zahemen9900 commited on
Commit
0f6159c
·
1 Parent(s): 92c66d1

Refactor inference script to streamline usage; remove unnecessary classes and integrate threading for response generation

Browse files
Files changed (2) hide show
  1. README.md +1 -130
  2. inference.py +65 -207
README.md CHANGED
@@ -100,137 +100,8 @@ Our evaluation demonstrates significant performance improvements across all stan
100
 
101
  ## Usage
102
 
103
- ### Streaming function
104
-
105
- ```python
106
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
107
- import torch
108
- from peft import PeftModel
109
- import threading
110
-
111
- # For 4-bit quantized inference (recommended)
112
- bnb_config = BitsAndBytesConfig(
113
- load_in_4bit=True,
114
- bnb_4bit_use_double_quant=True,
115
- bnb_4bit_quant_type="nf4",
116
- bnb_4bit_compute_dtype=torch.bfloat16
117
- )
118
-
119
- # First load the base model with quantization
120
- base_model = AutoModelForCausalLM.from_pretrained(
121
- "HuggingFaceTB/SmolLM2-1.7B-Instruct",
122
- quantization_config=bnb_config,
123
- device_map="auto"
124
- )
125
-
126
- # Then load the adapter weights (LoRA)
127
- model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
128
- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
129
-
130
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
131
- system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
132
- prompt = "What's your name, and what're you good at?"
133
-
134
- messages = [
135
- {"role": "system", "content": system_prompt},
136
- {"role": "user", "content": prompt}
137
- ]
138
-
139
- formatted_prompt = tokenizer.apply_chat_template(
140
- messages, tokenize=False, add_generation_prompt=True
141
- )
142
-
143
- # Tokenize the formatted prompt
144
- inputs = tokenizer(formatted_prompt, return_tensors="pt")
145
- inputs = {k: v.to(device) for k, v in inputs.items()} # Move all tensors to device
146
-
147
- # Create a streamer
148
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
149
-
150
- # Adjust generation parameters for more controlled responses
151
- generation_config = {
152
- "max_new_tokens": 256,
153
- "temperature": 0.6,
154
- "top_p": 0.95,
155
- "do_sample": True,
156
- "pad_token_id": tokenizer.eos_token_id,
157
- "eos_token_id": tokenizer.eos_token_id,
158
- "repetition_penalty": 1.2,
159
- "no_repeat_ngram_size": 4,
160
- "num_beams": 1,
161
- "early_stopping": False,
162
- "length_penalty": 1.0,
163
- }
164
 
165
- # Combine inputs and generation config for the generate function
166
- generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
167
-
168
- # Start generation in a separate thread
169
- thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
170
- thread.start()
171
-
172
- # Iterate over the generated text
173
- print("Response: ", end="")
174
- for text in streamer:
175
- print(text, end="", flush=True)
176
- ```
177
-
178
- ### Simple Non-Streaming Usage
179
-
180
- If you prefer a simpler approach without streaming:
181
-
182
- ```python
183
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
184
- import torch
185
- from peft import PeftModel
186
-
187
- # For 4-bit quantized inference
188
- bnb_config = BitsAndBytesConfig(
189
- load_in_4bit=True,
190
- bnb_4bit_use_double_quant=True,
191
- bnb_4bit_quant_type="nf4",
192
- bnb_4bit_compute_dtype=torch.bfloat16
193
- )
194
-
195
- # Load base model with quantization
196
- base_model = AutoModelForCausalLM.from_pretrained(
197
- "HuggingFaceTB/SmolLM2-1.7B-Instruct",
198
- quantization_config=bnb_config,
199
- device_map="auto"
200
- )
201
-
202
- # Load adapter weights (LoRA)
203
- model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
204
- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
205
-
206
- # Prepare input
207
- system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
208
- user_prompt = "What's a good strategy for long-term investing?"
209
-
210
- messages = [
211
- {"role": "system", "content": system_prompt},
212
- {"role": "user", "content": user_prompt}
213
- ]
214
-
215
- formatted_prompt = tokenizer.apply_chat_template(
216
- messages, tokenize=False, add_generation_prompt=True
217
- )
218
-
219
- inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
220
-
221
- # Generate response
222
- outputs = model.generate(
223
- inputs.input_ids,
224
- max_new_tokens=256,
225
- temperature=0.7,
226
- top_p=0.95,
227
- do_sample=True,
228
- repetition_penalty=1.2
229
- )
230
-
231
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
232
- print("Response:\n", response.strip())
233
- ```
234
 
235
  ## Training Details
236
 
 
100
 
101
  ## Usage
102
 
103
+ Check out the usage functionality in `inference.py`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  ## Training Details
107
 
inference.py CHANGED
@@ -1,212 +1,70 @@
1
- #!/usr/bin/env python3
2
- """
3
- FinSight AI - Inference script for financial advisory chatbot
4
- This script provides a simple way to interact with the model via command line
5
- """
6
-
7
- import os
8
- import argparse
9
  import torch
10
- from typing import List, Dict
11
- from transformers import (
12
- AutoModelForCausalLM,
13
- AutoTokenizer,
14
- TextStreamer,
15
- BitsAndBytesConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
- class FinancialAdvisor:
19
- def __init__(
20
- self,
21
- model_id: str = "zahemen9900/finsight-ai",
22
- use_4bit: bool = True,
23
- device: str = None
24
- ):
25
- self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
26
- print(f"Using device: {self.device}")
27
-
28
- # Configure quantization if requested and available
29
- if use_4bit and self.device == "cuda":
30
- print("Loading model in 4-bit quantization mode")
31
- bnb_config = BitsAndBytesConfig(
32
- load_in_4bit=True,
33
- bnb_4bit_quant_type="nf4",
34
- bnb_4bit_use_double_quant=True,
35
- bnb_4bit_compute_dtype=torch.bfloat16
36
- )
37
- else:
38
- print("Loading model in standard mode")
39
- bnb_config = None
40
-
41
- # Load tokenizer and model
42
- self.tokenizer = AutoTokenizer.from_pretrained(model_id)
43
- self.model = AutoModelForCausalLM.from_pretrained(
44
- model_id,
45
- quantization_config=bnb_config,
46
- device_map="auto" if self.device == "cuda" else None,
47
- torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
48
- )
49
-
50
- if self.device == "cpu":
51
- self.model = self.model.to(self.device)
52
-
53
- self.model.eval()
54
- self.conversation_history = []
55
- self.system_message = {
56
- "role": "system",
57
- "content": (
58
- "You are FinSight AI, a helpful and knowledgeable financial assistant. "
59
- "You can provide information and guidance on financial topics, market trends, investment strategies, "
60
- "and personal finance management. Always strive to be accurate, informative, and helpful. "
61
- "Remember that you cannot provide personalized financial advice that would require knowing a person's "
62
- "complete financial situation or future market movements."
63
- )
64
- }
65
-
66
- def generate_response(
67
- self,
68
- prompt: str,
69
- temperature: float = 0.7,
70
- max_new_tokens: int = 512,
71
- stream: bool = True
72
- ) -> str:
73
- """Generate response from the model"""
74
- # Manage conversation history (keep last 5 exchanges)
75
- if len(self.conversation_history) > 10:
76
- self.conversation_history = self.conversation_history[-10:]
77
-
78
- # Create messages with history
79
- messages = [self.system_message] + self.conversation_history
80
- messages.append({"role": "user", "content": prompt})
81
-
82
- # Format prompt using chat template
83
- formatted_prompt = self.tokenizer.apply_chat_template(
84
- messages,
85
- tokenize=False,
86
- add_generation_prompt=True
87
- )
88
-
89
- # Encode the input
90
- inputs = self.tokenizer(
91
- formatted_prompt,
92
- return_tensors="pt",
93
- truncation=True,
94
- max_length=4096
95
- ).to(self.device)
96
-
97
- # Setup streamer if requested
98
- streamer = TextStreamer(
99
- self.tokenizer,
100
- skip_prompt=True,
101
- skip_special_tokens=True
102
- ) if stream else None
103
-
104
- # Generate response
105
- with torch.inference_mode():
106
- output_ids = self.model.generate(
107
- inputs.input_ids,
108
- attention_mask=inputs.attention_mask,
109
- max_new_tokens=max_new_tokens,
110
- do_sample=True,
111
- temperature=temperature,
112
- top_p=0.95,
113
- streamer=streamer,
114
- pad_token_id=self.tokenizer.eos_token_id,
115
- repetition_penalty=1.1
116
- )
117
-
118
- # Return the response
119
- if not stream:
120
- response = self.tokenizer.decode(
121
- output_ids[0][inputs.input_ids.shape[1]:],
122
- skip_special_tokens=True
123
- )
124
- print("\nAssistant:", response)
125
- else:
126
- response = "" # Response was already streamed
127
-
128
- # Update conversation history
129
- self.conversation_history.append({"role": "user", "content": prompt})
130
- self.conversation_history.append({"role": "assistant", "content": response if response else "[Response was streamed]"})
131
-
132
- return response
133
-
134
- def start_chat_loop(self):
135
- """Start an interactive chat session"""
136
- print("\nWelcome to FinSight AI - Your Financial Advisory Assistant!")
137
- print("Type 'quit', 'exit', or press Ctrl+C to end the conversation.\n")
138
-
139
- while True:
140
- try:
141
- user_input = input("\nYou: ").strip()
142
- if user_input.lower() in ["quit", "exit", "q"]:
143
- break
144
-
145
- if user_input.lower() == "clear":
146
- self.conversation_history = []
147
- print("Conversation history cleared.")
148
- continue
149
-
150
- print("\nAssistant: ", end="", flush=True)
151
- self.generate_response(user_input)
152
-
153
- except KeyboardInterrupt:
154
- print("\nExiting chat...")
155
- break
156
- except Exception as e:
157
- print(f"\nError: {e}")
158
- continue
159
-
160
- print("\nThank you for using FinSight AI. Goodbye!")
161
 
162
- def main():
163
- parser = argparse.ArgumentParser(description="FinSight AI Inference Script")
164
- parser.add_argument(
165
- "--model_id",
166
- type=str,
167
- default="zahemen9900/finsight-ai",
168
- help="Model ID or path to load"
169
- )
170
- parser.add_argument(
171
- "--no_quantize",
172
- action="store_true",
173
- help="Disable 4-bit quantization (uses more memory)"
174
- )
175
- parser.add_argument(
176
- "--query",
177
- type=str,
178
- help="Single query mode: provide a question and get one response"
179
- )
180
- parser.add_argument(
181
- "--temperature",
182
- type=float,
183
- default=0.7,
184
- help="Sampling temperature (higher = more random)"
185
- )
186
- parser.add_argument(
187
- "--max_tokens",
188
- type=int,
189
- default=512,
190
- help="Maximum number of new tokens to generate"
191
- )
192
-
193
- args = parser.parse_args()
194
-
195
- advisor = FinancialAdvisor(
196
- model_id=args.model_id,
197
- use_4bit=not args.no_quantize
198
- )
199
-
200
- # Single query mode
201
- if args.query:
202
- advisor.generate_response(
203
- args.query,
204
- temperature=args.temperature,
205
- max_new_tokens=args.max_tokens
206
- )
207
- # Interactive chat mode
208
- else:
209
- advisor.start_chat_loop()
210
 
211
- if __name__ == "__main__":
212
- main()
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 
 
 
 
 
 
 
2
  import torch
3
+ from peft import PeftModel
4
+ import threading
5
+
6
+ # For 4-bit quantized inference (recommended)
7
+ bnb_config = BitsAndBytesConfig(
8
+ load_in_4bit=True,
9
+ bnb_4bit_use_double_quant=True,
10
+ bnb_4bit_quant_type="nf4",
11
+ bnb_4bit_compute_dtype=torch.bfloat16
12
+ )
13
+
14
+ # First load the base model with quantization
15
+ base_model = AutoModelForCausalLM.from_pretrained(
16
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct",
17
+ quantization_config=bnb_config,
18
+ device_map="auto"
19
+ )
20
+
21
+ # Then load the adapter weights (LoRA)
22
+ model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
23
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
24
+
25
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
26
+ system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
27
+ prompt = "What's your name, and what're you good at?"
28
+
29
+ messages = [
30
+ {"role": "system", "content": system_prompt},
31
+ {"role": "user", "content": prompt}
32
+ ]
33
+
34
+ formatted_prompt = tokenizer.apply_chat_template(
35
+ messages, tokenize=False, add_generation_prompt=True
36
  )
37
 
38
+ # Tokenize the formatted prompt
39
+ inputs = tokenizer(formatted_prompt, return_tensors="pt")
40
+ inputs = {k: v.to(device) for k, v in inputs.items()} # Move all tensors to device
41
+
42
+ # Create a streamer
43
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
44
+
45
+ # Adjust generation parameters for more controlled responses
46
+ generation_config = {
47
+ "max_new_tokens": 256,
48
+ "temperature": 0.6,
49
+ "top_p": 0.95,
50
+ "do_sample": True,
51
+ "pad_token_id": tokenizer.eos_token_id,
52
+ "eos_token_id": tokenizer.eos_token_id,
53
+ "repetition_penalty": 1.2,
54
+ "no_repeat_ngram_size": 4,
55
+ "num_beams": 1,
56
+ "early_stopping": False,
57
+ "length_penalty": 1.0,
58
+ }
59
+
60
+ # Combine inputs and generation config for the generate function
61
+ generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # Start generation in a separate thread
64
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
65
+ thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # Iterate over the generated text
68
+ print("Response: ", end="")
69
+ for text in streamer:
70
+ print(text, end="", flush=True)