Commit
·
0f6159c
1
Parent(s):
92c66d1
Refactor inference script to streamline usage; remove unnecessary classes and integrate threading for response generation
Browse files- README.md +1 -130
- inference.py +65 -207
README.md
CHANGED
@@ -100,137 +100,8 @@ Our evaluation demonstrates significant performance improvements across all stan
|
|
100 |
|
101 |
## Usage
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
```python
|
106 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
|
107 |
-
import torch
|
108 |
-
from peft import PeftModel
|
109 |
-
import threading
|
110 |
-
|
111 |
-
# For 4-bit quantized inference (recommended)
|
112 |
-
bnb_config = BitsAndBytesConfig(
|
113 |
-
load_in_4bit=True,
|
114 |
-
bnb_4bit_use_double_quant=True,
|
115 |
-
bnb_4bit_quant_type="nf4",
|
116 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
117 |
-
)
|
118 |
-
|
119 |
-
# First load the base model with quantization
|
120 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
121 |
-
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
122 |
-
quantization_config=bnb_config,
|
123 |
-
device_map="auto"
|
124 |
-
)
|
125 |
-
|
126 |
-
# Then load the adapter weights (LoRA)
|
127 |
-
model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
|
128 |
-
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
129 |
-
|
130 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
131 |
-
system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
|
132 |
-
prompt = "What's your name, and what're you good at?"
|
133 |
-
|
134 |
-
messages = [
|
135 |
-
{"role": "system", "content": system_prompt},
|
136 |
-
{"role": "user", "content": prompt}
|
137 |
-
]
|
138 |
-
|
139 |
-
formatted_prompt = tokenizer.apply_chat_template(
|
140 |
-
messages, tokenize=False, add_generation_prompt=True
|
141 |
-
)
|
142 |
-
|
143 |
-
# Tokenize the formatted prompt
|
144 |
-
inputs = tokenizer(formatted_prompt, return_tensors="pt")
|
145 |
-
inputs = {k: v.to(device) for k, v in inputs.items()} # Move all tensors to device
|
146 |
-
|
147 |
-
# Create a streamer
|
148 |
-
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
|
149 |
-
|
150 |
-
# Adjust generation parameters for more controlled responses
|
151 |
-
generation_config = {
|
152 |
-
"max_new_tokens": 256,
|
153 |
-
"temperature": 0.6,
|
154 |
-
"top_p": 0.95,
|
155 |
-
"do_sample": True,
|
156 |
-
"pad_token_id": tokenizer.eos_token_id,
|
157 |
-
"eos_token_id": tokenizer.eos_token_id,
|
158 |
-
"repetition_penalty": 1.2,
|
159 |
-
"no_repeat_ngram_size": 4,
|
160 |
-
"num_beams": 1,
|
161 |
-
"early_stopping": False,
|
162 |
-
"length_penalty": 1.0,
|
163 |
-
}
|
164 |
|
165 |
-
# Combine inputs and generation config for the generate function
|
166 |
-
generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
|
167 |
-
|
168 |
-
# Start generation in a separate thread
|
169 |
-
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
170 |
-
thread.start()
|
171 |
-
|
172 |
-
# Iterate over the generated text
|
173 |
-
print("Response: ", end="")
|
174 |
-
for text in streamer:
|
175 |
-
print(text, end="", flush=True)
|
176 |
-
```
|
177 |
-
|
178 |
-
### Simple Non-Streaming Usage
|
179 |
-
|
180 |
-
If you prefer a simpler approach without streaming:
|
181 |
-
|
182 |
-
```python
|
183 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
184 |
-
import torch
|
185 |
-
from peft import PeftModel
|
186 |
-
|
187 |
-
# For 4-bit quantized inference
|
188 |
-
bnb_config = BitsAndBytesConfig(
|
189 |
-
load_in_4bit=True,
|
190 |
-
bnb_4bit_use_double_quant=True,
|
191 |
-
bnb_4bit_quant_type="nf4",
|
192 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
193 |
-
)
|
194 |
-
|
195 |
-
# Load base model with quantization
|
196 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
197 |
-
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
198 |
-
quantization_config=bnb_config,
|
199 |
-
device_map="auto"
|
200 |
-
)
|
201 |
-
|
202 |
-
# Load adapter weights (LoRA)
|
203 |
-
model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
|
204 |
-
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
205 |
-
|
206 |
-
# Prepare input
|
207 |
-
system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
|
208 |
-
user_prompt = "What's a good strategy for long-term investing?"
|
209 |
-
|
210 |
-
messages = [
|
211 |
-
{"role": "system", "content": system_prompt},
|
212 |
-
{"role": "user", "content": user_prompt}
|
213 |
-
]
|
214 |
-
|
215 |
-
formatted_prompt = tokenizer.apply_chat_template(
|
216 |
-
messages, tokenize=False, add_generation_prompt=True
|
217 |
-
)
|
218 |
-
|
219 |
-
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
|
220 |
-
|
221 |
-
# Generate response
|
222 |
-
outputs = model.generate(
|
223 |
-
inputs.input_ids,
|
224 |
-
max_new_tokens=256,
|
225 |
-
temperature=0.7,
|
226 |
-
top_p=0.95,
|
227 |
-
do_sample=True,
|
228 |
-
repetition_penalty=1.2
|
229 |
-
)
|
230 |
-
|
231 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
232 |
-
print("Response:\n", response.strip())
|
233 |
-
```
|
234 |
|
235 |
## Training Details
|
236 |
|
|
|
100 |
|
101 |
## Usage
|
102 |
|
103 |
+
Check out the usage functionality in `inference.py`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
## Training Details
|
107 |
|
inference.py
CHANGED
@@ -1,212 +1,70 @@
|
|
1 |
-
|
2 |
-
"""
|
3 |
-
FinSight AI - Inference script for financial advisory chatbot
|
4 |
-
This script provides a simple way to interact with the model via command line
|
5 |
-
"""
|
6 |
-
|
7 |
-
import os
|
8 |
-
import argparse
|
9 |
import torch
|
10 |
-
from
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
43 |
-
self.model = AutoModelForCausalLM.from_pretrained(
|
44 |
-
model_id,
|
45 |
-
quantization_config=bnb_config,
|
46 |
-
device_map="auto" if self.device == "cuda" else None,
|
47 |
-
torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
|
48 |
-
)
|
49 |
-
|
50 |
-
if self.device == "cpu":
|
51 |
-
self.model = self.model.to(self.device)
|
52 |
-
|
53 |
-
self.model.eval()
|
54 |
-
self.conversation_history = []
|
55 |
-
self.system_message = {
|
56 |
-
"role": "system",
|
57 |
-
"content": (
|
58 |
-
"You are FinSight AI, a helpful and knowledgeable financial assistant. "
|
59 |
-
"You can provide information and guidance on financial topics, market trends, investment strategies, "
|
60 |
-
"and personal finance management. Always strive to be accurate, informative, and helpful. "
|
61 |
-
"Remember that you cannot provide personalized financial advice that would require knowing a person's "
|
62 |
-
"complete financial situation or future market movements."
|
63 |
-
)
|
64 |
-
}
|
65 |
-
|
66 |
-
def generate_response(
|
67 |
-
self,
|
68 |
-
prompt: str,
|
69 |
-
temperature: float = 0.7,
|
70 |
-
max_new_tokens: int = 512,
|
71 |
-
stream: bool = True
|
72 |
-
) -> str:
|
73 |
-
"""Generate response from the model"""
|
74 |
-
# Manage conversation history (keep last 5 exchanges)
|
75 |
-
if len(self.conversation_history) > 10:
|
76 |
-
self.conversation_history = self.conversation_history[-10:]
|
77 |
-
|
78 |
-
# Create messages with history
|
79 |
-
messages = [self.system_message] + self.conversation_history
|
80 |
-
messages.append({"role": "user", "content": prompt})
|
81 |
-
|
82 |
-
# Format prompt using chat template
|
83 |
-
formatted_prompt = self.tokenizer.apply_chat_template(
|
84 |
-
messages,
|
85 |
-
tokenize=False,
|
86 |
-
add_generation_prompt=True
|
87 |
-
)
|
88 |
-
|
89 |
-
# Encode the input
|
90 |
-
inputs = self.tokenizer(
|
91 |
-
formatted_prompt,
|
92 |
-
return_tensors="pt",
|
93 |
-
truncation=True,
|
94 |
-
max_length=4096
|
95 |
-
).to(self.device)
|
96 |
-
|
97 |
-
# Setup streamer if requested
|
98 |
-
streamer = TextStreamer(
|
99 |
-
self.tokenizer,
|
100 |
-
skip_prompt=True,
|
101 |
-
skip_special_tokens=True
|
102 |
-
) if stream else None
|
103 |
-
|
104 |
-
# Generate response
|
105 |
-
with torch.inference_mode():
|
106 |
-
output_ids = self.model.generate(
|
107 |
-
inputs.input_ids,
|
108 |
-
attention_mask=inputs.attention_mask,
|
109 |
-
max_new_tokens=max_new_tokens,
|
110 |
-
do_sample=True,
|
111 |
-
temperature=temperature,
|
112 |
-
top_p=0.95,
|
113 |
-
streamer=streamer,
|
114 |
-
pad_token_id=self.tokenizer.eos_token_id,
|
115 |
-
repetition_penalty=1.1
|
116 |
-
)
|
117 |
-
|
118 |
-
# Return the response
|
119 |
-
if not stream:
|
120 |
-
response = self.tokenizer.decode(
|
121 |
-
output_ids[0][inputs.input_ids.shape[1]:],
|
122 |
-
skip_special_tokens=True
|
123 |
-
)
|
124 |
-
print("\nAssistant:", response)
|
125 |
-
else:
|
126 |
-
response = "" # Response was already streamed
|
127 |
-
|
128 |
-
# Update conversation history
|
129 |
-
self.conversation_history.append({"role": "user", "content": prompt})
|
130 |
-
self.conversation_history.append({"role": "assistant", "content": response if response else "[Response was streamed]"})
|
131 |
-
|
132 |
-
return response
|
133 |
-
|
134 |
-
def start_chat_loop(self):
|
135 |
-
"""Start an interactive chat session"""
|
136 |
-
print("\nWelcome to FinSight AI - Your Financial Advisory Assistant!")
|
137 |
-
print("Type 'quit', 'exit', or press Ctrl+C to end the conversation.\n")
|
138 |
-
|
139 |
-
while True:
|
140 |
-
try:
|
141 |
-
user_input = input("\nYou: ").strip()
|
142 |
-
if user_input.lower() in ["quit", "exit", "q"]:
|
143 |
-
break
|
144 |
-
|
145 |
-
if user_input.lower() == "clear":
|
146 |
-
self.conversation_history = []
|
147 |
-
print("Conversation history cleared.")
|
148 |
-
continue
|
149 |
-
|
150 |
-
print("\nAssistant: ", end="", flush=True)
|
151 |
-
self.generate_response(user_input)
|
152 |
-
|
153 |
-
except KeyboardInterrupt:
|
154 |
-
print("\nExiting chat...")
|
155 |
-
break
|
156 |
-
except Exception as e:
|
157 |
-
print(f"\nError: {e}")
|
158 |
-
continue
|
159 |
-
|
160 |
-
print("\nThank you for using FinSight AI. Goodbye!")
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
"--model_id",
|
166 |
-
type=str,
|
167 |
-
default="zahemen9900/finsight-ai",
|
168 |
-
help="Model ID or path to load"
|
169 |
-
)
|
170 |
-
parser.add_argument(
|
171 |
-
"--no_quantize",
|
172 |
-
action="store_true",
|
173 |
-
help="Disable 4-bit quantization (uses more memory)"
|
174 |
-
)
|
175 |
-
parser.add_argument(
|
176 |
-
"--query",
|
177 |
-
type=str,
|
178 |
-
help="Single query mode: provide a question and get one response"
|
179 |
-
)
|
180 |
-
parser.add_argument(
|
181 |
-
"--temperature",
|
182 |
-
type=float,
|
183 |
-
default=0.7,
|
184 |
-
help="Sampling temperature (higher = more random)"
|
185 |
-
)
|
186 |
-
parser.add_argument(
|
187 |
-
"--max_tokens",
|
188 |
-
type=int,
|
189 |
-
default=512,
|
190 |
-
help="Maximum number of new tokens to generate"
|
191 |
-
)
|
192 |
-
|
193 |
-
args = parser.parse_args()
|
194 |
-
|
195 |
-
advisor = FinancialAdvisor(
|
196 |
-
model_id=args.model_id,
|
197 |
-
use_4bit=not args.no_quantize
|
198 |
-
)
|
199 |
-
|
200 |
-
# Single query mode
|
201 |
-
if args.query:
|
202 |
-
advisor.generate_response(
|
203 |
-
args.query,
|
204 |
-
temperature=args.temperature,
|
205 |
-
max_new_tokens=args.max_tokens
|
206 |
-
)
|
207 |
-
# Interactive chat mode
|
208 |
-
else:
|
209 |
-
advisor.start_chat_loop()
|
210 |
|
211 |
-
|
212 |
-
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import torch
|
3 |
+
from peft import PeftModel
|
4 |
+
import threading
|
5 |
+
|
6 |
+
# For 4-bit quantized inference (recommended)
|
7 |
+
bnb_config = BitsAndBytesConfig(
|
8 |
+
load_in_4bit=True,
|
9 |
+
bnb_4bit_use_double_quant=True,
|
10 |
+
bnb_4bit_quant_type="nf4",
|
11 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
12 |
+
)
|
13 |
+
|
14 |
+
# First load the base model with quantization
|
15 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
16 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
17 |
+
quantization_config=bnb_config,
|
18 |
+
device_map="auto"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Then load the adapter weights (LoRA)
|
22 |
+
model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
24 |
+
|
25 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
26 |
+
system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
|
27 |
+
prompt = "What's your name, and what're you good at?"
|
28 |
+
|
29 |
+
messages = [
|
30 |
+
{"role": "system", "content": system_prompt},
|
31 |
+
{"role": "user", "content": prompt}
|
32 |
+
]
|
33 |
+
|
34 |
+
formatted_prompt = tokenizer.apply_chat_template(
|
35 |
+
messages, tokenize=False, add_generation_prompt=True
|
36 |
)
|
37 |
|
38 |
+
# Tokenize the formatted prompt
|
39 |
+
inputs = tokenizer(formatted_prompt, return_tensors="pt")
|
40 |
+
inputs = {k: v.to(device) for k, v in inputs.items()} # Move all tensors to device
|
41 |
+
|
42 |
+
# Create a streamer
|
43 |
+
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
|
44 |
+
|
45 |
+
# Adjust generation parameters for more controlled responses
|
46 |
+
generation_config = {
|
47 |
+
"max_new_tokens": 256,
|
48 |
+
"temperature": 0.6,
|
49 |
+
"top_p": 0.95,
|
50 |
+
"do_sample": True,
|
51 |
+
"pad_token_id": tokenizer.eos_token_id,
|
52 |
+
"eos_token_id": tokenizer.eos_token_id,
|
53 |
+
"repetition_penalty": 1.2,
|
54 |
+
"no_repeat_ngram_size": 4,
|
55 |
+
"num_beams": 1,
|
56 |
+
"early_stopping": False,
|
57 |
+
"length_penalty": 1.0,
|
58 |
+
}
|
59 |
+
|
60 |
+
# Combine inputs and generation config for the generate function
|
61 |
+
generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
+
# Start generation in a separate thread
|
64 |
+
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
65 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
# Iterate over the generated text
|
68 |
+
print("Response: ", end="")
|
69 |
+
for text in streamer:
|
70 |
+
print(text, end="", flush=True)
|