zahemen9900
/

finsight-ai

@@ -100,137 +100,8 @@ Our evaluation demonstrates significant performance improvements across all stan
 ## Usage
-### Streaming function
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
-import torch
-from peft import PeftModel
-import threading
-# For 4-bit quantized inference (recommended)
-bnb_config = BitsAndBytesConfig(
-  load_in_4bit=True,
-  bnb_4bit_use_double_quant=True,
-  bnb_4bit_quant_type="nf4",
-  bnb_4bit_compute_dtype=torch.bfloat16
-)
-# First load the base model with quantization
-base_model = AutoModelForCausalLM.from_pretrained(
-  "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-  quantization_config=bnb_config,
-  device_map="auto"
-)
-# Then load the adapter weights (LoRA)
-model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
-prompt = "What's your name, and what're you good at?"
-messages = [
-    {"role": "system", "content": system_prompt},
-    {"role": "user", "content": prompt}
-]
-formatted_prompt = tokenizer.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-# Tokenize the formatted prompt
-inputs = tokenizer(formatted_prompt, return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}  # Move all tensors to device
-# Create a streamer
-streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-# Adjust generation parameters for more controlled responses
-generation_config = {
-    "max_new_tokens": 256,
-    "temperature": 0.6,
-    "top_p": 0.95,
-    "do_sample": True,
-    "pad_token_id": tokenizer.eos_token_id,
-    "eos_token_id": tokenizer.eos_token_id,
-    "repetition_penalty": 1.2,
-    "no_repeat_ngram_size": 4,
-    "num_beams": 1,
-    "early_stopping": False,
-    "length_penalty": 1.0,
-}
-# Combine inputs and generation config for the generate function
-generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
-# Start generation in a separate thread
-thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-thread.start()
-# Iterate over the generated text
-print("Response: ", end="")
-for text in streamer:
-    print(text, end="", flush=True)
-```
-### Simple Non-Streaming Usage
-If you prefer a simpler approach without streaming:
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import torch
-from peft import PeftModel
-# For 4-bit quantized inference
-bnb_config = BitsAndBytesConfig(
-  load_in_4bit=True,
-  bnb_4bit_use_double_quant=True,
-  bnb_4bit_quant_type="nf4",
-  bnb_4bit_compute_dtype=torch.bfloat16
-)
-# Load base model with quantization
-base_model = AutoModelForCausalLM.from_pretrained(
-  "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-  quantization_config=bnb_config,
-  device_map="auto"
-)
-# Load adapter weights (LoRA)
-model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
-# Prepare input
-system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
-user_prompt = "What's a good strategy for long-term investing?"
-messages = [
-    {"role": "system", "content": system_prompt},
-    {"role": "user", "content": user_prompt}
-]
-formatted_prompt = tokenizer.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
-# Generate response
-outputs = model.generate(
-    inputs.input_ids,
-    max_new_tokens=256,
-    temperature=0.7,
-    top_p=0.95,
-    do_sample=True,
-    repetition_penalty=1.2
-)
-response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print("Response:\n", response.strip())
-```
 ## Training Details

 ## Usage
+Check out the usage functionality in `inference.py`.
 ## Training Details

inference.py CHANGED Viewed

@@ -1,212 +1,70 @@
-#!/usr/bin/env python3
-"""
-FinSight AI - Inference script for financial advisory chatbot
-This script provides a simple way to interact with the model via command line
-"""
-import os
-import argparse
 import torch
-from typing import List, Dict
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextStreamer,
-    BitsAndBytesConfig
 )
-class FinancialAdvisor:
-    def __init__(
-        self,
-        model_id: str = "zahemen9900/finsight-ai",
-        use_4bit: bool = True,
-        device: str = None
-    ):
-        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Using device: {self.device}")
-        # Configure quantization if requested and available
-        if use_4bit and self.device == "cuda":
-            print("Loading model in 4-bit quantization mode")
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_compute_dtype=torch.bfloat16
-            )
-        else:
-            print("Loading model in standard mode")
-            bnb_config = None
-        # Load tokenizer and model
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            quantization_config=bnb_config,
-            device_map="auto" if self.device == "cuda" else None,
-            torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
-        )
-        if self.device == "cpu":
-            self.model = self.model.to(self.device)
-        self.model.eval()
-        self.conversation_history = []
-        self.system_message = {
-            "role": "system",
-            "content": (
-                "You are FinSight AI, a helpful and knowledgeable financial assistant. "
-                "You can provide information and guidance on financial topics, market trends, investment strategies, "
-                "and personal finance management. Always strive to be accurate, informative, and helpful. "
-                "Remember that you cannot provide personalized financial advice that would require knowing a person's "
-                "complete financial situation or future market movements."
-            )
-        }
-    def generate_response(
-        self,
-        prompt: str,
-        temperature: float = 0.7,
-        max_new_tokens: int = 512,
-        stream: bool = True
-    ) -> str:
-        """Generate response from the model"""
-        # Manage conversation history (keep last 5 exchanges)
-        if len(self.conversation_history) > 10:
-            self.conversation_history = self.conversation_history[-10:]
-        # Create messages with history
-        messages = [self.system_message] + self.conversation_history
-        messages.append({"role": "user", "content": prompt})
-        # Format prompt using chat template
-        formatted_prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Encode the input
-        inputs = self.tokenizer(
-            formatted_prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=4096
-        ).to(self.device)
-        # Setup streamer if requested
-        streamer = TextStreamer(
-            self.tokenizer,
-            skip_prompt=True,
-            skip_special_tokens=True
-        ) if stream else None
-        # Generate response
-        with torch.inference_mode():
-            output_ids = self.model.generate(
-                inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=temperature,
-                top_p=0.95,
-                streamer=streamer,
-                pad_token_id=self.tokenizer.eos_token_id,
-                repetition_penalty=1.1
-            )
-        # Return the response
-        if not stream:
-            response = self.tokenizer.decode(
-                output_ids[0][inputs.input_ids.shape[1]:],
-                skip_special_tokens=True
-            )
-            print("\nAssistant:", response)
-        else:
-            response = ""  # Response was already streamed
-        # Update conversation history
-        self.conversation_history.append({"role": "user", "content": prompt})
-        self.conversation_history.append({"role": "assistant", "content": response if response else "[Response was streamed]"})
-        return response
-    def start_chat_loop(self):
-        """Start an interactive chat session"""
-        print("\nWelcome to FinSight AI - Your Financial Advisory Assistant!")
-        print("Type 'quit', 'exit', or press Ctrl+C to end the conversation.\n")
-        while True:
-            try:
-                user_input = input("\nYou: ").strip()
-                if user_input.lower() in ["quit", "exit", "q"]:
-                    break
-                if user_input.lower() == "clear":
-                    self.conversation_history = []
-                    print("Conversation history cleared.")
-                    continue
-                print("\nAssistant: ", end="", flush=True)
-                self.generate_response(user_input)
-            except KeyboardInterrupt:
-                print("\nExiting chat...")
-                break
-            except Exception as e:
-                print(f"\nError: {e}")
-                continue
-        print("\nThank you for using FinSight AI. Goodbye!")
-def main():
-    parser = argparse.ArgumentParser(description="FinSight AI Inference Script")
-    parser.add_argument(
-        "--model_id",
-        type=str,
-        default="zahemen9900/finsight-ai",
-        help="Model ID or path to load"
-    )
-    parser.add_argument(
-        "--no_quantize",
-        action="store_true",
-        help="Disable 4-bit quantization (uses more memory)"
-    )
-    parser.add_argument(
-        "--query",
-        type=str,
-        help="Single query mode: provide a question and get one response"
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.7,
-        help="Sampling temperature (higher = more random)"
-    )
-    parser.add_argument(
-        "--max_tokens",
-        type=int,
-        default=512,
-        help="Maximum number of new tokens to generate"
-    )
-    args = parser.parse_args()
-    advisor = FinancialAdvisor(
-        model_id=args.model_id,
-        use_4bit=not args.no_quantize
-    )
-    # Single query mode
-    if args.query:
-        advisor.generate_response(
-            args.query,
-            temperature=args.temperature,
-            max_new_tokens=args.max_tokens
-        )
-    # Interactive chat mode
-    else:
-        advisor.start_chat_loop()
-if __name__ == "__main__":
-    main()

+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 import torch
+from peft import PeftModel
+import threading
+# For 4-bit quantized inference (recommended)
+bnb_config = BitsAndBytesConfig(
+  load_in_4bit=True,
+  bnb_4bit_use_double_quant=True,
+  bnb_4bit_quant_type="nf4",
+  bnb_4bit_compute_dtype=torch.bfloat16
+)
+# First load the base model with quantization
+base_model = AutoModelForCausalLM.from_pretrained(
+  "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+  quantization_config=bnb_config,
+  device_map="auto"
+)
+# Then load the adapter weights (LoRA)
+model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
+prompt = "What's your name, and what're you good at?"
+messages = [
+    {"role": "system", "content": system_prompt},
+    {"role": "user", "content": prompt}
+]
+formatted_prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
 )
+# Tokenize the formatted prompt
+inputs = tokenizer(formatted_prompt, return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}  # Move all tensors to device
+# Create a streamer
+streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+# Adjust generation parameters for more controlled responses
+generation_config = {
+    "max_new_tokens": 256,
+    "temperature": 0.6,
+    "top_p": 0.95,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "eos_token_id": tokenizer.eos_token_id,
+    "repetition_penalty": 1.2,
+    "no_repeat_ngram_size": 4,
+    "num_beams": 1,
+    "early_stopping": False,
+    "length_penalty": 1.0,
+}
+# Combine inputs and generation config for the generate function
+generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}
+# Start generation in a separate thread
+thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+thread.start()
+# Iterate over the generated text
+print("Response: ", end="")
+for text in streamer:
+    print(text, end="", flush=True)