from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import torch
from peft import PeftModel
import threading

# For 4-bit quantized inference (recommended)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.bfloat16
)

# First load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
  "HuggingFaceTB/SmolLM2-1.7B-Instruct",
  quantization_config=bnb_config,
  device_map="auto"
)

# Then load the adapter weights (LoRA)
model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
system_prompt = "You are Finsight, a finance bot trained to assist users with financial insights"
prompt = "What's your name, and what're you good at?"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt}
]

formatted_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Tokenize the formatted prompt
inputs = tokenizer(formatted_prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move all tensors to device

# Create a streamer
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

# Adjust generation parameters for more controlled responses
generation_config = {
    "max_new_tokens": 256,
    "temperature": 0.6,
    "top_p": 0.95,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 4,
    "num_beams": 1,
    "early_stopping": False,
    "length_penalty": 1.0,
}

# Combine inputs and generation config for the generate function
generation_kwargs = {**generation_config, "input_ids": inputs["input_ids"], "streamer": streamer}

# Start generation in a separate thread
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# Iterate over the generated text
print("Response: ", end="")
for text in streamer:
    print(text, end="", flush=True)