#!/usr/bin/env python3 """ Simple inference script for the merged Phi-4-mini 128K model. """ from transformers import AutoModelForCausalLM, AutoTokenizer import torch def generate_response(prompt, max_new_tokens=500): model_path = "/data/phi4_merged_128k" # Load model and tokenizer print("Loading model...") model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(model_path) # Tokenize input inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=0.7, do_sample=True, top_p=0.95 ) # Decode and return response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response if __name__ == "__main__": # Example usage prompt = "What is the capital of France?" response = generate_response(prompt) print(f"Response: {response}")