--- base_model: - Qwen/Qwen2.5-Coder-14B-Instruct --- ```python #!/usr/bin/env python3 import time from vllm import LLM, SamplingParams def main(): # Hard-coded model and tensor parallel configuration. model_path = "miike-ai/qwen-14b-coder-fp8" tensor_parallel_size = 1 # Define sampling parameters with an increased max_tokens and a stop string. sampling_params = SamplingParams( temperature=0.0, top_p=0.95, max_tokens=32000, # Increase this to allow longer responses. stop=["\nUser:"], # Stop when the model outputs a new user marker. ) print(f"Loading model '{model_path}' ...") model = LLM( model=model_path, enforce_eager=True, dtype="auto", tensor_parallel_size=tensor_parallel_size, ) print("Model loaded. You can now chat!") print("Type 'exit' or 'quit' to end the conversation.\n") conversation = "" while True: try: user_input = input("User: ").strip() except (KeyboardInterrupt, EOFError): print("\nExiting chat.") break if user_input.lower() in {"exit", "quit"}: print("Exiting chat.") break # Append the user's input to the conversation history. conversation += f"User: {user_input}\nBot: " print("Bot: ", end="", flush=True) # Generate a response using the conversation history and sampling parameters. response = model.generate(conversation, sampling_params=sampling_params) # Extract the generated reply. bot_reply = response[0].outputs[0].text.strip() # Simulate streaming by printing one character at a time. for char in bot_reply: print(char, end="", flush=True) time.sleep(0.02) # Adjust delay (in seconds) as desired. print() # Newline after bot reply. # Append the bot reply to conversation history. conversation += bot_reply + "\n" if __name__ == "__main__": main() ```