#Use this script to chat with "mistral-7b-medical-o1-ft" that answers your questions until you type '\q' or 'quit' to end the conversation. # !pip install unsloth #(install unsloth if not installed) from unsloth import FastLanguageModel import torch # Define the Alpaca prompt template and load model alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input_text} ### Response: {output} """ # Load your model model, tokenizer = FastLanguageModel.from_pretrained( model_name="Subh775/mistral-7b-medical-o1-ft", max_seq_length=2048, load_in_4bit=True ) # Enable optimized inference mode for faster generation FastLanguageModel.for_inference(model) # Function to handle the chat loop with memory def chat(): print("Chat with the model! Type '\\q' or 'quit' to stop.\n") chat_history = "" # Store the conversation history while True: # Get user input user_input = input("You: ") # Exit condition if user_input.lower() in ['\\q', 'quit']: print("\nExiting the chat. Goodbye!") break # Append the current input to chat history with instruction formatting prompt = alpaca_prompt.format( instruction="Please answer the following medical question.", input_text=user_input, output="" ) chat_history += prompt + "\n" # Tokenize combined history and move to GPU inputs = tokenizer([chat_history], return_tensors="pt").to("cuda") # Generate output with configured parameters outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, num_return_sequences=1, do_sample=True, no_repeat_ngram_size=2 ) # Decode and clean the model's response decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True) clean_output = decoded_output[0].split('### Response:')[-1].strip() # Add the response to chat history chat_history += f": {clean_output}\n" # Display the response print(f"\nModel: {clean_output}\n") # Start the chat chat()