#Use this script to chat with "mistral-7b-medical-o1-ft" that answers your questions until you type '\q' or 'quit' to end the conversation.


# !pip install unsloth #(install unsloth if not installed)
 
from unsloth import FastLanguageModel
import torch


# Define the Alpaca prompt template and load model
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction: {instruction}
### Input: {input_text}
### Response: {output}
"""

# Load your model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Subh775/mistral-7b-medical-o1-ft",
    max_seq_length=2048,
    load_in_4bit=True
)

# Enable optimized inference mode for faster generation
FastLanguageModel.for_inference(model)


# Function to handle the chat loop with memory

def chat():
    print("Chat with the model! Type '\\q' or 'quit' to stop.\n")

    chat_history = ""  # Store the conversation history

    while True:
        # Get user input
        user_input = input("You: ")

        # Exit condition
        if user_input.lower() in ['\\q', 'quit']:
            print("\nExiting the chat. Goodbye!")
            break

        # Append the current input to chat history with instruction formatting
        prompt = alpaca_prompt.format(
            instruction="Please answer the following medical question.",
            input_text=user_input,
            output=""
        )
        chat_history += prompt + "\n"

        # Tokenize combined history and move to GPU
        inputs = tokenizer([chat_history], return_tensors="pt").to("cuda")

        # Generate output with configured parameters
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            num_return_sequences=1,
            do_sample=True,
            no_repeat_ngram_size=2
        )

        # Decode and clean the model's response
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        clean_output = decoded_output[0].split('### Response:')[-1].strip()

        # Add the response to chat history
        chat_history += f": {clean_output}\n"

        # Display the response
        print(f"\nModel: {clean_output}\n")

# Start the chat
chat()