|
|
|
|
|
|
|
|
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
|
|
|
|
|
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
### Instruction: {instruction} |
|
### Input: {input_text} |
|
### Response: {output} |
|
""" |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="Subh775/mistral-7b-medical-o1-ft", |
|
max_seq_length=2048, |
|
load_in_4bit=True |
|
) |
|
|
|
|
|
FastLanguageModel.for_inference(model) |
|
|
|
|
|
|
|
|
|
def chat(): |
|
print("Chat with the model! Type '\\q' or 'quit' to stop.\n") |
|
|
|
chat_history = "" |
|
|
|
while True: |
|
|
|
user_input = input("You: ") |
|
|
|
|
|
if user_input.lower() in ['\\q', 'quit']: |
|
print("\nExiting the chat. Goodbye!") |
|
break |
|
|
|
|
|
prompt = alpaca_prompt.format( |
|
instruction="Please answer the following medical question.", |
|
input_text=user_input, |
|
output="" |
|
) |
|
chat_history += prompt + "\n" |
|
|
|
|
|
inputs = tokenizer([chat_history], return_tensors="pt").to("cuda") |
|
|
|
|
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
temperature=0.7, |
|
top_p=0.9, |
|
num_return_sequences=1, |
|
do_sample=True, |
|
no_repeat_ngram_size=2 |
|
) |
|
|
|
|
|
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
clean_output = decoded_output[0].split('### Response:')[-1].strip() |
|
|
|
|
|
chat_history += f": {clean_output}\n" |
|
|
|
|
|
print(f"\nModel: {clean_output}\n") |
|
|
|
|
|
chat() |
|
|