import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

import time
import gradio as gr

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    torch_dtype=torch.bfloat16,
    # quantization_config=bnb_config,
    trust_remote_code=True
)
model.load_adapter('./finetunedPEFTModel')

tokenizer = AutoTokenizer.from_pretrained('./finetunedPEFTModel', trust_remote_code=True)
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)


def generateText(inputText="What is QLora finetuning?", num_tokens=200):

    # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=num_tokens)
    # result = pipe(f'''[INST] {inputText} [/INST]''')
    # print(result[0]['generated_text'])
    
    prompt = "What is model regularization?"
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=num_tokens)
    result = pipe(f'''{inputText}''')
    
    return result[0]['generated_text']
    

title = "Fine tuned Phi3.5 instruct model on OpenAssist dataset using QLora"
description = "Fine tuned Phi3.5 instruct model on OpenAssist dataset using QLora. Running on CPU and thus a bit slow. So please be patient on submitting a request as it might take 15 to 20 minutes for a response."
examples = [
            ["How can I optimize my web page for online search so that it is on top?", 200],
            ["Can you give me an example of python script for Fibonacci series?", 200],
            ["Can you explain what is Contrastive Loss in Deep Learning?", 200],
            ["How are Sentence Transformers different from Huggingface Transformers?", 200],
           ]

demo = gr.Interface(
    generateText, 
    inputs = [
        gr.Textbox(label="Question that you want to ask"),
        gr.Slider(100, 500, value = 200, step=100, label="Number of tokens that you want in your output"),
        ], 
    outputs = [
        gr.Text(),
        ],
    title = title,
    description = description,
    examples = examples,
    cache_examples=False    
)
demo.launch()