import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline import time import gradio as gr # bnb_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.bfloat16, # ) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3.5-mini-instruct", torch_dtype=torch.bfloat16, # quantization_config=bnb_config, trust_remote_code=True ) model.load_adapter('./finetunedPEFTModel') tokenizer = AutoTokenizer.from_pretrained('./finetunedPEFTModel', trust_remote_code=True) # tokenizer.pad_token = tokenizer.unk_token # tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True) def generateText(inputText="What is QLora finetuning?", num_tokens=200): # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=num_tokens) # result = pipe(f'''[INST] {inputText} [/INST]''') # print(result[0]['generated_text']) prompt = "What is model regularization?" pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=num_tokens) result = pipe(f'''{inputText}''') return result[0]['generated_text'] title = "Fine tuned Phi3.5 instruct model on OpenAssist dataset using QLora" description = "Fine tuned Phi3.5 instruct model on OpenAssist dataset using QLora. Running on CPU and thus a bit slow. So please be patient on submitting a request as it might take 15 to 20 minutes for a response." examples = [ ["How can I optimize my web page for online search so that it is on top?", 200], ["Can you give me an example of python script for Fibonacci series?", 200], ["Can you explain what is Contrastive Loss in Deep Learning?", 200], ["How are Sentence Transformers different from Huggingface Transformers?", 200], ] demo = gr.Interface( generateText, inputs = [ gr.Textbox(label="Question that you want to ask"), gr.Slider(100, 500, value = 200, step=100, label="Number of tokens that you want in your output"), ], outputs = [ gr.Text(), ], title = title, description = description, examples = examples, cache_examples=False ) demo.launch()