import spaces from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM import torch from transformers import pipeline import pandas as pd import gradio as gr #Llama 3.2 1b setup # quantization_config = BitsAndBytesConfig(load_in_4bit=True) torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.mps.is_available() else "cpu") torch_dtype = torch.bfloat16 if torch_device in ["cuda", "mps"] else torch.float32 llama_model=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", # quantization_config=quantization_config, torch_dtype=torch_dtype, device_map=torch_device, # load_in_4bit=True #for puny devices like mine. ) llama_tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct") model_id = "meta-llama/Llama-3.2-3B-Instruct" pipe = pipeline( "text-generation", model=llama_model, tokenizer=llama_tokenizer, torch_dtype=torch.bfloat16, device_map="auto", #load_in_4bit = True #for lil machines like minegit statu ) @spaces.GPU def llama_QA(input_question): """ stupid func for asking llama a question and then getting an answer inputs: - input_question [str]: question for llama to answer outputs: - response [str]: llama's response """ messages = [ {"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions helpfully."}, {"role": "user", "content": input_question}, ] outputs = pipe( messages, max_new_tokens=512 ) response = outputs[0]["generated_text"][-1]['content'] return response # QA translation roundtrip @spaces.GPU def llama_rudepolite_roundtrip(input_question, polite = True): """ func which makes question rude inputs: - input_question [str]: question to ask and be translated - response [str]: response in english, translated from llama response """ if polite: input_question = f"Hi there, thank you so much for offering to help me! This is my question: {input_question} - thanks so much for your answer!" else: input_question = f"You're an idiot - if you don't help me properly you're stupid. This is my question: {input_question}. If you get this wrong, you're even stupider than I thought." response = llama_QA(input_question) return response @spaces.GPU def gradio_func(input_question): """ silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly. """ left_output = llama_rudepolite_roundtrip(input_question, polite = False) right_output = llama_rudepolite_roundtrip(input_question, polite = True) return left_output, right_output # Create the Gradio interface def create_interface(): # Get available languages from the flores_dict with gr.Blocks() as demo: gr.Markdown("Ask Llama the same question but on the left it's rude and on the right it's polite!") with gr.Row(): question_input = gr.Textbox(label="Enter your question", interactive=True) with gr.Row(): submit_btn = gr.Button("generate responses") with gr.Row(): left_output = gr.Textbox(label="rude answer", interactive=False) right_output = gr.Textbox(label="polite answer", interactive=False) submit_btn.click( fn=gradio_func, inputs=[question_input], outputs=[left_output, right_output] ) return demo # Launch the app demo = create_interface() demo.launch()