politeness-demo / politeness_gradio.py
willsh1997's picture
ACTUAL torch delete quant and add bfloat 16
e19f5bb
import spaces
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import pipeline
import pandas as pd
import gradio as gr
#Llama 3.2 1b setup
# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.mps.is_available() else "cpu")
torch_dtype = torch.bfloat16 if torch_device in ["cuda", "mps"] else torch.float32
llama_model=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct",
# quantization_config=quantization_config,
torch_dtype=torch_dtype,
device_map=torch_device,
# load_in_4bit=True #for puny devices like mine.
)
llama_tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
"text-generation",
model=llama_model,
tokenizer=llama_tokenizer,
torch_dtype=torch.bfloat16,
device_map="auto",
#load_in_4bit = True #for lil machines like minegit statu
)
@spaces.GPU
def llama_QA(input_question):
"""
stupid func for asking llama a question and then getting an answer
inputs:
- input_question [str]: question for llama to answer
outputs:
- response [str]: llama's response
"""
messages = [
{"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions helpfully."},
{"role": "user", "content": input_question},
]
outputs = pipe(
messages,
max_new_tokens=512
)
response = outputs[0]["generated_text"][-1]['content']
return response
# QA translation roundtrip
@spaces.GPU
def llama_rudepolite_roundtrip(input_question, polite = True):
"""
func which makes question rude
inputs:
- input_question [str]: question to ask and be translated
- response [str]: response in english, translated from llama response
"""
if polite:
input_question = f"Hi there, thank you so much for offering to help me! This is my question: {input_question} - thanks so much for your answer!"
else:
input_question = f"You're an idiot - if you don't help me properly you're stupid. This is my question: {input_question}. If you get this wrong, you're even stupider than I thought."
response = llama_QA(input_question)
return response
@spaces.GPU
def gradio_func(input_question):
"""
silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly.
"""
left_output = llama_rudepolite_roundtrip(input_question, polite = False)
right_output = llama_rudepolite_roundtrip(input_question, polite = True)
return left_output, right_output
# Create the Gradio interface
def create_interface():
# Get available languages from the flores_dict
with gr.Blocks() as demo:
gr.Markdown("Ask Llama the same question but on the left it's rude and on the right it's polite!")
with gr.Row():
question_input = gr.Textbox(label="Enter your question", interactive=True)
with gr.Row():
submit_btn = gr.Button("generate responses")
with gr.Row():
left_output = gr.Textbox(label="rude answer", interactive=False)
right_output = gr.Textbox(label="polite answer", interactive=False)
submit_btn.click(
fn=gradio_func,
inputs=[question_input],
outputs=[left_output, right_output]
)
return demo
# Launch the app
demo = create_interface()
demo.launch()