Spaces:

willsh1997
/

politeness-demo

Running on Zero

App Files Files Community

politeness-demo / politeness_gradio.py

willsh1997

ACTUAL torch delete quant and add bfloat 16

e19f5bb 5 months ago

raw

history blame contribute delete

3.85 kB

	import spaces
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
	import torch
	from transformers import pipeline
	import pandas as pd
	import gradio as gr

	#Llama 3.2 1b setup
	# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
	torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.mps.is_available() else "cpu")

	torch_dtype = torch.bfloat16 if torch_device in ["cuda", "mps"] else torch.float32

	llama_model=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct",
	# quantization_config=quantization_config,
	torch_dtype=torch_dtype,
	device_map=torch_device,
	# load_in_4bit=True #for puny devices like mine.

	)
	llama_tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")


	model_id = "meta-llama/Llama-3.2-3B-Instruct"
	pipe = pipeline(
	"text-generation",
	model=llama_model,
	tokenizer=llama_tokenizer,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	#load_in_4bit = True #for lil machines like minegit statu
	)

	@spaces.GPU
	def llama_QA(input_question):
	"""
	stupid func for asking llama a question and then getting an answer
	inputs:
	- input_question [str]: question for llama to answer
	outputs:
	- response [str]: llama's response
	"""

	messages = [
	{"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions helpfully."},
	{"role": "user", "content": input_question},
	]
	outputs = pipe(
	messages,
	max_new_tokens=512
	)
	response = outputs[0]["generated_text"][-1]['content']
	return response


	# QA translation roundtrip
	@spaces.GPU
	def llama_rudepolite_roundtrip(input_question, polite = True):
	"""
	func which makes question rude

	inputs:
	- input_question [str]: question to ask and be translated

	- response [str]: response in english, translated from llama response
	"""
	if polite:
	input_question = f"Hi there, thank you so much for offering to help me! This is my question: {input_question} - thanks so much for your answer!"
	else:
	input_question = f"You're an idiot - if you don't help me properly you're stupid. This is my question: {input_question}. If you get this wrong, you're even stupider than I thought."
	response = llama_QA(input_question)
	return response

	@spaces.GPU
	def gradio_func(input_question):
	"""
	silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly.
	"""
	left_output = llama_rudepolite_roundtrip(input_question, polite = False)
	right_output = llama_rudepolite_roundtrip(input_question, polite = True)
	return left_output, right_output

	# Create the Gradio interface
	def create_interface():
	# Get available languages from the flores_dict

	with gr.Blocks() as demo:
	gr.Markdown("Ask Llama the same question but on the left it's rude and on the right it's polite!")
	with gr.Row():
	question_input = gr.Textbox(label="Enter your question", interactive=True)
	with gr.Row():
	submit_btn = gr.Button("generate responses")
	with gr.Row():
	left_output = gr.Textbox(label="rude answer", interactive=False)
	right_output = gr.Textbox(label="polite answer", interactive=False)

	submit_btn.click(
	fn=gradio_func,
	inputs=[question_input],
	outputs=[left_output, right_output]
	)

	return demo

	# Launch the app
	demo = create_interface()
	demo.launch()