Spaces:

joaogante
/

assisted_generation_demo

Running on Zero

App Files Files Community

assisted_generation_demo / app.py

joaogante HF staff

Update app.py

7187f3b verified 22 days ago

raw

history blame contribute delete

3.76 kB

	import spaces
	import gradio as gr

	import time
	from threading import Thread

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer


	model_id = "meta-llama/Llama-3.1-8B"
	assistant_id = "meta-llama/Llama-3.2-1B"

	model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
	assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.float16)
	tokenizer = AutoTokenizer.from_pretrained(model_id)


	@spaces.GPU
	def run_generation(user_text, use_assistant, temperature, max_new_tokens):
	if temperature < 0.1:
	do_sample = False
	else:
	do_sample = True

	# Get the model and tokenizer, and tokenize the user text.
	model_inputs = tokenizer([user_text], return_tensors="pt").to(model.device)

	# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
	# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	assistant_model=assistant_model if use_assistant else None,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=do_sample,
	top_p=0.95,
	temperature=float(temperature),
	top_k=50,
	eos_token_id=-1, # ensures `max_new_tokens` new tokens are always generated, can't reach EOS
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	start = time.time()
	t.start()

	# Pull the generated text from the streamer, and update the model output. Return the model output and time
	# spent so far.
	model_output = ""
	for new_text in streamer:
	model_output += new_text
	time_so_far = time.time() - start
	tokens_so_far = tokenizer(model_output, return_tensors="pt").input_ids.shape[1]
	yield [model_output, round(tokens_so_far/time_so_far, 2)]


	def reset_textbox():
	return gr.update(value='')


	with gr.Blocks() as demo:
	gr.Markdown(
	"# 🤗 Assisted Generation Demo\n"
	f"- Model: {model_id} (4-bit quantization)\n"
	f"- Assistant Model: {assistant_id} (FP16)\n"
	"- Recipe for good speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
	)

	with gr.Row():
	with gr.Column(scale=4):
	user_text = gr.Textbox(
	value="A sequence: one, two, three, ",
	label="Prompt"
	)
	model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
	button_submit = gr.Button(value="Submit")

	with gr.Column(scale=1, min_width=200):
	gr.Markdown("### Generation Settings")
	use_assistant = gr.Checkbox(label="Use Assisted Generation", value=True)
	max_new_tokens = gr.Slider(
	minimum=1, maximum=500, value=100, step=1, interactive=True, label="Max New Tokens",
	)
	temperature = gr.Slider(
	minimum=0.0, maximum=2.0, value=0.6, step=0.05, interactive=True, label="Temperature (0.0 = Greedy)",
	)
	gr.Markdown("### Tokens per second")
	tokens_per_second = gr.Textbox(lines=1, interactive=False, show_label=False)

	generate_inputs = [user_text, use_assistant, temperature, max_new_tokens]
	generate_outputs = [model_output, tokens_per_second]
	user_text.submit(run_generation, generate_inputs, generate_outputs)
	button_submit.click(run_generation, generate_inputs, generate_outputs)

	demo.queue(max_size=16).launch()