Spaces:

randomblock1
/

phi-2

Sleeping

phi-2 / app.py

Benjamin G

fix gpu usage (tested)

0f36821 11 months ago

4.63 kB

	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TextIteratorStreamer,
	StoppingCriteria,
	)
	from threading import Thread
	import gradio as gr

	has_gpu = torch.cuda.is_available()
	device = "cuda" if has_gpu else "cpu"

	torch.set_default_device(device)

	tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	"microsoft/phi-2",
	# torch_dtype=torch.float16 if has_gpu else torch.float32,
	torch_dtype=torch.float32,
	device_map=device,
	trust_remote_code=True,
	)


	# custom stopping criteria (avoid generating hallucinated prompts)
	# still includes these tokens in the output but stops generating after them
	class Phi2StoppingCriteria(StoppingCriteria):
	def __init__(self):
	stop_list = ["Exercise", "Exercises", "exercises:", "<\|endoftext\|>"]
	tokenphrases = []
	for token in stop_list:
	tokenphrases.append(
	tokenizer(token, return_tensors="pt").input_ids[0].tolist()
	)
	self.tokenphrases = tokenphrases

	def __call__(
	self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
	) -> bool:
	for tokenphrase in self.tokenphrases:
	if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
	return True


	def generate(
	prompt,
	max_new_tokens=75,
	terminate_hallucinated_prompts=True,
	sampling=False,
	temperature=1.0,
	top_k=50,
	top_p=1.0,
	):
	inputs = tokenizer(prompt, return_tensors="pt").to(device)
	# thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
	streamer = TextIteratorStreamer(tokenizer)
	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=sampling,
	stopping_criteria=[Phi2StoppingCriteria()]
	if terminate_hallucinated_prompts
	else None,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	)
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()
	model_output = ""
	for new_text in streamer:
	model_output += new_text
	yield model_output
	return model_output


	demo = gr.Interface(
	fn=generate,
	inputs=[
	gr.Text(
	label="prompt",
	value="Write a detailed analogy between mathematics and a lighthouse.",
	),
	gr.Slider(minimum=0, maximum=500, step=1, value=50, label="max new tokens"),
	gr.Checkbox(
	value=True,
	label="terminate hallucinated prompts",
	info="stop generation after getting tokens like 'Exercise' or '<\|endoftext\|>, but will not remove them.",
	),
	gr.Checkbox(
	label="do sampling",
	info="introduce randomness for non-deterministic results. required for below options",
	value=True,
	),
	gr.Slider(
	label="temperature",
	info="higher temperature means more randomness",
	value=1.0,
	minimum=0.1,
	maximum=1.5,
	step=0.1,
	),
	gr.Slider(
	label="top-k",
	info="consider only the k most likely tokens",
	value=50,
	minimum=1,
	maximum=100,
	step=1,
	),
	gr.Slider(
	label="top-p",
	info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
	value=1.0,
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	),
	],
	outputs="text",
	examples=[
	[
	"Write a detailed analogy between mathematics and a lighthouse.",
	75,
	],
	[
	"Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
	100,
	],
	[
	"Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
	150,
	],
	[
	'''```
	def print_prime(n):
	"""
	Print all primes between 1 and n
	"""\n''',
	125,
	],
	["User: How does sleep affect mood?\nAI:", 100],
	["Who was Ada Lovelace?", 25],
	["Explain the concept of skip lists.", 400],
	],
	title="Microsoft Phi-2",
	description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
	)


	if __name__ == "__main__":
	demo.queue().launch(show_api=False)