import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, StoppingCriteria, ) from threading import Thread import gradio as gr has_gpu = torch.cuda.is_available() device = "cuda" if has_gpu else "cpu" torch.set_default_device(device) tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "microsoft/phi-2", # torch_dtype=torch.float16 if has_gpu else torch.float32, torch_dtype=torch.float32, device_map=device, trust_remote_code=True, ) # custom stopping criteria (avoid generating hallucinated prompts) # still includes these tokens in the output but stops generating after them class Phi2StoppingCriteria(StoppingCriteria): def __init__(self): stop_list = ["Exercise", "Exercises", "exercises:", "<|endoftext|>"] tokenphrases = [] for token in stop_list: tokenphrases.append( tokenizer(token, return_tensors="pt").input_ids[0].tolist() ) self.tokenphrases = tokenphrases def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs ) -> bool: for tokenphrase in self.tokenphrases: if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]: return True def generate( prompt, max_new_tokens=75, terminate_hallucinated_prompts=True, sampling=False, temperature=1.0, top_k=50, top_p=1.0, ): inputs = tokenizer(prompt, return_tensors="pt").to(device) # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py streamer = TextIteratorStreamer(tokenizer) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=sampling, stopping_criteria=[Phi2StoppingCriteria()] if terminate_hallucinated_prompts else None, temperature=temperature, top_k=top_k, top_p=top_p, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() model_output = "" for new_text in streamer: model_output += new_text yield model_output return model_output demo = gr.Interface( fn=generate, inputs=[ gr.Text( label="prompt", value="Write a detailed analogy between mathematics and a lighthouse.", ), gr.Slider(minimum=0, maximum=500, step=1, value=50, label="max new tokens"), gr.Checkbox( value=True, label="terminate hallucinated prompts", info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.", ), gr.Checkbox( label="do sampling", info="introduce randomness for non-deterministic results. required for below options", value=True, ), gr.Slider( label="temperature", info="higher temperature means more randomness", value=1.0, minimum=0.1, maximum=1.5, step=0.1, ), gr.Slider( label="top-k", info="consider only the k most likely tokens", value=50, minimum=1, maximum=100, step=1, ), gr.Slider( label="top-p", info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p", value=1.0, minimum=0.1, maximum=1.0, step=0.1, ), ], outputs="text", examples=[ [ "Write a detailed analogy between mathematics and a lighthouse.", 75, ], [ "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:", 100, ], [ "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ", 150, ], [ '''``` def print_prime(n): """ Print all primes between 1 and n """\n''', 125, ], ], title="Microsoft Phi-2", description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.", ) if __name__ == "__main__": demo.queue().launch(show_api=False)