import gradio as gr from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM import torch import spaces MODEL_PATH = "benhaotang/phi4-qwq-sky-t1" MODEL_URL = f"https://huggingface.co/{MODEL_PATH}" def load_model(): bnb_config = BitsAndBytesConfig( load_in_8bit=False, llm_int8_enable_fp32_cpu_offload=True ) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, device_map="auto", torch_dtype=torch.float16, offload_folder="offload_folder", quantization_config=bnb_config ) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto", ) return pipe pipe = load_model() @spaces.GPU(duration=110) def generate_response(prompt, max_length=1024): # Create messages with system prompt messages = [ {"role": "system", "content": "You are a helpful AI assistant. You always think step by step."}, {"role": "user", "content": prompt} ] outputs = pipe(messages, max_new_tokens=max_length) # Find assistant's response in the output try: # The output contains the full conversation history generated_text = outputs[0]["generated_text"] # Look for the last assistant message assistant_prefix = "{'role': 'assistant', 'content': '" assistant_start = generated_text.rfind(assistant_prefix) if assistant_start != -1: # Move past the prefix content_start = assistant_start + len(assistant_prefix) # Find the end of the content (before the closing quote and brace) content_end = generated_text.rfind("'}") if content_end != -1: return generated_text[content_start:content_end] except Exception as e: print(f"Error extracting response: {e}") # Fallback: return the raw generated text if extraction fails return outputs[0]["generated_text"] # Example with proper line breaks example_prompt = """For a scalar field theory with interaction Lagrangian $\mathcal{L}_{int} = g\phi^3 + \lambda\phi^4$: 1. Enumerate all possible 1-loop Feynman diagrams contributing to the scalar propagator 2. For each diagram, write down its loop contribution 3. Provide Mathematica code to calculate these loop amplitudes with dimensional regularization at $d=4-\epsilon$ Please explain your reasoning step by step.""" demo = gr.Interface( fn=generate_response, inputs=[ gr.Textbox( label="Enter your question", placeholder="Ask me anything...", lines=5 ), ], outputs=gr.Textbox(label="Response", lines=10), title="benhaotang/phi4-qwq-sky-t1", description=f""" To achieve CoT and science reasoning on small scale with a merge of CoT finetuned phi4 model. Model: [benhaotang/phi4-qwq-sky-t1]({MODEL_URL})""", examples=[ [example_prompt] # Now using the formatted example ] ) demo.launch()