import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "sarvamai/sarvam-m" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) def generate_response(prompt): messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( messages, tokenize=False, enable_thinking=True, ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # Generate output with temperature=0.2 generated_ids = model.generate( **model_inputs, max_new_tokens=8192, temperature=0.2 ) output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() output_text = tokenizer.decode(output_ids) if "" in output_text: reasoning_content = output_text.split("")[0].rstrip("\n") content = output_text.split("")[-1].lstrip("\n").rstrip("") else: reasoning_content = "" content = output_text.rstrip("") return reasoning_content, content # Gradio UI iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=5, label="Enter your prompt"), outputs=[ gr.Textbox(label="Reasoning"), gr.Textbox(label="Response") ], title="Sarvam-M Chat Interface", description="Enter a prompt and receive both the internal reasoning and the final answer from the Sarvam-M model." ) iface.launch()