import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import traceback model_name_or_path = "stephenlzc/dolphin-llama3-zh-cn-uncensored" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) # Check if GPU is available device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load the model model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.float16 if device == "cuda" else torch.float32, trust_remote_code=True ).to(device) print("Tokenizer loaded successfully") print("Model loaded successfully") def generate_response(system_message, user_message): try: messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_message}, ] input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt").to(device) attention_mask = torch.ones_like(input_ids).to(device) output = model.generate( inputs=input_ids, attention_mask=attention_mask, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id ) generated_response = tokenizer.decode(output[0], skip_special_tokens=True) return generated_response except Exception as e: error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" return error_message iface = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(label="System Message"), gr.Textbox(label="User Message") ], outputs=gr.Textbox(label="Generated Response"), title="llama3 cn uncensored Chatbot" ) if __name__ == "__main__": iface.launch()