import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import traceback

model_name_or_path = "stephenlzc/dolphin-llama3-zh-cn-uncensored"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    trust_remote_code=True
).to(device)

print("Tokenizer loaded successfully")
print("Model loaded successfully")

def generate_response(system_message, user_message):
    try:
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ]

        input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt").to(device)
        attention_mask = torch.ones_like(input_ids).to(device)
        
        output = model.generate(
            inputs=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=512,
            pad_token_id=tokenizer.eos_token_id
        )

        generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
        return generated_response
    except Exception as e:
        error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        return error_message

iface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(label="System Message"),
        gr.Textbox(label="User Message")
    ],
    outputs=gr.Textbox(label="Generated Response"),
    title="llama3 cn uncensored Chatbot"
)

if __name__ == "__main__":
    iface.launch()