import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch # Needed for model operations, especially on GPU
import os

# --- Model Loading ---
# Define the model ID
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

tokenizer = None
model = None
# Use device_map="auto" to automatically handle placing the model on GPU/CPU
# Use torch_dtype=torch.bfloat16 or torch.float16 for reduced memory usage on compatible GPUs
try:
    print(f"Loading tokenizer for {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("Tokenizer loaded.")

    print(f"Loading model {model_id}...")
    # Adjust torch_dtype based on your GPU capability and memory (float16 or bfloat16 are common for speed/memory)
    # If no GPU is available, remove device_map="auto" and the torch_dtype argument, or set device_map="cpu"
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto", # Automatically select device (GPU or CPU)
        torch_dtype=torch.bfloat16 # Use bfloat16 for better performance/memory on compatible GPUs
        # If you have less VRAM, try torch.float16, or remove this line for float32 (uses more VRAM)
    )
    print("Model loaded successfully!")

    # Optional: Check if the tokenizer has a chat template (DeepSeek/Qwen should)
    if not hasattr(tokenizer, 'apply_chat_template'):
        print(f"Warning: Tokenizer for {model_id} does not have a chat template. Model might not be optimized for chat.")

except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    tokenizer = None # Ensure both are None if loading fails
    model = None


# --- Inference Function for Gradio ---
def chat_with_model(user_input_string):
    if model is None or tokenizer is None:
        # Return error message if model loading failed
        return "Model or tokenizer failed to load. Please check App Space logs."

    # --- 1. Format the input into the chat structure ---
    # For a single-turn chat from user input, the messages list is simple
    messages = [
        {"role": "user", "content": user_input_string},
        # Add previous turns here for multi-turn chat (more complex)
    ]

    # --- 2. Apply the chat template ---
    # The tokenizer converts the messages list into a single string formatted
    # according to the model's specific chat requirements (e.g., adding <|im_start|>user tokens)
    # add_generation_prompt=True tells the model it should generate the assistant's response next
    try:
        chat_input_string = tokenizer.apply_chat_template(
            messages,
            tokenize=False, # Return a string, not token IDs yet
            add_generation_prompt=True
        )
        print(f"Formatted chat input: {chat_input_string[:200]}...") # Log the formatted input

    except Exception as e:
        print(f"Error applying chat template: {e}")
        return f"Error formatting input: {e}"


    # --- 3. Tokenize the formatted input ---
    try:
        input_ids = tokenizer(chat_input_string, return_tensors="pt").input_ids

        # Move input tensors to the same device as the model (e.g., GPU)
        if model.device.type != 'cpu':
             input_ids = input_ids.to(model.device)

        print(f"Input token IDs shape: {input_ids.shape}")

    except Exception as e:
        print(f"Error tokenizing input: {e}")
        return f"Error tokenizing input: {e}"


    # --- 4. Generate response ---
    try:
        print("Starting text generation...")
        # Use model.generate() for text generation
        # max_new_tokens limits the length of the generated response
        # Add other generation parameters (temperature, top_p, etc.) for more control
        with torch.no_grad(): # Inference doesn't need gradient calculation, saves memory
             outputs = model.generate(
                 input_ids,
                 max_new_tokens=512, # Limit the response length
                 temperature=0.7,    # Control creativity (adjust as needed)
                 do_sample=True,     # Enable sampling (recommended for chat)
                 top_p=0.95,         # Top-p sampling
                 # Add other parameters like num_return_sequences if you want multiple responses
             )
        print("Text generation complete.")

        # --- 5. Decode the output ---
        # The generated output contains the original input tokens + the new tokens generated by the model.
        # Decode only the new tokens that the model generated.
        generated_tokens = outputs[0, input_ids.shape[-1]:]
        assistant_response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

        # Clean up potential leading/trailing whitespace
        assistant_response = assistant_response.strip()

        print(f"Generated response: {assistant_response[:200]}...") # Log the generated response

        return assistant_response

    except Exception as e:
        print(f"Error during text generation: {e}")
        return f"An error occurred during generation: {e}"


# --- Gradio Interface Definition ---
# Only create the interface if the model and tokenizer loaded successfully
if model is not None and tokenizer is not None:
    print("Creating Gradio interface...")
    interface = gr.Interface(
        fn=chat_with_model,
        inputs=gr.Textbox(label="Digite sua mensagem (Chat em Português do Brasil)", lines=5),
        outputs=gr.Textbox(label="Resposta do Modelo", lines=10),
        title="DeepSeek-R1-Distill-Qwen-7B Chat PT-BR Demo",
        description="Converse com o modelo DeepSeek-R1-Distill-Qwen-7B, versão destilada.",
        allow_flagging="never" # Disable flagging for a simple demo
    )
    print("Gradio interface created.")

else:
     # Create a simple interface indicating an error if model loading failed
     print("Model/Tokenizer failed to load, creating error interface.")
     interface = gr.Interface(
        fn=lambda x: "O modelo ou tokenizer falhou ao carregar. Verifique os logs do App Space para mais detalhes.",
        inputs=gr.Textbox(label="Status da Aplicação"),
        outputs=gr.Textbox(),
        title="Erro na Aplicação",
        description="Falha ao carregar o modelo Transformers. Consulte os logs para diagnóstico."
    )


# --- Launch the Gradio App ---
# This part is necessary for the App Space to run your Gradio app
if __name__ == "__main__":
    print("Launching Gradio interface...")
    # App Spaces automatically set server_name and server_port
    interface.launch()
    print("Gradio launch initiated.")