Spaces:

AleSb
/

my-deep-world

Sleeping

App Files Files Community

alesb2010 commited on May 8

Commit

a02cabe

1 Parent(s): 5cf7c39

Update space

Browse files

Files changed (2) hide show

app.py +126 -63
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -1,89 +1,152 @@
 import gradio as gr
-# from transformers import pipeline # Or whatever library your model needs (e.g., torch, tensorflow)
-from transformers import AutoModel
-import os # Useful for environment variables if needed
-# 1. Load your Hugging Face model
-# Replace "your-model-id" with the actual ID of the model on Hugging Face Hub
-# Using pipeline is often the easiest way to start for common tasks
 try:
-    from llama_cpp import Llama
-    print("llama_cpp imported successfully")
-except ImportError:
-    print("Error: llama-cpp-python not installed. Please check requirements.txt and logs.")
-    Llama = None # Set to None if import fails
-llm = None
-if Llama is not None:
-    try:
-        model_repo_id = "mradermacher/DeepSeek-R1-Distill-Qwen-7B-Multilingual-i1-GGUF"
-        model_file_name = "deepseek-r1-distill-qwen-7b-multilingual-i1.Q4_K_M.gguf" # <<== VERIFY THIS FILENAME ON HF HUB
-    # Example: Sentiment Analysis model
-    # model = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
-    # model = AutoModel.from_pretrained("mradermacher/DeepSeek-R1-Distill-Qwen-7B-Multilingual-i1-GGUF")
-    # Or load specific model/tokenizer if pipeline isn't suitable:
-    # from transformers import AutoModel, AutoTokenizer
-    # tokenizer = AutoTokenizer.from_pretrained("your-model-id")
-    # model = AutoModel.from_pretrained("your-model-id")
 except Exception as e:
-     # Handle potential errors during model loading (e.g., network issues, model not found)
-     print(f"Error loading model: {e}")
-     model = None # Set model to None if loading fails
-# 2. Define the function that uses the model
-# This function takes the input from the Gradio interface
-# and returns the output that Gradio will display.
-def generate_text(prompt):
-    if llm is None:
-        return "Model failed to load. Please check App Space logs."
     try:
-        print(f"Generating completion for prompt: {prompt[:100]}...") # Log start of generation
-        # Use the model to generate text
-        # Adjust max_tokens, stop sequence, etc. based on your needs and the model
-        output = llm(
-            prompt,
-            max_tokens=512, # Max tokens to generate
-            stop=["Qwen:", "\n\n"], # Stop sequence examples (adjust as needed)
-            echo=False, # Don't include prompt in output
-            temperature=0.7, # Creativity level
-            top_p=0.9, # Nucleus sampling
         )
-        print("Generation complete.")
-        # Extract the generated text
-        generated_text = output["choices"][0]["text"]
-        return generated_text
     except Exception as e:
         print(f"Error during text generation: {e}")
         return f"An error occurred during generation: {e}"
-# 3. Define the Gradio Interface
-if llm is not None: # Only create the interface if the model loaded successfully
     interface = gr.Interface(
-        fn=generate_text,        # Your new generation function
-        inputs=gr.Textbox(label="Enter your prompt", lines=5), # Text input
-        outputs=gr.Textbox(label="Generated Text", lines=10),   # Text output
-        title="DeepSeek-R1-Distill-Qwen-7B GGUF Demo",
-        description="Interact with the DeepSeek-R1-Distill-Qwen-7B Multilingual model in GGUF format."
     )
 else:
-    # Interface to show error if model loading failed
      interface = gr.Interface(
-        fn=lambda x: "Application failed to load model. See logs for details.",
-        inputs=gr.Textbox(label="Status"),
         outputs=gr.Textbox(),
-        title="Application Error",
-        description="Failed to load the GGUF model. Check the logs for details on model loading errors."
     )
-# 4. Launch the Gradio App
-# This is crucial for App Spaces to run your application.
 if __name__ == "__main__":
-    # The listen='0.0.0.0' and share=False are often handled by the App Space environment
-    # but including them is harmless. App Spaces expose on port 7860 by default.
-    interface.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch # Needed for model operations, especially on GPU
+import os
+# --- Model Loading ---
+# Define the model ID
+model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+tokenizer = None
+model = None
+# Use device_map="auto" to automatically handle placing the model on GPU/CPU
+# Use torch_dtype=torch.bfloat16 or torch.float16 for reduced memory usage on compatible GPUs
 try:
+    print(f"Loading tokenizer for {model_id}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    print("Tokenizer loaded.")
+    print(f"Loading model {model_id}...")
+    # Adjust torch_dtype based on your GPU capability and memory (float16 or bfloat16 are common for speed/memory)
+    # If no GPU is available, remove device_map="auto" and the torch_dtype argument, or set device_map="cpu"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto", # Automatically select device (GPU or CPU)
+        torch_dtype=torch.bfloat16 # Use bfloat16 for better performance/memory on compatible GPUs
+        # If you have less VRAM, try torch.float16, or remove this line for float32 (uses more VRAM)
+    )
+    print("Model loaded successfully!")
+    # Optional: Check if the tokenizer has a chat template (DeepSeek/Qwen should)
+    if not hasattr(tokenizer, 'apply_chat_template'):
+        print(f"Warning: Tokenizer for {model_id} does not have a chat template. Model might not be optimized for chat.")
 except Exception as e:
+    print(f"Error loading model or tokenizer: {e}")
+    tokenizer = None # Ensure both are None if loading fails
+    model = None
+# --- Inference Function for Gradio ---
+def chat_with_model(user_input_string):
+    if model is None or tokenizer is None:
+        # Return error message if model loading failed
+        return "Model or tokenizer failed to load. Please check App Space logs."
+    # --- 1. Format the input into the chat structure ---
+    # For a single-turn chat from user input, the messages list is simple
+    messages = [
+        {"role": "user", "content": user_input_string},
+        # Add previous turns here for multi-turn chat (more complex)
+    ]
+    # --- 2. Apply the chat template ---
+    # The tokenizer converts the messages list into a single string formatted
+    # according to the model's specific chat requirements (e.g., adding <|im_start|>user tokens)
+    # add_generation_prompt=True tells the model it should generate the assistant's response next
     try:
+        chat_input_string = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False, # Return a string, not token IDs yet
+            add_generation_prompt=True
         )
+        print(f"Formatted chat input: {chat_input_string[:200]}...") # Log the formatted input
+    except Exception as e:
+        print(f"Error applying chat template: {e}")
+        return f"Error formatting input: {e}"
+    # --- 3. Tokenize the formatted input ---
+    try:
+        input_ids = tokenizer(chat_input_string, return_tensors="pt").input_ids
+        # Move input tensors to the same device as the model (e.g., GPU)
+        if model.device.type != 'cpu':
+             input_ids = input_ids.to(model.device)
+        print(f"Input token IDs shape: {input_ids.shape}")
+    except Exception as e:
+        print(f"Error tokenizing input: {e}")
+        return f"Error tokenizing input: {e}"
+    # --- 4. Generate response ---
+    try:
+        print("Starting text generation...")
+        # Use model.generate() for text generation
+        # max_new_tokens limits the length of the generated response
+        # Add other generation parameters (temperature, top_p, etc.) for more control
+        with torch.no_grad(): # Inference doesn't need gradient calculation, saves memory
+             outputs = model.generate(
+                 input_ids,
+                 max_new_tokens=512, # Limit the response length
+                 temperature=0.7,    # Control creativity (adjust as needed)
+                 do_sample=True,     # Enable sampling (recommended for chat)
+                 top_p=0.95,         # Top-p sampling
+                 # Add other parameters like num_return_sequences if you want multiple responses
+             )
+        print("Text generation complete.")
+        # --- 5. Decode the output ---
+        # The generated output contains the original input tokens + the new tokens generated by the model.
+        # Decode only the new tokens that the model generated.
+        generated_tokens = outputs[0, input_ids.shape[-1]:]
+        assistant_response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        # Clean up potential leading/trailing whitespace
+        assistant_response = assistant_response.strip()
+        print(f"Generated response: {assistant_response[:200]}...") # Log the generated response
+        return assistant_response
     except Exception as e:
         print(f"Error during text generation: {e}")
         return f"An error occurred during generation: {e}"
+# --- Gradio Interface Definition ---
+# Only create the interface if the model and tokenizer loaded successfully
+if model is not None and tokenizer is not None:
+    print("Creating Gradio interface...")
     interface = gr.Interface(
+        fn=chat_with_model,
+        inputs=gr.Textbox(label="Digite sua mensagem (Chat em Português do Brasil)", lines=5),
+        outputs=gr.Textbox(label="Resposta do Modelo", lines=10),
+        title="DeepSeek-R1-Distill-Qwen-7B Chat PT-BR Demo",
+        description="Converse com o modelo DeepSeek-R1-Distill-Qwen-7B, versão destilada.",
+        allow_flagging="never" # Disable flagging for a simple demo
     )
+    print("Gradio interface created.")
 else:
+     # Create a simple interface indicating an error if model loading failed
+     print("Model/Tokenizer failed to load, creating error interface.")
      interface = gr.Interface(
+        fn=lambda x: "O modelo ou tokenizer falhou ao carregar. Verifique os logs do App Space para mais detalhes.",
+        inputs=gr.Textbox(label="Status da Aplicação"),
         outputs=gr.Textbox(),
+        title="Erro na Aplicação",
+        description="Falha ao carregar o modelo Transformers. Consulte os logs para diagnóstico."
     )
+# --- Launch the Gradio App ---
+# This part is necessary for the App Space to run your Gradio app
 if __name__ == "__main__":
+    print("Launching Gradio interface...")
+    # App Spaces automatically set server_name and server_port
+    interface.launch()
+    print("Gradio launch initiated.")

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 huggingface_hub==0.25.2
 gradio
 transformers
-torch
-llama-cpp-python

 huggingface_hub==0.25.2
 gradio
 transformers
+torch # Or tensorflow, depending on your model's backend