alesb2010 commited on
Commit
a02cabe
·
1 Parent(s): 5cf7c39

Update space

Browse files
Files changed (2) hide show
  1. app.py +126 -63
  2. requirements.txt +1 -2
app.py CHANGED
@@ -1,89 +1,152 @@
1
  import gradio as gr
2
- # from transformers import pipeline # Or whatever library your model needs (e.g., torch, tensorflow)
3
- from transformers import AutoModel
4
- import os # Useful for environment variables if needed
5
 
6
- # 1. Load your Hugging Face model
7
- # Replace "your-model-id" with the actual ID of the model on Hugging Face Hub
8
- # Using pipeline is often the easiest way to start for common tasks
 
 
 
 
 
9
  try:
10
- from llama_cpp import Llama
11
- print("llama_cpp imported successfully")
12
- except ImportError:
13
- print("Error: llama-cpp-python not installed. Please check requirements.txt and logs.")
14
- Llama = None # Set to None if import fails
15
- llm = None
16
- if Llama is not None:
17
- try:
18
- model_repo_id = "mradermacher/DeepSeek-R1-Distill-Qwen-7B-Multilingual-i1-GGUF"
19
- model_file_name = "deepseek-r1-distill-qwen-7b-multilingual-i1.Q4_K_M.gguf" # <<== VERIFY THIS FILENAME ON HF HUB
20
- # Example: Sentiment Analysis model
21
- # model = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
22
- # model = AutoModel.from_pretrained("mradermacher/DeepSeek-R1-Distill-Qwen-7B-Multilingual-i1-GGUF")
23
- # Or load specific model/tokenizer if pipeline isn't suitable:
24
- # from transformers import AutoModel, AutoTokenizer
25
- # tokenizer = AutoTokenizer.from_pretrained("your-model-id")
26
- # model = AutoModel.from_pretrained("your-model-id")
 
 
27
  except Exception as e:
28
- # Handle potential errors during model loading (e.g., network issues, model not found)
29
- print(f"Error loading model: {e}")
30
- model = None # Set model to None if loading fails
31
 
32
 
33
- # 2. Define the function that uses the model
34
- # This function takes the input from the Gradio interface
35
- # and returns the output that Gradio will display.
36
- def generate_text(prompt):
37
- if llm is None:
38
- return "Model failed to load. Please check App Space logs."
39
 
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
- print(f"Generating completion for prompt: {prompt[:100]}...") # Log start of generation
42
- # Use the model to generate text
43
- # Adjust max_tokens, stop sequence, etc. based on your needs and the model
44
- output = llm(
45
- prompt,
46
- max_tokens=512, # Max tokens to generate
47
- stop=["Qwen:", "\n\n"], # Stop sequence examples (adjust as needed)
48
- echo=False, # Don't include prompt in output
49
- temperature=0.7, # Creativity level
50
- top_p=0.9, # Nucleus sampling
51
  )
52
- print("Generation complete.")
 
 
 
 
 
 
 
 
 
53
 
54
- # Extract the generated text
55
- generated_text = output["choices"][0]["text"]
 
56
 
57
- return generated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  except Exception as e:
60
  print(f"Error during text generation: {e}")
61
  return f"An error occurred during generation: {e}"
62
 
63
 
64
- # 3. Define the Gradio Interface
65
- if llm is not None: # Only create the interface if the model loaded successfully
 
 
66
  interface = gr.Interface(
67
- fn=generate_text, # Your new generation function
68
- inputs=gr.Textbox(label="Enter your prompt", lines=5), # Text input
69
- outputs=gr.Textbox(label="Generated Text", lines=10), # Text output
70
- title="DeepSeek-R1-Distill-Qwen-7B GGUF Demo",
71
- description="Interact with the DeepSeek-R1-Distill-Qwen-7B Multilingual model in GGUF format."
 
72
  )
 
 
73
  else:
74
- # Interface to show error if model loading failed
 
75
  interface = gr.Interface(
76
- fn=lambda x: "Application failed to load model. See logs for details.",
77
- inputs=gr.Textbox(label="Status"),
78
  outputs=gr.Textbox(),
79
- title="Application Error",
80
- description="Failed to load the GGUF model. Check the logs for details on model loading errors."
81
  )
82
 
83
 
84
- # 4. Launch the Gradio App
85
- # This is crucial for App Spaces to run your application.
86
  if __name__ == "__main__":
87
- # The listen='0.0.0.0' and share=False are often handled by the App Space environment
88
- # but including them is harmless. App Spaces expose on port 7860 by default.
89
- interface.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch # Needed for model operations, especially on GPU
4
+ import os
5
 
6
+ # --- Model Loading ---
7
+ # Define the model ID
8
+ model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
9
+
10
+ tokenizer = None
11
+ model = None
12
+ # Use device_map="auto" to automatically handle placing the model on GPU/CPU
13
+ # Use torch_dtype=torch.bfloat16 or torch.float16 for reduced memory usage on compatible GPUs
14
  try:
15
+ print(f"Loading tokenizer for {model_id}...")
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
17
+ print("Tokenizer loaded.")
18
+
19
+ print(f"Loading model {model_id}...")
20
+ # Adjust torch_dtype based on your GPU capability and memory (float16 or bfloat16 are common for speed/memory)
21
+ # If no GPU is available, remove device_map="auto" and the torch_dtype argument, or set device_map="cpu"
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_id,
24
+ device_map="auto", # Automatically select device (GPU or CPU)
25
+ torch_dtype=torch.bfloat16 # Use bfloat16 for better performance/memory on compatible GPUs
26
+ # If you have less VRAM, try torch.float16, or remove this line for float32 (uses more VRAM)
27
+ )
28
+ print("Model loaded successfully!")
29
+
30
+ # Optional: Check if the tokenizer has a chat template (DeepSeek/Qwen should)
31
+ if not hasattr(tokenizer, 'apply_chat_template'):
32
+ print(f"Warning: Tokenizer for {model_id} does not have a chat template. Model might not be optimized for chat.")
33
+
34
  except Exception as e:
35
+ print(f"Error loading model or tokenizer: {e}")
36
+ tokenizer = None # Ensure both are None if loading fails
37
+ model = None
38
 
39
 
40
+ # --- Inference Function for Gradio ---
41
+ def chat_with_model(user_input_string):
42
+ if model is None or tokenizer is None:
43
+ # Return error message if model loading failed
44
+ return "Model or tokenizer failed to load. Please check App Space logs."
 
45
 
46
+ # --- 1. Format the input into the chat structure ---
47
+ # For a single-turn chat from user input, the messages list is simple
48
+ messages = [
49
+ {"role": "user", "content": user_input_string},
50
+ # Add previous turns here for multi-turn chat (more complex)
51
+ ]
52
+
53
+ # --- 2. Apply the chat template ---
54
+ # The tokenizer converts the messages list into a single string formatted
55
+ # according to the model's specific chat requirements (e.g., adding <|im_start|>user tokens)
56
+ # add_generation_prompt=True tells the model it should generate the assistant's response next
57
  try:
58
+ chat_input_string = tokenizer.apply_chat_template(
59
+ messages,
60
+ tokenize=False, # Return a string, not token IDs yet
61
+ add_generation_prompt=True
 
 
 
 
 
 
62
  )
63
+ print(f"Formatted chat input: {chat_input_string[:200]}...") # Log the formatted input
64
+
65
+ except Exception as e:
66
+ print(f"Error applying chat template: {e}")
67
+ return f"Error formatting input: {e}"
68
+
69
+
70
+ # --- 3. Tokenize the formatted input ---
71
+ try:
72
+ input_ids = tokenizer(chat_input_string, return_tensors="pt").input_ids
73
 
74
+ # Move input tensors to the same device as the model (e.g., GPU)
75
+ if model.device.type != 'cpu':
76
+ input_ids = input_ids.to(model.device)
77
 
78
+ print(f"Input token IDs shape: {input_ids.shape}")
79
+
80
+ except Exception as e:
81
+ print(f"Error tokenizing input: {e}")
82
+ return f"Error tokenizing input: {e}"
83
+
84
+
85
+ # --- 4. Generate response ---
86
+ try:
87
+ print("Starting text generation...")
88
+ # Use model.generate() for text generation
89
+ # max_new_tokens limits the length of the generated response
90
+ # Add other generation parameters (temperature, top_p, etc.) for more control
91
+ with torch.no_grad(): # Inference doesn't need gradient calculation, saves memory
92
+ outputs = model.generate(
93
+ input_ids,
94
+ max_new_tokens=512, # Limit the response length
95
+ temperature=0.7, # Control creativity (adjust as needed)
96
+ do_sample=True, # Enable sampling (recommended for chat)
97
+ top_p=0.95, # Top-p sampling
98
+ # Add other parameters like num_return_sequences if you want multiple responses
99
+ )
100
+ print("Text generation complete.")
101
+
102
+ # --- 5. Decode the output ---
103
+ # The generated output contains the original input tokens + the new tokens generated by the model.
104
+ # Decode only the new tokens that the model generated.
105
+ generated_tokens = outputs[0, input_ids.shape[-1]:]
106
+ assistant_response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
107
+
108
+ # Clean up potential leading/trailing whitespace
109
+ assistant_response = assistant_response.strip()
110
+
111
+ print(f"Generated response: {assistant_response[:200]}...") # Log the generated response
112
+
113
+ return assistant_response
114
 
115
  except Exception as e:
116
  print(f"Error during text generation: {e}")
117
  return f"An error occurred during generation: {e}"
118
 
119
 
120
+ # --- Gradio Interface Definition ---
121
+ # Only create the interface if the model and tokenizer loaded successfully
122
+ if model is not None and tokenizer is not None:
123
+ print("Creating Gradio interface...")
124
  interface = gr.Interface(
125
+ fn=chat_with_model,
126
+ inputs=gr.Textbox(label="Digite sua mensagem (Chat em Português do Brasil)", lines=5),
127
+ outputs=gr.Textbox(label="Resposta do Modelo", lines=10),
128
+ title="DeepSeek-R1-Distill-Qwen-7B Chat PT-BR Demo",
129
+ description="Converse com o modelo DeepSeek-R1-Distill-Qwen-7B, versão destilada.",
130
+ allow_flagging="never" # Disable flagging for a simple demo
131
  )
132
+ print("Gradio interface created.")
133
+
134
  else:
135
+ # Create a simple interface indicating an error if model loading failed
136
+ print("Model/Tokenizer failed to load, creating error interface.")
137
  interface = gr.Interface(
138
+ fn=lambda x: "O modelo ou tokenizer falhou ao carregar. Verifique os logs do App Space para mais detalhes.",
139
+ inputs=gr.Textbox(label="Status da Aplicação"),
140
  outputs=gr.Textbox(),
141
+ title="Erro na Aplicação",
142
+ description="Falha ao carregar o modelo Transformers. Consulte os logs para diagnóstico."
143
  )
144
 
145
 
146
+ # --- Launch the Gradio App ---
147
+ # This part is necessary for the App Space to run your Gradio app
148
  if __name__ == "__main__":
149
+ print("Launching Gradio interface...")
150
+ # App Spaces automatically set server_name and server_port
151
+ interface.launch()
152
+ print("Gradio launch initiated.")
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  huggingface_hub==0.25.2
2
  gradio
3
  transformers
4
- torch
5
- llama-cpp-python
 
1
  huggingface_hub==0.25.2
2
  gradio
3
  transformers
4
+ torch # Or tensorflow, depending on your model's backend