kimhyunwoo commited on
Commit
2673193
Β·
verified Β·
1 Parent(s): 61bf4d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -77
app.py CHANGED
@@ -5,47 +5,45 @@ import gc
5
  import os
6
  import datetime
7
 
8
- # --- Configuration ---
9
  MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
10
- MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU (adjust as needed)
11
- CPU_THREAD_COUNT = 4 # Limit threads torch uses on CPU if needed (adjust based on Space CPU core count)
12
 
13
- # Set PyTorch CPU thread count (optional, might help prevent resource exhaustion)
14
  # torch.set_num_threads(CPU_THREAD_COUNT)
15
  # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
16
  # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
17
 
18
  print("--- Environment Setup ---")
19
  print(f"PyTorch version: {torch.__version__}")
20
- print(f"Running on device: cpu") # Explicitly state we expect CPU
21
- print(f"Torch Threads: {torch.get_num_threads()}") # Check default threads
22
 
23
- # --- Model Loading ---
24
  print(f"--- Loading Model: {MODEL_ID} ---")
25
  print("This might take a few minutes, especially on the first launch...")
26
 
 
 
 
27
  try:
28
- # Load model explicitly onto CPU with float32 (standard for CPU compatibility)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  MODEL_ID,
31
- torch_dtype=torch.float32, # Use float32 for CPU compatibility
32
- device_map="cpu" # Explicitly map to CPU
33
- # low_cpu_mem_usage=True # Can sometimes help on low RAM, but might slow down loading
 
 
 
 
34
  )
35
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
36
- model.eval() # Set model to evaluation mode
37
  print("--- Model and Tokenizer Loaded Successfully on CPU ---")
38
 
39
- # --- Stop Token Configuration ---
40
- # Get IDs for specified stop tokens and the standard EOS token
41
  stop_token_strings = ["<|endofturn|>", "<|stop|>"]
42
  stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
43
 
44
- # Ensure the official EOS token is also included if not already present
45
  if tokenizer.eos_token_id not in stop_token_ids_list:
46
  stop_token_ids_list.append(tokenizer.eos_token_id)
47
 
48
- # Remove None values if any token wasn't found (though they should be in this vocab)
49
  stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
50
 
51
  if not stop_token_ids_list:
@@ -56,120 +54,94 @@ try:
56
 
57
  except Exception as e:
58
  print(f"!!! Error loading model: {e}")
59
- # Clean up memory if partial loading occurred
60
- del model
61
- del tokenizer
 
62
  gc.collect()
63
- # Raise a Gradio error to make it visible in the UI
64
  raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
65
 
66
 
67
- # --- System Prompt ---
68
- # Use a dynamic date and the correct model name as per the card example structure
69
  def get_system_prompt():
70
- # current_date = datetime.datetime.now().strftime("%Yλ…„ %mμ›” %d일(%a)") # Korean date format
71
- current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") # English date format is safer for consistency
72
  return (
73
  f"- AI μ–Έμ–΄λͺ¨λΈμ˜ 이름은 \"CLOVA X\" 이며 λ„€μ΄λ²„μ—μ„œ λ§Œλ“€μ—ˆλ‹€.\n"
74
- # f"- μ˜€λŠ˜μ€ {current_date}이닀.\n" # Dynamic date can be added if desired
75
  f"- μ‚¬μš©μžμ˜ μ§ˆλ¬Έμ— λŒ€ν•΄ μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ ν•œκ΅­μ–΄λ‘œ λ‹΅λ³€ν•΄μ•Ό ν•œλ‹€."
76
  )
77
 
78
-
79
- # --- Inference Function ---
80
  def predict(message, history):
81
- """
82
- Generates a response using the HyperCLOVAX model based on user message and chat history.
83
- Handles chat formatting, generation, decoding, and memory management.
84
- """
85
  system_prompt = get_system_prompt()
86
 
87
- # 1. Format conversation history according to the model's expected template
88
  chat_history_formatted = [
89
- {"role": "tool_list", "content": ""}, # Required by the model card example
90
  {"role": "system", "content": system_prompt}
91
  ]
92
  for user_msg, ai_msg in history:
93
  chat_history_formatted.append({"role": "user", "content": user_msg})
94
- # Ensure assistant response is included correctly, potentially adding endofturn if needed by template logic,
95
- # but apply_chat_template usually handles this.
96
- chat_history_formatted.append({"role": "assistant", "content": ai_msg}) # Append the actual AI response
97
 
98
- # Add the latest user message
99
  chat_history_formatted.append({"role": "user", "content": message})
100
 
101
- # 2. Apply the chat template
 
 
102
  try:
103
  inputs = tokenizer.apply_chat_template(
104
  chat_history_formatted,
105
- add_generation_prompt=True, # Crucial for instruction-following models
106
  return_dict=True,
107
  return_tensors="pt"
108
- ).to(model.device) # Ensure inputs are on the correct device (CPU)
109
  input_length = inputs['input_ids'].shape[1]
110
  print(f"\nInput tokens: {input_length}")
111
- # print(f"Formatted input text (approx): {tokenizer.decode(inputs['input_ids'][0])}") # For debugging
112
 
113
  except Exception as e:
114
  print(f"!!! Error applying chat template: {e}")
115
- # Provide feedback to the user
116
  return f"였λ₯˜: μž…λ ₯ ν˜•μ‹μ„ μ²˜λ¦¬ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
117
 
118
- # 3. Generate response using the model
119
- output_ids = None # Initialize output_ids
120
  try:
121
  print("Generating response...")
122
- # Use torch.no_grad() to reduce memory footprint during inference
123
  with torch.no_grad():
124
  output_ids = model.generate(
125
  **inputs,
126
  max_new_tokens=MAX_NEW_TOKENS,
127
- eos_token_id=stop_token_ids_list, # Use the list of stop token IDs
128
- pad_token_id=tokenizer.eos_token_id, # Set pad token ID to EOS token ID
129
- do_sample=True, # Enable sampling for less repetitive output
130
- temperature=0.7, # Control randomness (lower = more focused)
131
- top_p=0.9, # Use nucleus sampling
132
- # num_beams=1, # Use 1 for sampling (greedy is default if do_sample=False)
133
- # early_stopping=True # Stop generation early if EOS is reached
134
  )
135
  print("Generation complete.")
136
 
137
  except Exception as e:
138
  print(f"!!! Error during model generation: {e}")
139
- # Clean up potentially large tensors in case of error
140
- del inputs
141
  if output_ids is not None: del output_ids
142
  gc.collect()
143
  return f"였λ₯˜: 응닡을 μƒμ„±ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
144
 
145
- # 4. Decode the response
146
- # We need to decode only the newly generated tokens, excluding the input tokens
147
  new_tokens = output_ids[0, input_length:]
148
  response = tokenizer.decode(new_tokens, skip_special_tokens=True)
149
 
150
  print(f"Output tokens: {len(new_tokens)}")
151
- # print(f"Raw response: '{response}'") # Log the raw decoded output
152
 
153
- # 5. Clean up memory
154
  del inputs
155
  del output_ids
156
  del new_tokens
157
- gc.collect() # Explicitly run garbage collection
158
  print("Memory cleaned.")
159
 
160
  return response
161
 
162
- # --- Gradio Interface ---
163
  print("--- Setting up Gradio Interface ---")
164
 
165
- # Use ChatInterface for a user-friendly chat experience
166
  chatbot_component = gr.Chatbot(
167
  label="HyperCLOVA X SEED (0.5B) λŒ€ν™”",
168
  bubble_full_width=False,
169
  height=600
170
  )
171
 
172
- # Define examples relevant to the model's strengths (Korean)
173
  examples = [
174
  ["넀이버 ν΄λ‘œλ°”XλŠ” λ¬΄μ—‡μΈκ°€μš”?"],
175
  ["μŠˆλ’°λ”©κ±° 방정식과 μ–‘μžμ—­ν•™μ˜ 관계λ₯Ό μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
@@ -177,10 +149,9 @@ examples = [
177
  ["μ œμ£Όλ„ μ—¬ν–‰ κ³„νšμ„ μ„Έμš°κ³  μžˆλŠ”λ°, 3λ°• 4일 μΆ”μ²œ μ½”μŠ€ μ’€ μ§œμ€„λž˜?"],
178
  ]
179
 
180
- # Create the Gradio ChatInterface
181
  demo = gr.ChatInterface(
182
- fn=predict, # The function to call for generating responses
183
- chatbot=chatbot_component, # The chatbot display component
184
  title="πŸ‡°πŸ‡· 넀이버 HyperCLOVA X SEED (0.5B) 데λͺ¨",
185
  description=(
186
  f"**λͺ¨λΈ:** {MODEL_ID}\n"
@@ -189,21 +160,13 @@ demo = gr.ChatInterface(
189
  f"μ΅œλŒ€ 생성 토큰 μˆ˜λŠ” {MAX_NEW_TOKENS}개둜 μ œν•œλ©λ‹ˆλ‹€."
190
  ),
191
  examples=examples,
192
- cache_examples=False, # Disable caching on free tier to save disk/memory
193
- theme="soft", # Use a soft theme
194
  retry_btn="λ‹€μ‹œ μ‹œλ„",
195
  undo_btn="이전 ν„΄ μ‚­μ œ",
196
  clear_btn="λŒ€ν™” μ΄ˆκΈ°ν™”",
197
  )
198
 
199
- # --- Launch the App ---
200
  if __name__ == "__main__":
201
  print("--- Launching Gradio App ---")
202
- # queue() is important for handling multiple users, especially with slow inference on CPU
203
- # Use concurrency_count=1 if resource exhaustion occurs, otherwise default might be okay.
204
- demo.queue(
205
- # default_concurrency_limit=1 # Limit concurrent requests if needed
206
- ).launch(
207
- # share=False # Set to True to get a public link (requires login)
208
- # server_name="0.0.0.0" # To make it accessible on the network if running locally
209
- )
 
5
  import os
6
  import datetime
7
 
 
8
  MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
9
+ MAX_NEW_TOKENS = 512
10
+ CPU_THREAD_COUNT = 4 # ν•„μš”μ‹œ 쑰절
11
 
 
12
  # torch.set_num_threads(CPU_THREAD_COUNT)
13
  # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
14
  # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
15
 
16
  print("--- Environment Setup ---")
17
  print(f"PyTorch version: {torch.__version__}")
18
+ print(f"Running on device: cpu")
19
+ print(f"Torch Threads: {torch.get_num_threads()}")
20
 
 
21
  print(f"--- Loading Model: {MODEL_ID} ---")
22
  print("This might take a few minutes, especially on the first launch...")
23
 
24
+ model = None
25
+ tokenizer = None
26
+
27
  try:
 
28
  model = AutoModelForCausalLM.from_pretrained(
29
  MODEL_ID,
30
+ torch_dtype=torch.float32,
31
+ device_map="cpu",
32
+ force_download=True # κ°•μ œ μž¬λ‹€μš΄λ‘œλ“œ ν™œμ„±ν™”
33
+ )
34
+ tokenizer = AutoTokenizer.from_pretrained(
35
+ MODEL_ID,
36
+ force_download=True # κ°•μ œ μž¬λ‹€μš΄λ‘œλ“œ ν™œμ„±ν™”
37
  )
38
+ model.eval()
 
39
  print("--- Model and Tokenizer Loaded Successfully on CPU ---")
40
 
 
 
41
  stop_token_strings = ["<|endofturn|>", "<|stop|>"]
42
  stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
43
 
 
44
  if tokenizer.eos_token_id not in stop_token_ids_list:
45
  stop_token_ids_list.append(tokenizer.eos_token_id)
46
 
 
47
  stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
48
 
49
  if not stop_token_ids_list:
 
54
 
55
  except Exception as e:
56
  print(f"!!! Error loading model: {e}")
57
+ if 'model' in locals() and model is not None:
58
+ del model
59
+ if 'tokenizer' in locals() and tokenizer is not None:
60
+ del tokenizer
61
  gc.collect()
 
62
  raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
63
 
64
 
 
 
65
  def get_system_prompt():
66
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
 
67
  return (
68
  f"- AI μ–Έμ–΄λͺ¨λΈμ˜ 이름은 \"CLOVA X\" 이며 λ„€μ΄λ²„μ—μ„œ λ§Œλ“€μ—ˆλ‹€.\n"
69
+ # f"- μ˜€λŠ˜μ€ {current_date}이닀.\n" # ν•„μš”μ‹œ 주석 ν•΄μ œ
70
  f"- μ‚¬μš©μžμ˜ μ§ˆλ¬Έμ— λŒ€ν•΄ μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ ν•œκ΅­μ–΄λ‘œ λ‹΅λ³€ν•΄μ•Ό ν•œλ‹€."
71
  )
72
 
 
 
73
  def predict(message, history):
 
 
 
 
74
  system_prompt = get_system_prompt()
75
 
 
76
  chat_history_formatted = [
77
+ {"role": "tool_list", "content": ""},
78
  {"role": "system", "content": system_prompt}
79
  ]
80
  for user_msg, ai_msg in history:
81
  chat_history_formatted.append({"role": "user", "content": user_msg})
82
+ chat_history_formatted.append({"role": "assistant", "content": ai_msg})
 
 
83
 
 
84
  chat_history_formatted.append({"role": "user", "content": message})
85
 
86
+ inputs = None
87
+ output_ids = None
88
+
89
  try:
90
  inputs = tokenizer.apply_chat_template(
91
  chat_history_formatted,
92
+ add_generation_prompt=True,
93
  return_dict=True,
94
  return_tensors="pt"
95
+ ).to(model.device)
96
  input_length = inputs['input_ids'].shape[1]
97
  print(f"\nInput tokens: {input_length}")
 
98
 
99
  except Exception as e:
100
  print(f"!!! Error applying chat template: {e}")
 
101
  return f"였λ₯˜: μž…λ ₯ ν˜•μ‹μ„ μ²˜λ¦¬ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
102
 
 
 
103
  try:
104
  print("Generating response...")
 
105
  with torch.no_grad():
106
  output_ids = model.generate(
107
  **inputs,
108
  max_new_tokens=MAX_NEW_TOKENS,
109
+ eos_token_id=stop_token_ids_list,
110
+ pad_token_id=tokenizer.eos_token_id,
111
+ do_sample=True,
112
+ temperature=0.7,
113
+ top_p=0.9,
 
 
114
  )
115
  print("Generation complete.")
116
 
117
  except Exception as e:
118
  print(f"!!! Error during model generation: {e}")
119
+ if inputs is not None: del inputs
 
120
  if output_ids is not None: del output_ids
121
  gc.collect()
122
  return f"였λ₯˜: 응닡을 μƒμ„±ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
123
 
 
 
124
  new_tokens = output_ids[0, input_length:]
125
  response = tokenizer.decode(new_tokens, skip_special_tokens=True)
126
 
127
  print(f"Output tokens: {len(new_tokens)}")
 
128
 
 
129
  del inputs
130
  del output_ids
131
  del new_tokens
132
+ gc.collect()
133
  print("Memory cleaned.")
134
 
135
  return response
136
 
 
137
  print("--- Setting up Gradio Interface ---")
138
 
 
139
  chatbot_component = gr.Chatbot(
140
  label="HyperCLOVA X SEED (0.5B) λŒ€ν™”",
141
  bubble_full_width=False,
142
  height=600
143
  )
144
 
 
145
  examples = [
146
  ["넀이버 ν΄λ‘œλ°”XλŠ” λ¬΄μ—‡μΈκ°€μš”?"],
147
  ["μŠˆλ’°λ”©κ±° 방정식과 μ–‘μžμ—­ν•™μ˜ 관계λ₯Ό μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
 
149
  ["μ œμ£Όλ„ μ—¬ν–‰ κ³„νšμ„ μ„Έμš°κ³  μžˆλŠ”λ°, 3λ°• 4일 μΆ”μ²œ μ½”μŠ€ μ’€ μ§œμ€„λž˜?"],
150
  ]
151
 
 
152
  demo = gr.ChatInterface(
153
+ fn=predict,
154
+ chatbot=chatbot_component,
155
  title="πŸ‡°πŸ‡· 넀이버 HyperCLOVA X SEED (0.5B) 데λͺ¨",
156
  description=(
157
  f"**λͺ¨λΈ:** {MODEL_ID}\n"
 
160
  f"μ΅œλŒ€ 생성 토큰 μˆ˜λŠ” {MAX_NEW_TOKENS}개둜 μ œν•œλ©λ‹ˆλ‹€."
161
  ),
162
  examples=examples,
163
+ cache_examples=False,
164
+ theme="soft",
165
  retry_btn="λ‹€μ‹œ μ‹œλ„",
166
  undo_btn="이전 ν„΄ μ‚­μ œ",
167
  clear_btn="λŒ€ν™” μ΄ˆκΈ°ν™”",
168
  )
169
 
 
170
  if __name__ == "__main__":
171
  print("--- Launching Gradio App ---")
172
+ demo.queue().launch()