Spaces:

kimhyunwoo
/

freetestn

Sleeping

App Files Files Community

kimhyunwoo commited on 11 days ago

Commit

2673193

verified ·

1 Parent(s): 61bf4d9

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -77

app.py CHANGED Viewed

@@ -5,47 +5,45 @@ import gc
 import os
 import datetime
-# --- Configuration ---
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
-MAX_NEW_TOKENS = 512 # Limit output length for faster response on CPU (adjust as needed)
-CPU_THREAD_COUNT = 4 # Limit threads torch uses on CPU if needed (adjust based on Space CPU core count)
-# Set PyTorch CPU thread count (optional, might help prevent resource exhaustion)
 # torch.set_num_threads(CPU_THREAD_COUNT)
 # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 print("--- Environment Setup ---")
 print(f"PyTorch version: {torch.__version__}")
-print(f"Running on device: cpu") # Explicitly state we expect CPU
-print(f"Torch Threads: {torch.get_num_threads()}") # Check default threads
-# --- Model Loading ---
 print(f"--- Loading Model: {MODEL_ID} ---")
 print("This might take a few minutes, especially on the first launch...")
 try:
-    # Load model explicitly onto CPU with float32 (standard for CPU compatibility)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32, # Use float32 for CPU compatibility
-        device_map="cpu"          # Explicitly map to CPU
-        # low_cpu_mem_usage=True # Can sometimes help on low RAM, but might slow down loading
     )
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model.eval() # Set model to evaluation mode
     print("--- Model and Tokenizer Loaded Successfully on CPU ---")
-    # --- Stop Token Configuration ---
-    # Get IDs for specified stop tokens and the standard EOS token
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
     stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
-    # Ensure the official EOS token is also included if not already present
     if tokenizer.eos_token_id not in stop_token_ids_list:
         stop_token_ids_list.append(tokenizer.eos_token_id)
-    # Remove None values if any token wasn't found (though they should be in this vocab)
     stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
     if not stop_token_ids_list:
@@ -56,120 +54,94 @@ try:
 except Exception as e:
     print(f"!!! Error loading model: {e}")
-    # Clean up memory if partial loading occurred
-    del model
-    del tokenizer
     gc.collect()
-    # Raise a Gradio error to make it visible in the UI
     raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
-# --- System Prompt ---
-# Use a dynamic date and the correct model name as per the card example structure
 def get_system_prompt():
-    # current_date = datetime.datetime.now().strftime("%Y년 %m월 %d일(%a)") # Korean date format
-    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") # English date format is safer for consistency
     return (
         f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
-        # f"- 오늘은 {current_date}이다.\n" # Dynamic date can be added if desired
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
-# --- Inference Function ---
 def predict(message, history):
-    """
-    Generates a response using the HyperCLOVAX model based on user message and chat history.
-    Handles chat formatting, generation, decoding, and memory management.
-    """
     system_prompt = get_system_prompt()
-    # 1. Format conversation history according to the model's expected template
     chat_history_formatted = [
-        {"role": "tool_list", "content": ""}, # Required by the model card example
         {"role": "system", "content": system_prompt}
     ]
     for user_msg, ai_msg in history:
         chat_history_formatted.append({"role": "user", "content": user_msg})
-        # Ensure assistant response is included correctly, potentially adding endofturn if needed by template logic,
-        # but apply_chat_template usually handles this.
-        chat_history_formatted.append({"role": "assistant", "content": ai_msg}) # Append the actual AI response
-    # Add the latest user message
     chat_history_formatted.append({"role": "user", "content": message})
-    # 2. Apply the chat template
     try:
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
-            add_generation_prompt=True, # Crucial for instruction-following models
             return_dict=True,
             return_tensors="pt"
-        ).to(model.device) # Ensure inputs are on the correct device (CPU)
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
-        # print(f"Formatted input text (approx): {tokenizer.decode(inputs['input_ids'][0])}") # For debugging
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
-        # Provide feedback to the user
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
-    # 3. Generate response using the model
-    output_ids = None # Initialize output_ids
     try:
         print("Generating response...")
-        # Use torch.no_grad() to reduce memory footprint during inference
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
-                eos_token_id=stop_token_ids_list, # Use the list of stop token IDs
-                pad_token_id=tokenizer.eos_token_id, # Set pad token ID to EOS token ID
-                do_sample=True,    # Enable sampling for less repetitive output
-                temperature=0.7,   # Control randomness (lower = more focused)
-                top_p=0.9,         # Use nucleus sampling
-                # num_beams=1,     # Use 1 for sampling (greedy is default if do_sample=False)
-                # early_stopping=True # Stop generation early if EOS is reached
             )
         print("Generation complete.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
-        # Clean up potentially large tensors in case of error
-        del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
-    # 4. Decode the response
-    # We need to decode only the newly generated tokens, excluding the input tokens
     new_tokens = output_ids[0, input_length:]
     response = tokenizer.decode(new_tokens, skip_special_tokens=True)
     print(f"Output tokens: {len(new_tokens)}")
-    # print(f"Raw response: '{response}'") # Log the raw decoded output
-    # 5. Clean up memory
     del inputs
     del output_ids
     del new_tokens
-    gc.collect() # Explicitly run garbage collection
     print("Memory cleaned.")
     return response
-# --- Gradio Interface ---
 print("--- Setting up Gradio Interface ---")
-# Use ChatInterface for a user-friendly chat experience
 chatbot_component = gr.Chatbot(
     label="HyperCLOVA X SEED (0.5B) 대화",
     bubble_full_width=False,
     height=600
     )
-# Define examples relevant to the model's strengths (Korean)
 examples = [
     ["네이버 클로바X는 무엇인가요?"],
     ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
@@ -177,10 +149,9 @@ examples = [
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
-# Create the Gradio ChatInterface
 demo = gr.ChatInterface(
-    fn=predict,                 # The function to call for generating responses
-    chatbot=chatbot_component,  # The chatbot display component
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
@@ -189,21 +160,13 @@ demo = gr.ChatInterface(
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
-    cache_examples=False,      # Disable caching on free tier to save disk/memory
-    theme="soft",              # Use a soft theme
     retry_btn="다시 시도",
     undo_btn="이전 턴 삭제",
     clear_btn="대화 초기화",
 )
-# --- Launch the App ---
 if __name__ == "__main__":
     print("--- Launching Gradio App ---")
-    # queue() is important for handling multiple users, especially with slow inference on CPU
-    # Use concurrency_count=1 if resource exhaustion occurs, otherwise default might be okay.
-    demo.queue(
-        # default_concurrency_limit=1 # Limit concurrent requests if needed
-        ).launch(
-            # share=False # Set to True to get a public link (requires login)
-            # server_name="0.0.0.0" # To make it accessible on the network if running locally
-            )

 import os
 import datetime
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
+MAX_NEW_TOKENS = 512
+CPU_THREAD_COUNT = 4 # 필요시 조절
 # torch.set_num_threads(CPU_THREAD_COUNT)
 # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 print("--- Environment Setup ---")
 print(f"PyTorch version: {torch.__version__}")
+print(f"Running on device: cpu")
+print(f"Torch Threads: {torch.get_num_threads()}")
 print(f"--- Loading Model: {MODEL_ID} ---")
 print("This might take a few minutes, especially on the first launch...")
+model = None
+tokenizer = None
 try:
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.float32,
+        device_map="cpu",
+        force_download=True # 강제 재다운로드 활성화
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        force_download=True # 강제 재다운로드 활성화
     )
+    model.eval()
     print("--- Model and Tokenizer Loaded Successfully on CPU ---")
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
     stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
     if tokenizer.eos_token_id not in stop_token_ids_list:
         stop_token_ids_list.append(tokenizer.eos_token_id)
     stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
     if not stop_token_ids_list:
 except Exception as e:
     print(f"!!! Error loading model: {e}")
+    if 'model' in locals() and model is not None:
+        del model
+    if 'tokenizer' in locals() and tokenizer is not None:
+        del tokenizer
     gc.collect()
     raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
 def get_system_prompt():
+    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
         f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
+        # f"- 오늘은 {current_date}이다.\n" # 필요시 주석 해제
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
 def predict(message, history):
     system_prompt = get_system_prompt()
     chat_history_formatted = [
+        {"role": "tool_list", "content": ""},
         {"role": "system", "content": system_prompt}
     ]
     for user_msg, ai_msg in history:
         chat_history_formatted.append({"role": "user", "content": user_msg})
+        chat_history_formatted.append({"role": "assistant", "content": ai_msg})
     chat_history_formatted.append({"role": "user", "content": message})
+    inputs = None
+    output_ids = None
     try:
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
+            add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt"
+        ).to(model.device)
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
         print("Generating response...")
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
+                eos_token_id=stop_token_ids_list,
+                pad_token_id=tokenizer.eos_token_id,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
             )
         print("Generation complete.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
+        if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
     new_tokens = output_ids[0, input_length:]
     response = tokenizer.decode(new_tokens, skip_special_tokens=True)
     print(f"Output tokens: {len(new_tokens)}")
     del inputs
     del output_ids
     del new_tokens
+    gc.collect()
     print("Memory cleaned.")
     return response
 print("--- Setting up Gradio Interface ---")
 chatbot_component = gr.Chatbot(
     label="HyperCLOVA X SEED (0.5B) 대화",
     bubble_full_width=False,
     height=600
     )
 examples = [
     ["네이버 클로바X는 무엇인가요?"],
     ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
 demo = gr.ChatInterface(
+    fn=predict,
+    chatbot=chatbot_component,
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
+    cache_examples=False,
+    theme="soft",
     retry_btn="다시 시도",
     undo_btn="이전 턴 삭제",
     clear_btn="대화 초기화",
 )
 if __name__ == "__main__":
     print("--- Launching Gradio App ---")
+    demo.queue().launch()