Spaces:

kimhyunwoo
/

freetestn

Sleeping

App Files Files Community

kimhyunwoo commited on 11 days ago

Commit

c9ceb74

verified ·

1 Parent(s): dc10a73

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -51

app.py CHANGED Viewed

@@ -4,11 +4,14 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import gc
 import os
 import datetime
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
 MAX_NEW_TOKENS = 512
 CPU_THREAD_COUNT = 4 # 필요시 조절
 # torch.set_num_threads(CPU_THREAD_COUNT)
 # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
@@ -18,60 +21,62 @@ print(f"PyTorch version: {torch.__version__}")
 print(f"Running on device: cpu")
 print(f"Torch Threads: {torch.get_num_threads()}")
 print(f"--- Loading Model: {MODEL_ID} ---")
 print("This might take a few minutes, especially on the first launch...")
 model = None
 tokenizer = None
 try:
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float32,
         device_map="cpu",
-        force_download=True # 이전 오류 해결을 위해 유지 (필요 없으면 False 또는 제거)
     )
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
-        force_download=True # 이전 오류 해결을 위해 유지 (필요 없으면 False 또는 제거)
     )
     model.eval()
-    print("--- Model and Tokenizer Loaded Successfully on CPU ---")
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
     stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
-    # 모델 토크나이저에 eos_token_id가 제대로 설정되어 있는지 확인
-    if tokenizer.eos_token is not None and tokenizer.eos_token_id not in stop_token_ids_list:
-         # eos_token_id가 None이 아니고 리스트에 없을 때만 추가
-         if tokenizer.eos_token_id is not None:
-            stop_token_ids_list.append(tokenizer.eos_token_id)
-         else:
-            print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
-    elif tokenizer.eos_token is None:
-        print("Warning: tokenizer.eos_token is not defined.")
     stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
     if not stop_token_ids_list:
-        print("Warning: Could not find any stop token IDs. Generation might not stop correctly.")
-        # Fallback: 만약 eos 토큰 ID도 없다면, generation에서 문제가 생길 수 있음
-        # 필요하다면 기본 eos 토큰 ID를 하드코딩하거나 다른 방식으로 처리해야 할 수 있음
-        # 예: stop_token_ids_list = [some_default_eos_id]
     print(f"Using Stop Token IDs: {stop_token_ids_list}")
 except Exception as e:
     print(f"!!! Error loading model: {e}")
-    if 'model' in locals() and model is not None:
-        del model
-    if 'tokenizer' in locals() and tokenizer is not None:
-        del tokenizer
     gc.collect()
-    raise gr.Error(f"Failed to load the model {MODEL_ID}. Please check the Space logs. Error: {e}")
 def get_system_prompt():
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
@@ -80,72 +85,143 @@ def get_system_prompt():
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
 def predict(message, history):
     system_prompt = get_system_prompt()
     chat_history_formatted = [
         {"role": "tool_list", "content": ""},
         {"role": "system", "content": system_prompt}
     ]
-    # history가 (user, ai) 튜플 리스트라고 가정
-    for user_msg, ai_msg in history:
-        chat_history_formatted.append({"role": "user", "content": user_msg})
-        chat_history_formatted.append({"role": "assistant", "content": ai_msg})
     chat_history_formatted.append({"role": "user", "content": message})
     inputs = None
     output_ids = None
     try:
-        # device_map="cpu"로 모델을 로드했으므로, inputs도 cpu로 보냅니다.
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt"
-        ).to("cpu") # 명시적으로 CPU 지정
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
         print("Generating response...")
         with torch.no_grad():
-            # eos_token_id에 리스트를 전달하는 것이 일반적입니다.
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
-                eos_token_id=stop_token_ids_list, # 수정된 stop_token_ids_list 사용
-                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, # pad_token_id 확인
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
             )
-        print("Generation complete.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
-    # output_ids가 None이 아닐 경우에만 디코딩 시도
     if output_ids is not None:
-        new_tokens = output_ids[0, input_length:]
-        response = tokenizer.decode(new_tokens, skip_special_tokens=True)
-        print(f"Output tokens: {len(new_tokens)}")
-        del new_tokens # 메모리 정리
-    else:
-        response = "오류: 응답 생성에 실패했습니다 (output_ids is None)."
-        print("Generation failed, output_ids is None.")
-    # 메모리 정리
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
     gc.collect()
@@ -153,15 +229,15 @@ def predict(message, history):
     return response
-# --- Gradio Interface ---
 print("--- Setting up Gradio Interface ---")
-# UserWarning 해결 및 최신 형식 사용
 chatbot_component = gr.Chatbot(
     label="HyperCLOVA X SEED (0.5B) 대화",
     bubble_full_width=False,
     height=600,
-    type='messages' # message 형식 사용 명시
 )
 examples = [
@@ -171,23 +247,34 @@ examples = [
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
-# 문제가 된 인자(retry_btn, undo_btn, clear_btn) 제거
 demo = gr.ChatInterface(
-    fn=predict,
-    chatbot=chatbot_component,
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
         f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
-        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다 (특히 첫 응답). "
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
-    cache_examples=False,
     theme="soft",
 )
-# --- Launch the App ---
 if __name__ == "__main__":
     print("--- Launching Gradio App ---")
-    demo.queue().launch()

 import gc
 import os
 import datetime
+import time
+# --- Configuration ---
 MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
 MAX_NEW_TOKENS = 512
 CPU_THREAD_COUNT = 4 # 필요시 조절
+# --- Optional: Set CPU Threads ---
 # torch.set_num_threads(CPU_THREAD_COUNT)
 # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
 print(f"Running on device: cpu")
 print(f"Torch Threads: {torch.get_num_threads()}")
+# --- Model and Tokenizer Loading ---
 print(f"--- Loading Model: {MODEL_ID} ---")
 print("This might take a few minutes, especially on the first launch...")
 model = None
 tokenizer = None
+load_successful = False
 try:
+    start_load_time = time.time()
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float32,
         device_map="cpu",
+        # force_download=True # 주석 처리. 캐시 문제가 없다면 불필요
     )
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
+        # force_download=True # 주석 처리
     )
     model.eval()
+    load_time = time.time() - start_load_time
+    print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---")
+    load_successful = True
+    # --- Stop Token Configuration ---
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
     stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
+    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in stop_token_ids_list:
+        stop_token_ids_list.append(tokenizer.eos_token_id)
+    elif tokenizer.eos_token_id is None:
+         print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
     stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
     if not stop_token_ids_list:
+        print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
+        if tokenizer.eos_token_id is not None:
+            stop_token_ids_list = [tokenizer.eos_token_id]
+        else:
+             print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
+             # 필요시 에러 처리 또는 기본값 설정
     print(f"Using Stop Token IDs: {stop_token_ids_list}")
 except Exception as e:
     print(f"!!! Error loading model: {e}")
+    if 'model' in locals() and model is not None: del model
+    if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
     gc.collect()
+    # 앱 실행 전에 로딩 실패 시 Gradio 에러 대신 프로세스 종료 또는 다른 처리 고려
+    raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
+# --- System Prompt Definition ---
 def get_system_prompt():
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
+# --- Warm-up Function ---
+def warmup_model():
+    if not load_successful or model is None or tokenizer is None:
+        print("Skipping warmup: Model not loaded successfully.")
+        return
+    print("--- Starting Model Warm-up ---")
+    try:
+        start_warmup_time = time.time()
+        warmup_message = "안��하세요"
+        system_prompt = get_system_prompt()
+        warmup_chat = [
+            {"role": "tool_list", "content": ""},
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": warmup_message}
+        ]
+        inputs = tokenizer.apply_chat_template(
+            warmup_chat,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to("cpu")
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=10, # 짧게 생성하여 시간 절약
+                eos_token_id=stop_token_ids_list,
+                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+                do_sample=False # Warm-up 시에는 샘플링 불필요
+            )
+        # 결과 디코딩 (선택 사항, 확인용)
+        # response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+        # print(f"Warm-up response (decoded): {response}")
+        del inputs
+        del output_ids
+        gc.collect()
+        warmup_time = time.time() - start_warmup_time
+        print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")
+    except Exception as e:
+        print(f"!!! Error during model warm-up: {e}")
+        # 웜업 실패가 앱 실행을 막지는 않도록 처리
+    finally:
+        gc.collect() # Ensure cleanup even if warmup fails
+# --- Inference Function ---
 def predict(message, history):
+    """
+    Generates response using HyperCLOVAX based on user message and chat history.
+    Handles chat formatting, generation, decoding, and memory management.
+    Assumes 'history' is in the Gradio 'messages' format: List[List[str | None | Tuple]] or List[Dict]
+    """
+    if model is None or tokenizer is None:
+         return "오류: 모델이 로드되지 않았습니다."
     system_prompt = get_system_prompt()
+    # history 형식이 List[Dict] ('messages' format)라고 가정하고 처리
     chat_history_formatted = [
         {"role": "tool_list", "content": ""},
         {"role": "system", "content": system_prompt}
     ]
+    # history는 [{'role': 'user', 'content': '...'}, {'role': 'assistant', 'content': '...'}] 형태
+    for turn in history:
+         # history의 각 요소가 딕셔너리 형태인지 확인 (더 안전하게)
+         if isinstance(turn, dict) and "role" in turn and "content" in turn:
+             chat_history_formatted.append(turn)
+         else:
+             # 예상치 못한 형식이 들어올 경우 경고 출력 (디버깅용)
+             print(f"Warning: Unexpected history format item: {turn}")
+             # 필요하다면 여기서 에러 처리 또는 변환 로직 추가
+    # Add the latest user message
     chat_history_formatted.append({"role": "user", "content": message})
     inputs = None
     output_ids = None
     try:
         inputs = tokenizer.apply_chat_template(
             chat_history_formatted,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt"
+        ).to("cpu") # Explicitly send to CPU
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
+        # Provide feedback to the user
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
         print("Generating response...")
+        generation_start_time = time.time()
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
+                eos_token_id=stop_token_ids_list,
+                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
             )
+        generation_time = time.time() - generation_start_time
+        print(f"Generation complete in {generation_time:.2f} seconds.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
+        # Clean up potentially large tensors in case of error
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
+    # Decode the response
+    response = "오류: 응답 생성에 실패했습니다." # 기본값
     if output_ids is not None:
+        try:
+            new_tokens = output_ids[0, input_length:]
+            response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+            print(f"Output tokens: {len(new_tokens)}")
+            del new_tokens
+        except Exception as e:
+            print(f"!!! Error decoding response: {e}")
+            response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."
+    # Clean up memory
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
     gc.collect()
     return response
+# --- Gradio Interface Setup ---
 print("--- Setting up Gradio Interface ---")
+# type='messages'를 명시하여 UserWarning 해결 및 최신 형식 사용
 chatbot_component = gr.Chatbot(
     label="HyperCLOVA X SEED (0.5B) 대화",
     bubble_full_width=False,
     height=600,
+    type='messages' # 이 부분을 명시하여 ChatInterface와의 호환성 확보
 )
 examples = [
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
+# ChatInterface 생성 시 불필요한 인자 제거됨
 demo = gr.ChatInterface(
+    fn=predict,                 # 예측 함수 연결
+    chatbot=chatbot_component,  # Chatbot 컴포넌트 사용 (type='messages' 설정됨)
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
         f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
+        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 시도됨)\n"
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
+    cache_examples=False,       # 무료 티어 캐싱 비활성화
     theme="soft",
+    # retry_btn, undo_btn, clear_btn 등은 최신 버전에서 직접 지원하지 않음
 )
+# --- Application Launch ---
 if __name__ == "__main__":
+    # 모델 로딩 성공 시에만 웜업 실행
+    if load_successful:
+        warmup_model()
+    else:
+        print("Skipping warm-up because model loading failed.")
     print("--- Launching Gradio App ---")
+    # queue()는 여러 사용자 처리 및 긴 작업 관리에 유용
+    demo.queue().launch(
+        # share=True # 공개 링크 생성 시 필요 (로그인 필요할 수 있음)
+        # server_name="0.0.0.0" # 로컬 네트워크에서 접근 허용 시
+    )