ROBO-R1984 / app-backup-last.py
openfree's picture
Rename app.py to app-backup-last.py
601a19f verified
#!/usr/bin/env python3
import os
import re
import tempfile
import gc
from collections.abc import Iterator
from threading import Thread, Lock
import json
import requests
import cv2
import gradio as gr
import spaces
import torch
import numpy as np
from loguru import logger
from PIL import Image
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, pipeline
import time
import warnings
from typing import Dict, List, Optional, Union
import librosa
import scipy.signal as sps
import queue
# CSV/TXT ๋ถ„์„
import pandas as pd
# PDF ํ…์ŠคํŠธ ์ถ”์ถœ
import PyPDF2
warnings.filterwarnings('ignore')
# ๋กœ๊น… ์„ค์ •
logger.remove()
logger.add(lambda msg: print(msg, flush=True), level="INFO")
print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper)...")
##############################################################################
# ์ƒ์ˆ˜ ์ •์˜
##############################################################################
MAX_CONTENT_CHARS = 2000
MAX_INPUT_LENGTH = 2096
MAX_NUM_IMAGES = 5
SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
##############################################################################
# ์ „์—ญ ๋ณ€์ˆ˜
##############################################################################
model = None
processor = None
whisper_model = None
model_loaded = False
whisper_loaded = False
model_name = "Gemma3-R1984-4B"
# ์˜ค๋””์˜ค ๊ด€๋ จ ์ „์—ญ ๋ณ€์ˆ˜
audio_lock = Lock()
last_audio_data = None
last_transcription = ""
##############################################################################
# ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
##############################################################################
def clear_cuda_cache():
"""CUDA ์บ์‹œ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ๋น„์›๋‹ˆ๋‹ค."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
##############################################################################
# ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
##############################################################################
def extract_keywords(text: str, top_k: int = 5) -> str:
"""ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
tokens = text.split()
seen = set()
unique_tokens = []
for token in tokens:
if token not in seen and len(token) > 1:
seen.add(token)
unique_tokens.append(token)
key_tokens = unique_tokens[:top_k]
return " ".join(key_tokens)
##############################################################################
# Whisper ๋ชจ๋ธ ๋กœ๋“œ
##############################################################################
@spaces.GPU(duration=60)
def load_whisper():
global whisper_model, whisper_loaded
if whisper_loaded:
logger.info("Whisper ๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
return True
try:
logger.info("Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
# ํŒŒ์ดํ”„๋ผ์ธ ๋ฐฉ์‹์œผ๋กœ ๋กœ๋“œ
device = 0 if torch.cuda.is_available() else "cpu"
whisper_model = pipeline(
task="automatic-speech-recognition",
model="openai/whisper-base",
chunk_length_s=30,
device=device,
)
whisper_loaded = True
logger.info("โœ… Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
return True
except Exception as e:
logger.error(f"Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
return False
##############################################################################
# ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜ (๊ฐ„์†Œํ™”)
##############################################################################
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
"""์˜ค๋””์˜ค ๋ฆฌ์ƒ˜ํ”Œ๋ง"""
if orig_sr == target_sr:
return audio.astype(np.float32)
# scipy๋ฅผ ์‚ฌ์šฉํ•œ ๋ฆฌ์ƒ˜ํ”Œ๋ง
number_of_samples = round(len(audio) * float(target_sr) / orig_sr)
audio_resampled = sps.resample(audio, number_of_samples)
return audio_resampled.astype(np.float32)
@spaces.GPU(duration=30)
def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
"""Whisper๋ฅผ ์‚ฌ์šฉํ•œ ์˜ค๋””์˜ค ์ „์‚ฌ"""
global whisper_model, whisper_loaded
if not whisper_loaded:
if not load_whisper():
return None
try:
# ์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์กฐ์šฉํ•œ์ง€ ์ฒดํฌ
if np.max(np.abs(audio_array)) < 0.01:
logger.warning("์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์กฐ์šฉํ•จ")
return None
# ์Œ์„ฑ ์ธ์‹
result = whisper_model({"array": audio_array, "sampling_rate": sr})
transcription = result["text"].strip()
logger.info(f"Whisper ์ „์‚ฌ ์„ฑ๊ณต: {transcription[:50]}...")
return transcription if transcription else None
except Exception as e:
logger.error(f"Whisper ์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
import traceback
logger.error(traceback.format_exc())
return None
def process_audio_recording(audio_data):
"""๋…น์Œ๋œ ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ"""
global last_audio_data, last_transcription, audio_lock
if audio_data is None:
return None
try:
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ถ”์ถœ
if isinstance(audio_data, tuple) and len(audio_data) == 2:
sr, audio = audio_data
else:
logger.warning(f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋””์˜ค ํ˜•์‹: {type(audio_data)}")
return None
if audio is None or len(audio) == 0:
return None
# numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
if not isinstance(audio, np.ndarray):
audio = np.array(audio)
# ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
if audio.ndim > 1:
audio = audio.mean(axis=1)
# 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
if sr != 16000:
audio = resample_audio(audio, sr, 16000)
# ์ €์žฅ
with audio_lock:
last_audio_data = (audio, 16000)
logger.info(f"์˜ค๋””์˜ค ์ €์žฅ ์™„๋ฃŒ: {len(audio)/16000:.1f}์ดˆ")
# ์ „์‚ฌ ์‹œ๋„
transcription = transcribe_audio_whisper(audio, 16000)
if transcription:
with audio_lock:
last_transcription = transcription
return transcription
except Exception as e:
logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
import traceback
logger.error(traceback.format_exc())
return None
##############################################################################
# ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
##############################################################################
def do_web_search(query: str) -> str:
"""SerpHouse API๋ฅผ ์‚ฌ์šฉํ•œ ์›น ๊ฒ€์ƒ‰"""
try:
url = "https://api.serphouse.com/serp/live"
params = {
"q": query,
"domain": "google.com",
"serp_type": "web",
"device": "desktop",
"lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
"num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
}
headers = {
"Authorization": f"Bearer {SERPHOUSE_API_KEY}"
}
logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
response = requests.get(url, headers=headers, params=params, timeout=60)
response.raise_for_status()
data = response.json()
results = data.get("results", {})
organic = results.get("organic", []) if isinstance(results, dict) else []
if not organic:
return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
max_results = min(10, len(organic))
limited_organic = organic[:max_results]
summary_lines = []
for idx, item in enumerate(limited_organic, start=1):
title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
link = item.get("link", "#")
snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
displayed_link = item.get("displayed_link", link)
summary_lines.append(
f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
f"{snippet}\n\n"
f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
f"---\n"
)
instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
"""
search_results = instructions + "\n".join(summary_lines)
return search_results
except Exception as e:
logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
##############################################################################
# ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
##############################################################################
def analyze_csv_file(path: str) -> str:
"""CSV ํŒŒ์ผ ๋ถ„์„"""
try:
df = pd.read_csv(path)
if df.shape[0] > 50 or df.shape[1] > 10:
df = df.iloc[:50, :10]
df_str = df.to_string()
if len(df_str) > MAX_CONTENT_CHARS:
df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
except Exception as e:
return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
def analyze_txt_file(path: str) -> str:
"""TXT ํŒŒ์ผ ๋ถ„์„"""
try:
with open(path, "r", encoding="utf-8") as f:
text = f.read()
if len(text) > MAX_CONTENT_CHARS:
text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
except Exception as e:
return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
def pdf_to_markdown(pdf_path: str) -> str:
"""PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
text_chunks = []
try:
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
max_pages = min(5, len(reader.pages))
for page_num in range(max_pages):
page = reader.pages[page_num]
page_text = page.extract_text() or ""
page_text = page_text.strip()
if page_text:
if len(page_text) > MAX_CONTENT_CHARS // max_pages:
page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
if len(reader.pages) > max_pages:
text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
except Exception as e:
return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
full_text = "\n".join(text_chunks)
if len(full_text) > MAX_CONTENT_CHARS:
full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
##############################################################################
# ๋ชจ๋ธ ๋กœ๋“œ
##############################################################################
@spaces.GPU(duration=120)
def load_model():
global model, processor, model_loaded
if model_loaded:
logger.info("๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
return True
try:
logger.info("Gemma3-R1984-4B ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
clear_cuda_cache()
model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="eager"
)
model_loaded = True
logger.info(f"โœ… {model_name} ๋กœ๋”ฉ ์™„๋ฃŒ!")
return True
except Exception as e:
logger.error(f"๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
return False
##############################################################################
# ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
##############################################################################
@spaces.GPU(duration=60)
def analyze_image_for_robot(
image: Union[np.ndarray, Image.Image],
prompt: str,
task_type: str = "general",
use_web_search: bool = False,
enable_thinking: bool = False,
max_new_tokens: int = 300,
audio_transcript: Optional[str] = None
) -> str:
"""๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„ (์˜ค๋””์˜ค ์ •๋ณด ํฌํ•จ)"""
global model, processor
if not model_loaded:
if not load_model():
return "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
try:
# numpy ๋ฐฐ์—ด์„ PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
if isinstance(image, np.ndarray):
image = Image.fromarray(image).convert('RGB')
# ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
system_prompts = {
"general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ๋ถ„์„ํ•˜์„ธ์š”.",
"planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
๋จผ์ € ์žฅ๋ฉด ์ดํ•ด๋ฅผ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ๊ทธ ๋‹ค์Œ ์ž‘์—… ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
ํ˜•์‹:
[์žฅ๋ฉด ์ดํ•ด] ํ˜„์žฌ ๋ณด์ด๋Š” ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…
[์ž‘์—… ๊ณ„ํš]
Step_1: xxx
Step_2: xxx
Step_n: xxx""",
"grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ๋ณด์ด๋Š” ๊ฐ์ฒด๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์š”์ฒญ๋œ ๊ฐ์ฒด ์œ„์น˜๋ฅผ [x1, y1, x2, y2]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
"affordance": "๋‹น์‹ ์€ ํŒŒ์ง€์  ๋ถ„์„ AI์ž…๋‹ˆ๋‹ค. ๋จผ์ € ๋Œ€์ƒ ๊ฐ์ฒด๋ฅผ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
"trajectory": "๋‹น์‹ ์€ ๊ฒฝ๋กœ ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค. ๋จผ์ € ํ™˜๊ฒฝ์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
"pointing": "๋‹น์‹ ์€ ์ง€์  ์ง€์ • ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์ฐธ์กฐ์ ๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์œ„์น˜๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
}
# ์˜ค๋””์˜ค ์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด ํ”„๋กฌํ”„ํŠธ ์ˆ˜์ •
if audio_transcript and task_type == "planning":
system_prompts["planning"] = """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
๋จผ์ € ์žฅ๋ฉด ์ดํ•ด๋ฅผ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์ฃผ๋ณ€ ์†Œ๋ฆฌ๋ฅผ ์ธ์‹ํ–ˆ๋‹ค๋ฉด ๊ทธ๊ฒƒ๋„ ์„ค๋ช…ํ•œ ํ›„, ์ž‘์—… ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
ํ˜•์‹:
[์žฅ๋ฉด ์ดํ•ด] ํ˜„์žฌ ๋ณด์ด๋Š” ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…
[์ฃผ๋ณ€ ์†Œ๋ฆฌ ์ธ์‹] ๋“ค๋ฆฌ๋Š” ์†Œ๋ฆฌ๋‚˜ ์Œ์„ฑ์„ 1์ค„๋กœ ์„ค๋ช…
[์ž‘์—… ๊ณ„ํš]
Step_1: xxx
Step_2: xxx
Step_n: xxx"""
system_prompt = system_prompts.get(task_type, system_prompts["general"])
# Chain-of-Thought ์ถ”๊ฐ€ (์„ ํƒ์ )
if enable_thinking:
system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ž‘์„ฑ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œํ•˜์„ธ์š”. ์žฅ๋ฉด ์ดํ•ด๋Š” ์ถ”๋ก  ๊ณผ์ •๊ณผ ๋ณ„๋„๋กœ ๋ฐ˜๋“œ์‹œ ํฌํ•จํ•˜์„ธ์š”."
# ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
combined_system = system_prompt
if use_web_search:
keywords = extract_keywords(prompt, top_k=5)
if keywords:
logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
search_results = do_web_search(keywords)
combined_system = f"{search_results}\n\n{system_prompt}"
# ์‚ฌ์šฉ์ž ํ”„๋กฌํ”„ํŠธ์— ์˜ค๋””์˜ค ์ •๋ณด ์ถ”๊ฐ€
user_prompt = prompt
if audio_transcript:
user_prompt += f"\n\n[์ธ์‹๋œ ์ฃผ๋ณ€ ์†Œ๋ฆฌ: {audio_transcript}]"
# ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
messages = [
{
"role": "system",
"content": [{"type": "text", "text": combined_system}]
},
{
"role": "user",
"content": [
{"type": "image", "url": image},
{"type": "text", "text": user_prompt}
]
}
]
# ์ž…๋ ฅ ์ฒ˜๋ฆฌ
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(device=model.device, dtype=torch.bfloat16)
# ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ
if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
if 'attention_mask' in inputs:
inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
# ์ƒ์„ฑ
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
)
# ์ž…๋ ฅ ํ† ํฐ ์ œ๊ฑฐํ•˜์—ฌ ์ถœ๋ ฅ๋งŒ ์ถ”์ถœ
generated_tokens = outputs[0][inputs.input_ids.shape[1]:]
# ๋””์ฝ”๋”ฉ
response = processor.decode(generated_tokens, skip_special_tokens=True).strip()
# ํ”„๋กฌํ”„ํŠธ ์ œ๊ฑฐ ๋ฐ ์ •๋ฆฌ
# ์ด๋ฏธ ์ž…๋ ฅ ํ† ํฐ์„ ์ œ๊ฑฐํ–ˆ์œผ๋ฏ€๋กœ ์ถ”๊ฐ€ ์ •๋ฆฌ๋งŒ ์ˆ˜ํ–‰
response = response.strip()
# ํ˜น์‹œ ๋‚จ์•„์žˆ๋Š” ๋ถˆํ•„์š”ํ•œ ํ…์ŠคํŠธ ์ œ๊ฑฐ
if response.startswith("model\n"):
response = response[6:].strip()
elif response.startswith("model"):
response = response[5:].strip()
return response
except Exception as e:
logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
import traceback
return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
finally:
clear_cuda_cache()
##############################################################################
# ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
##############################################################################
def _model_gen_with_oom_catch(**kwargs):
"""OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
global model
try:
model.generate(**kwargs)
except torch.cuda.OutOfMemoryError:
raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
finally:
clear_cuda_cache()
@spaces.GPU(duration=120)
def analyze_documents_streaming(
files: List[str],
prompt: str,
use_web_search: bool = False,
max_new_tokens: int = 2048
) -> Iterator[str]:
"""๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
global model, processor
if not model_loaded:
if not load_model():
yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
return
try:
# ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
# ์›น ๊ฒ€์ƒ‰
if use_web_search:
keywords = extract_keywords(prompt, top_k=5)
if keywords:
search_results = do_web_search(keywords)
system_content = f"{search_results}\n\n{system_content}"
# ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
doc_contents = []
for file_path in files:
if file_path.lower().endswith('.csv'):
content = analyze_csv_file(file_path)
elif file_path.lower().endswith('.txt'):
content = analyze_txt_file(file_path)
elif file_path.lower().endswith('.pdf'):
content = pdf_to_markdown(file_path)
else:
continue
doc_contents.append(content)
# ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
messages = [
{
"role": "system",
"content": [{"type": "text", "text": system_content}]
},
{
"role": "user",
"content": [
{"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
]
}
]
# ์ž…๋ ฅ ์ฒ˜๋ฆฌ
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(device=model.device, dtype=torch.bfloat16)
# ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
temperature=0.8,
top_p=0.9,
)
# ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
t.start()
# ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
output = ""
for new_text in streamer:
output += new_text
yield output
except Exception as e:
logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
finally:
clear_cuda_cache()
##############################################################################
# Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
##############################################################################
css = """
.robot-header {
text-align: center;
background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.status-box {
text-align: center;
padding: 10px;
border-radius: 5px;
margin: 10px 0;
font-weight: bold;
}
.info-box {
background: #f0f0f0;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
border-left: 4px solid #2a5298;
}
.task-button {
min-height: 60px;
font-size: 1.1em;
}
.webcam-container {
border: 3px solid #2a5298;
border-radius: 10px;
padding: 10px;
background: #f8f9fa;
}
.auto-capture-status {
text-align: center;
padding: 5px;
border-radius: 5px;
margin: 5px 0;
font-weight: bold;
background: #e8f5e9;
color: #2e7d32;
}
.audio-status {
text-align: center;
padding: 5px;
border-radius: 5px;
margin: 5px 0;
font-weight: bold;
background: #e3f2fd;
color: #1565c0;
}
"""
with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
gr.HTML("""
<div class="robot-header">
<h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
<h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค ์Œ์„ฑ ์ธ์‹</h3>
<p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
</div>
""")
with gr.Row():
# ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
with gr.Column(scale=1):
gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
with gr.Group(elem_classes="webcam-container"):
webcam = gr.Image(
sources=["webcam"],
streaming=True,
type="numpy",
label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
height=300
)
# ์ž๋™ ์บก์ฒ˜ ์ƒํƒœ ํ‘œ์‹œ
auto_capture_status = gr.HTML(
'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋Œ€๊ธฐ ์ค‘</div>'
)
# ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
captured_image = gr.Image(
label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
height=180,
visible=False
)
# ์˜ค๋””์˜ค ์ปจํŠธ๋กค
gr.Markdown("### ๐ŸŽค ์Œ์„ฑ ์ธ์‹")
with gr.Group():
# ์˜ค๋””์˜ค ์ƒํƒœ ํ‘œ์‹œ
audio_status = gr.HTML(
'<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
)
# ๋…น์Œ ์ธํ„ฐํŽ˜์ด์Šค (์ˆจ๊น€ ์ƒํƒœ๋กœ ์‹œ์ž‘)
audio_recorder = gr.Audio(
sources=["microphone"],
type="numpy",
label="๐ŸŽค 10์ดˆ ๋…น์Œ",
visible=False
)
# ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
last_transcript = gr.Textbox(
label="์ธ์‹๋œ ์Œ์„ฑ",
value="",
lines=2,
interactive=False
)
# ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—…")
with gr.Row():
capture_btn = gr.Button("๐Ÿ“ธ ์ˆ˜๋™ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
with gr.Column():
auto_capture_toggle = gr.Checkbox(
label="๐Ÿ”„ ์ž๋™ ์บก์ฒ˜ (10์ดˆ๋งˆ๋‹ค)",
value=False
)
use_audio_toggle = gr.Checkbox(
label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ",
value=False,
info="10์ดˆ๋งˆ๋‹ค ์Œ์„ฑ์„ ์ธ์‹ํ•˜์—ฌ ๋ถ„์„์— ํฌํ•จ"
)
with gr.Row():
planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
# ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
with gr.Column(scale=2):
gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
with gr.Row():
with gr.Column():
task_prompt = gr.Textbox(
label="์ž‘์—… ์„ค๋ช…",
placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
value="ํ˜„์žฌ ์žฅ๋ฉด์„ ๋ถ„์„ํ•˜๊ณ  ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์ œ์•ˆํ•˜์„ธ์š”.",
lines=2
)
with gr.Row():
use_web_search = gr.Checkbox(
label="๐Ÿ” ์›น ๊ฒ€์ƒ‰",
value=False
)
enable_thinking = gr.Checkbox(
label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ •",
value=False
)
max_tokens = gr.Slider(
label="์ตœ๋Œ€ ํ† ํฐ",
minimum=100,
maximum=1000,
value=300,
step=50
)
gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
result_output = gr.Textbox(
label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
lines=18,
max_lines=35,
show_copy_button=True,
elem_id="result"
)
status_display = gr.HTML(
'<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>'
)
# ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
with gr.Row():
with gr.Column():
doc_files = gr.File(
label="๋ฌธ์„œ ์—…๋กœ๋“œ",
file_count="multiple",
file_types=[".pdf", ".csv", ".txt"],
type="filepath"
)
doc_prompt = gr.Textbox(
label="๋ถ„์„ ์š”์ฒญ",
placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
lines=3
)
doc_web_search = gr.Checkbox(
label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
value=False
)
analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
with gr.Column():
doc_result = gr.Textbox(
label="๋ถ„์„ ๊ฒฐ๊ณผ",
lines=25,
max_lines=50
)
# ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
webcam_state = gr.State(None)
def capture_webcam(frame):
"""์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
if frame is None:
return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
def clear_capture():
"""์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
global last_transcription, last_audio_data, audio_lock
with audio_lock:
last_transcription = ""
last_audio_data = None
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
"""ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
global last_transcription, audio_lock
if image is None:
return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
# ํ˜„์žฌ ์ „์‚ฌ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
transcript = ""
with audio_lock:
transcript = last_transcription
result = analyze_image_for_robot(
image=image,
prompt=prompt,
task_type=task_type,
use_web_search=use_search,
enable_thinking=thinking,
max_new_tokens=tokens,
audio_transcript=transcript if transcript else None
)
# ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
timestamp = time.strftime("%H:%M:%S")
task_names = {
"planning": "์ž‘์—… ๊ณ„ํš",
"grounding": "๊ฐ์ฒด ์œ„์น˜",
"affordance": "ํŒŒ์ง€์ ",
"trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
}
formatted_result = f"""๐Ÿค– {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ ({timestamp})
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
{result}
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
return formatted_result, complete_status
# ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
@spaces.GPU(duration=60)
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio, audio_data):
"""์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"""
global last_transcription, audio_lock
if webcam_frame is None:
return (
None,
"์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
"๋Œ€๊ธฐ ์ค‘...",
None # ์˜ค๋””์˜ค ๋ฆฌ์…‹
)
# ์บก์ฒ˜ ์ˆ˜ํ–‰
timestamp = time.strftime("%H:%M:%S")
# ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (์žˆ์œผ๋ฉด)
if use_audio and audio_data is not None:
logger.info(f"[{timestamp}] ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์‹œ์ž‘")
transcription = process_audio_recording(audio_data)
if transcription:
logger.info(f"์ƒˆ๋กœ์šด ์ „์‚ฌ: {transcription[:50]}...")
# ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
audio_transcript = ""
if use_audio:
with audio_lock:
audio_transcript = last_transcription
if audio_transcript:
logger.info(f"๋ถ„์„์— ์‚ฌ์šฉํ•  ์Œ์„ฑ: {audio_transcript[:50]}...")
# ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
result = analyze_image_for_robot(
image=webcam_frame,
prompt=task_prompt,
task_type="planning",
use_web_search=use_search,
enable_thinking=thinking,
max_new_tokens=tokens,
audio_transcript=audio_transcript if audio_transcript else None
)
formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
{result}
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
# ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ ์—…๋ฐ์ดํŠธ
transcript_display = audio_transcript if audio_transcript else "์Œ์„ฑ ์ธ์‹ ๋Œ€๊ธฐ ์ค‘..."
return (
webcam_frame,
formatted_result,
'<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
transcript_display,
None # ์˜ค๋””์˜ค ๋ฆฌ์…‹ (๋‹ค์Œ ๋…น์Œ ์ค€๋น„)
)
# ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
webcam.stream(
fn=lambda x: x,
inputs=[webcam],
outputs=[webcam_state]
)
# ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
capture_btn.click(
fn=capture_webcam,
inputs=[webcam_state],
outputs=[webcam_state, captured_image, status_display]
)
# ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
clear_capture_btn.click(
fn=clear_capture,
outputs=[webcam_state, captured_image, status_display, last_transcript]
)
# ์ž‘์—… ๋ฒ„ํŠผ๋“ค
planning_btn.click(
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
outputs=[result_output, status_display]
)
grounding_btn.click(
fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
outputs=[result_output, status_display]
)
# ๋ฌธ์„œ ๋ถ„์„
def analyze_docs(files, prompt, use_search):
if not files:
return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
output = ""
for chunk in analyze_documents_streaming(files, prompt, use_search):
output = chunk
return output
analyze_docs_btn.click(
fn=analyze_docs,
inputs=[doc_files, doc_prompt, doc_web_search],
outputs=[doc_result]
)
# ์ž๋™ ์บก์ฒ˜ ํƒ€์ด๋จธ (10์ดˆ๋งˆ๋‹ค)
timer = gr.Timer(10.0, active=False)
# ์ž๋™ ์บก์ฒ˜ ํ† ๊ธ€ ์ด๋ฒคํŠธ
def toggle_auto_capture(enabled):
if enabled:
return gr.Timer(10.0, active=True), '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ํ™œ์„ฑํ™”๋จ (10์ดˆ๋งˆ๋‹ค)</div>'
else:
return gr.Timer(active=False), '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋น„ํ™œ์„ฑํ™”๋จ</div>'
auto_capture_toggle.change(
fn=toggle_auto_capture,
inputs=[auto_capture_toggle],
outputs=[timer, auto_capture_status]
)
# ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
def toggle_audio(enabled):
global last_transcription, last_audio_data, audio_lock
if enabled:
# Whisper ๋ชจ๋ธ ๋กœ๋“œ
load_whisper()
# ์ดˆ๊ธฐํ™”
with audio_lock:
last_transcription = ""
last_audio_data = None
logger.info("์˜ค๋””์˜ค ์ธ์‹ ํ™œ์„ฑํ™”๋จ")
return (
gr.update(visible=True), # audio_recorder ํ‘œ์‹œ
'<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ</div>'
)
else:
# ์ดˆ๊ธฐํ™”
with audio_lock:
last_transcription = ""
last_audio_data = None
logger.info("์˜ค๋””์˜ค ์ธ์‹ ๋น„ํ™œ์„ฑํ™”๋จ")
return (
gr.update(visible=False), # audio_recorder ์ˆจ๊น€
'<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
)
use_audio_toggle.change(
fn=toggle_audio,
inputs=[use_audio_toggle],
outputs=[audio_recorder, audio_status]
)
# ์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ฒ˜๋ฆฌ
def on_audio_recorded(audio_data):
"""์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ž๋™ ์ฒ˜๋ฆฌ"""
global last_transcription, audio_lock
if audio_data is not None:
logger.info("์ƒˆ ์˜ค๋””์˜ค ๋…น์Œ ๊ฐ์ง€")
transcription = process_audio_recording(audio_data)
if transcription:
return transcription
with audio_lock:
return last_transcription if last_transcription else "์Œ์„ฑ ์ธ์‹ ๋Œ€๊ธฐ ์ค‘..."
audio_recorder.change(
fn=on_audio_recorded,
inputs=[audio_recorder],
outputs=[last_transcript]
)
# ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
timer.tick(
fn=auto_capture_and_analyze,
inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, audio_recorder],
outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript, audio_recorder]
)
# ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
def initial_load():
load_model()
return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
demo.load(
fn=initial_load,
outputs=None
)
if __name__ == "__main__":
print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper)...")
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
debug=False
)