์†์„œํ˜„ commited on
Commit
ed93606
ยท
1 Parent(s): c4d447f

feat: initial file

Browse files
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ํ™˜๊ฒฝ๋ณ€์ˆ˜ ์„ค์ •
4
+ ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
5
+ ENV HF_HOME=/tmp/huggingface_cache
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ WORKDIR /app
9
+
10
+ # ์‹œ์Šคํ…œ ํŒจํ‚ค์ง€ ์„ค์น˜
11
+ RUN apt-get update && apt-get install -y \
12
+ build-essential \
13
+ libgl1-mesa-glx \
14
+ libglib2.0-0 \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ ๋ฐ ๊ถŒํ•œ ์„ค์ •
18
+ RUN mkdir -p $TRANSFORMERS_CACHE && chmod -R 777 $TRANSFORMERS_CACHE
19
+
20
+ # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ๊ถŒํ•œ ์„ค์ •
21
+ RUN mkdir -p /tmp/uploads && chmod 777 /tmp/uploads
22
+ ENV TMPDIR=/tmp/uploads
23
+
24
+ # ์š”๊ตฌ์‚ฌํ•ญ ํŒŒ์ผ ๋ณต์‚ฌ ๋ฐ ์„ค์น˜
25
+ COPY requirements.txt .
26
+ RUN pip install --no-cache-dir -r requirements.txt
27
+
28
+ # ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ํŒŒ์ผ ๋ณต์‚ฌ
29
+ COPY . .
30
+
31
+ # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •
32
+ ENV PYTHONPATH=/app
33
+
34
+ # kiwipiepy ์ดˆ๊ธฐํ™” ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ - ์‚ฌ์ „ ๋‹ค์šด๋กœ๋“œ ๋ฌธ์ œ ํ•ด๊ฒฐ
35
+ RUN python -c "from kiwipiepy import Kiwi; Kiwi()"
36
+
37
+ # ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์‹คํ–‰ (ํฌํŠธ 7861๋กœ ๋ณ€๊ฒฝ)
38
+ EXPOSE 7861
39
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7861"]
api/__init__.py ADDED
File without changes
api/routes/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ API ๋ผ์šฐํŠธ ํŒจํ‚ค์ง€
3
+ """
4
+ from .matching_routers import router as matching_router
5
+
6
+ __all__ = ['matching_router']
api/routes/matching_routers.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ์Šต๋“๋ฌผ ๋งค์นญ ๊ด€๋ จ API ๋ผ์šฐํŠธ
3
+ """
4
+ import os
5
+ import sys
6
+ import logging
7
+ from typing import List, Dict, Any, Optional
8
+ from fastapi import APIRouter, Depends, HTTPException, Query, Body
9
+ from fastapi.responses import JSONResponse
10
+ from pydantic import BaseModel, Field, validator
11
+ import base64
12
+ from io import BytesIO
13
+ from PIL import Image
14
+
15
+ # ๋กœ๊น… ์„ค์ •
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # ๋ผ์šฐํ„ฐ ์ƒ์„ฑ
20
+ router = APIRouter(
21
+ prefix="/api/matching",
22
+ tags=["matching"],
23
+ responses={404: {"description": "Not found"}},
24
+ )
25
+
26
+ # Pydantic ๋ชจ๋ธ ์ •์˜
27
+ class LostItemPost(BaseModel):
28
+ """์‚ฌ์šฉ์ž๊ฐ€ ๋ถ„์‹คํ•œ ๋ฌผํ’ˆ ๊ฒŒ์‹œ๊ธ€ ๋ชจ๋ธ"""
29
+ category: str = Field(..., description="๋ถ„์‹ค๋ฌผ ์นดํ…Œ๊ณ ๋ฆฌ (์˜ˆ: ์ง€๊ฐ‘, ๊ฐ€๋ฐฉ, ์ „์ž๊ธฐ๊ธฐ)")
30
+ item_name: str = Field(..., description="๋ฌผํ’ˆ๋ช… (์˜ˆ: ๊ฒ€์€์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘)")
31
+ color: Optional[str] = Field(None, description="๋ฌผํ’ˆ ์ƒ‰์ƒ")
32
+ content: str = Field(..., description="๊ฒŒ์‹œ๊ธ€ ๋‚ด์šฉ")
33
+ location: Optional[str] = Field(None, description="๋ถ„์‹ค ์žฅ์†Œ")
34
+ image_url: Optional[str] = Field(None, description="์ด๋ฏธ์ง€ URL (์žˆ๋Š” ๊ฒฝ์šฐ)")
35
+ lost_items: Optional[List[Dict[str, Any]]] = Field(None, description="๋น„๊ตํ•  ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ (API ํ…Œ์ŠคํŠธ์šฉ)")
36
+
37
+ class Config:
38
+ schema_extra = {
39
+ "example": {
40
+ "category": "์ง€๊ฐ‘",
41
+ "item_name": "๊ฒ€์€์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘",
42
+ "color": "๊ฒ€์ •์ƒ‰",
43
+ "content": "์ง€๋‚œ์ฃผ ํ† ์š”์ผ ๊ฐ•๋‚จ์—ญ ๊ทผ์ฒ˜์—์„œ ๊ฒ€์ •์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์žƒ์–ด๋ฒ„๋ ธ์Šต๋‹ˆ๋‹ค. ํ˜„๊ธˆ๊ณผ ์นด๋“œ๊ฐ€ ๋“ค์–ด์žˆ์–ด์š”.",
44
+ "location": "๊ฐ•๋‚จ์—ญ",
45
+ "image_url": None
46
+ }
47
+ }
48
+
49
+ class ImageMatchingRequest(BaseModel):
50
+ """์ด๋ฏธ์ง€ ๊ธฐ๋ฐ˜ ๋งค์นญ ์š”์ฒญ ๋ชจ๋ธ"""
51
+ category: Optional[str] = Field(None, description="๋ถ„์‹ค๋ฌผ ์นดํ…Œ๊ณ ๋ฆฌ")
52
+ item_name: Optional[str] = Field(None, description="๋ฌผํ’ˆ๋ช…")
53
+ color: Optional[str] = Field(None, description="์ƒ‰์ƒ")
54
+ content: Optional[str] = Field(None, description="๋‚ด์šฉ")
55
+ image_base64: Optional[str] = Field(None, description="Base64 ์ธ์ฝ”๋”ฉ๋œ ์ด๋ฏธ์ง€")
56
+ lost_items: Optional[List[Dict[str, Any]]] = Field(None, description="๋น„๊ตํ•  ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ (API ํ…Œ์ŠคํŠธ์šฉ)")
57
+
58
+ class Config:
59
+ schema_extra = {
60
+ "example": {
61
+ "category": "์ง€๊ฐ‘",
62
+ "item_name": "๊ฒ€์€์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘",
63
+ "color": "๊ฒ€์ •์ƒ‰",
64
+ "content": "์ง€๋‚œ์ฃผ ํ† ์š”์ผ ๊ฐ•๋‚จ์—ญ ๊ทผ์ฒ˜์—์„œ ๊ฒ€์ •์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์žƒ์–ด๋ฒ„๋ ธ์Šต๋‹ˆ๋‹ค.",
65
+ "image_base64": "[base64 encoded image string]"
66
+ }
67
+ }
68
+
69
+ class MatchingResult(BaseModel):
70
+ """๋งค์นญ ๊ฒฐ๊ณผ ๋ชจ๋ธ"""
71
+ total_matches: int = Field(..., description="๋งค์นญ๋œ ํ•ญ๋ชฉ ์ˆ˜")
72
+ similarity_threshold: float = Field(..., description="์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’")
73
+ matches: List[Dict[str, Any]] = Field(..., description="๋งค์นญ๋œ ํ•ญ๋ชฉ ๋ชฉ๋ก")
74
+
75
+ class MatchingResponse(BaseModel):
76
+ """API ์‘๋‹ต ๋ชจ๋ธ"""
77
+ success: bool = Field(..., description="์š”์ฒญ ์„ฑ๊ณต ์—ฌ๋ถ€")
78
+ message: str = Field(..., description="์‘๋‹ต ๋ฉ”์‹œ์ง€")
79
+ result: Optional[MatchingResult] = Field(None, description="๋งค์นญ ๊ฒฐ๊ณผ")
main.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ๋ฉ”์ธ ๋ชจ๋“ˆ
3
+ """
4
+ import os
5
+ import sys
6
+ import logging
7
+ import tempfile
8
+ from fastapi import FastAPI, Request, HTTPException, Query, Body
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.responses import JSONResponse
11
+ from typing import List, Dict, Any, Optional, Union
12
+ import json
13
+ import base64
14
+ from io import BytesIO
15
+ from PIL import Image
16
+
17
+ # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
18
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
19
+ os.environ['HF_HOME'] = '/tmp/huggingface_cache'
20
+
21
+ # ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
22
+ os.makedirs('/tmp/huggingface_cache', exist_ok=True)
23
+ os.makedirs('/tmp/uploads', exist_ok=True)
24
+
25
+ # ๋กœ๊น… ์„ค์ •
26
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # ํ•„์š”ํ•œ ๋ชจ๋“ˆ ์ž„ํฌํŠธ
30
+ from models.clip_model import KoreanCLIPModel
31
+ from utils.similarity import calculate_similarity, find_similar_items
32
+ from api.routes.matching_routers import LostItemPost, ImageMatchingRequest, MatchingResult, MatchingResponse
33
+
34
+ # ๋ชจ๋ธ ์ดˆ๊ธฐํ™” (์‹ฑ๊ธ€ํ†ค์œผ๋กœ ๋กœ๋“œ)
35
+ clip_model = None
36
+
37
+ def get_clip_model():
38
+ """
39
+ ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค๋ฅผ ๋ฐ˜ํ™˜ (์‹ฑ๊ธ€ํ†ค ํŒจํ„ด)
40
+ """
41
+ global clip_model
42
+ if clip_model is None:
43
+ try:
44
+ clip_model = KoreanCLIPModel()
45
+ return clip_model
46
+ except Exception as e:
47
+ logger.error(f"CLIP ๋ชจ๋ธ ์ดˆ๊ธฐํ™” ์‹คํŒจ: {str(e)}")
48
+ # ์‹คํŒจ ์‹œ None ๋ฐ˜ํ™˜ (ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ๋งค์นญ๋งŒ ๊ฐ€๋Šฅ)
49
+ return None
50
+ return clip_model
51
+
52
+ # FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์ƒ์„ฑ
53
+ app = FastAPI(
54
+ title="์Šต๋“๋ฌผ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ API",
55
+ description="ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์Šต๋“๋ฌผ ๊ฐ„์˜ ์œ ์‚ฌ๋„๋ฅผ ๊ณ„์‚ฐํ•˜๋Š” API",
56
+ version="1.0.0"
57
+ )
58
+
59
+ # CORS ๋ฏธ๋“ค์›จ์–ด ์„ค์ •
60
+ app.add_middleware(
61
+ CORSMiddleware,
62
+ allow_origins=["*"],
63
+ allow_credentials=True,
64
+ allow_methods=["*"],
65
+ allow_headers=["*"],
66
+ )
67
+
68
+ # ์ „์—ญ ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
69
+ @app.exception_handler(Exception)
70
+ async def global_exception_handler(request: Request, exc: Exception):
71
+ """
72
+ ์ „์—ญ ์˜ˆ์™ธ ์ฒ˜๋ฆฌ๊ธฐ
73
+ """
74
+ logger.error(f"์š”์ฒญ ์ฒ˜๋ฆฌ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ: {str(exc)}")
75
+ return JSONResponse(
76
+ status_code=500,
77
+ content={"success": False, "message": f"์„œ๋ฒ„ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(exc)}"}
78
+ )
79
+
80
+ # API ์—”๋“œํฌ์ธํŠธ ์ •์˜
81
+ @app.post("/api/matching/find-similar", response_model=MatchingResponse)
82
+ async def find_similar_items_api(
83
+ request: Union[LostItemPost, ImageMatchingRequest],
84
+ threshold: float = Query(0.7, description="์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’ (0.0 ~ 1.0)"),
85
+ limit: int = Query(10, description="๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ํ•ญ๋ชฉ ์ˆ˜")
86
+ ):
87
+ """
88
+ ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์œ ์‚ฌํ•œ ์Šต๋“๋ฌผ์„ ์ฐพ๋Š” API ์—”๋“œํฌ์ธํŠธ
89
+
90
+ Args:
91
+ request: ์‚ฌ์šฉ์ž์˜ ๋ถ„์‹ค๋ฌผ ๊ฒŒ์‹œ๊ธ€ ๋˜๋Š” ์ด๋ฏธ์ง€ ๋งค์นญ ์š”์ฒญ
92
+ threshold: ์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’
93
+ limit: ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ํ•ญ๋ชฉ ์ˆ˜
94
+
95
+ Returns:
96
+ MatchingResponse: ๋งค์นญ ๊ฒฐ๊ณผ๊ฐ€ ํฌํ•จ๋œ ์‘๋‹ต
97
+ """
98
+ try:
99
+ logger.info(f"์œ ์‚ฌ ์Šต๋“๋ฌผ ๊ฒ€์ƒ‰ ์š”์ฒญ: threshold={threshold}, limit={limit}")
100
+
101
+ # ์š”์ฒญ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
102
+ user_post = {}
103
+ if isinstance(request, LostItemPost):
104
+ user_post = request.dict()
105
+ else:
106
+ user_post = request.dict()
107
+ # Base64 ์ด๋ฏธ์ง€๊ฐ€ ์žˆ์œผ๋ฉด ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ๋กœ์ง ์ถ”๊ฐ€
108
+ if user_post.get("image_base64"):
109
+ try:
110
+ # Base64 ์ด๋ฏธ์ง€ ๋””์ฝ”๋”ฉ
111
+ base64_str = user_post["image_base64"]
112
+
113
+ # Base64 ๋ฌธ์ž์—ด์—์„œ ํ—ค๋” ์ œ๊ฑฐ (์žˆ์„ ๊ฒฝ์šฐ)
114
+ if "," in base64_str:
115
+ base64_str = base64_str.split(",")[1]
116
+
117
+ image_data = base64.b64decode(base64_str)
118
+ image = Image.open(BytesIO(image_data))
119
+
120
+ # ์ด๋ฏธ์ง€ ์‚ฌ์šฉ (CLIP ๋ชจ๋ธ์— ์ „๋‹ฌ)
121
+ user_post["image"] = image
122
+
123
+ logger.info("Base64 ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์™„๋ฃŒ")
124
+ except Exception as e:
125
+ logger.error(f"Base64 ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}")
126
+
127
+ # ์—ฌ๊ธฐ์„œ DB ๋Œ€์‹  ์š”์ฒญ์—์„œ ์ „๋‹ฌ๋œ ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
128
+ lost_items = []
129
+
130
+ # ์š”์ฒญ์— ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ์œผ๋ฉด ์‚ฌ์šฉ
131
+ if hasattr(request, 'lost_items') and request.lost_items:
132
+ lost_items = request.lost_items
133
+
134
+ if not lost_items:
135
+ return MatchingResponse(
136
+ success=False,
137
+ message="์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ์š”์ฒญ์— ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ๋ฅผ ํฌํ•จํ•ด์ฃผ์„ธ์š”.",
138
+ result=None
139
+ )
140
+
141
+ # CLIP ๋ชจ๋ธ ๋กœ๋“œ
142
+ clip_model_instance = get_clip_model()
143
+
144
+ # ์œ ์‚ฌํ•œ ํ•ญ๋ชฉ ์ฐพ๊ธฐ
145
+ similar_items = find_similar_items(user_post, lost_items, threshold, clip_model_instance)
146
+
147
+ # ๊ฒฐ๊ณผ ์ œํ•œ
148
+ similar_items = similar_items[:limit]
149
+
150
+ # ์‘๋‹ต ๊ตฌ์„ฑ
151
+ result = MatchingResult(
152
+ total_matches=len(similar_items),
153
+ similarity_threshold=threshold,
154
+ matches=[
155
+ {
156
+ "item": item["item"],
157
+ "similarity": round(item["similarity"], 4),
158
+ "details": {
159
+ "text_similarity": round(item["details"]["text_similarity"], 4),
160
+ "image_similarity": round(item["details"]["image_similarity"], 4) if item["details"]["image_similarity"] else None,
161
+ "category_similarity": round(item["details"]["details"]["category"], 4),
162
+ "item_name_similarity": round(item["details"]["details"]["item_name"], 4),
163
+ "color_similarity": round(item["details"]["details"]["color"], 4),
164
+ "content_similarity": round(item["details"]["details"]["content"], 4)
165
+ }
166
+ }
167
+ for item in similar_items
168
+ ]
169
+ )
170
+
171
+ return MatchingResponse(
172
+ success=True,
173
+ message=f"{len(similar_items)}๊ฐœ์˜ ์œ ์‚ฌํ•œ ์Šต๋“๋ฌผ์„ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.",
174
+ result=result
175
+ )
176
+
177
+ except Exception as e:
178
+ logger.error(f"API ํ˜ธ์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
179
+ raise HTTPException(status_code=500, detail=f"์š”์ฒญ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}")
180
+
181
+ @app.get("/api/matching/test")
182
+ async def test_endpoint():
183
+ """
184
+ API ํ…Œ์ŠคํŠธ์šฉ ์—”๋“œํฌ์ธํŠธ
185
+
186
+ Returns:
187
+ dict: ํ…Œ์ŠคํŠธ ์‘๋‹ต
188
+ """
189
+ return {"message": "API๊ฐ€ ์ •์ƒ์ ์œผ๋กœ ์ž‘๋™ ์ค‘์ž…๋‹ˆ๋‹ค."}
190
+
191
+ @app.get("/api/status")
192
+ async def status():
193
+ """
194
+ API ์ƒํƒœ ์—”๋“œํฌ์ธํŠธ
195
+
196
+ Returns:
197
+ dict: API ์ƒํƒœ ์ •๋ณด
198
+ """
199
+ # CLIP ๋ชจ๋ธ ๋กœ๋“œ ์‹œ๋„
200
+ model = get_clip_model()
201
+
202
+ return {
203
+ "status": "ok",
204
+ "models_loaded": model is not None,
205
+ "version": "1.0.0"
206
+ }
207
+
208
+ # ๋ฃจํŠธ ์—”๋“œํฌ์ธํŠธ
209
+ @app.get("/")
210
+ async def root():
211
+ """
212
+ ๋ฃจํŠธ ์—”๋“œํฌ์ธํŠธ - API ์ •๋ณด ์ œ๊ณต
213
+ """
214
+ return {
215
+ "app_name": "์Šต๋“๋ฌผ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ API",
216
+ "version": "1.0.0",
217
+ "description": "ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์Šต๋“๋ฌผ ๊ฐ„์˜ ์œ ์‚ฌ๋„๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.",
218
+ "api_endpoint": "/api/matching/find-similar",
219
+ "test_endpoint": "/api/matching/test",
220
+ "status_endpoint": "/api/status"
221
+ }
models/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ ๋ชจ๋ธ ๊ด€๋ จ ๋ชจ๋“ˆ ํŒจํ‚ค์ง€
3
+ """
4
+ from .clip_model import KoreanCLIPModel
5
+
6
+ __all__ = ['KoreanCLIPModel']
models/clip_model.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ ๊ตฌํ˜„
3
+ ์ด ๋ชจ๋“ˆ์€ HuggingFace์˜ ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ์™€ ์ด๋ฏธ์ง€์˜ ์ž„๋ฒ ๋”ฉ์„ ์ƒ์„ฑ
4
+ """
5
+ import os
6
+ import sys
7
+ import logging
8
+ import torch
9
+ from transformers import CLIPProcessor, CLIPModel
10
+ from PIL import Image
11
+ import requests
12
+ from io import BytesIO
13
+ import numpy as np
14
+
15
+ # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
16
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
17
+ os.environ['HF_HOME'] = '/tmp/huggingface_cache'
18
+
19
+ # ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
20
+ os.makedirs('/tmp/huggingface_cache', exist_ok=True)
21
+ os.makedirs('/tmp/uploads', exist_ok=True)
22
+
23
+ # ๋กœ๊น… ์„ค์ •
24
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # ๋ชจ๋ธ ์„ค์ • - ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ ๊ฐ€์ ธ์˜ค๊ฑฐ๋‚˜ ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ
28
+ CLIP_MODEL_NAME = os.getenv('CLIP_MODEL_NAME', 'Bingsu/clip-vit-large-patch14-ko')
29
+ DEVICE = "cuda" if torch.cuda.is_available() and os.getenv('USE_GPU', 'True').lower() == 'true' else "cpu"
30
+
31
+ class KoreanCLIPModel:
32
+ """
33
+ ํ•œ๊ตญ์–ด CLIP ๋ชจ๋ธ ํด๋ž˜์Šค
34
+ ํ…์ŠคํŠธ์™€ ์ด๋ฏธ์ง€๋ฅผ ์ž„๋ฒ ๋”ฉํ•˜๊ณ  ์œ ์‚ฌ๋„๋ฅผ ๊ณ„์‚ฐํ•˜๋Š” ๊ธฐ๋Šฅ ์ œ๊ณต
35
+ """
36
+
37
+ def __init__(self, model_name=CLIP_MODEL_NAME, device=DEVICE):
38
+ """
39
+ CLIP ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
40
+
41
+ Args:
42
+ model_name (str): ์‚ฌ์šฉํ•  CLIP ๋ชจ๋ธ ์ด๋ฆ„ ๋˜๋Š” ๊ฒฝ๋กœ
43
+ device (str): ์‚ฌ์šฉํ•  ์žฅ์น˜ ('cuda' ๋˜๋Š” 'cpu')
44
+ """
45
+ self.device = device
46
+ self.model_name = model_name
47
+
48
+ logger.info(f"CLIP ๋ชจ๋ธ '{model_name}' ๋กœ๋“œ ์ค‘ (device: {device})...")
49
+
50
+ try:
51
+ # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
52
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
53
+ os.makedirs("/tmp/transformers_cache", exist_ok=True)
54
+
55
+ self.model = CLIPModel.from_pretrained(model_name).to(device)
56
+ self.processor = CLIPProcessor.from_pretrained(model_name)
57
+ logger.info("CLIP ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")
58
+ except Exception as e:
59
+ logger.error(f"CLIP ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
60
+ raise
61
+
62
+ def encode_text(self, text):
63
+ """
64
+ ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜
65
+
66
+ Args:
67
+ text (str or list): ์ธ์ฝ”๋”ฉํ•  ํ…์ŠคํŠธ ๋˜๋Š” ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
68
+
69
+ Returns:
70
+ numpy.ndarray: ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ
71
+ """
72
+ if isinstance(text, str):
73
+ text = [text]
74
+
75
+ try:
76
+ with torch.no_grad():
77
+ # ํ…์ŠคํŠธ ์ธ์ฝ”๋”ฉ
78
+ inputs = self.processor(text=text, return_tensors="pt", padding=True, truncation=True).to(self.device)
79
+ text_features = self.model.get_text_features(**inputs)
80
+
81
+ # ํ…์ŠคํŠธ ํŠน์„ฑ ์ •๊ทœํ™”
82
+ text_embeddings = text_features / text_features.norm(dim=1, keepdim=True)
83
+
84
+ return text_embeddings.cpu().numpy()
85
+ except Exception as e:
86
+ logger.error(f"ํ…์ŠคํŠธ ์ธ์ฝ”๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
87
+ return np.zeros((len(text), self.model.text_embed_dim))
88
+
89
+ def encode_image(self, image_source):
90
+ """
91
+ ์ด๋ฏธ์ง€๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜
92
+
93
+ Args:
94
+ image_source: ์ธ์ฝ”๋”ฉํ•  ์ด๋ฏธ์ง€ (PIL Image, URL ๋˜๋Š” ์ด๋ฏธ์ง€ ๊ฒฝ๋กœ)
95
+
96
+ Returns:
97
+ numpy.ndarray: ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ
98
+ """
99
+ try:
100
+ # ์ด๋ฏธ์ง€ ๋กœ๋“œ (URL, ํŒŒ์ผ ๊ฒฝ๋กœ, PIL ์ด๋ฏธ์ง€ ๊ฐ์ฒด ๋˜๋Š” Base64)
101
+ if isinstance(image_source, str):
102
+ if image_source.startswith('http'):
103
+ # URL์—์„œ ์ด๋ฏธ์ง€ ๋กœ๋“œ
104
+ response = requests.get(image_source)
105
+ image = Image.open(BytesIO(response.content)).convert('RGB')
106
+ else:
107
+ # ๋กœ์ปฌ ํŒŒ์ผ์—์„œ ์ด๋ฏธ์ง€ ๋กœ๋“œ
108
+ image = Image.open(image_source).convert('RGB')
109
+ else:
110
+ # ์ด๋ฏธ PIL ์ด๋ฏธ์ง€ ๊ฐ์ฒด์ธ ๊ฒฝ์šฐ
111
+ image = image_source.convert('RGB')
112
+
113
+ with torch.no_grad():
114
+ # ์ด๋ฏธ์ง€ ์ธ์ฝ”๋”ฉ
115
+ inputs = self.processor(images=image, return_tensors="pt").to(self.device)
116
+ image_features = self.model.get_image_features(**inputs)
117
+
118
+ # ์ด๋ฏธ์ง€ ํŠน์„ฑ ์ •๊ทœํ™”
119
+ image_embeddings = image_features / image_features.norm(dim=1, keepdim=True)
120
+
121
+ return image_embeddings.cpu().numpy()
122
+ except Exception as e:
123
+ logger.error(f"์ด๋ฏธ์ง€ ์ธ์ฝ”๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
124
+ return np.zeros((1, self.model.vision_embed_dim))
125
+
126
+ def calculate_similarity(self, text_embedding, image_embedding=None):
127
+ """
128
+ ํ…์ŠคํŠธ์™€ ์ด๋ฏธ์ง€ ์ž„๋ฒ ๋”ฉ ๊ฐ„์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
129
+
130
+ Args:
131
+ text_embedding (numpy.ndarray): ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ
132
+ image_embedding (numpy.ndarray, optional): ์ด๋ฏธ์ง€ ์ž„๋ฒ ๋”ฉ (์—†์œผ๋ฉด ํ…์ŠคํŠธ๋งŒ ๋น„๊ต)
133
+
134
+ Returns:
135
+ float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
136
+ """
137
+ if image_embedding is None:
138
+ # ํ…์ŠคํŠธ-ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„)
139
+ similarity = np.dot(text_embedding, text_embedding.T)[0, 0]
140
+ else:
141
+ # ํ…์ŠคํŠธ-์ด๋ฏธ์ง€ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„)
142
+ similarity = np.dot(text_embedding, image_embedding.T)[0, 0]
143
+
144
+ # ์œ ์‚ฌ๋„๋ฅผ 0~1 ๋ฒ”์œ„๋กœ ์ •๊ทœํ™”
145
+ similarity = (similarity + 1) / 2
146
+ return float(similarity)
147
+
148
+ def encode_batch_texts(self, texts):
149
+ """
150
+ ์—ฌ๋Ÿฌ ํ…์ŠคํŠธ๋ฅผ ํ•œ ๋ฒˆ์— ์ž„๋ฒ ๋”ฉ
151
+
152
+ Args:
153
+ texts (list): ํ…์ŠคํŠธ ๋ชฉ๋ก
154
+
155
+ Returns:
156
+ numpy.ndarray: ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ๋ฐฐ์—ด
157
+ """
158
+ # ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ฝ”๋“œ
159
+ # ์‹ค์ œ ๊ตฌํ˜„์—์„œ๋Š” ๋ฉ”๋ชจ๋ฆฌ ํฌ๊ธฐ์— ๋”ฐ๋ผ ์ ์ ˆํ•œ ๋ฐฐ์น˜ ํฌ๊ธฐ ์กฐ์ • ํ•„์š”
160
+ return self.encode_text(texts)
161
+
162
+ # ๋ชจ๋“ˆ ํ…Œ์ŠคํŠธ์šฉ ์ฝ”๋“œ
163
+ if __name__ == "__main__":
164
+ # ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
165
+ clip_model = KoreanCLIPModel()
166
+
167
+ # ์ƒ˜ํ”Œ ํ…์ŠคํŠธ ์ธ์ฝ”๋”ฉ
168
+ sample_text = "๊ฒ€์€์ƒ‰ ์ง€๊ฐ‘์„ ์žƒ์–ด๋ฒ„๋ ธ์Šต๋‹ˆ๋‹ค. ํ˜„๊ธˆ๊ณผ ์นด๋“œ๊ฐ€ ๋“ค์–ด์žˆ์–ด์š”."
169
+ text_embedding = clip_model.encode_text(sample_text)
170
+
171
+ print(f"ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ shape: {text_embedding.shape}")
172
+
173
+ # ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (ํ…์ŠคํŠธ๋งŒ)
174
+ sample_text2 = "๊ฒ€์€์ƒ‰ ์ง€๊ฐ‘์„ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค. ์•ˆ์— ํ˜„๊ธˆ๊ณผ ์นด๋“œ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค."
175
+ text_embedding2 = clip_model.encode_text(sample_text2)
176
+
177
+ similarity = clip_model.calculate_similarity(text_embedding, text_embedding2)
178
+ print(f"ํ…์ŠคํŠธ ๊ฐ„ ์œ ์‚ฌ๋„: {similarity:.4f}")
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers==4.26.0
4
+ torch
5
+ Pillow
6
+ pydantic
7
+ python-multipart
8
+ httpx
9
+ python-dotenv
10
+ aiofiles
11
+ kiwipiepy==0.20.4
12
+ numpy
13
+ requests
utils/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ ์œ ํ‹ธ๋ฆฌํ‹ฐ ๊ด€๋ จ ๋ชจ๋“ˆ ํŒจํ‚ค์ง€
3
+ """
4
+ from .similarity import calculate_similarity, find_similar_items
5
+
6
+ __all__ = ['calculate_similarity', 'find_similar_items']
utils/similarity.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ๋ฐ ๊ด€๋ จ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
3
+ Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ๋ถ„์„ ๊ฐœ์„ 
4
+ """
5
+ import os
6
+ import sys
7
+ import logging
8
+ import numpy as np
9
+ import re
10
+ from collections import Counter
11
+ from kiwipiepy import Kiwi
12
+
13
+ # ๋กœ๊น… ์„ค์ •
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ ์ดˆ๊ธฐํ™”
18
+ kiwi = Kiwi()
19
+
20
+ # ์„ค์ • ๊ฐ’ (ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” ๊ธฐ๋ณธ๊ฐ’)
21
+ SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', '0.7'))
22
+ TEXT_WEIGHT = float(os.getenv('TEXT_WEIGHT', '0.7'))
23
+ IMAGE_WEIGHT = float(os.getenv('IMAGE_WEIGHT', '0.3'))
24
+ CATEGORY_WEIGHT = float(os.getenv('CATEGORY_WEIGHT', '0.5'))
25
+ ITEM_NAME_WEIGHT = float(os.getenv('ITEM_NAME_WEIGHT', '0.3'))
26
+ COLOR_WEIGHT = float(os.getenv('COLOR_WEIGHT', '0.1'))
27
+ CONTENT_WEIGHT = float(os.getenv('CONTENT_WEIGHT', '0.1'))
28
+
29
+ def preprocess_text(text):
30
+ """
31
+ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜
32
+
33
+ Args:
34
+ text (str): ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ
35
+
36
+ Returns:
37
+ str: ์ „์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ
38
+ """
39
+ if not text:
40
+ return ""
41
+
42
+ # ์†Œ๋ฌธ์ž ๋ณ€ํ™˜ (์˜์–ด์˜ ๊ฒฝ์šฐ)
43
+ text = text.lower()
44
+
45
+ # ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
46
+ text = re.sub(r'\s+', ' ', text).strip()
47
+
48
+ # ํŠน์ˆ˜ ๋ฌธ์ž ์ œ๊ฑฐ (๋‹จ, ํ•œ๊ธ€, ์˜๋ฌธ, ์ˆซ์ž๋Š” ์œ ์ง€)
49
+ text = re.sub(r'[^\w\s๊ฐ€-ํžฃใ„ฑ-ใ…Žใ…-ใ…ฃ]', ' ', text)
50
+
51
+ return text
52
+
53
+ def extract_keywords(text):
54
+ """
55
+ Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ ์ถ”์ถœ
56
+
57
+ Args:
58
+ text (str): ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•  ํ…์ŠคํŠธ
59
+
60
+ Returns:
61
+ list: ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ (์ฃผ๋กœ ๋ช…์‚ฌ์™€ ํ˜•์šฉ์‚ฌ)
62
+ """
63
+ if not text:
64
+ return []
65
+
66
+ # ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
67
+ processed_text = preprocess_text(text)
68
+
69
+ try:
70
+ # Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ˆ˜ํ–‰
71
+ result = kiwi.analyze(processed_text)
72
+
73
+ # ์ค‘์š” ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๋ช…์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ)
74
+ keywords = []
75
+ for token in result[0][0]:
76
+ # NNG: ์ผ๋ฐ˜๋ช…์‚ฌ, NNP: ๊ณ ์œ ๋ช…์‚ฌ, VA: ํ˜•์šฉ์‚ฌ, VV: ๋™์‚ฌ, SL: ์™ธ๊ตญ์–ด(์˜์–ด ๋“ฑ)
77
+ if token.tag in ['NNG', 'NNP', 'VA', 'SL']:
78
+ # ํ•œ ๊ธ€์ž ๋ช…์‚ฌ๋Š” ์ค‘์š”๋„ ๋‚ฎ์„ ์ˆ˜ ์žˆ์–ด ํ•„ํ„ฐ๋ง (์„ ํƒ์ )
79
+ if len(token.form) > 1 or token.tag in ['SL']:
80
+ keywords.append(token.form)
81
+
82
+ logger.debug(f"ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๊ฒฐ๊ณผ: {keywords}")
83
+ return keywords
84
+
85
+ except Exception as e:
86
+ logger.warning(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}, ๊ธฐ๋ณธ ๋ถ„๋ฆฌ ๋ฐฉ์‹์œผ๋กœ ๋Œ€์ฒด")
87
+ # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๊ธฐ๋ณธ ๋ฐฉ์‹์œผ๋กœ ๋Œ€์ฒด
88
+ words = processed_text.split()
89
+ return words
90
+
91
+ def calculate_text_similarity(text1, text2, weights=None):
92
+ """
93
+ ๋‘ ํ…์ŠคํŠธ ๊ฐ„์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ™œ์šฉ)
94
+
95
+ Args:
96
+ text1 (str): ์ฒซ ๋ฒˆ์งธ ํ…์ŠคํŠธ
97
+ text2 (str): ๋‘ ๋ฒˆ์งธ ํ…์ŠคํŠธ
98
+ weights (dict, optional): ๊ฐ ๋ถ€๋ถ„์— ๋Œ€ํ•œ ๊ฐ€์ค‘์น˜
99
+
100
+ Returns:
101
+ float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
102
+ """
103
+ if not text1 or not text2:
104
+ return 0.0
105
+
106
+ # ๊ธฐ๋ณธ ๊ฐ€์ค‘์น˜ ์„ค์ •
107
+ if weights is None:
108
+ weights = {
109
+ 'common_words': 0.7, # ๊ณตํ†ต ๋‹จ์–ด ๋น„์œจ์˜ ๊ฐ€์ค‘์น˜ ์ฆ๊ฐ€
110
+ 'length_ratio': 0.15,
111
+ 'word_order': 0.15
112
+ }
113
+
114
+ # ํ…์ŠคํŠธ์—์„œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ ์‚ฌ์šฉ)
115
+ keywords1 = extract_keywords(text1)
116
+ keywords2 = extract_keywords(text2)
117
+
118
+ if not keywords1 or not keywords2:
119
+ return 0.0
120
+
121
+ # 1. ๊ณตํ†ต ๋‹จ์–ด ๋น„์œจ ๊ณ„์‚ฐ
122
+ common_words = set(keywords1) & set(keywords2)
123
+ common_ratio = len(common_words) / max(1, min(len(set(keywords1)), len(set(keywords2))))
124
+
125
+ # 2. ํ…์ŠคํŠธ ๊ธธ์ด ์œ ์‚ฌ๋„
126
+ length_ratio = min(len(keywords1), len(keywords2)) / max(1, max(len(keywords1), len(keywords2)))
127
+
128
+ # 3. ๋‹จ์–ด ์ˆœ์„œ ์œ ์‚ฌ๋„ (์„ ํƒ์ )
129
+ word_order_sim = 0.0
130
+ if common_words:
131
+ # ๊ณตํ†ต ๋‹จ์–ด์˜ ์œ„์น˜ ์ฐจ์ด ๊ธฐ๋ฐ˜ ์œ ์‚ฌ๋„
132
+ positions1 = {word: i for i, word in enumerate(keywords1) if word in common_words}
133
+ positions2 = {word: i for i, word in enumerate(keywords2) if word in common_words}
134
+
135
+ if positions1 and positions2:
136
+ pos_diff_sum = sum(abs(positions1[word] - positions2[word]) for word in common_words if word in positions1 and word in positions2)
137
+ max_diff = len(keywords1) + len(keywords2)
138
+ word_order_sim = 1.0 - (pos_diff_sum / max(1, max_diff))
139
+
140
+ # ๊ฐ€์ค‘์น˜ ์ ์šฉํ•˜์—ฌ ์ตœ์ข… ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
141
+ similarity = (
142
+ weights['common_words'] * common_ratio +
143
+ weights['length_ratio'] * length_ratio +
144
+ weights['word_order'] * word_order_sim
145
+ )
146
+
147
+ return min(1.0, max(0.0, similarity))
148
+
149
+ def calculate_category_similarity(category1, category2):
150
+ """
151
+ ๋‘ ์นดํ…Œ๊ณ ๋ฆฌ ๊ฐ„์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (๊ธฐํƒ€ ์นดํ…Œ๊ณ ๋ฆฌ ๊ณ ๋ ค)
152
+
153
+ Args:
154
+ category1 (str): ์ฒซ ๋ฒˆ์งธ ์นดํ…Œ๊ณ ๋ฆฌ
155
+ category2 (str): ๋‘ ๋ฒˆ์งธ ์นดํ…Œ๊ณ ๋ฆฌ
156
+
157
+ Returns:
158
+ float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
159
+ """
160
+ if not category1 or not category2:
161
+ return 0.0
162
+
163
+ # ์นดํ…Œ๊ณ ๋ฆฌ ์ „์ฒ˜๋ฆฌ
164
+ cat1 = preprocess_text(str(category1))
165
+ cat2 = preprocess_text(str(category2))
166
+
167
+ # ์ •ํ™•ํžˆ ์ผ์น˜ํ•˜๋Š” ๊ฒฝ์šฐ
168
+ if cat1 == cat2:
169
+ return 1.0
170
+
171
+ # Kiwi๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
172
+ keywords1 = set(extract_keywords(cat1))
173
+ keywords2 = set(extract_keywords(cat2))
174
+
175
+ # '๊ธฐํƒ€' ์นดํ…Œ๊ณ ๋ฆฌ ์ฒ˜๋ฆฌ
176
+ if '๊ธฐํƒ€' in cat1 or '๊ธฐํƒ€' in cat2:
177
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๋ฐ ๊ต์ง‘ํ•ฉ ๋น„๊ต
178
+ if not keywords1 or not keywords2:
179
+ return 0.3 # ๊ธฐํƒ€ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ๊ธฐ๋ณธ ์œ ์‚ฌ๋„ ๋ถ€์—ฌ
180
+
181
+ # ๊ต์ง‘ํ•ฉ ๋‹จ์–ด๊ฐ€ ์žˆ์œผ๋ฉด ๋†’์€ ์œ ์‚ฌ๋„
182
+ common_words = keywords1 & keywords2
183
+ if common_words:
184
+ return 0.7
185
+
186
+ return 0.3 # ๊ธฐํƒ€ ์นดํ…Œ๊ณ ๋ฆฌ์ง€๋งŒ ๊ณตํ†ต ํ‚ค์›Œ๋“œ ์—†์Œ
187
+
188
+ # ์ผ๋ฐ˜ ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„
189
+ return calculate_text_similarity(cat1, cat2)
190
+
191
+ def calculate_similarity(user_post, lost_item, clip_model=None):
192
+ """
193
+ ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์Šต๋“๋ฌผ ํ•ญ๋ชฉ ๊ฐ„์˜ ์ข…ํ•ฉ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
194
+
195
+ Args:
196
+ user_post (dict): ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€ ์ •๋ณด
197
+ lost_item (dict): ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ
198
+ clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
199
+
200
+ Returns:
201
+ float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
202
+ dict: ์„ธ๋ถ€ ์œ ์‚ฌ๋„ ์ •๋ณด
203
+ """
204
+ # ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
205
+ text_similarities = {}
206
+
207
+ # 1. ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„
208
+ category_sim = 0.0
209
+ if 'category' in user_post and 'category' in lost_item:
210
+ category_sim = calculate_category_similarity(user_post['category'], lost_item['category'])
211
+ text_similarities['category'] = category_sim
212
+
213
+ # 2. ๋ฌผํ’ˆ๋ช… ์œ ์‚ฌ๋„
214
+ item_name_sim = 0.0
215
+ user_item_name = user_post.get('item_name', '')
216
+ lost_item_name = lost_item.get('item_name', '')
217
+ if user_item_name and lost_item_name:
218
+ item_name_sim = calculate_text_similarity(user_item_name, lost_item_name)
219
+ text_similarities['item_name'] = item_name_sim
220
+
221
+ # 3. ์ƒ‰์ƒ ์œ ์‚ฌ๋„
222
+ color_sim = 0.0
223
+ user_color = user_post.get('color', '')
224
+ lost_color = lost_item.get('color', '')
225
+ if user_color and lost_color:
226
+ color_sim = calculate_text_similarity(user_color, lost_color)
227
+ text_similarities['color'] = color_sim
228
+
229
+ # 4. ๋‚ด์šฉ ์œ ์‚ฌ๋„
230
+ content_sim = 0.0
231
+ user_content = user_post.get('content', '')
232
+ lost_content = lost_item.get('content', '')
233
+ if user_content and lost_content:
234
+ content_sim = calculate_text_similarity(user_content, lost_content)
235
+ text_similarities['content'] = content_sim
236
+
237
+ # ํ…์ŠคํŠธ ์ข…ํ•ฉ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (๊ฐ€์ค‘์น˜ ์ ์šฉ)
238
+ text_similarity = (
239
+ CATEGORY_WEIGHT * category_sim +
240
+ ITEM_NAME_WEIGHT * item_name_sim +
241
+ COLOR_WEIGHT * color_sim +
242
+ CONTENT_WEIGHT * content_sim
243
+ )
244
+
245
+ # CLIP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•œ ์ด๋ฏธ์ง€-ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
246
+ image_similarity = 0.0
247
+ has_image = False
248
+
249
+ if clip_model is not None:
250
+ # ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์Šต๋“๋ฌผ์— ๋ชจ๋‘ ์ด๋ฏธ์ง€๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
251
+ user_image = user_post.get('image', None) or user_post.get('image_url', None)
252
+ lost_image = lost_item.get('image', None) or lost_item.get('image_url', None)
253
+
254
+ if user_image and lost_image:
255
+ try:
256
+ # CLIP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•œ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
257
+ user_text_embedding = clip_model.encode_text(user_post.get('content', ''))
258
+ user_image_embedding = clip_model.encode_image(user_image)
259
+
260
+ item_text_embedding = clip_model.encode_text(lost_item.get('content', ''))
261
+ item_image_embedding = clip_model.encode_image(lost_image)
262
+
263
+ # ํ…์ŠคํŠธ-์ด๋ฏธ์ง€ ๊ต์ฐจ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
264
+ text_to_image_sim = clip_model.calculate_similarity(user_text_embedding, item_image_embedding)
265
+ image_to_text_sim = clip_model.calculate_similarity(item_text_embedding, user_image_embedding)
266
+ image_to_image_sim = clip_model.calculate_similarity(user_image_embedding, item_image_embedding)
267
+
268
+ image_similarity = (text_to_image_sim + image_to_text_sim + image_to_image_sim) / 3
269
+ has_image = True
270
+ except Exception as e:
271
+ logger.warning(f"์ด๋ฏธ์ง€ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
272
+
273
+ # ์ตœ์ข… ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (ํ…์ŠคํŠธ์™€ ์ด๋ฏธ์ง€ ๊ฐ€์ค‘์น˜ ์ ์šฉ)
274
+ if has_image:
275
+ final_similarity = TEXT_WEIGHT * text_similarity + IMAGE_WEIGHT * image_similarity
276
+ else:
277
+ final_similarity = text_similarity
278
+
279
+ # ์„ธ๋ถ€ ์œ ์‚ฌ๋„ ์ •๋ณด
280
+ similarity_details = {
281
+ 'text_similarity': text_similarity,
282
+ 'image_similarity': image_similarity if has_image else None,
283
+ 'final_similarity': final_similarity,
284
+ 'details': text_similarities
285
+ }
286
+
287
+ return final_similarity, similarity_details
288
+
289
+ def find_similar_items(user_post, lost_items, threshold=SIMILARITY_THRESHOLD, clip_model=None):
290
+ """
291
+ ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์œ ์‚ฌํ•œ ์Šต๋“๋ฌผ ๋ชฉ๋ก ์ฐพ๊ธฐ
292
+
293
+ Args:
294
+ user_post (dict): ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€ ์ •๋ณด
295
+ lost_items (list): ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ ๋ชฉ๋ก
296
+ threshold (float): ์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’ (๊ธฐ๋ณธ๊ฐ’: config์—์„œ ์„ค์ •)
297
+ clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
298
+
299
+ Returns:
300
+ list: ์œ ์‚ฌ๋„๊ฐ€ ์ž„๊ณ„๊ฐ’ ์ด์ƒ์ธ ์Šต๋“๋ฌผ ๋ชฉ๋ก (์œ ์‚ฌ๋„ ๋†’์€ ์ˆœ)
301
+ """
302
+ similar_items = []
303
+
304
+ logger.info(f"์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ {len(lost_items)}๊ฐœ ์Šต๋“๋ฌผ ๋น„๊ต ์ค‘...")
305
+
306
+ for item in lost_items:
307
+ similarity, details = calculate_similarity(user_post, item, clip_model)
308
+
309
+ if similarity >= threshold:
310
+ similar_items.append({
311
+ 'item': item,
312
+ 'similarity': similarity,
313
+ 'details': details
314
+ })
315
+
316
+ # ์œ ์‚ฌ๋„ ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌ
317
+ similar_items.sort(key=lambda x: x['similarity'], reverse=True)
318
+
319
+ logger.info(f"์œ ์‚ฌ๋„ {threshold} ์ด์ƒ์ธ ์Šต๋“๋ฌผ {len(similar_items)}๊ฐœ ๋ฐœ๊ฒฌ")
320
+
321
+ return similar_items
322
+
323
+ # ๋ชจ๋“ˆ ํ…Œ์ŠคํŠธ์šฉ ์ฝ”๋“œ
324
+ if __name__ == "__main__":
325
+ # ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
326
+ text1 = "๊ฒ€์€์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์žƒ์–ด๋ฒ„๋ ธ์Šต๋‹ˆ๋‹ค."
327
+ text2 = "๊ฒ€์ • ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค."
328
+ text3 = "๋…ธํŠธ๋ถ์„ ๋ถ„์‹คํ–ˆ์Šต๋‹ˆ๋‹ค."
329
+
330
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ…Œ์ŠคํŠธ
331
+ print("[ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ…Œ์ŠคํŠธ ]")
332
+ print(f"ํ…์ŠคํŠธ 1: '{text1}'")
333
+ print(f"์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {extract_keywords(text1)}")
334
+ print(f"ํ…์ŠคํŠธ 2: '{text2}'")
335
+ print(f"์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {extract_keywords(text2)}")
336
+
337
+ # ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
338
+ sim12 = calculate_text_similarity(text1, text2)
339
+ sim13 = calculate_text_similarity(text1, text3)
340
+
341
+ print("\n[ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ ]")
342
+ print(f"ํ…์ŠคํŠธ 1-2 ์œ ์‚ฌ๋„: {sim12:.4f}")
343
+ print(f"ํ…์ŠคํŠธ 1-3 ์œ ์‚ฌ๋„: {sim13:.4f}")
344
+
345
+ # ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
346
+ cat1 = "์ง€๊ฐ‘"
347
+ cat2 = "๊ฐ€๋ฐฉ/์ง€๊ฐ‘"
348
+ cat3 = "๊ธฐํƒ€"
349
+
350
+ cat_sim12 = calculate_category_similarity(cat1, cat2)
351
+ cat_sim13 = calculate_category_similarity(cat1, cat3)
352
+
353
+ print("\n[ ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ ]")
354
+ print(f"์นดํ…Œ๊ณ ๋ฆฌ 1-2 ์œ ์‚ฌ๋„: {cat_sim12:.4f}")
355
+ print(f"์นดํ…Œ๊ณ ๋ฆฌ 1-3 ์œ ์‚ฌ๋„: {cat_sim13:.4f}")