Spaces:
Sleeping
Sleeping
| """ | |
| ์ ์ฌ๋ ๊ณ์ฐ ๋ฐ ๊ด๋ จ ์ ํธ๋ฆฌํฐ ํจ์ | |
| Kiwi ํํ์ ๋ถ์๊ธฐ๋ฅผ ์ฌ์ฉํ์ฌ ํ๊ตญ์ด ํ ์คํธ ๋ถ์ ๊ฐ์ | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import numpy as np | |
| import re | |
| from collections import Counter | |
| from kiwipiepy import Kiwi | |
| # ๋ก๊น ์ค์ | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Kiwi ํํ์ ๋ถ์๊ธฐ ์ด๊ธฐํ | |
| kiwi = Kiwi() | |
| # ์ค์ ๊ฐ (ํ๊ฒฝ๋ณ์ ๋๋ ๊ธฐ๋ณธ๊ฐ) | |
| SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', '0.6')) | |
| TEXT_WEIGHT = float(os.getenv('TEXT_WEIGHT', '0.7')) | |
| IMAGE_WEIGHT = float(os.getenv('IMAGE_WEIGHT', '0.3')) | |
| CATEGORY_WEIGHT = float(os.getenv('CATEGORY_WEIGHT', '0.5')) | |
| ITEM_NAME_WEIGHT = float(os.getenv('ITEM_NAME_WEIGHT', '0.3')) | |
| COLOR_WEIGHT = float(os.getenv('COLOR_WEIGHT', '0.1')) | |
| CONTENT_WEIGHT = float(os.getenv('CONTENT_WEIGHT', '0.1')) | |
| def preprocess_text(text): | |
| """ | |
| ํ ์คํธ ์ ์ฒ๋ฆฌ ํจ์ | |
| Args: | |
| text (str): ์ ์ฒ๋ฆฌํ ํ ์คํธ | |
| Returns: | |
| str: ์ ์ฒ๋ฆฌ๋ ํ ์คํธ | |
| """ | |
| if not text: | |
| return "" | |
| if not isinstance(text, str): | |
| text = str(text) | |
| # ์๋ฌธ์ ๋ณํ (์์ด์ ๊ฒฝ์ฐ) | |
| text = text.lower() | |
| # ๋ถํ์ํ ๊ณต๋ฐฑ ์ ๊ฑฐ | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # ํน์ ๋ฌธ์ ์ ๊ฑฐ (๋จ, ํ๊ธ, ์๋ฌธ, ์ซ์๋ ์ ์ง) | |
| text = re.sub(r'[^\w\s๊ฐ-ํฃใฑ-ใ ใ -ใ ฃ]', ' ', text) | |
| return text | |
| def extract_keywords(text): | |
| """ | |
| Kiwi ํํ์ ๋ถ์๊ธฐ๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ์์ ์ค์ ํค์๋ ์ถ์ถ | |
| Args: | |
| text (str): ํค์๋๋ฅผ ์ถ์ถํ ํ ์คํธ | |
| Returns: | |
| list: ํค์๋ ๋ฆฌ์คํธ (์ฃผ๋ก ๋ช ์ฌ์ ํ์ฉ์ฌ) | |
| """ | |
| if not text: | |
| return [] | |
| # ํ ์คํธ ์ ์ฒ๋ฆฌ | |
| processed_text = preprocess_text(text) | |
| try: | |
| # Kiwi ํํ์ ๋ถ์ ์ํ | |
| result = kiwi.analyze(processed_text) | |
| # ์ค์ ํค์๋ ์ถ์ถ (๋ช ์ฌ, ํ์ฉ์ฌ ๋ฑ) | |
| keywords = [] | |
| for token in result[0][0]: | |
| # NNG: ์ผ๋ฐ๋ช ์ฌ, NNP: ๊ณ ์ ๋ช ์ฌ, VA: ํ์ฉ์ฌ, VV: ๋์ฌ, SL: ์ธ๊ตญ์ด(์์ด ๋ฑ) | |
| if token.tag in ['NNG', 'NNP', 'VA', 'SL']: | |
| # ํ ๊ธ์ ๋ช ์ฌ๋ ์ค์๋ ๋ฎ์ ์ ์์ด ํํฐ๋ง (์ ํ์ ) | |
| if len(token.form) > 1 or token.tag in ['SL']: | |
| keywords.append(token.form) | |
| logger.debug(f"ํค์๋ ์ถ์ถ ๊ฒฐ๊ณผ: {keywords}") | |
| return keywords | |
| except Exception as e: | |
| logger.warning(f"ํํ์ ๋ถ์ ์ค๋ฅ: {str(e)}, ๊ธฐ๋ณธ ๋ถ๋ฆฌ ๋ฐฉ์์ผ๋ก ๋์ฒด") | |
| # ์ค๋ฅ ๋ฐ์ ์ ๊ธฐ๋ณธ ๋ฐฉ์์ผ๋ก ๋์ฒด | |
| words = processed_text.split() | |
| return words | |
| def calculate_text_similarity(text1, text2, weights=None): | |
| """ | |
| ๋ ํ ์คํธ ๊ฐ์ ์ ์ฌ๋ ๊ณ์ฐ (Kiwi ํํ์ ๋ถ์ ํ์ฉ) | |
| ๊ฐ์ ๋ ๋ฒ์ : ์ ํํ ์ผ์น ๋ฐ ํฌํจ ๊ด๊ณ๋ ๊ณ ๋ ค | |
| Args: | |
| text1 (str): ์ฒซ ๋ฒ์งธ ํ ์คํธ | |
| text2 (str): ๋ ๋ฒ์งธ ํ ์คํธ | |
| weights (dict, optional): ๊ฐ ๋ถ๋ถ์ ๋ํ ๊ฐ์ค์น | |
| Returns: | |
| float: ์ ์ฌ๋ ์ ์ (0~1 ์ฌ์ด) | |
| """ | |
| if not text1 or not text2: | |
| return 0.0 | |
| # ์๋ณธ ํ ์คํธ ์ ์ฒ๋ฆฌ | |
| clean_text1 = preprocess_text(text1) | |
| clean_text2 = preprocess_text(text2) | |
| # 1. ์ ํํ ์ผ์น ๊ฒ์ฌ | |
| if clean_text1.lower() == clean_text2.lower(): | |
| return 1.0 | |
| # 2. ํฌํจ ๊ด๊ณ ๊ฒ์ฌ (์งง์ ํ ์คํธ๊ฐ ๊ธด ํ ์คํธ์ ํฌํจ๋จ) | |
| if clean_text1.lower() in clean_text2.lower() or clean_text2.lower() in clean_text1.lower(): | |
| # ๊ธธ์ด ๋น์จ์ ๋ฐ๋ผ ์ ์ฌ๋ ๊ณ์ฐ (์ต์ 0.7) | |
| len_ratio = min(len(clean_text1), len(clean_text2)) / max(len(clean_text1), len(clean_text2)) | |
| return max(0.7, 0.7 + 0.3 * len_ratio) # 0.7~1.0 ์ฌ์ด ๊ฐ | |
| # ๊ธฐ๋ณธ ๊ฐ์ค์น ์ค์ | |
| if weights is None: | |
| weights = { | |
| 'common_words': 0.7, # ๊ณตํต ๋จ์ด ๋น์จ์ ๊ฐ์ค์น | |
| 'length_ratio': 0.15, # ๊ธธ์ด ์ ์ฌ์ฑ ๊ฐ์ค์น | |
| 'word_order': 0.15 # ๋จ์ด ์์ ์ ์ฌ์ฑ ๊ฐ์ค์น | |
| } | |
| # ํ ์คํธ์์ ํค์๋ ์ถ์ถ (Kiwi ํํ์ ๋ถ์๊ธฐ ์ฌ์ฉ) | |
| keywords1 = extract_keywords(text1) | |
| keywords2 = extract_keywords(text2) | |
| if not keywords1 or not keywords2: | |
| # ํค์๋๊ฐ ์์ผ๋ฉด ์๋ณธ ํ ์คํธ์ ์ ์ฌ๋ ๊ณ์ฐ | |
| jaccard_sim = calculate_jaccard_similarity(clean_text1, clean_text2) | |
| return max(0.1, jaccard_sim) # ์ต์ 0.1 ์ ์ฌ๋ ๋ถ์ฌ | |
| # 3. ๊ณตํต ๋จ์ด ๋น์จ ๊ณ์ฐ (๊ฐ์ ) | |
| common_words = set(keywords1) & set(keywords2) | |
| if common_words: | |
| # ๊ณตํต ๋จ์ด๊ฐ ์์ ๊ฒฝ์ฐ ๋น์จ ๊ณ์ฐ | |
| common_ratio = len(common_words) / min(len(set(keywords1)), len(set(keywords2))) | |
| # ์ฃผ์ ํค์๋๊ฐ ๊ณตํต๋๋ ๊ฒฝ์ฐ ๊ฐ์ค์น ์ถ๊ฐ | |
| important_keywords = [w for w in common_words | |
| if len(w) > 1 and not w.isdigit()] | |
| if important_keywords: | |
| common_ratio = max(common_ratio, 0.5 + 0.3 * (len(important_keywords) / len(common_words))) | |
| else: | |
| # ๊ณตํต ํค์๋๊ฐ ์์ผ๋ฉด ์์นด๋ ์ ์ฌ๋ ๊ณ์ฐ (๋ฎ์ ๊ฐ) | |
| common_ratio = calculate_jaccard_similarity(clean_text1, clean_text2) * 0.5 | |
| # 4. ํ ์คํธ ๊ธธ์ด ์ ์ฌ๋ | |
| length_ratio = min(len(keywords1), len(keywords2)) / max(1, max(len(keywords1), len(keywords2))) | |
| # 5. ๋จ์ด ์์ ์ ์ฌ๋ (์ ํ์ ) | |
| word_order_sim = 0.0 | |
| if common_words: | |
| # ๊ณตํต ๋จ์ด์ ์์น ์ฐจ์ด ๊ธฐ๋ฐ ์ ์ฌ๋ | |
| positions1 = {word: i for i, word in enumerate(keywords1) if word in common_words} | |
| positions2 = {word: i for i, word in enumerate(keywords2) if word in common_words} | |
| if positions1 and positions2: | |
| common_words_positions = set(positions1.keys()) & set(positions2.keys()) | |
| if common_words_positions: | |
| pos_diff_sum = sum(abs(positions1[word] - positions2[word]) | |
| for word in common_words_positions) | |
| max_diff = len(keywords1) + len(keywords2) | |
| word_order_sim = 1.0 - min(1.0, (pos_diff_sum / max(1, max_diff))) | |
| # ๊ฐ์ค์น ์ ์ฉํ์ฌ ์ต์ข ์ ์ฌ๋ ๊ณ์ฐ | |
| similarity = ( | |
| weights['common_words'] * common_ratio + | |
| weights['length_ratio'] * length_ratio + | |
| weights['word_order'] * word_order_sim | |
| ) | |
| # ์ต์ ์ ์ฌ๋ ๋ณด์ฅ (ํค์๋๊ฐ ์๋ค๋ฉด) | |
| if common_words: | |
| similarity = max(similarity, 0.1 + 0.2 * len(common_words) / max(len(keywords1), len(keywords2))) | |
| return min(1.0, max(0.0, similarity)) | |
| # ์์นด๋ ์ ์ฌ๋ ๊ณ์ฐ ํจ์ ์ถ๊ฐ | |
| def calculate_jaccard_similarity(text1, text2): | |
| """ | |
| ๋ ํ ์คํธ ๊ฐ์ ์์นด๋ ์ ์ฌ๋ ๊ณ์ฐ | |
| Args: | |
| text1 (str): ์ฒซ ๋ฒ์งธ ํ ์คํธ | |
| text2 (str): ๋ ๋ฒ์งธ ํ ์คํธ | |
| Returns: | |
| float: ์์นด๋ ์ ์ฌ๋ (0~1 ์ฌ์ด) | |
| """ | |
| set1 = set(text1.lower().split()) | |
| set2 = set(text2.lower().split()) | |
| if not set1 or not set2: | |
| return 0.0 | |
| intersection = len(set1 & set2) | |
| union = len(set1 | set2) | |
| return intersection / max(1, union) | |
| def calculate_category_similarity(category1, category2): | |
| """ | |
| ๋ ์นดํ ๊ณ ๋ฆฌ ๊ฐ์ ์ ์ฌ๋ ๊ณ์ฐ (๊ฐ์ ๋ ๋ฒ์ ) | |
| Args: | |
| category1 (str or int): ์ฒซ ๋ฒ์งธ ์นดํ ๊ณ ๋ฆฌ | |
| category2 (str or int): ๋ ๋ฒ์งธ ์นดํ ๊ณ ๋ฆฌ | |
| Returns: | |
| float: ์ ์ฌ๋ ์ ์ (0~1 ์ฌ์ด) | |
| """ | |
| # None ๋๋ ๋น ๊ฐ ์ฒ๋ฆฌ | |
| if not category1 or not category2: | |
| return 0.0 | |
| # ์ ์ํ ID์ธ ๊ฒฝ์ฐ ์ง์ ๋น๊ต | |
| if isinstance(category1, int) and isinstance(category2, int): | |
| return 1.0 if category1 == category2 else 0.0 | |
| # ๋ฌธ์์ด๋ก ๋ณํ | |
| cat1 = str(category1).strip() | |
| cat2 = str(category2).strip() | |
| # ์์ ์ผ์น ํ์ธ | |
| if cat1.lower() == cat2.lower(): | |
| return 1.0 | |
| # ์นดํ ๊ณ ๋ฆฌ ์ ์ฒ๋ฆฌ | |
| cat1_processed = preprocess_text(cat1) | |
| cat2_processed = preprocess_text(cat2) | |
| # ์ ์ฒ๋ฆฌ ํ ์ผ์น ํ์ธ | |
| if cat1_processed.lower() == cat2_processed.lower(): | |
| return 1.0 | |
| # ํฌํจ ๊ด๊ณ ํ์ธ (์: '์ง๊ฐ'๊ณผ '๊ฐ์ฃฝ ์ง๊ฐ') | |
| if cat1_processed.lower() in cat2_processed.lower() or cat2_processed.lower() in cat1_processed.lower(): | |
| # ๊ธธ์ด ๋น์จ์ ๋ฐ๋ผ ์ ์ฌ๋ ์กฐ์ | |
| len_ratio = min(len(cat1_processed), len(cat2_processed)) / max(len(cat1_processed), len(cat2_processed)) | |
| return max(0.8, len_ratio) # ์ต์ 0.8 ์ ์ฌ๋ | |
| # ํค์๋ ์ถ์ถ ๋ฐ ๊ณตํต ๋จ์ด ํ์ธ | |
| keywords1 = set(extract_keywords(cat1)) | |
| keywords2 = set(extract_keywords(cat2)) | |
| # ๊ณตํต ํค์๋๊ฐ ์๋ ๊ฒฝ์ฐ | |
| common_keywords = keywords1 & keywords2 | |
| if common_keywords: | |
| # ๊ณตํต ํค์๋ ๋น์จ์ ๋ฐ๋ผ ์ ์ฌ๋ ๊ณ์ฐ | |
| common_ratio = len(common_keywords) / min(len(keywords1), len(keywords2)) if keywords1 and keywords2 else 0 | |
| return max(0.5, common_ratio) # ์ต์ 0.5 ์ ์ฌ๋ | |
| # '๊ธฐํ' ์นดํ ๊ณ ๋ฆฌ ์ฒ๋ฆฌ | |
| if '๊ธฐํ' in cat1 or '๊ธฐํ' in cat2: | |
| return 0.3 # ๊ธฐํ ์นดํ ๊ณ ๋ฆฌ๋ ์ฝํ ์ฐ๊ด์ฑ | |
| # ์ต์ข ์ ์ผ๋ก ํ ์คํธ ์ ์ฌ๋ ๊ณ์ฐ | |
| return calculate_text_similarity(cat1, cat2) | |
| def calculate_similarity(user_post, lost_item, clip_model=None): | |
| """ | |
| ์ฌ์ฉ์ ๊ฒ์๊ธ๊ณผ ์ต๋๋ฌผ ํญ๋ชฉ ๊ฐ์ ์ข ํฉ ์ ์ฌ๋ ๊ณ์ฐ | |
| Spring Boot์ ํธํ๋๋๋ก ํ๋๋ช ๋งคํ ์์ | |
| Args: | |
| user_post (dict): ์ฌ์ฉ์ ๊ฒ์๊ธ ์ ๋ณด (๋ถ์ค๋ฌผ) | |
| lost_item (dict): ์ต๋๋ฌผ ๋ฐ์ดํฐ (found_item) | |
| clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์คํด์ค | |
| Returns: | |
| float: ์ ์ฌ๋ ์ ์ (0~1 ์ฌ์ด) | |
| dict: ์ธ๋ถ ์ ์ฌ๋ ์ ๋ณด | |
| """ | |
| # ํ ์คํธ ์ ์ฌ๋ ๊ณ์ฐ | |
| text_similarities = {} | |
| # ํ๋ ์กด์ฌ ์ฌ๋ถ ๊ฒ์ฌ ๋ฐ ๋ก๊น | |
| logger.info(f"==== ์ ์ฌ๋ ๊ณ์ฐ ์์ ====") | |
| # 1. ์นดํ ๊ณ ๋ฆฌ ์ ์ฌ๋ - ID๋ง ์ฌ์ฉํ๋๋ก ์์ | |
| category_sim = 0.0 | |
| # ์ฌ์ฉ์ ์นดํ ๊ณ ๋ฆฌ ํ๋: 'category' ๋๋ 'itemCategoryId' | |
| user_category_id = None | |
| if 'category' in user_post and user_post['category'] is not None: | |
| user_category_id = user_post['category'] | |
| elif 'itemCategoryId' in user_post and user_post['itemCategoryId'] is not None: | |
| user_category_id = user_post['itemCategoryId'] | |
| # ์ต๋๋ฌผ ์นดํ ๊ณ ๋ฆฌ ํ๋: 'item_category_id'๋ง ์ฌ์ฉ | |
| lost_category_id = None | |
| if 'item_category_id' in lost_item and lost_item['item_category_id'] is not None: | |
| lost_category_id = lost_item['item_category_id'] | |
| # ์นดํ ๊ณ ๋ฆฌ ์ ๋ณด ๋ก๊น | |
| logger.info(f"์นดํ ๊ณ ๋ฆฌ ID ๋น๊ต: ์ฌ์ฉ์({user_category_id}) vs ์ต๋๋ฌผ({lost_category_id})") | |
| # ์นดํ ๊ณ ๋ฆฌ ID ์ ์ฌ๋ ๊ณ์ฐ - ์ ํํ ๊ฐ์ ID์ธ ๊ฒฝ์ฐ๋ง ์ผ์น | |
| if user_category_id is not None and lost_category_id is not None: | |
| try: | |
| # ์ซ์๋ก ๋ณํํ์ฌ ๋น๊ต | |
| user_category_id = int(user_category_id) | |
| lost_category_id = int(lost_category_id) | |
| category_sim = 1.0 if user_category_id == lost_category_id else 0.0 | |
| logger.info(f"์นดํ ๊ณ ๋ฆฌ ID ์ผ์น ์ฌ๋ถ: {category_sim}") | |
| except (ValueError, TypeError): | |
| logger.warning(f"์นดํ ๊ณ ๋ฆฌ ID๋ฅผ ์ซ์๋ก ๋ณํํ ์ ์์: {user_category_id}, {lost_category_id}") | |
| category_sim = 0.0 | |
| text_similarities['category'] = category_sim | |
| # 2. ๋ฌผํ๋ช ์ ์ฌ๋ (์ฌ์ฉ์ ์ธก์ด ์์ ๊ฒฝ์ฐ ์นดํ ๊ณ ๋ฆฌ๋ ๊ฒ์์ด์์ ์ถ์ถ) | |
| item_name_sim = 0.0 | |
| user_item_name = None | |
| # ์ฌ์ฉ์ ๋ฌผํ๋ช : title, search_keyword, content ์ค์์ ๊ฐ์ ธ์ค๊ธฐ | |
| if 'title' in user_post and user_post['title']: | |
| user_item_name = user_post['title'] | |
| elif 'search_keyword' in user_post and user_post['search_keyword']: | |
| # ๊ฒ์ ํค์๋๊ฐ ์์ผ๋ฉด ์ฌ์ฉ | |
| user_item_name = user_post['search_keyword'] | |
| elif 'content' in user_post and user_post['content']: | |
| # ๋ด์ฉ์์ ์ฒซ ๋ฌธ์ฅ์ด๋ ํค์๋ ์ถ์ถ | |
| content = user_post['content'] | |
| # ์ฒซ 10๋จ์ด ์ถ์ถ (๋๋ ์ ์ ํ ๊ธธ์ด) | |
| words = content.split()[:10] | |
| if words: | |
| user_item_name = ' '.join(words) | |
| # ์ต๋๋ฌผ ๋ฌผํ๋ช : name ๋๋ title์์ ๊ฐ์ ธ์ค๊ธฐ | |
| lost_item_name = None | |
| if 'name' in lost_item and lost_item['name']: | |
| lost_item_name = lost_item['name'] | |
| elif 'title' in lost_item and lost_item['title']: | |
| lost_item_name = lost_item['title'] | |
| logger.info(f"๋ฌผํ๋ช ํ๋: ์ฌ์ฉ์({user_item_name}) vs ์ต๋๋ฌผ({lost_item_name})") | |
| # ๋ฌผํ๋ช ์ ์ฌ๋ ๊ณ์ฐ | |
| if user_item_name and lost_item_name: | |
| # ์ ์ฒ๋ฆฌ ์ ์ฉ | |
| user_item_name_clean = preprocess_text(str(user_item_name)) | |
| lost_item_name_clean = preprocess_text(str(lost_item_name)) | |
| # ๊ธฐ๋ณธ ์ ์ฌ๋ ๊ณ์ฐ | |
| item_name_sim = calculate_text_similarity(user_item_name_clean, lost_item_name_clean) | |
| # ์์ ์ผ์นํ๊ฑฐ๋ ํฌํจ ๊ด๊ณ์ธ ๊ฒฝ์ฐ ๊ฐ์ค์น ๋ถ์ฌ | |
| if user_item_name_clean.lower() == lost_item_name_clean.lower(): | |
| item_name_sim = 1.0 # ์์ ์ผ์น | |
| logger.info("๋ฌผํ๋ช ์์ ์ผ์น") | |
| elif user_item_name_clean.lower() in lost_item_name_clean.lower() or lost_item_name_clean.lower() in user_item_name_clean.lower(): | |
| item_name_sim = 0.8 # ๋ถ๋ถ ํฌํจ | |
| logger.info("๋ฌผํ๋ช ํฌํจ ๊ด๊ณ ๊ฐ์ง") | |
| elif user_item_name is None and lost_item_name: | |
| # ์ฌ์ฉ์ ๋ฌผํ๋ช ์ด ์๊ณ ์ต๋๋ฌผ ๋ฌผํ๋ช ๋ง ์๋ ๊ฒฝ์ฐ | |
| # ์นดํ ๊ณ ๋ฆฌ๋ ์์์ด ์ผ์นํ๋ฉด ์ต์ ์ ์ฌ๋ ๋ถ์ฌ | |
| if category_sim > 0.5 or ('color' in user_post and 'color' in lost_item and | |
| preprocess_text(user_post['color']).lower() == preprocess_text(lost_item['color']).lower()): | |
| item_name_sim = 0.3 # ์ต์ ์ ์ฌ๋ ๋ถ์ฌ | |
| logger.info("์ฌ์ฉ์ ๋ฌผํ๋ช ์์, ์นดํ ๊ณ ๋ฆฌ/์์ ์ ์ฌ์ฑ ๊ธฐ๋ฐ ์ต์ ์ ์ฌ๋ ๋ถ์ฌ") | |
| else: | |
| logger.warning(f"์ฌ์ฉ์ ๋ฌผํ๋ช ๋๋ฝ, ์ ์ฌ๋ 0") | |
| else: | |
| logger.warning(f"๋ฌผํ๋ช ๋น๊ต ๋ถ๊ฐ: ์ฌ์ฉ์({user_item_name}) ๋๋ ์ต๋๋ฌผ({lost_item_name}) ๋ฌผํ๋ช ๋๋ฝ") | |
| text_similarities['item_name'] = item_name_sim | |
| # 3. ์์ ์ ์ฌ๋ | |
| color_sim = 0.0 | |
| # ์์ ํ๋๋ ๋์ผํ๊ฒ 'color' | |
| user_color = user_post.get('color', '') | |
| lost_color = lost_item.get('color', '') | |
| logger.info(f"์์ ๋น๊ต: ์ฌ์ฉ์({user_color}) vs ์ต๋๋ฌผ({lost_color})") | |
| # ์์ ์ ์ฌ๋ ๊ณ์ฐ | |
| if user_color and lost_color: | |
| # ์์ ํค์๋ ์ถ์ถ | |
| user_color_clean = preprocess_text(str(user_color)) | |
| lost_color_clean = preprocess_text(str(lost_color)) | |
| # ์์ ์ผ์น ๊ฒ์ฌ | |
| if user_color_clean.lower() == lost_color_clean.lower(): | |
| color_sim = 1.0 | |
| logger.info("์์ ์์ ์ผ์น") | |
| else: | |
| # ๊ณตํต ํค์๋ ๊ฒ์ฌ | |
| user_color_keywords = extract_keywords(user_color) | |
| lost_color_keywords = extract_keywords(lost_color) | |
| common_keywords = set(user_color_keywords) & set(lost_color_keywords) | |
| if common_keywords: | |
| color_sim = 0.8 | |
| logger.info(f"์์ ๊ณตํต ํค์๋: {common_keywords}") | |
| else: | |
| color_sim = calculate_text_similarity(user_color, lost_color) | |
| logger.info(f"์์ ๊ธฐ๋ณธ ์ ์ฌ๋: {color_sim}") | |
| else: | |
| logger.warning(f"์์ ๋๋ฝ: ์ฌ์ฉ์({user_color}) ๋๋ ์ต๋๋ฌผ({lost_color})") | |
| text_similarities['color'] = color_sim | |
| # 4. ๋ด์ฉ ์ ์ฌ๋ | |
| content_sim = 0.0 | |
| # ๋ชจ๋ ๊ฐ๋ฅํ ๋ด์ฉ ํ๋ ๊ฒ์ฌ | |
| possible_content_fields_user = ['detail', 'content', 'description'] | |
| possible_content_fields_lost = ['detail', 'content', 'description'] | |
| # ์ฌ์ฉ์ ๋ด์ฉ ํ๋ ์ฐพ๊ธฐ | |
| user_content = None | |
| user_content_field = None | |
| for field in possible_content_fields_user: | |
| if field in user_post and user_post[field]: | |
| user_content = user_post[field] | |
| user_content_field = field | |
| break | |
| # ์ต๋๋ฌผ ๋ด์ฉ ํ๋ ์ฐพ๊ธฐ | |
| lost_content = None | |
| lost_content_field = None | |
| for field in possible_content_fields_lost: | |
| if field in lost_item and lost_item[field]: | |
| lost_content = lost_item[field] | |
| lost_content_field = field | |
| break | |
| logger.info(f"๋ด์ฉ ํ๋: ์ฌ์ฉ์({user_content_field}) vs ์ต๋๋ฌผ({lost_content_field})") | |
| # ๋ด์ฉ ์ ์ฌ๋ ๊ณ์ฐ | |
| if user_content and lost_content: | |
| # ๋ด์ฉ์ ๊ธธ์ด๊ฐ ์งง์ ์ ์์ผ๋ฏ๋ก ์ ์ฒ๋ฆฌ ํ ํค์๋ ์ถ์ถ์ ์ค์ | |
| user_content_keywords = extract_keywords(user_content) | |
| lost_content_keywords = extract_keywords(lost_content) | |
| logger.info(f"๋ด์ฉ ํค์๋ ์: ์ฌ์ฉ์({len(user_content_keywords)}๊ฐ) vs ์ต๋๋ฌผ({len(lost_content_keywords)}๊ฐ)") | |
| if user_content_keywords and lost_content_keywords: | |
| # ๊ณตํต ํค์๋ ๋น์จ ๊ณ์ฐ | |
| common_keywords = set(user_content_keywords) & set(lost_content_keywords) | |
| if common_keywords: | |
| common_ratio = len(common_keywords) / min(len(user_content_keywords), len(lost_content_keywords)) | |
| logger.info(f"๋ด์ฉ ๊ณตํต ํค์๋: {len(common_keywords)}๊ฐ, ๊ณตํต ๋น์จ: {common_ratio:.4f}") | |
| # ๊ณตํต ํค์๋๊ฐ ๋ง์์๋ก ์ ์ฌ๋ ์ฆ๊ฐ | |
| if common_ratio >= 0.5: # 50% ์ด์ ๊ณตํต ํค์๋ | |
| content_sim = max(0.7, common_ratio) | |
| logger.info(f"๋ด์ฉ ๋์ ๊ณตํต ๋น์จ: {content_sim:.4f}") | |
| else: | |
| text_sim = calculate_text_similarity(user_content, lost_content) | |
| content_sim = max(text_sim, common_ratio) | |
| logger.info(f"๋ด์ฉ ๊ธฐ๋ณธ ์ ์ฌ๋: {text_sim:.4f}, ์ต์ข : {content_sim:.4f}") | |
| else: | |
| content_sim = calculate_text_similarity(user_content, lost_content) | |
| logger.info(f"๋ด์ฉ ๊ณตํต ํค์๋ ์์, ๊ธฐ๋ณธ ์ ์ฌ๋: {content_sim:.4f}") | |
| else: | |
| content_sim = calculate_text_similarity(user_content, lost_content) | |
| logger.info(f"๋ด์ฉ ํค์๋ ์ถ์ถ ์คํจ, ๊ธฐ๋ณธ ์ ์ฌ๋: {content_sim:.4f}") | |
| else: | |
| logger.warning(f"๋ด์ฉ ๋๋ฝ: ์ฌ์ฉ์({user_content is not None}) ๋๋ ์ต๋๋ฌผ({lost_content is not None})") | |
| text_similarities['content'] = content_sim | |
| # ๊ฐ์ค์น ์กฐ์ | |
| ADJ_CATEGORY_WEIGHT = 0.35 | |
| ADJ_ITEM_NAME_WEIGHT = 0.35 | |
| ADJ_COLOR_WEIGHT = 0.15 | |
| ADJ_CONTENT_WEIGHT = 0.15 | |
| # ํ ์คํธ ์ข ํฉ ์ ์ฌ๋ ๊ณ์ฐ (๊ฐ์ค์น ์ ์ฉ) | |
| total_text_similarity = ( | |
| ADJ_CATEGORY_WEIGHT * category_sim + | |
| ADJ_ITEM_NAME_WEIGHT * item_name_sim + | |
| ADJ_COLOR_WEIGHT * color_sim + | |
| ADJ_CONTENT_WEIGHT * content_sim | |
| ) | |
| # ์ต์ข ์ ์ฌ๋๋ ํ ์คํธ ์ ์ฌ๋๋ง ์ฌ์ฉ | |
| final_similarity = total_text_similarity | |
| # ์ ์ฌ๋ ๊ณ์ฐ ๊ฒฐ๊ณผ ๋ก๊น | |
| logger.info(f"์ ์ฌ๋ ๊ณ์ฐ ๊ฒฐ๊ณผ: ์นดํ ๊ณ ๋ฆฌ({category_sim:.4f}*{ADJ_CATEGORY_WEIGHT}) + ๋ฌผํ๋ช ({item_name_sim:.4f}*{ADJ_ITEM_NAME_WEIGHT}) + ์์({color_sim:.4f}*{ADJ_COLOR_WEIGHT}) + ๋ด์ฉ({content_sim:.4f}*{ADJ_CONTENT_WEIGHT}) = {final_similarity:.4f}") | |
| logger.info(f"==== ์ ์ฌ๋ ๊ณ์ฐ ์ข ๋ฃ ====") | |
| # ์ธ๋ถ ์ ์ฌ๋ ์ ๋ณด | |
| similarity_details = { | |
| 'text_similarity': total_text_similarity, | |
| 'image_similarity': None, # ์ด๋ฏธ์ง ์ ์ฌ๋ ์ฌ์ฉ ์ํจ | |
| 'final_similarity': final_similarity, | |
| 'details': text_similarities | |
| } | |
| return final_similarity, similarity_details | |
| def find_similar_items(user_post, lost_items, threshold=SIMILARITY_THRESHOLD, clip_model=None): | |
| """ | |
| ์ฌ์ฉ์ ๊ฒ์๊ธ๊ณผ ์ ์ฌํ ์ต๋๋ฌผ ๋ชฉ๋ก ์ฐพ๊ธฐ | |
| Args: | |
| user_post (dict): ์ฌ์ฉ์ ๊ฒ์๊ธ ์ ๋ณด | |
| lost_items (list): ์ต๋๋ฌผ ๋ฐ์ดํฐ ๋ชฉ๋ก | |
| threshold (float): ์ ์ฌ๋ ์๊ณ๊ฐ (๊ธฐ๋ณธ๊ฐ: config์์ ์ค์ ) | |
| clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์คํด์ค | |
| Returns: | |
| list: ์ ์ฌ๋๊ฐ ์๊ณ๊ฐ ์ด์์ธ ์ต๋๋ฌผ ๋ชฉ๋ก (์ ์ฌ๋ ๋์ ์) | |
| """ | |
| similar_items = [] | |
| logger.info(f"์ฌ์ฉ์ ๊ฒ์๊ธ๊ณผ {len(lost_items)}๊ฐ ์ต๋๋ฌผ ๋น๊ต ์ค...") | |
| for item in lost_items: | |
| similarity, details = calculate_similarity(user_post, item, clip_model) | |
| if similarity >= threshold: | |
| similar_items.append({ | |
| 'item': item, | |
| 'similarity': similarity, | |
| 'details': details | |
| }) | |
| # ์ ์ฌ๋ ๋์ ์์ผ๋ก ์ ๋ ฌ | |
| similar_items.sort(key=lambda x: x['similarity'], reverse=True) | |
| logger.info(f"์ ์ฌ๋ {threshold} ์ด์์ธ ์ต๋๋ฌผ {len(similar_items)}๊ฐ ๋ฐ๊ฒฌ") | |
| return similar_items | |
| # ๋ชจ๋ ํ ์คํธ์ฉ ์ฝ๋ | |
| if __name__ == "__main__": | |
| # ํ ์คํธ ์ ์ฌ๋ ํ ์คํธ | |
| text1 = "๊ฒ์์ ๊ฐ์ฃฝ ์ง๊ฐ์ ์์ด๋ฒ๋ ธ์ต๋๋ค." | |
| text2 = "๊ฒ์ ๊ฐ์ฃฝ ์ง๊ฐ์ ์ฐพ์์ต๋๋ค." | |
| text3 = "๋ ธํธ๋ถ์ ๋ถ์คํ์ต๋๋ค." | |
| # ํค์๋ ์ถ์ถ ํ ์คํธ | |
| print("[ ํค์๋ ์ถ์ถ ํ ์คํธ ]") | |
| print(f"ํ ์คํธ 1: '{text1}'") | |
| print(f"์ถ์ถ๋ ํค์๋: {extract_keywords(text1)}") | |
| print(f"ํ ์คํธ 2: '{text2}'") | |
| print(f"์ถ์ถ๋ ํค์๋: {extract_keywords(text2)}") | |
| # ์ ์ฌ๋ ํ ์คํธ | |
| sim12 = calculate_text_similarity(text1, text2) | |
| sim13 = calculate_text_similarity(text1, text3) | |
| print("\n[ ์ ์ฌ๋ ํ ์คํธ ]") | |
| print(f"ํ ์คํธ 1-2 ์ ์ฌ๋: {sim12:.4f}") | |
| print(f"ํ ์คํธ 1-3 ์ ์ฌ๋: {sim13:.4f}") | |
| # ์นดํ ๊ณ ๋ฆฌ ์ ์ฌ๋ ํ ์คํธ | |
| cat1 = "์ง๊ฐ" | |
| cat2 = "๊ฐ๋ฐฉ/์ง๊ฐ" | |
| cat3 = "๊ธฐํ" | |
| cat_sim12 = calculate_category_similarity(cat1, cat2) | |
| cat_sim13 = calculate_category_similarity(cat1, cat3) | |
| print("\n[ ์นดํ ๊ณ ๋ฆฌ ์ ์ฌ๋ ํ ์คํธ ]") | |
| print(f"์นดํ ๊ณ ๋ฆฌ 1-2 ์ ์ฌ๋: {cat_sim12:.4f}") | |
| print(f"์นดํ ๊ณ ๋ฆฌ 1-3 ์ ์ฌ๋: {cat_sim13:.4f}") |