ssfinder-matching / utils /similarity.py
asefasdfcv's picture
Update utils/similarity.py
3589a31 verified
"""
์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ๋ฐ ๊ด€๋ จ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ๋ถ„์„ ๊ฐœ์„ 
"""
import os
import sys
import logging
import numpy as np
import re
from collections import Counter
from kiwipiepy import Kiwi
# ๋กœ๊น… ์„ค์ •
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ ์ดˆ๊ธฐํ™”
kiwi = Kiwi()
# ์„ค์ • ๊ฐ’ (ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” ๊ธฐ๋ณธ๊ฐ’)
SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', '0.6'))
TEXT_WEIGHT = float(os.getenv('TEXT_WEIGHT', '0.7'))
IMAGE_WEIGHT = float(os.getenv('IMAGE_WEIGHT', '0.3'))
CATEGORY_WEIGHT = float(os.getenv('CATEGORY_WEIGHT', '0.5'))
ITEM_NAME_WEIGHT = float(os.getenv('ITEM_NAME_WEIGHT', '0.3'))
COLOR_WEIGHT = float(os.getenv('COLOR_WEIGHT', '0.1'))
CONTENT_WEIGHT = float(os.getenv('CONTENT_WEIGHT', '0.1'))
def preprocess_text(text):
"""
ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜
Args:
text (str): ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ
Returns:
str: ์ „์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ
"""
if not text:
return ""
if not isinstance(text, str):
text = str(text)
# ์†Œ๋ฌธ์ž ๋ณ€ํ™˜ (์˜์–ด์˜ ๊ฒฝ์šฐ)
text = text.lower()
# ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
text = re.sub(r'\s+', ' ', text).strip()
# ํŠน์ˆ˜ ๋ฌธ์ž ์ œ๊ฑฐ (๋‹จ, ํ•œ๊ธ€, ์˜๋ฌธ, ์ˆซ์ž๋Š” ์œ ์ง€)
text = re.sub(r'[^\w\s๊ฐ€-ํžฃใ„ฑ-ใ…Žใ…-ใ…ฃ]', ' ', text)
return text
def extract_keywords(text):
"""
Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ ์ถ”์ถœ
Args:
text (str): ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•  ํ…์ŠคํŠธ
Returns:
list: ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ (์ฃผ๋กœ ๋ช…์‚ฌ์™€ ํ˜•์šฉ์‚ฌ)
"""
if not text:
return []
# ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
processed_text = preprocess_text(text)
try:
# Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ˆ˜ํ–‰
result = kiwi.analyze(processed_text)
# ์ค‘์š” ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๋ช…์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ)
keywords = []
for token in result[0][0]:
# NNG: ์ผ๋ฐ˜๋ช…์‚ฌ, NNP: ๊ณ ์œ ๋ช…์‚ฌ, VA: ํ˜•์šฉ์‚ฌ, VV: ๋™์‚ฌ, SL: ์™ธ๊ตญ์–ด(์˜์–ด ๋“ฑ)
if token.tag in ['NNG', 'NNP', 'VA', 'SL']:
# ํ•œ ๊ธ€์ž ๋ช…์‚ฌ๋Š” ์ค‘์š”๋„ ๋‚ฎ์„ ์ˆ˜ ์žˆ์–ด ํ•„ํ„ฐ๋ง (์„ ํƒ์ )
if len(token.form) > 1 or token.tag in ['SL']:
keywords.append(token.form)
logger.debug(f"ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๊ฒฐ๊ณผ: {keywords}")
return keywords
except Exception as e:
logger.warning(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}, ๊ธฐ๋ณธ ๋ถ„๋ฆฌ ๋ฐฉ์‹์œผ๋กœ ๋Œ€์ฒด")
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๊ธฐ๋ณธ ๋ฐฉ์‹์œผ๋กœ ๋Œ€์ฒด
words = processed_text.split()
return words
def calculate_text_similarity(text1, text2, weights=None):
"""
๋‘ ํ…์ŠคํŠธ ๊ฐ„์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ™œ์šฉ)
๊ฐœ์„ ๋œ ๋ฒ„์ „: ์ •ํ™•ํ•œ ์ผ์น˜ ๋ฐ ํฌํ•จ ๊ด€๊ณ„๋„ ๊ณ ๋ ค
Args:
text1 (str): ์ฒซ ๋ฒˆ์งธ ํ…์ŠคํŠธ
text2 (str): ๋‘ ๋ฒˆ์งธ ํ…์ŠคํŠธ
weights (dict, optional): ๊ฐ ๋ถ€๋ถ„์— ๋Œ€ํ•œ ๊ฐ€์ค‘์น˜
Returns:
float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
"""
if not text1 or not text2:
return 0.0
# ์›๋ณธ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
clean_text1 = preprocess_text(text1)
clean_text2 = preprocess_text(text2)
# 1. ์ •ํ™•ํ•œ ์ผ์น˜ ๊ฒ€์‚ฌ
if clean_text1.lower() == clean_text2.lower():
return 1.0
# 2. ํฌํ•จ ๊ด€๊ณ„ ๊ฒ€์‚ฌ (์งง์€ ํ…์ŠคํŠธ๊ฐ€ ๊ธด ํ…์ŠคํŠธ์— ํฌํ•จ๋จ)
if clean_text1.lower() in clean_text2.lower() or clean_text2.lower() in clean_text1.lower():
# ๊ธธ์ด ๋น„์œจ์— ๋”ฐ๋ผ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (์ตœ์†Œ 0.7)
len_ratio = min(len(clean_text1), len(clean_text2)) / max(len(clean_text1), len(clean_text2))
return max(0.7, 0.7 + 0.3 * len_ratio) # 0.7~1.0 ์‚ฌ์ด ๊ฐ’
# ๊ธฐ๋ณธ ๊ฐ€์ค‘์น˜ ์„ค์ •
if weights is None:
weights = {
'common_words': 0.7, # ๊ณตํ†ต ๋‹จ์–ด ๋น„์œจ์˜ ๊ฐ€์ค‘์น˜
'length_ratio': 0.15, # ๊ธธ์ด ์œ ์‚ฌ์„ฑ ๊ฐ€์ค‘์น˜
'word_order': 0.15 # ๋‹จ์–ด ์ˆœ์„œ ์œ ์‚ฌ์„ฑ ๊ฐ€์ค‘์น˜
}
# ํ…์ŠคํŠธ์—์„œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (Kiwi ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ ์‚ฌ์šฉ)
keywords1 = extract_keywords(text1)
keywords2 = extract_keywords(text2)
if not keywords1 or not keywords2:
# ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ์›๋ณธ ํ…์ŠคํŠธ์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
jaccard_sim = calculate_jaccard_similarity(clean_text1, clean_text2)
return max(0.1, jaccard_sim) # ์ตœ์†Œ 0.1 ์œ ์‚ฌ๋„ ๋ถ€์—ฌ
# 3. ๊ณตํ†ต ๋‹จ์–ด ๋น„์œจ ๊ณ„์‚ฐ (๊ฐœ์„ )
common_words = set(keywords1) & set(keywords2)
if common_words:
# ๊ณตํ†ต ๋‹จ์–ด๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ ๋น„์œจ ๊ณ„์‚ฐ
common_ratio = len(common_words) / min(len(set(keywords1)), len(set(keywords2)))
# ์ฃผ์š” ํ‚ค์›Œ๋“œ๊ฐ€ ๊ณตํ†ต๋˜๋Š” ๊ฒฝ์šฐ ๊ฐ€์ค‘์น˜ ์ถ”๊ฐ€
important_keywords = [w for w in common_words
if len(w) > 1 and not w.isdigit()]
if important_keywords:
common_ratio = max(common_ratio, 0.5 + 0.3 * (len(important_keywords) / len(common_words)))
else:
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ์ž์นด๋“œ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (๋‚ฎ์€ ๊ฐ’)
common_ratio = calculate_jaccard_similarity(clean_text1, clean_text2) * 0.5
# 4. ํ…์ŠคํŠธ ๊ธธ์ด ์œ ์‚ฌ๋„
length_ratio = min(len(keywords1), len(keywords2)) / max(1, max(len(keywords1), len(keywords2)))
# 5. ๋‹จ์–ด ์ˆœ์„œ ์œ ์‚ฌ๋„ (์„ ํƒ์ )
word_order_sim = 0.0
if common_words:
# ๊ณตํ†ต ๋‹จ์–ด์˜ ์œ„์น˜ ์ฐจ์ด ๊ธฐ๋ฐ˜ ์œ ์‚ฌ๋„
positions1 = {word: i for i, word in enumerate(keywords1) if word in common_words}
positions2 = {word: i for i, word in enumerate(keywords2) if word in common_words}
if positions1 and positions2:
common_words_positions = set(positions1.keys()) & set(positions2.keys())
if common_words_positions:
pos_diff_sum = sum(abs(positions1[word] - positions2[word])
for word in common_words_positions)
max_diff = len(keywords1) + len(keywords2)
word_order_sim = 1.0 - min(1.0, (pos_diff_sum / max(1, max_diff)))
# ๊ฐ€์ค‘์น˜ ์ ์šฉํ•˜์—ฌ ์ตœ์ข… ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
similarity = (
weights['common_words'] * common_ratio +
weights['length_ratio'] * length_ratio +
weights['word_order'] * word_order_sim
)
# ์ตœ์†Œ ์œ ์‚ฌ๋„ ๋ณด์žฅ (ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋‹ค๋ฉด)
if common_words:
similarity = max(similarity, 0.1 + 0.2 * len(common_words) / max(len(keywords1), len(keywords2)))
return min(1.0, max(0.0, similarity))
# ์ž์นด๋“œ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ํ•จ์ˆ˜ ์ถ”๊ฐ€
def calculate_jaccard_similarity(text1, text2):
"""
๋‘ ํ…์ŠคํŠธ ๊ฐ„์˜ ์ž์นด๋“œ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
Args:
text1 (str): ์ฒซ ๋ฒˆ์งธ ํ…์ŠคํŠธ
text2 (str): ๋‘ ๋ฒˆ์งธ ํ…์ŠคํŠธ
Returns:
float: ์ž์นด๋“œ ์œ ์‚ฌ๋„ (0~1 ์‚ฌ์ด)
"""
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
if not set1 or not set2:
return 0.0
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / max(1, union)
def calculate_category_similarity(category1, category2):
"""
๋‘ ์นดํ…Œ๊ณ ๋ฆฌ ๊ฐ„์˜ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (๊ฐœ์„ ๋œ ๋ฒ„์ „)
Args:
category1 (str or int): ์ฒซ ๋ฒˆ์งธ ์นดํ…Œ๊ณ ๋ฆฌ
category2 (str or int): ๋‘ ๋ฒˆ์งธ ์นดํ…Œ๊ณ ๋ฆฌ
Returns:
float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
"""
# None ๋˜๋Š” ๋นˆ ๊ฐ’ ์ฒ˜๋ฆฌ
if not category1 or not category2:
return 0.0
# ์ •์ˆ˜ํ˜• ID์ธ ๊ฒฝ์šฐ ์ง์ ‘ ๋น„๊ต
if isinstance(category1, int) and isinstance(category2, int):
return 1.0 if category1 == category2 else 0.0
# ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
cat1 = str(category1).strip()
cat2 = str(category2).strip()
# ์™„์ „ ์ผ์น˜ ํ™•์ธ
if cat1.lower() == cat2.lower():
return 1.0
# ์นดํ…Œ๊ณ ๋ฆฌ ์ „์ฒ˜๋ฆฌ
cat1_processed = preprocess_text(cat1)
cat2_processed = preprocess_text(cat2)
# ์ „์ฒ˜๋ฆฌ ํ›„ ์ผ์น˜ ํ™•์ธ
if cat1_processed.lower() == cat2_processed.lower():
return 1.0
# ํฌํ•จ ๊ด€๊ณ„ ํ™•์ธ (์˜ˆ: '์ง€๊ฐ‘'๊ณผ '๊ฐ€์ฃฝ ์ง€๊ฐ‘')
if cat1_processed.lower() in cat2_processed.lower() or cat2_processed.lower() in cat1_processed.lower():
# ๊ธธ์ด ๋น„์œจ์— ๋”ฐ๋ผ ์œ ์‚ฌ๋„ ์กฐ์ •
len_ratio = min(len(cat1_processed), len(cat2_processed)) / max(len(cat1_processed), len(cat2_processed))
return max(0.8, len_ratio) # ์ตœ์†Œ 0.8 ์œ ์‚ฌ๋„
# ํ‚ค์›Œ๋“œ ์ถ”์ถœ ๋ฐ ๊ณตํ†ต ๋‹จ์–ด ํ™•์ธ
keywords1 = set(extract_keywords(cat1))
keywords2 = set(extract_keywords(cat2))
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
common_keywords = keywords1 & keywords2
if common_keywords:
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ ๋น„์œจ์— ๋”ฐ๋ผ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
common_ratio = len(common_keywords) / min(len(keywords1), len(keywords2)) if keywords1 and keywords2 else 0
return max(0.5, common_ratio) # ์ตœ์†Œ 0.5 ์œ ์‚ฌ๋„
# '๊ธฐํƒ€' ์นดํ…Œ๊ณ ๋ฆฌ ์ฒ˜๋ฆฌ
if '๊ธฐํƒ€' in cat1 or '๊ธฐํƒ€' in cat2:
return 0.3 # ๊ธฐํƒ€ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ์•ฝํ•œ ์—ฐ๊ด€์„ฑ
# ์ตœ์ข…์ ์œผ๋กœ ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
return calculate_text_similarity(cat1, cat2)
def calculate_similarity(user_post, lost_item, clip_model=None):
"""
์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์Šต๋“๋ฌผ ํ•ญ๋ชฉ ๊ฐ„์˜ ์ข…ํ•ฉ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
Spring Boot์™€ ํ˜ธํ™˜๋˜๋„๋ก ํ•„๋“œ๋ช… ๋งคํ•‘ ์ˆ˜์ •
Args:
user_post (dict): ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€ ์ •๋ณด (๋ถ„์‹ค๋ฌผ)
lost_item (dict): ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ (found_item)
clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
Returns:
float: ์œ ์‚ฌ๋„ ์ ์ˆ˜ (0~1 ์‚ฌ์ด)
dict: ์„ธ๋ถ€ ์œ ์‚ฌ๋„ ์ •๋ณด
"""
# ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
text_similarities = {}
# ํ•„๋“œ ์กด์žฌ ์—ฌ๋ถ€ ๊ฒ€์‚ฌ ๋ฐ ๋กœ๊น…
logger.info(f"==== ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์‹œ์ž‘ ====")
# 1. ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„ - ID๋งŒ ์‚ฌ์šฉํ•˜๋„๋ก ์ˆ˜์ •
category_sim = 0.0
# ์‚ฌ์šฉ์ž ์นดํ…Œ๊ณ ๋ฆฌ ํ•„๋“œ: 'category' ๋˜๋Š” 'itemCategoryId'
user_category_id = None
if 'category' in user_post and user_post['category'] is not None:
user_category_id = user_post['category']
elif 'itemCategoryId' in user_post and user_post['itemCategoryId'] is not None:
user_category_id = user_post['itemCategoryId']
# ์Šต๋“๋ฌผ ์นดํ…Œ๊ณ ๋ฆฌ ํ•„๋“œ: 'item_category_id'๋งŒ ์‚ฌ์šฉ
lost_category_id = None
if 'item_category_id' in lost_item and lost_item['item_category_id'] is not None:
lost_category_id = lost_item['item_category_id']
# ์นดํ…Œ๊ณ ๋ฆฌ ์ •๋ณด ๋กœ๊น…
logger.info(f"์นดํ…Œ๊ณ ๋ฆฌ ID ๋น„๊ต: ์‚ฌ์šฉ์ž({user_category_id}) vs ์Šต๋“๋ฌผ({lost_category_id})")
# ์นดํ…Œ๊ณ ๋ฆฌ ID ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ - ์ •ํ™•ํžˆ ๊ฐ™์€ ID์ธ ๊ฒฝ์šฐ๋งŒ ์ผ์น˜
if user_category_id is not None and lost_category_id is not None:
try:
# ์ˆซ์ž๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ๋น„๊ต
user_category_id = int(user_category_id)
lost_category_id = int(lost_category_id)
category_sim = 1.0 if user_category_id == lost_category_id else 0.0
logger.info(f"์นดํ…Œ๊ณ ๋ฆฌ ID ์ผ์น˜ ์—ฌ๋ถ€: {category_sim}")
except (ValueError, TypeError):
logger.warning(f"์นดํ…Œ๊ณ ๋ฆฌ ID๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜ํ•  ์ˆ˜ ์—†์Œ: {user_category_id}, {lost_category_id}")
category_sim = 0.0
text_similarities['category'] = category_sim
# 2. ๋ฌผํ’ˆ๋ช… ์œ ์‚ฌ๋„ (์‚ฌ์šฉ์ž ์ธก์ด ์—†์„ ๊ฒฝ์šฐ ์นดํ…Œ๊ณ ๋ฆฌ๋‚˜ ๊ฒ€์ƒ‰์–ด์—์„œ ์ถ”์ถœ)
item_name_sim = 0.0
user_item_name = None
# ์‚ฌ์šฉ์ž ๋ฌผํ’ˆ๋ช…: title, search_keyword, content ์ค‘์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ
if 'title' in user_post and user_post['title']:
user_item_name = user_post['title']
elif 'search_keyword' in user_post and user_post['search_keyword']:
# ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ์œผ๋ฉด ์‚ฌ์šฉ
user_item_name = user_post['search_keyword']
elif 'content' in user_post and user_post['content']:
# ๋‚ด์šฉ์—์„œ ์ฒซ ๋ฌธ์žฅ์ด๋‚˜ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
content = user_post['content']
# ์ฒซ 10๋‹จ์–ด ์ถ”์ถœ (๋˜๋Š” ์ ์ ˆํ•œ ๊ธธ์ด)
words = content.split()[:10]
if words:
user_item_name = ' '.join(words)
# ์Šต๋“๋ฌผ ๋ฌผํ’ˆ๋ช…: name ๋˜๋Š” title์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ
lost_item_name = None
if 'name' in lost_item and lost_item['name']:
lost_item_name = lost_item['name']
elif 'title' in lost_item and lost_item['title']:
lost_item_name = lost_item['title']
logger.info(f"๋ฌผํ’ˆ๋ช… ํ•„๋“œ: ์‚ฌ์šฉ์ž({user_item_name}) vs ์Šต๋“๋ฌผ({lost_item_name})")
# ๋ฌผํ’ˆ๋ช… ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
if user_item_name and lost_item_name:
# ์ „์ฒ˜๋ฆฌ ์ ์šฉ
user_item_name_clean = preprocess_text(str(user_item_name))
lost_item_name_clean = preprocess_text(str(lost_item_name))
# ๊ธฐ๋ณธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
item_name_sim = calculate_text_similarity(user_item_name_clean, lost_item_name_clean)
# ์™„์ „ ์ผ์น˜ํ•˜๊ฑฐ๋‚˜ ํฌํ•จ ๊ด€๊ณ„์ธ ๊ฒฝ์šฐ ๊ฐ€์ค‘์น˜ ๋ถ€์—ฌ
if user_item_name_clean.lower() == lost_item_name_clean.lower():
item_name_sim = 1.0 # ์™„์ „ ์ผ์น˜
logger.info("๋ฌผํ’ˆ๋ช… ์™„์ „ ์ผ์น˜")
elif user_item_name_clean.lower() in lost_item_name_clean.lower() or lost_item_name_clean.lower() in user_item_name_clean.lower():
item_name_sim = 0.8 # ๋ถ€๋ถ„ ํฌํ•จ
logger.info("๋ฌผํ’ˆ๋ช… ํฌํ•จ ๊ด€๊ณ„ ๊ฐ์ง€")
elif user_item_name is None and lost_item_name:
# ์‚ฌ์šฉ์ž ๋ฌผํ’ˆ๋ช…์ด ์—†๊ณ  ์Šต๋“๋ฌผ ๋ฌผํ’ˆ๋ช…๋งŒ ์žˆ๋Š” ๊ฒฝ์šฐ
# ์นดํ…Œ๊ณ ๋ฆฌ๋‚˜ ์ƒ‰์ƒ์ด ์ผ์น˜ํ•˜๋ฉด ์ตœ์†Œ ์œ ์‚ฌ๋„ ๋ถ€์—ฌ
if category_sim > 0.5 or ('color' in user_post and 'color' in lost_item and
preprocess_text(user_post['color']).lower() == preprocess_text(lost_item['color']).lower()):
item_name_sim = 0.3 # ์ตœ์†Œ ์œ ์‚ฌ๋„ ๋ถ€์—ฌ
logger.info("์‚ฌ์šฉ์ž ๋ฌผํ’ˆ๋ช… ์—†์Œ, ์นดํ…Œ๊ณ ๋ฆฌ/์ƒ‰์ƒ ์œ ์‚ฌ์„ฑ ๊ธฐ๋ฐ˜ ์ตœ์†Œ ์œ ์‚ฌ๋„ ๋ถ€์—ฌ")
else:
logger.warning(f"์‚ฌ์šฉ์ž ๋ฌผํ’ˆ๋ช… ๋ˆ„๋ฝ, ์œ ์‚ฌ๋„ 0")
else:
logger.warning(f"๋ฌผํ’ˆ๋ช… ๋น„๊ต ๋ถˆ๊ฐ€: ์‚ฌ์šฉ์ž({user_item_name}) ๋˜๋Š” ์Šต๋“๋ฌผ({lost_item_name}) ๋ฌผํ’ˆ๋ช… ๋ˆ„๋ฝ")
text_similarities['item_name'] = item_name_sim
# 3. ์ƒ‰์ƒ ์œ ์‚ฌ๋„
color_sim = 0.0
# ์ƒ‰์ƒ ํ•„๋“œ๋Š” ๋™์ผํ•˜๊ฒŒ 'color'
user_color = user_post.get('color', '')
lost_color = lost_item.get('color', '')
logger.info(f"์ƒ‰์ƒ ๋น„๊ต: ์‚ฌ์šฉ์ž({user_color}) vs ์Šต๋“๋ฌผ({lost_color})")
# ์ƒ‰์ƒ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
if user_color and lost_color:
# ์ƒ‰์ƒ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
user_color_clean = preprocess_text(str(user_color))
lost_color_clean = preprocess_text(str(lost_color))
# ์™„์ „ ์ผ์น˜ ๊ฒ€์‚ฌ
if user_color_clean.lower() == lost_color_clean.lower():
color_sim = 1.0
logger.info("์ƒ‰์ƒ ์™„์ „ ์ผ์น˜")
else:
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ ๊ฒ€์‚ฌ
user_color_keywords = extract_keywords(user_color)
lost_color_keywords = extract_keywords(lost_color)
common_keywords = set(user_color_keywords) & set(lost_color_keywords)
if common_keywords:
color_sim = 0.8
logger.info(f"์ƒ‰์ƒ ๊ณตํ†ต ํ‚ค์›Œ๋“œ: {common_keywords}")
else:
color_sim = calculate_text_similarity(user_color, lost_color)
logger.info(f"์ƒ‰์ƒ ๊ธฐ๋ณธ ์œ ์‚ฌ๋„: {color_sim}")
else:
logger.warning(f"์ƒ‰์ƒ ๋ˆ„๋ฝ: ์‚ฌ์šฉ์ž({user_color}) ๋˜๋Š” ์Šต๋“๋ฌผ({lost_color})")
text_similarities['color'] = color_sim
# 4. ๋‚ด์šฉ ์œ ์‚ฌ๋„
content_sim = 0.0
# ๋ชจ๋“  ๊ฐ€๋Šฅํ•œ ๋‚ด์šฉ ํ•„๋“œ ๊ฒ€์‚ฌ
possible_content_fields_user = ['detail', 'content', 'description']
possible_content_fields_lost = ['detail', 'content', 'description']
# ์‚ฌ์šฉ์ž ๋‚ด์šฉ ํ•„๋“œ ์ฐพ๊ธฐ
user_content = None
user_content_field = None
for field in possible_content_fields_user:
if field in user_post and user_post[field]:
user_content = user_post[field]
user_content_field = field
break
# ์Šต๋“๋ฌผ ๋‚ด์šฉ ํ•„๋“œ ์ฐพ๊ธฐ
lost_content = None
lost_content_field = None
for field in possible_content_fields_lost:
if field in lost_item and lost_item[field]:
lost_content = lost_item[field]
lost_content_field = field
break
logger.info(f"๋‚ด์šฉ ํ•„๋“œ: ์‚ฌ์šฉ์ž({user_content_field}) vs ์Šต๋“๋ฌผ({lost_content_field})")
# ๋‚ด์šฉ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
if user_content and lost_content:
# ๋‚ด์šฉ์˜ ๊ธธ์ด๊ฐ€ ์งง์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์ „์ฒ˜๋ฆฌ ํ›„ ํ‚ค์›Œ๋“œ ์ถ”์ถœ์— ์ค‘์ 
user_content_keywords = extract_keywords(user_content)
lost_content_keywords = extract_keywords(lost_content)
logger.info(f"๋‚ด์šฉ ํ‚ค์›Œ๋“œ ์ˆ˜: ์‚ฌ์šฉ์ž({len(user_content_keywords)}๊ฐœ) vs ์Šต๋“๋ฌผ({len(lost_content_keywords)}๊ฐœ)")
if user_content_keywords and lost_content_keywords:
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ ๋น„์œจ ๊ณ„์‚ฐ
common_keywords = set(user_content_keywords) & set(lost_content_keywords)
if common_keywords:
common_ratio = len(common_keywords) / min(len(user_content_keywords), len(lost_content_keywords))
logger.info(f"๋‚ด์šฉ ๊ณตํ†ต ํ‚ค์›Œ๋“œ: {len(common_keywords)}๊ฐœ, ๊ณตํ†ต ๋น„์œจ: {common_ratio:.4f}")
# ๊ณตํ†ต ํ‚ค์›Œ๋“œ๊ฐ€ ๋งŽ์„์ˆ˜๋ก ์œ ์‚ฌ๋„ ์ฆ๊ฐ€
if common_ratio >= 0.5: # 50% ์ด์ƒ ๊ณตํ†ต ํ‚ค์›Œ๋“œ
content_sim = max(0.7, common_ratio)
logger.info(f"๋‚ด์šฉ ๋†’์€ ๊ณตํ†ต ๋น„์œจ: {content_sim:.4f}")
else:
text_sim = calculate_text_similarity(user_content, lost_content)
content_sim = max(text_sim, common_ratio)
logger.info(f"๋‚ด์šฉ ๊ธฐ๋ณธ ์œ ์‚ฌ๋„: {text_sim:.4f}, ์ตœ์ข…: {content_sim:.4f}")
else:
content_sim = calculate_text_similarity(user_content, lost_content)
logger.info(f"๋‚ด์šฉ ๊ณตํ†ต ํ‚ค์›Œ๋“œ ์—†์Œ, ๊ธฐ๋ณธ ์œ ์‚ฌ๋„: {content_sim:.4f}")
else:
content_sim = calculate_text_similarity(user_content, lost_content)
logger.info(f"๋‚ด์šฉ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ์‹คํŒจ, ๊ธฐ๋ณธ ์œ ์‚ฌ๋„: {content_sim:.4f}")
else:
logger.warning(f"๋‚ด์šฉ ๋ˆ„๋ฝ: ์‚ฌ์šฉ์ž({user_content is not None}) ๋˜๋Š” ์Šต๋“๋ฌผ({lost_content is not None})")
text_similarities['content'] = content_sim
# ๊ฐ€์ค‘์น˜ ์กฐ์ •
ADJ_CATEGORY_WEIGHT = 0.35
ADJ_ITEM_NAME_WEIGHT = 0.35
ADJ_COLOR_WEIGHT = 0.15
ADJ_CONTENT_WEIGHT = 0.15
# ํ…์ŠคํŠธ ์ข…ํ•ฉ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (๊ฐ€์ค‘์น˜ ์ ์šฉ)
total_text_similarity = (
ADJ_CATEGORY_WEIGHT * category_sim +
ADJ_ITEM_NAME_WEIGHT * item_name_sim +
ADJ_COLOR_WEIGHT * color_sim +
ADJ_CONTENT_WEIGHT * content_sim
)
# ์ตœ์ข… ์œ ์‚ฌ๋„๋Š” ํ…์ŠคํŠธ ์œ ์‚ฌ๋„๋งŒ ์‚ฌ์šฉ
final_similarity = total_text_similarity
# ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ ๋กœ๊น…
logger.info(f"์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ: ์นดํ…Œ๊ณ ๋ฆฌ({category_sim:.4f}*{ADJ_CATEGORY_WEIGHT}) + ๋ฌผํ’ˆ๋ช…({item_name_sim:.4f}*{ADJ_ITEM_NAME_WEIGHT}) + ์ƒ‰์ƒ({color_sim:.4f}*{ADJ_COLOR_WEIGHT}) + ๋‚ด์šฉ({content_sim:.4f}*{ADJ_CONTENT_WEIGHT}) = {final_similarity:.4f}")
logger.info(f"==== ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์ข…๋ฃŒ ====")
# ์„ธ๋ถ€ ์œ ์‚ฌ๋„ ์ •๋ณด
similarity_details = {
'text_similarity': total_text_similarity,
'image_similarity': None, # ์ด๋ฏธ์ง€ ์œ ์‚ฌ๋„ ์‚ฌ์šฉ ์•ˆํ•จ
'final_similarity': final_similarity,
'details': text_similarities
}
return final_similarity, similarity_details
def find_similar_items(user_post, lost_items, threshold=SIMILARITY_THRESHOLD, clip_model=None):
"""
์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ ์œ ์‚ฌํ•œ ์Šต๋“๋ฌผ ๋ชฉ๋ก ์ฐพ๊ธฐ
Args:
user_post (dict): ์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€ ์ •๋ณด
lost_items (list): ์Šต๋“๋ฌผ ๋ฐ์ดํ„ฐ ๋ชฉ๋ก
threshold (float): ์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’ (๊ธฐ๋ณธ๊ฐ’: config์—์„œ ์„ค์ •)
clip_model (KoreanCLIPModel, optional): CLIP ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
Returns:
list: ์œ ์‚ฌ๋„๊ฐ€ ์ž„๊ณ„๊ฐ’ ์ด์ƒ์ธ ์Šต๋“๋ฌผ ๋ชฉ๋ก (์œ ์‚ฌ๋„ ๋†’์€ ์ˆœ)
"""
similar_items = []
logger.info(f"์‚ฌ์šฉ์ž ๊ฒŒ์‹œ๊ธ€๊ณผ {len(lost_items)}๊ฐœ ์Šต๋“๋ฌผ ๋น„๊ต ์ค‘...")
for item in lost_items:
similarity, details = calculate_similarity(user_post, item, clip_model)
if similarity >= threshold:
similar_items.append({
'item': item,
'similarity': similarity,
'details': details
})
# ์œ ์‚ฌ๋„ ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌ
similar_items.sort(key=lambda x: x['similarity'], reverse=True)
logger.info(f"์œ ์‚ฌ๋„ {threshold} ์ด์ƒ์ธ ์Šต๋“๋ฌผ {len(similar_items)}๊ฐœ ๋ฐœ๊ฒฌ")
return similar_items
# ๋ชจ๋“ˆ ํ…Œ์ŠคํŠธ์šฉ ์ฝ”๋“œ
if __name__ == "__main__":
# ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
text1 = "๊ฒ€์€์ƒ‰ ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์žƒ์–ด๋ฒ„๋ ธ์Šต๋‹ˆ๋‹ค."
text2 = "๊ฒ€์ • ๊ฐ€์ฃฝ ์ง€๊ฐ‘์„ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค."
text3 = "๋…ธํŠธ๋ถ์„ ๋ถ„์‹คํ–ˆ์Šต๋‹ˆ๋‹ค."
# ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ…Œ์ŠคํŠธ
print("[ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ…Œ์ŠคํŠธ ]")
print(f"ํ…์ŠคํŠธ 1: '{text1}'")
print(f"์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {extract_keywords(text1)}")
print(f"ํ…์ŠคํŠธ 2: '{text2}'")
print(f"์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ: {extract_keywords(text2)}")
# ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
sim12 = calculate_text_similarity(text1, text2)
sim13 = calculate_text_similarity(text1, text3)
print("\n[ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ ]")
print(f"ํ…์ŠคํŠธ 1-2 ์œ ์‚ฌ๋„: {sim12:.4f}")
print(f"ํ…์ŠคํŠธ 1-3 ์œ ์‚ฌ๋„: {sim13:.4f}")
# ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ
cat1 = "์ง€๊ฐ‘"
cat2 = "๊ฐ€๋ฐฉ/์ง€๊ฐ‘"
cat3 = "๊ธฐํƒ€"
cat_sim12 = calculate_category_similarity(cat1, cat2)
cat_sim13 = calculate_category_similarity(cat1, cat3)
print("\n[ ์นดํ…Œ๊ณ ๋ฆฌ ์œ ์‚ฌ๋„ ํ…Œ์ŠคํŠธ ]")
print(f"์นดํ…Œ๊ณ ๋ฆฌ 1-2 ์œ ์‚ฌ๋„: {cat_sim12:.4f}")
print(f"์นดํ…Œ๊ณ ๋ฆฌ 1-3 ์œ ์‚ฌ๋„: {cat_sim13:.4f}")