Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -173,13 +173,25 @@ class PDFRAGSystem:
|
|
173 |
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
|
174 |
chunks = self.text_splitter.split_text(pdf_data["full_text"])
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
# ์ฒญํฌ ์ ์ฅ
|
177 |
self.document_chunks[doc_id] = chunks
|
178 |
|
179 |
-
# ์๋ฒ ๋ฉ ์์ฑ
|
180 |
if self.embedder:
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
# ๋ฌธ์ ์ ๋ณด ์ ์ฅ
|
185 |
self.documents[doc_id] = {
|
@@ -188,6 +200,9 @@ class PDFRAGSystem:
|
|
188 |
"upload_time": datetime.now().isoformat()
|
189 |
}
|
190 |
|
|
|
|
|
|
|
191 |
return {
|
192 |
"success": True,
|
193 |
"doc_id": doc_id,
|
@@ -197,6 +212,7 @@ class PDFRAGSystem:
|
|
197 |
}
|
198 |
|
199 |
except Exception as e:
|
|
|
200 |
return {"success": False, "error": str(e)}
|
201 |
|
202 |
def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
|
@@ -205,54 +221,75 @@ class PDFRAGSystem:
|
|
205 |
|
206 |
print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
|
|
|
|
|
214 |
doc_embeddings = self.embeddings_store[doc_id]
|
215 |
-
chunks = self.document_chunks[doc_id]
|
216 |
|
217 |
-
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ
|
218 |
similarities = []
|
219 |
-
for emb in doc_embeddings:
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
"similarity": score / len(query_keywords) if query_keywords else 0
|
250 |
-
})
|
251 |
|
252 |
-
#
|
253 |
all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
|
|
|
|
|
254 |
result = all_relevant_chunks[:top_k]
|
255 |
print(f"Returning {len(result)} chunks")
|
|
|
|
|
|
|
|
|
|
|
256 |
return result
|
257 |
|
258 |
def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
|
@@ -262,10 +299,23 @@ class PDFRAGSystem:
|
|
262 |
relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
|
263 |
|
264 |
if not relevant_chunks:
|
265 |
-
print("No relevant chunks found")
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
-
print(f"
|
269 |
|
270 |
# ์ปจํ
์คํธ ๊ตฌ์ฑ
|
271 |
context_parts = []
|
@@ -274,15 +324,18 @@ class PDFRAGSystem:
|
|
274 |
|
275 |
for i, chunk in enumerate(relevant_chunks, 1):
|
276 |
context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
|
277 |
-
|
|
|
278 |
context_parts.append(content)
|
279 |
-
print(f"Added chunk {i} with similarity: {chunk.get('similarity', 0):.3f}")
|
280 |
|
281 |
context_parts.append("\n" + "=" * 40)
|
282 |
|
283 |
context = "\n".join(context_parts)
|
284 |
enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
|
285 |
|
|
|
|
|
286 |
return enhanced_query, context
|
287 |
|
288 |
# Initialize model and RAG system
|
|
|
173 |
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
|
174 |
chunks = self.text_splitter.split_text(pdf_data["full_text"])
|
175 |
|
176 |
+
if not chunks:
|
177 |
+
print("Warning: No chunks created from PDF")
|
178 |
+
return {"success": False, "error": "No text content found in PDF"}
|
179 |
+
|
180 |
+
print(f"Created {len(chunks)} chunks from PDF")
|
181 |
+
|
182 |
# ์ฒญํฌ ์ ์ฅ
|
183 |
self.document_chunks[doc_id] = chunks
|
184 |
|
185 |
+
# ์๋ฒ ๋ฉ ์์ฑ (์ ํ์ )
|
186 |
if self.embedder:
|
187 |
+
try:
|
188 |
+
print("Generating embeddings...")
|
189 |
+
embeddings = self.embedder.encode(chunks)
|
190 |
+
self.embeddings_store[doc_id] = embeddings
|
191 |
+
print(f"Generated {len(embeddings)} embeddings")
|
192 |
+
except Exception as e:
|
193 |
+
print(f"Warning: Failed to generate embeddings: {e}")
|
194 |
+
# ์๋ฒ ๋ฉ ์คํจํด๋ ๊ณ์ ์งํ
|
195 |
|
196 |
# ๋ฌธ์ ์ ๋ณด ์ ์ฅ
|
197 |
self.documents[doc_id] = {
|
|
|
200 |
"upload_time": datetime.now().isoformat()
|
201 |
}
|
202 |
|
203 |
+
# ๋๋ฒ๊ทธ: ์ฒซ ๋ฒ์งธ ์ฒญํฌ ์ถ๋ ฅ
|
204 |
+
print(f"First chunk preview: {chunks[0][:200]}...")
|
205 |
+
|
206 |
return {
|
207 |
"success": True,
|
208 |
"doc_id": doc_id,
|
|
|
212 |
}
|
213 |
|
214 |
except Exception as e:
|
215 |
+
print(f"Error processing PDF: {e}")
|
216 |
return {"success": False, "error": str(e)}
|
217 |
|
218 |
def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
|
|
|
221 |
|
222 |
print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
|
223 |
|
224 |
+
# ๋จผ์ ๋ฌธ์๊ฐ ์๋์ง ํ์ธ
|
225 |
+
for doc_id in doc_ids:
|
226 |
+
if doc_id not in self.document_chunks:
|
227 |
+
print(f"Warning: Document {doc_id} not found in chunks")
|
228 |
+
continue
|
229 |
+
|
230 |
+
chunks = self.document_chunks[doc_id]
|
231 |
+
print(f"Document {doc_id} has {len(chunks)} chunks")
|
232 |
|
233 |
+
# ์๋ฒ ๋ฉ ๊ธฐ๋ฐ ๊ฒ์ ์๋
|
234 |
+
if self.embedder and doc_id in self.embeddings_store:
|
235 |
+
try:
|
236 |
+
query_embedding = self.embedder.encode([query])[0]
|
237 |
doc_embeddings = self.embeddings_store[doc_id]
|
|
|
238 |
|
239 |
+
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ (์์ ํ๊ฒ)
|
240 |
similarities = []
|
241 |
+
for i, emb in enumerate(doc_embeddings):
|
242 |
+
try:
|
243 |
+
query_norm = np.linalg.norm(query_embedding)
|
244 |
+
emb_norm = np.linalg.norm(emb)
|
245 |
+
|
246 |
+
if query_norm > 0 and emb_norm > 0:
|
247 |
+
sim = np.dot(query_embedding, emb) / (query_norm * emb_norm)
|
248 |
+
similarities.append(sim)
|
249 |
+
else:
|
250 |
+
similarities.append(0.0)
|
251 |
+
except Exception as e:
|
252 |
+
print(f"Error calculating similarity for chunk {i}: {e}")
|
253 |
+
similarities.append(0.0)
|
254 |
|
255 |
+
# ์์ ์ฒญํฌ ์ ํ
|
256 |
+
if similarities:
|
257 |
+
top_indices = np.argsort(similarities)[-min(top_k, len(similarities)):][::-1]
|
258 |
+
|
259 |
+
for idx in top_indices:
|
260 |
+
if idx < len(chunks): # ์ธ๋ฑ์ค ๋ฒ์ ํ์ธ
|
261 |
+
all_relevant_chunks.append({
|
262 |
+
"content": chunks[idx],
|
263 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
264 |
+
"similarity": similarities[idx]
|
265 |
+
})
|
266 |
+
print(f"Added chunk {idx} with similarity: {similarities[idx]:.3f}")
|
267 |
+
except Exception as e:
|
268 |
+
print(f"Error in embedding search: {e}")
|
269 |
+
# ์๋ฒ ๋ฉ ์คํจ์ ํด๋ฐฑ
|
270 |
|
271 |
+
# ์๋ฒ ๋ฉ์ด ์๊ฑฐ๋ ์คํจํ ๊ฒฝ์ฐ - ๊ฐ๋จํ ์ฒ์ N๊ฐ ์ฒญํฌ ๋ฐํ
|
272 |
+
if not all_relevant_chunks:
|
273 |
+
print(f"Falling back to simple chunk selection for {doc_id}")
|
274 |
+
for i in range(min(top_k, len(chunks))):
|
275 |
+
all_relevant_chunks.append({
|
276 |
+
"content": chunks[i],
|
277 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
278 |
+
"similarity": 1.0 - (i * 0.1) # ์์๋๋ก ๊ฐ์ค์น
|
279 |
+
})
|
280 |
+
print(f"Added chunk {i} (fallback)")
|
|
|
|
|
281 |
|
282 |
+
# ์ ์ฌ๋ ๊ธฐ์ค ์ ๋ ฌ
|
283 |
all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
|
284 |
+
|
285 |
+
# ์์ K๊ฐ ์ ํ
|
286 |
result = all_relevant_chunks[:top_k]
|
287 |
print(f"Returning {len(result)} chunks")
|
288 |
+
|
289 |
+
# ๋๋ฒ๊ทธ: ์ฒซ ๋ฒ์งธ ์ฒญํฌ ๋ด์ฉ ์ผ๋ถ ์ถ๋ ฅ
|
290 |
+
if result:
|
291 |
+
print(f"First chunk preview: {result[0]['content'][:100]}...")
|
292 |
+
|
293 |
return result
|
294 |
|
295 |
def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
|
|
|
299 |
relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
|
300 |
|
301 |
if not relevant_chunks:
|
302 |
+
print("No relevant chunks found - checking if documents exist")
|
303 |
+
# ๋ฌธ์๊ฐ ์๋๋ฐ ์ฒญํฌ๋ฅผ ๋ชป ์ฐพ์ ๊ฒฝ์ฐ, ์ฒซ ๋ฒ์งธ ์ฒญํฌ๋ผ๋ ์ฌ์ฉ
|
304 |
+
for doc_id in doc_ids:
|
305 |
+
if doc_id in self.document_chunks and self.document_chunks[doc_id]:
|
306 |
+
print(f"Using first chunk from {doc_id} as fallback")
|
307 |
+
relevant_chunks = [{
|
308 |
+
"content": self.document_chunks[doc_id][0],
|
309 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
310 |
+
"similarity": 0.5
|
311 |
+
}]
|
312 |
+
break
|
313 |
+
|
314 |
+
if not relevant_chunks:
|
315 |
+
print("No documents or chunks available")
|
316 |
+
return query, ""
|
317 |
|
318 |
+
print(f"Using {len(relevant_chunks)} chunks for context")
|
319 |
|
320 |
# ์ปจํ
์คํธ ๊ตฌ์ฑ
|
321 |
context_parts = []
|
|
|
324 |
|
325 |
for i, chunk in enumerate(relevant_chunks, 1):
|
326 |
context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
|
327 |
+
# ์ฒญํฌ ํฌ๊ธฐ ์ฆ๊ฐ
|
328 |
+
content = chunk['content'][:1000] if len(chunk['content']) > 1000 else chunk['content']
|
329 |
context_parts.append(content)
|
330 |
+
print(f"Added chunk {i} ({len(content)} chars) with similarity: {chunk.get('similarity', 0):.3f}")
|
331 |
|
332 |
context_parts.append("\n" + "=" * 40)
|
333 |
|
334 |
context = "\n".join(context_parts)
|
335 |
enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
|
336 |
|
337 |
+
print(f"Enhanced query length: {len(enhanced_query)} chars (original: {len(query)} chars)")
|
338 |
+
|
339 |
return enhanced_query, context
|
340 |
|
341 |
# Initialize model and RAG system
|