openfree commited on
Commit
ec0ec22
ยท
verified ยท
1 Parent(s): 71385e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -46
app.py CHANGED
@@ -173,13 +173,25 @@ class PDFRAGSystem:
173
  # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
174
  chunks = self.text_splitter.split_text(pdf_data["full_text"])
175
 
 
 
 
 
 
 
176
  # ์ฒญํฌ ์ €์žฅ
177
  self.document_chunks[doc_id] = chunks
178
 
179
- # ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
180
  if self.embedder:
181
- embeddings = self.embedder.encode(chunks)
182
- self.embeddings_store[doc_id] = embeddings
 
 
 
 
 
 
183
 
184
  # ๋ฌธ์„œ ์ •๋ณด ์ €์žฅ
185
  self.documents[doc_id] = {
@@ -188,6 +200,9 @@ class PDFRAGSystem:
188
  "upload_time": datetime.now().isoformat()
189
  }
190
 
 
 
 
191
  return {
192
  "success": True,
193
  "doc_id": doc_id,
@@ -197,6 +212,7 @@ class PDFRAGSystem:
197
  }
198
 
199
  except Exception as e:
 
200
  return {"success": False, "error": str(e)}
201
 
202
  def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
@@ -205,54 +221,75 @@ class PDFRAGSystem:
205
 
206
  print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
207
 
208
- if self.embedder and self.embeddings_store:
209
- # ์ž„๋ฒ ๋”ฉ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰
210
- query_embedding = self.embedder.encode([query])[0]
 
 
 
 
 
211
 
212
- for doc_id in doc_ids:
213
- if doc_id in self.embeddings_store and doc_id in self.document_chunks:
 
 
214
  doc_embeddings = self.embeddings_store[doc_id]
215
- chunks = self.document_chunks[doc_id]
216
 
217
- # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
218
  similarities = []
219
- for emb in doc_embeddings:
220
- sim = np.dot(query_embedding, emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb))
221
- similarities.append(sim)
222
-
223
- # ์ƒ์œ„ ์ฒญํฌ ์„ ํƒ - ์ž„๊ณ„๊ฐ’ ๋‚ฎ์ถค
224
- top_indices = np.argsort(similarities)[-top_k:][::-1]
 
 
 
 
 
 
 
225
 
226
- for idx in top_indices:
227
- if similarities[idx] > 0.1: # ์ž„๊ณ„๊ฐ’์„ 0.2์—์„œ 0.1๋กœ ๋‚ฎ์ถค
228
- all_relevant_chunks.append({
229
- "content": chunks[idx],
230
- "doc_name": self.documents[doc_id]["metadata"]["file_name"],
231
- "similarity": similarities[idx]
232
- })
233
- print(f"Found chunk with similarity: {similarities[idx]:.3f}")
234
- else:
235
- # ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰
236
- print("Using keyword-based search (embedder not available)")
237
- query_keywords = set(query.lower().split())
 
 
 
238
 
239
- for doc_id in doc_ids:
240
- if doc_id in self.document_chunks:
241
- chunks = self.document_chunks[doc_id]
242
- for i, chunk in enumerate(chunks): # ๋ชจ๋“  ์ฒญํฌ ๊ฒ€์ƒ‰
243
- chunk_lower = chunk.lower()
244
- score = sum(1 for keyword in query_keywords if keyword in chunk_lower)
245
- if score > 0:
246
- all_relevant_chunks.append({
247
- "content": chunk[:800], # ๋” ๊ธด ์ฒญํฌ ์‚ฌ์šฉ
248
- "doc_name": self.documents[doc_id]["metadata"]["file_name"],
249
- "similarity": score / len(query_keywords) if query_keywords else 0
250
- })
251
 
252
- # ์ •๋ ฌ ๋ฐ ๋ฐ˜ํ™˜
253
  all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
 
 
254
  result = all_relevant_chunks[:top_k]
255
  print(f"Returning {len(result)} chunks")
 
 
 
 
 
256
  return result
257
 
258
  def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
@@ -262,10 +299,23 @@ class PDFRAGSystem:
262
  relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
263
 
264
  if not relevant_chunks:
265
- print("No relevant chunks found")
266
- return query, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
- print(f"Found {len(relevant_chunks)} relevant chunks")
269
 
270
  # ์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ
271
  context_parts = []
@@ -274,15 +324,18 @@ class PDFRAGSystem:
274
 
275
  for i, chunk in enumerate(relevant_chunks, 1):
276
  context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
277
- content = chunk['content'][:500] if len(chunk['content']) > 500 else chunk['content']
 
278
  context_parts.append(content)
279
- print(f"Added chunk {i} with similarity: {chunk.get('similarity', 0):.3f}")
280
 
281
  context_parts.append("\n" + "=" * 40)
282
 
283
  context = "\n".join(context_parts)
284
  enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
285
 
 
 
286
  return enhanced_query, context
287
 
288
  # Initialize model and RAG system
 
173
  # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
174
  chunks = self.text_splitter.split_text(pdf_data["full_text"])
175
 
176
+ if not chunks:
177
+ print("Warning: No chunks created from PDF")
178
+ return {"success": False, "error": "No text content found in PDF"}
179
+
180
+ print(f"Created {len(chunks)} chunks from PDF")
181
+
182
  # ์ฒญํฌ ์ €์žฅ
183
  self.document_chunks[doc_id] = chunks
184
 
185
+ # ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ (์„ ํƒ์ )
186
  if self.embedder:
187
+ try:
188
+ print("Generating embeddings...")
189
+ embeddings = self.embedder.encode(chunks)
190
+ self.embeddings_store[doc_id] = embeddings
191
+ print(f"Generated {len(embeddings)} embeddings")
192
+ except Exception as e:
193
+ print(f"Warning: Failed to generate embeddings: {e}")
194
+ # ์ž„๋ฒ ๋”ฉ ์‹คํŒจํ•ด๋„ ๊ณ„์† ์ง„ํ–‰
195
 
196
  # ๋ฌธ์„œ ์ •๋ณด ์ €์žฅ
197
  self.documents[doc_id] = {
 
200
  "upload_time": datetime.now().isoformat()
201
  }
202
 
203
+ # ๋””๋ฒ„๊ทธ: ์ฒซ ๋ฒˆ์งธ ์ฒญํฌ ์ถœ๋ ฅ
204
+ print(f"First chunk preview: {chunks[0][:200]}...")
205
+
206
  return {
207
  "success": True,
208
  "doc_id": doc_id,
 
212
  }
213
 
214
  except Exception as e:
215
+ print(f"Error processing PDF: {e}")
216
  return {"success": False, "error": str(e)}
217
 
218
  def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
 
221
 
222
  print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
223
 
224
+ # ๋จผ์ € ๋ฌธ์„œ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
225
+ for doc_id in doc_ids:
226
+ if doc_id not in self.document_chunks:
227
+ print(f"Warning: Document {doc_id} not found in chunks")
228
+ continue
229
+
230
+ chunks = self.document_chunks[doc_id]
231
+ print(f"Document {doc_id} has {len(chunks)} chunks")
232
 
233
+ # ์ž„๋ฒ ๋”ฉ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰ ์‹œ๋„
234
+ if self.embedder and doc_id in self.embeddings_store:
235
+ try:
236
+ query_embedding = self.embedder.encode([query])[0]
237
  doc_embeddings = self.embeddings_store[doc_id]
 
238
 
239
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (์•ˆ์ „ํ•˜๊ฒŒ)
240
  similarities = []
241
+ for i, emb in enumerate(doc_embeddings):
242
+ try:
243
+ query_norm = np.linalg.norm(query_embedding)
244
+ emb_norm = np.linalg.norm(emb)
245
+
246
+ if query_norm > 0 and emb_norm > 0:
247
+ sim = np.dot(query_embedding, emb) / (query_norm * emb_norm)
248
+ similarities.append(sim)
249
+ else:
250
+ similarities.append(0.0)
251
+ except Exception as e:
252
+ print(f"Error calculating similarity for chunk {i}: {e}")
253
+ similarities.append(0.0)
254
 
255
+ # ์ƒ์œ„ ์ฒญํฌ ์„ ํƒ
256
+ if similarities:
257
+ top_indices = np.argsort(similarities)[-min(top_k, len(similarities)):][::-1]
258
+
259
+ for idx in top_indices:
260
+ if idx < len(chunks): # ์ธ๋ฑ์Šค ๋ฒ”์œ„ ํ™•์ธ
261
+ all_relevant_chunks.append({
262
+ "content": chunks[idx],
263
+ "doc_name": self.documents[doc_id]["metadata"]["file_name"],
264
+ "similarity": similarities[idx]
265
+ })
266
+ print(f"Added chunk {idx} with similarity: {similarities[idx]:.3f}")
267
+ except Exception as e:
268
+ print(f"Error in embedding search: {e}")
269
+ # ์ž„๋ฒ ๋”ฉ ์‹คํŒจ์‹œ ํด๋ฐฑ
270
 
271
+ # ์ž„๋ฒ ๋”ฉ์ด ์—†๊ฑฐ๋‚˜ ์‹คํŒจํ•œ ๊ฒฝ์šฐ - ๊ฐ„๋‹จํžˆ ์ฒ˜์Œ N๊ฐœ ์ฒญํฌ ๋ฐ˜ํ™˜
272
+ if not all_relevant_chunks:
273
+ print(f"Falling back to simple chunk selection for {doc_id}")
274
+ for i in range(min(top_k, len(chunks))):
275
+ all_relevant_chunks.append({
276
+ "content": chunks[i],
277
+ "doc_name": self.documents[doc_id]["metadata"]["file_name"],
278
+ "similarity": 1.0 - (i * 0.1) # ์ˆœ์„œ๋Œ€๋กœ ๊ฐ€์ค‘์น˜
279
+ })
280
+ print(f"Added chunk {i} (fallback)")
 
 
281
 
282
+ # ์œ ์‚ฌ๋„ ๊ธฐ์ค€ ์ •๋ ฌ
283
  all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
284
+
285
+ # ์ƒ์œ„ K๊ฐœ ์„ ํƒ
286
  result = all_relevant_chunks[:top_k]
287
  print(f"Returning {len(result)} chunks")
288
+
289
+ # ๋””๋ฒ„๊ทธ: ์ฒซ ๋ฒˆ์งธ ์ฒญํฌ ๋‚ด์šฉ ์ผ๋ถ€ ์ถœ๋ ฅ
290
+ if result:
291
+ print(f"First chunk preview: {result[0]['content'][:100]}...")
292
+
293
  return result
294
 
295
  def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
 
299
  relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
300
 
301
  if not relevant_chunks:
302
+ print("No relevant chunks found - checking if documents exist")
303
+ # ๋ฌธ์„œ๊ฐ€ ์žˆ๋Š”๋ฐ ์ฒญํฌ๋ฅผ ๋ชป ์ฐพ์€ ๊ฒฝ์šฐ, ์ฒซ ๋ฒˆ์งธ ์ฒญํฌ๋ผ๋„ ์‚ฌ์šฉ
304
+ for doc_id in doc_ids:
305
+ if doc_id in self.document_chunks and self.document_chunks[doc_id]:
306
+ print(f"Using first chunk from {doc_id} as fallback")
307
+ relevant_chunks = [{
308
+ "content": self.document_chunks[doc_id][0],
309
+ "doc_name": self.documents[doc_id]["metadata"]["file_name"],
310
+ "similarity": 0.5
311
+ }]
312
+ break
313
+
314
+ if not relevant_chunks:
315
+ print("No documents or chunks available")
316
+ return query, ""
317
 
318
+ print(f"Using {len(relevant_chunks)} chunks for context")
319
 
320
  # ์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ
321
  context_parts = []
 
324
 
325
  for i, chunk in enumerate(relevant_chunks, 1):
326
  context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
327
+ # ์ฒญํฌ ํฌ๊ธฐ ์ฆ๊ฐ€
328
+ content = chunk['content'][:1000] if len(chunk['content']) > 1000 else chunk['content']
329
  context_parts.append(content)
330
+ print(f"Added chunk {i} ({len(content)} chars) with similarity: {chunk.get('similarity', 0):.3f}")
331
 
332
  context_parts.append("\n" + "=" * 40)
333
 
334
  context = "\n".join(context_parts)
335
  enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
336
 
337
+ print(f"Enhanced query length: {len(enhanced_query)} chars (original: {len(query)} chars)")
338
+
339
  return enhanced_query, context
340
 
341
  # Initialize model and RAG system