Spaces:

frascuchon
/

rag-mcp-server

Running

frascuchon HF Staff commited on 8 days ago

Commit

27030ce

1 Parent(s): 7a52116

fix: Avoid chunker errors first time

Files changed (1) hide show

chunker.py CHANGED Viewed

@@ -1,8 +1,10 @@
-import requests
-from bs4 import BeautifulSoup
 from llama_index.core import Document
 from llama_index.core.node_parser import SentenceSplitter
 def llama_index_sentence_splitter(
     documents: list[str],
@@ -31,22 +33,26 @@ def llama_index_sentence_splitter(
         return ' '.join(text.split())
     chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
-    node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
-    chunks = []
-    docs = [[Document(text=doc)] for doc in documents]
-    for doc_id, doc in zip(document_ids, docs):
-        texts = [node.text for node in node_parser(doc)]
-        # Filter out chunks that do not contain enough text
-        texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
-        if not texts:
-            continue
-        chunks += [
-            {"document_id": doc_id, "content": text}
-            for text in texts
-        ]
-    return chunks

+import threading
 from llama_index.core import Document
 from llama_index.core.node_parser import SentenceSplitter
+lock = threading.Lock()
 def llama_index_sentence_splitter(
     documents: list[str],
         return ' '.join(text.split())
     chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
+    # See here for more details on why we use a lock:
+    # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader
+    with lock:
+        node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
+        chunks = []
+        docs = [[Document(text=doc)] for doc in documents]
+        for doc_id, doc in zip(document_ids, docs):
+            texts = [node.text for node in node_parser(doc)]
+            # Filter out chunks that do not contain enough text
+            texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
+            if not texts:
+                continue
+            chunks += [
+                {"document_id": doc_id, "content": text}
+                for text in texts
+            ]
+        return chunks