frascuchon HF Staff commited on
Commit
27030ce
·
1 Parent(s): 7a52116

fix: Avoid chunker errors first time

Browse files
Files changed (1) hide show
  1. chunker.py +22 -16
chunker.py CHANGED
@@ -1,8 +1,10 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
  from llama_index.core import Document
4
  from llama_index.core.node_parser import SentenceSplitter
5
 
 
 
6
 
7
  def llama_index_sentence_splitter(
8
  documents: list[str],
@@ -31,22 +33,26 @@ def llama_index_sentence_splitter(
31
  return ' '.join(text.split())
32
 
33
  chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
34
- node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
35
 
36
- chunks = []
37
- docs = [[Document(text=doc)] for doc in documents]
38
- for doc_id, doc in zip(document_ids, docs):
 
 
 
 
 
39
 
40
- texts = [node.text for node in node_parser(doc)]
41
 
42
- # Filter out chunks that do not contain enough text
43
- texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
44
 
45
- if not texts:
46
- continue
47
 
48
- chunks += [
49
- {"document_id": doc_id, "content": text}
50
- for text in texts
51
- ]
52
- return chunks
 
1
+ import threading
2
+
3
  from llama_index.core import Document
4
  from llama_index.core.node_parser import SentenceSplitter
5
 
6
+ lock = threading.Lock()
7
+
8
 
9
  def llama_index_sentence_splitter(
10
  documents: list[str],
 
33
  return ' '.join(text.split())
34
 
35
  chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
 
36
 
37
+ # See here for more details on why we use a lock:
38
+ # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader
39
+ with lock:
40
+ node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))
41
+
42
+ chunks = []
43
+ docs = [[Document(text=doc)] for doc in documents]
44
+ for doc_id, doc in zip(document_ids, docs):
45
 
46
+ texts = [node.text for node in node_parser(doc)]
47
 
48
+ # Filter out chunks that do not contain enough text
49
+ texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]
50
 
51
+ if not texts:
52
+ continue
53
 
54
+ chunks += [
55
+ {"document_id": doc_id, "content": text}
56
+ for text in texts
57
+ ]
58
+ return chunks