chrysanthemum-boy FannC commited on
Commit
ee82924
·
1 Parent(s): 3cefaa0

Add `.doc` file parser. (#497)

Browse files

### What problem does this PR solve?
Add `.doc` file parser, using tika.
```
pip install tika
```
```
from tika import parser
from io import BytesIO

def extract_text_from_doc_bytes(doc_bytes):
file_like_object = BytesIO(doc_bytes)
parsed = parser.from_buffer(file_like_object)
return parsed["content"]
```
### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: chrysanthemum-boy <[email protected]>

api/utils/file_utils.py CHANGED
@@ -147,7 +147,7 @@ def filename_type(filename):
147
  return FileType.PDF.value
148
 
149
  if re.match(
150
- r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
151
  return FileType.DOC.value
152
 
153
  if re.match(
 
147
  return FileType.PDF.value
148
 
149
  if re.match(
150
+ r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
151
  return FileType.DOC.value
152
 
153
  if re.match(
rag/app/book.py CHANGED
@@ -11,6 +11,7 @@
11
  # limitations under the License.
12
  #
13
  import copy
 
14
  import re
15
  from io import BytesIO
16
 
@@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
103
  random_choices([t for t, _ in sections], k=200)))
104
  callback(0.8, "Finish parsing.")
105
 
 
 
 
 
 
 
 
 
 
 
106
  else:
107
  raise NotImplementedError(
108
- "file type not supported yet(docx, pdf, txt supported)")
109
 
110
  make_colon_as_title(sections)
111
  bull = bullets_category(
 
11
  # limitations under the License.
12
  #
13
  import copy
14
+ from tika import parser
15
  import re
16
  from io import BytesIO
17
 
 
104
  random_choices([t for t, _ in sections], k=200)))
105
  callback(0.8, "Finish parsing.")
106
 
107
+ elif re.search(r"\.doc$", filename, re.IGNORECASE):
108
+ callback(0.1, "Start to parse.")
109
+ binary = BytesIO(binary)
110
+ doc_parsed = parser.from_buffer(binary)
111
+ sections = doc_parsed['content'].split('\n')
112
+ sections = [(l, "") for l in sections if l]
113
+ remove_contents_table(sections, eng=is_english(
114
+ random_choices([t for t, _ in sections], k=200)))
115
+ callback(0.8, "Finish parsing.")
116
+
117
  else:
118
  raise NotImplementedError(
119
+ "file type not supported yet(doc, docx, pdf, txt supported)")
120
 
121
  make_colon_as_title(sections)
122
  bull = bullets_category(
rag/app/laws.py CHANGED
@@ -11,6 +11,7 @@
11
  # limitations under the License.
12
  #
13
  import copy
 
14
  import re
15
  from io import BytesIO
16
  from docx import Document
@@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
123
  sections = txt.split("\n")
124
  sections = [l for l in sections if l]
125
  callback(0.8, "Finish parsing.")
 
 
 
 
 
 
 
 
 
126
  else:
127
  raise NotImplementedError(
128
- "file type not supported yet(docx, pdf, txt supported)")
129
 
130
  # is it English
131
  eng = lang.lower() == "english" # is_english(sections)
 
11
  # limitations under the License.
12
  #
13
  import copy
14
+ from tika import parser
15
  import re
16
  from io import BytesIO
17
  from docx import Document
 
124
  sections = txt.split("\n")
125
  sections = [l for l in sections if l]
126
  callback(0.8, "Finish parsing.")
127
+
128
+ elif re.search(r"\.doc$", filename, re.IGNORECASE):
129
+ callback(0.1, "Start to parse.")
130
+ binary = BytesIO(binary)
131
+ doc_parsed = parser.from_buffer(binary)
132
+ sections = doc_parsed['content'].split('\n')
133
+ sections = [l for l in sections if l]
134
+ callback(0.8, "Finish parsing.")
135
+
136
  else:
137
  raise NotImplementedError(
138
+ "file type not supported yet(doc, docx, pdf, txt supported)")
139
 
140
  # is it English
141
  eng = lang.lower() == "english" # is_english(sections)
rag/app/naive.py CHANGED
@@ -10,6 +10,7 @@
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
 
13
  from io import BytesIO
14
  from docx import Document
15
  import re
@@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
154
  sections = [(l, "") for l in sections if l]
155
  callback(0.8, "Finish parsing.")
156
 
 
 
 
 
 
 
 
 
157
  else:
158
  raise NotImplementedError(
159
- "file type not supported yet(docx, pdf, txt supported)")
160
 
161
  chunks = naive_merge(
162
  sections, parser_config.get(
 
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
+ from tika import parser
14
  from io import BytesIO
15
  from docx import Document
16
  import re
 
155
  sections = [(l, "") for l in sections if l]
156
  callback(0.8, "Finish parsing.")
157
 
158
+ elif re.search(r"\.doc$", filename, re.IGNORECASE):
159
+ callback(0.1, "Start to parse.")
160
+ binary = BytesIO(binary)
161
+ doc_parsed = parser.from_buffer(binary)
162
+ sections = doc_parsed['content'].split('\n')
163
+ sections = [(l, "") for l in sections if l]
164
+ callback(0.8, "Finish parsing.")
165
+
166
  else:
167
  raise NotImplementedError(
168
+ "file type not supported yet(doc, docx, pdf, txt supported)")
169
 
170
  chunks = naive_merge(
171
  sections, parser_config.get(
rag/app/one.py CHANGED
@@ -10,6 +10,8 @@
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
 
 
13
  import re
14
  from rag.app import laws
15
  from rag.nlp import huqie, tokenize, find_codec
@@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
95
  sections = [s for s in sections if s]
96
  callback(0.8, "Finish parsing.")
97
 
 
 
 
 
 
 
 
 
98
  else:
99
  raise NotImplementedError(
100
- "file type not supported yet(docx, pdf, txt supported)")
101
 
102
  doc = {
103
  "docnm_kwd": filename,
 
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
+ from tika import parser
14
+ from io import BytesIO
15
  import re
16
  from rag.app import laws
17
  from rag.nlp import huqie, tokenize, find_codec
 
97
  sections = [s for s in sections if s]
98
  callback(0.8, "Finish parsing.")
99
 
100
+ elif re.search(r"\.doc$", filename, re.IGNORECASE):
101
+ callback(0.1, "Start to parse.")
102
+ binary = BytesIO(binary)
103
+ doc_parsed = parser.from_buffer(binary)
104
+ sections = doc_parsed['content'].split('\n')
105
+ sections = [l for l in sections if l]
106
+ callback(0.8, "Finish parsing.")
107
+
108
  else:
109
  raise NotImplementedError(
110
+ "file type not supported yet(doc, docx, pdf, txt supported)")
111
 
112
  doc = {
113
  "docnm_kwd": filename,
requirements.txt CHANGED
@@ -116,6 +116,7 @@ sniffio==1.3.1
116
  StrEnum==0.4.15
117
  sympy==1.12
118
  threadpoolctl==3.3.0
 
119
  tiktoken==0.6.0
120
  tokenizers==0.15.2
121
  torch==2.2.1
@@ -133,4 +134,4 @@ xxhash==3.4.1
133
  yarl==1.9.4
134
  zhipuai==2.0.1
135
  BCEmbedding
136
- loguru==0.7.2
 
116
  StrEnum==0.4.15
117
  sympy==1.12
118
  threadpoolctl==3.3.0
119
+ tika==2.6.0
120
  tiktoken==0.6.0
121
  tokenizers==0.15.2
122
  torch==2.2.1
 
134
  yarl==1.9.4
135
  zhipuai==2.0.1
136
  BCEmbedding
137
+ loguru==0.7.2