Add `.doc` file parser. (#497)
Browse files### What problem does this PR solve?
Add `.doc` file parser, using tika.
```
pip install tika
```
```
from tika import parser
from io import BytesIO
def extract_text_from_doc_bytes(doc_bytes):
file_like_object = BytesIO(doc_bytes)
parsed = parser.from_buffer(file_like_object)
return parsed["content"]
```
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: chrysanthemum-boy <[email protected]>
- api/utils/file_utils.py +1 -1
- rag/app/book.py +12 -1
- rag/app/laws.py +11 -1
- rag/app/naive.py +10 -1
- rag/app/one.py +11 -1
- requirements.txt +2 -1
api/utils/file_utils.py
CHANGED
@@ -147,7 +147,7 @@ def filename_type(filename):
|
|
147 |
return FileType.PDF.value
|
148 |
|
149 |
if re.match(
|
150 |
-
r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
151 |
return FileType.DOC.value
|
152 |
|
153 |
if re.match(
|
|
|
147 |
return FileType.PDF.value
|
148 |
|
149 |
if re.match(
|
150 |
+
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
151 |
return FileType.DOC.value
|
152 |
|
153 |
if re.match(
|
rag/app/book.py
CHANGED
@@ -11,6 +11,7 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import copy
|
|
|
14 |
import re
|
15 |
from io import BytesIO
|
16 |
|
@@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
103 |
random_choices([t for t, _ in sections], k=200)))
|
104 |
callback(0.8, "Finish parsing.")
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
else:
|
107 |
raise NotImplementedError(
|
108 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
109 |
|
110 |
make_colon_as_title(sections)
|
111 |
bull = bullets_category(
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import copy
|
14 |
+
from tika import parser
|
15 |
import re
|
16 |
from io import BytesIO
|
17 |
|
|
|
104 |
random_choices([t for t, _ in sections], k=200)))
|
105 |
callback(0.8, "Finish parsing.")
|
106 |
|
107 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
108 |
+
callback(0.1, "Start to parse.")
|
109 |
+
binary = BytesIO(binary)
|
110 |
+
doc_parsed = parser.from_buffer(binary)
|
111 |
+
sections = doc_parsed['content'].split('\n')
|
112 |
+
sections = [(l, "") for l in sections if l]
|
113 |
+
remove_contents_table(sections, eng=is_english(
|
114 |
+
random_choices([t for t, _ in sections], k=200)))
|
115 |
+
callback(0.8, "Finish parsing.")
|
116 |
+
|
117 |
else:
|
118 |
raise NotImplementedError(
|
119 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
120 |
|
121 |
make_colon_as_title(sections)
|
122 |
bull = bullets_category(
|
rag/app/laws.py
CHANGED
@@ -11,6 +11,7 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import copy
|
|
|
14 |
import re
|
15 |
from io import BytesIO
|
16 |
from docx import Document
|
@@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
123 |
sections = txt.split("\n")
|
124 |
sections = [l for l in sections if l]
|
125 |
callback(0.8, "Finish parsing.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
else:
|
127 |
raise NotImplementedError(
|
128 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
129 |
|
130 |
# is it English
|
131 |
eng = lang.lower() == "english" # is_english(sections)
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import copy
|
14 |
+
from tika import parser
|
15 |
import re
|
16 |
from io import BytesIO
|
17 |
from docx import Document
|
|
|
124 |
sections = txt.split("\n")
|
125 |
sections = [l for l in sections if l]
|
126 |
callback(0.8, "Finish parsing.")
|
127 |
+
|
128 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
129 |
+
callback(0.1, "Start to parse.")
|
130 |
+
binary = BytesIO(binary)
|
131 |
+
doc_parsed = parser.from_buffer(binary)
|
132 |
+
sections = doc_parsed['content'].split('\n')
|
133 |
+
sections = [l for l in sections if l]
|
134 |
+
callback(0.8, "Finish parsing.")
|
135 |
+
|
136 |
else:
|
137 |
raise NotImplementedError(
|
138 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
139 |
|
140 |
# is it English
|
141 |
eng = lang.lower() == "english" # is_english(sections)
|
rag/app/naive.py
CHANGED
@@ -10,6 +10,7 @@
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
|
|
13 |
from io import BytesIO
|
14 |
from docx import Document
|
15 |
import re
|
@@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
154 |
sections = [(l, "") for l in sections if l]
|
155 |
callback(0.8, "Finish parsing.")
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
else:
|
158 |
raise NotImplementedError(
|
159 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
160 |
|
161 |
chunks = naive_merge(
|
162 |
sections, parser_config.get(
|
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
+
from tika import parser
|
14 |
from io import BytesIO
|
15 |
from docx import Document
|
16 |
import re
|
|
|
155 |
sections = [(l, "") for l in sections if l]
|
156 |
callback(0.8, "Finish parsing.")
|
157 |
|
158 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
159 |
+
callback(0.1, "Start to parse.")
|
160 |
+
binary = BytesIO(binary)
|
161 |
+
doc_parsed = parser.from_buffer(binary)
|
162 |
+
sections = doc_parsed['content'].split('\n')
|
163 |
+
sections = [(l, "") for l in sections if l]
|
164 |
+
callback(0.8, "Finish parsing.")
|
165 |
+
|
166 |
else:
|
167 |
raise NotImplementedError(
|
168 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
169 |
|
170 |
chunks = naive_merge(
|
171 |
sections, parser_config.get(
|
rag/app/one.py
CHANGED
@@ -10,6 +10,8 @@
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
|
|
|
|
13 |
import re
|
14 |
from rag.app import laws
|
15 |
from rag.nlp import huqie, tokenize, find_codec
|
@@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
95 |
sections = [s for s in sections if s]
|
96 |
callback(0.8, "Finish parsing.")
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
else:
|
99 |
raise NotImplementedError(
|
100 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
101 |
|
102 |
doc = {
|
103 |
"docnm_kwd": filename,
|
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
+
from tika import parser
|
14 |
+
from io import BytesIO
|
15 |
import re
|
16 |
from rag.app import laws
|
17 |
from rag.nlp import huqie, tokenize, find_codec
|
|
|
97 |
sections = [s for s in sections if s]
|
98 |
callback(0.8, "Finish parsing.")
|
99 |
|
100 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
101 |
+
callback(0.1, "Start to parse.")
|
102 |
+
binary = BytesIO(binary)
|
103 |
+
doc_parsed = parser.from_buffer(binary)
|
104 |
+
sections = doc_parsed['content'].split('\n')
|
105 |
+
sections = [l for l in sections if l]
|
106 |
+
callback(0.8, "Finish parsing.")
|
107 |
+
|
108 |
else:
|
109 |
raise NotImplementedError(
|
110 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
111 |
|
112 |
doc = {
|
113 |
"docnm_kwd": filename,
|
requirements.txt
CHANGED
@@ -116,6 +116,7 @@ sniffio==1.3.1
|
|
116 |
StrEnum==0.4.15
|
117 |
sympy==1.12
|
118 |
threadpoolctl==3.3.0
|
|
|
119 |
tiktoken==0.6.0
|
120 |
tokenizers==0.15.2
|
121 |
torch==2.2.1
|
@@ -133,4 +134,4 @@ xxhash==3.4.1
|
|
133 |
yarl==1.9.4
|
134 |
zhipuai==2.0.1
|
135 |
BCEmbedding
|
136 |
-
loguru==0.7.2
|
|
|
116 |
StrEnum==0.4.15
|
117 |
sympy==1.12
|
118 |
threadpoolctl==3.3.0
|
119 |
+
tika==2.6.0
|
120 |
tiktoken==0.6.0
|
121 |
tokenizers==0.15.2
|
122 |
torch==2.2.1
|
|
|
134 |
yarl==1.9.4
|
135 |
zhipuai==2.0.1
|
136 |
BCEmbedding
|
137 |
+
loguru==0.7.2
|