Add `.doc` file parser. (#497)
Browse files### What problem does this PR solve?
Add `.doc` file parser, using tika.
```
pip install tika
```
```
from tika import parser
from io import BytesIO
def extract_text_from_doc_bytes(doc_bytes):
file_like_object = BytesIO(doc_bytes)
parsed = parser.from_buffer(file_like_object)
return parsed["content"]
```
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: chrysanthemum-boy <[email protected]>
- api/utils/file_utils.py +1 -1
- rag/app/book.py +12 -1
- rag/app/laws.py +11 -1
- rag/app/naive.py +10 -1
- rag/app/one.py +11 -1
- requirements.txt +2 -1
api/utils/file_utils.py
CHANGED
|
@@ -147,7 +147,7 @@ def filename_type(filename):
|
|
| 147 |
return FileType.PDF.value
|
| 148 |
|
| 149 |
if re.match(
|
| 150 |
-
r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
| 151 |
return FileType.DOC.value
|
| 152 |
|
| 153 |
if re.match(
|
|
|
|
| 147 |
return FileType.PDF.value
|
| 148 |
|
| 149 |
if re.match(
|
| 150 |
+
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
| 151 |
return FileType.DOC.value
|
| 152 |
|
| 153 |
if re.match(
|
rag/app/book.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
import copy
|
|
|
|
| 14 |
import re
|
| 15 |
from io import BytesIO
|
| 16 |
|
|
@@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 103 |
random_choices([t for t, _ in sections], k=200)))
|
| 104 |
callback(0.8, "Finish parsing.")
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
else:
|
| 107 |
raise NotImplementedError(
|
| 108 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
| 109 |
|
| 110 |
make_colon_as_title(sections)
|
| 111 |
bull = bullets_category(
|
|
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
import copy
|
| 14 |
+
from tika import parser
|
| 15 |
import re
|
| 16 |
from io import BytesIO
|
| 17 |
|
|
|
|
| 104 |
random_choices([t for t, _ in sections], k=200)))
|
| 105 |
callback(0.8, "Finish parsing.")
|
| 106 |
|
| 107 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
| 108 |
+
callback(0.1, "Start to parse.")
|
| 109 |
+
binary = BytesIO(binary)
|
| 110 |
+
doc_parsed = parser.from_buffer(binary)
|
| 111 |
+
sections = doc_parsed['content'].split('\n')
|
| 112 |
+
sections = [(l, "") for l in sections if l]
|
| 113 |
+
remove_contents_table(sections, eng=is_english(
|
| 114 |
+
random_choices([t for t, _ in sections], k=200)))
|
| 115 |
+
callback(0.8, "Finish parsing.")
|
| 116 |
+
|
| 117 |
else:
|
| 118 |
raise NotImplementedError(
|
| 119 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
| 120 |
|
| 121 |
make_colon_as_title(sections)
|
| 122 |
bull = bullets_category(
|
rag/app/laws.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
import copy
|
|
|
|
| 14 |
import re
|
| 15 |
from io import BytesIO
|
| 16 |
from docx import Document
|
|
@@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 123 |
sections = txt.split("\n")
|
| 124 |
sections = [l for l in sections if l]
|
| 125 |
callback(0.8, "Finish parsing.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
else:
|
| 127 |
raise NotImplementedError(
|
| 128 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
| 129 |
|
| 130 |
# is it English
|
| 131 |
eng = lang.lower() == "english" # is_english(sections)
|
|
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
import copy
|
| 14 |
+
from tika import parser
|
| 15 |
import re
|
| 16 |
from io import BytesIO
|
| 17 |
from docx import Document
|
|
|
|
| 124 |
sections = txt.split("\n")
|
| 125 |
sections = [l for l in sections if l]
|
| 126 |
callback(0.8, "Finish parsing.")
|
| 127 |
+
|
| 128 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
| 129 |
+
callback(0.1, "Start to parse.")
|
| 130 |
+
binary = BytesIO(binary)
|
| 131 |
+
doc_parsed = parser.from_buffer(binary)
|
| 132 |
+
sections = doc_parsed['content'].split('\n')
|
| 133 |
+
sections = [l for l in sections if l]
|
| 134 |
+
callback(0.8, "Finish parsing.")
|
| 135 |
+
|
| 136 |
else:
|
| 137 |
raise NotImplementedError(
|
| 138 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
| 139 |
|
| 140 |
# is it English
|
| 141 |
eng = lang.lower() == "english" # is_english(sections)
|
rag/app/naive.py
CHANGED
|
@@ -10,6 +10,7 @@
|
|
| 10 |
# See the License for the specific language governing permissions and
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
|
|
|
| 13 |
from io import BytesIO
|
| 14 |
from docx import Document
|
| 15 |
import re
|
|
@@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 154 |
sections = [(l, "") for l in sections if l]
|
| 155 |
callback(0.8, "Finish parsing.")
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
else:
|
| 158 |
raise NotImplementedError(
|
| 159 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
| 160 |
|
| 161 |
chunks = naive_merge(
|
| 162 |
sections, parser_config.get(
|
|
|
|
| 10 |
# See the License for the specific language governing permissions and
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
+
from tika import parser
|
| 14 |
from io import BytesIO
|
| 15 |
from docx import Document
|
| 16 |
import re
|
|
|
|
| 155 |
sections = [(l, "") for l in sections if l]
|
| 156 |
callback(0.8, "Finish parsing.")
|
| 157 |
|
| 158 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
| 159 |
+
callback(0.1, "Start to parse.")
|
| 160 |
+
binary = BytesIO(binary)
|
| 161 |
+
doc_parsed = parser.from_buffer(binary)
|
| 162 |
+
sections = doc_parsed['content'].split('\n')
|
| 163 |
+
sections = [(l, "") for l in sections if l]
|
| 164 |
+
callback(0.8, "Finish parsing.")
|
| 165 |
+
|
| 166 |
else:
|
| 167 |
raise NotImplementedError(
|
| 168 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
| 169 |
|
| 170 |
chunks = naive_merge(
|
| 171 |
sections, parser_config.get(
|
rag/app/one.py
CHANGED
|
@@ -10,6 +10,8 @@
|
|
| 10 |
# See the License for the specific language governing permissions and
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
|
|
|
|
|
|
| 13 |
import re
|
| 14 |
from rag.app import laws
|
| 15 |
from rag.nlp import huqie, tokenize, find_codec
|
|
@@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 95 |
sections = [s for s in sections if s]
|
| 96 |
callback(0.8, "Finish parsing.")
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
else:
|
| 99 |
raise NotImplementedError(
|
| 100 |
-
"file type not supported yet(docx, pdf, txt supported)")
|
| 101 |
|
| 102 |
doc = {
|
| 103 |
"docnm_kwd": filename,
|
|
|
|
| 10 |
# See the License for the specific language governing permissions and
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
+
from tika import parser
|
| 14 |
+
from io import BytesIO
|
| 15 |
import re
|
| 16 |
from rag.app import laws
|
| 17 |
from rag.nlp import huqie, tokenize, find_codec
|
|
|
|
| 97 |
sections = [s for s in sections if s]
|
| 98 |
callback(0.8, "Finish parsing.")
|
| 99 |
|
| 100 |
+
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
| 101 |
+
callback(0.1, "Start to parse.")
|
| 102 |
+
binary = BytesIO(binary)
|
| 103 |
+
doc_parsed = parser.from_buffer(binary)
|
| 104 |
+
sections = doc_parsed['content'].split('\n')
|
| 105 |
+
sections = [l for l in sections if l]
|
| 106 |
+
callback(0.8, "Finish parsing.")
|
| 107 |
+
|
| 108 |
else:
|
| 109 |
raise NotImplementedError(
|
| 110 |
+
"file type not supported yet(doc, docx, pdf, txt supported)")
|
| 111 |
|
| 112 |
doc = {
|
| 113 |
"docnm_kwd": filename,
|
requirements.txt
CHANGED
|
@@ -116,6 +116,7 @@ sniffio==1.3.1
|
|
| 116 |
StrEnum==0.4.15
|
| 117 |
sympy==1.12
|
| 118 |
threadpoolctl==3.3.0
|
|
|
|
| 119 |
tiktoken==0.6.0
|
| 120 |
tokenizers==0.15.2
|
| 121 |
torch==2.2.1
|
|
@@ -133,4 +134,4 @@ xxhash==3.4.1
|
|
| 133 |
yarl==1.9.4
|
| 134 |
zhipuai==2.0.1
|
| 135 |
BCEmbedding
|
| 136 |
-
loguru==0.7.2
|
|
|
|
| 116 |
StrEnum==0.4.15
|
| 117 |
sympy==1.12
|
| 118 |
threadpoolctl==3.3.0
|
| 119 |
+
tika==2.6.0
|
| 120 |
tiktoken==0.6.0
|
| 121 |
tokenizers==0.15.2
|
| 122 |
torch==2.2.1
|
|
|
|
| 134 |
yarl==1.9.4
|
| 135 |
zhipuai==2.0.1
|
| 136 |
BCEmbedding
|
| 137 |
+
loguru==0.7.2
|