dashi6174
commited on
Commit
·
c46aa24
1
Parent(s):
9d66cd6
Support for code files parse (#789)
Browse files### What problem does this PR solve?
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/utils/file_utils.py +1 -1
- rag/app/naive.py +1 -1
api/utils/file_utils.py
CHANGED
@@ -156,7 +156,7 @@ def filename_type(filename):
|
|
156 |
return FileType.PDF.value
|
157 |
|
158 |
if re.match(
|
159 |
-
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
160 |
return FileType.DOC.value
|
161 |
|
162 |
if re.match(
|
|
|
156 |
return FileType.PDF.value
|
157 |
|
158 |
if re.match(
|
159 |
+
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename):
|
160 |
return FileType.DOC.value
|
161 |
|
162 |
if re.match(
|
rag/app/naive.py
CHANGED
@@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
136 |
excel_parser = ExcelParser()
|
137 |
sections = [(excel_parser.html(binary), "")]
|
138 |
|
139 |
-
elif re.search(r"\.(txt|md)$", filename, re.IGNORECASE):
|
140 |
callback(0.1, "Start to parse.")
|
141 |
txt = ""
|
142 |
if binary:
|
|
|
136 |
excel_parser = ExcelParser()
|
137 |
sections = [(excel_parser.html(binary), "")]
|
138 |
|
139 |
+
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
140 |
callback(0.1, "Start to parse.")
|
141 |
txt = ""
|
142 |
if binary:
|