Kevin Hu
commited on
Commit
·
42023af
1
Parent(s):
d1a0b33
fix parser for pptx of which files are from filemanager (#2482)
Browse files### What problem does this PR solve?
#2467
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
api/apps/file2document_app.py
CHANGED
@@ -77,7 +77,7 @@ def convert():
|
|
77 |
doc = DocumentService.insert({
|
78 |
"id": get_uuid(),
|
79 |
"kb_id": kb.id,
|
80 |
-
"parser_id": kb.parser_id,
|
81 |
"parser_config": kb.parser_config,
|
82 |
"created_by": current_user.id,
|
83 |
"type": file.type,
|
@@ -85,7 +85,6 @@ def convert():
|
|
85 |
"location": file.location,
|
86 |
"size": file.size
|
87 |
})
|
88 |
-
FileService.set_constant_parser(doc, file.name)
|
89 |
file2document = File2DocumentService.insert({
|
90 |
"id": get_uuid(),
|
91 |
"file_id": id,
|
|
|
77 |
doc = DocumentService.insert({
|
78 |
"id": get_uuid(),
|
79 |
"kb_id": kb.id,
|
80 |
+
"parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
|
81 |
"parser_config": kb.parser_config,
|
82 |
"created_by": current_user.id,
|
83 |
"type": file.type,
|
|
|
85 |
"location": file.location,
|
86 |
"size": file.size
|
87 |
})
|
|
|
88 |
file2document = File2DocumentService.insert({
|
89 |
"id": get_uuid(),
|
90 |
"file_id": id,
|
api/db/services/file_service.py
CHANGED
@@ -357,7 +357,7 @@ class FileService(CommonService):
|
|
357 |
doc = {
|
358 |
"id": get_uuid(),
|
359 |
"kb_id": kb.id,
|
360 |
-
"parser_id": kb.parser_id,
|
361 |
"parser_config": kb.parser_config,
|
362 |
"created_by": user_id,
|
363 |
"type": filetype,
|
@@ -366,7 +366,6 @@ class FileService(CommonService):
|
|
366 |
"size": len(blob),
|
367 |
"thumbnail": thumbnail(filename, blob)
|
368 |
}
|
369 |
-
self.set_constant_parser(doc, filename)
|
370 |
DocumentService.insert(doc)
|
371 |
|
372 |
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
@@ -377,12 +376,13 @@ class FileService(CommonService):
|
|
377 |
return err, files
|
378 |
|
379 |
@staticmethod
|
380 |
-
def
|
381 |
-
if
|
382 |
-
|
383 |
-
if
|
384 |
-
|
385 |
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
386 |
-
|
387 |
if re.search(r"\.(eml)$", filename):
|
388 |
-
|
|
|
|
357 |
doc = {
|
358 |
"id": get_uuid(),
|
359 |
"kb_id": kb.id,
|
360 |
+
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
|
361 |
"parser_config": kb.parser_config,
|
362 |
"created_by": user_id,
|
363 |
"type": filetype,
|
|
|
366 |
"size": len(blob),
|
367 |
"thumbnail": thumbnail(filename, blob)
|
368 |
}
|
|
|
369 |
DocumentService.insert(doc)
|
370 |
|
371 |
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
|
|
376 |
return err, files
|
377 |
|
378 |
@staticmethod
|
379 |
+
def get_parser(doc_type, filename, default):
|
380 |
+
if doc_type == FileType.VISUAL:
|
381 |
+
return ParserType.PICTURE.value
|
382 |
+
if doc_type == FileType.AURAL:
|
383 |
+
return ParserType.AUDIO.value
|
384 |
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
385 |
+
return ParserType.PRESENTATION.value
|
386 |
if re.search(r"\.(eml)$", filename):
|
387 |
+
return ParserType.EMAIL.value
|
388 |
+
return default
|