Kevin Hu commited on
Commit
42023af
·
1 Parent(s): d1a0b33

fix parser for pptx of which files are from filemanager (#2482)

Browse files

### What problem does this PR solve?

#2467

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/apps/file2document_app.py CHANGED
@@ -77,7 +77,7 @@ def convert():
77
  doc = DocumentService.insert({
78
  "id": get_uuid(),
79
  "kb_id": kb.id,
80
- "parser_id": kb.parser_id,
81
  "parser_config": kb.parser_config,
82
  "created_by": current_user.id,
83
  "type": file.type,
@@ -85,7 +85,6 @@ def convert():
85
  "location": file.location,
86
  "size": file.size
87
  })
88
- FileService.set_constant_parser(doc, file.name)
89
  file2document = File2DocumentService.insert({
90
  "id": get_uuid(),
91
  "file_id": id,
 
77
  doc = DocumentService.insert({
78
  "id": get_uuid(),
79
  "kb_id": kb.id,
80
+ "parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
81
  "parser_config": kb.parser_config,
82
  "created_by": current_user.id,
83
  "type": file.type,
 
85
  "location": file.location,
86
  "size": file.size
87
  })
 
88
  file2document = File2DocumentService.insert({
89
  "id": get_uuid(),
90
  "file_id": id,
api/db/services/file_service.py CHANGED
@@ -357,7 +357,7 @@ class FileService(CommonService):
357
  doc = {
358
  "id": get_uuid(),
359
  "kb_id": kb.id,
360
- "parser_id": kb.parser_id,
361
  "parser_config": kb.parser_config,
362
  "created_by": user_id,
363
  "type": filetype,
@@ -366,7 +366,6 @@ class FileService(CommonService):
366
  "size": len(blob),
367
  "thumbnail": thumbnail(filename, blob)
368
  }
369
- self.set_constant_parser(doc, filename)
370
  DocumentService.insert(doc)
371
 
372
  FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
@@ -377,12 +376,13 @@ class FileService(CommonService):
377
  return err, files
378
 
379
  @staticmethod
380
- def set_constant_parser(doc, filename):
381
- if doc["type"] == FileType.VISUAL:
382
- doc["parser_id"] = ParserType.PICTURE.value
383
- if doc["type"] == FileType.AURAL:
384
- doc["parser_id"] = ParserType.AUDIO.value
385
  if re.search(r"\.(ppt|pptx|pages)$", filename):
386
- doc["parser_id"] = ParserType.PRESENTATION.value
387
  if re.search(r"\.(eml)$", filename):
388
- doc["parser_id"] = ParserType.EMAIL.value
 
 
357
  doc = {
358
  "id": get_uuid(),
359
  "kb_id": kb.id,
360
+ "parser_id": self.get_parser(filetype, filename, kb.parser_id),
361
  "parser_config": kb.parser_config,
362
  "created_by": user_id,
363
  "type": filetype,
 
366
  "size": len(blob),
367
  "thumbnail": thumbnail(filename, blob)
368
  }
 
369
  DocumentService.insert(doc)
370
 
371
  FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
 
376
  return err, files
377
 
378
  @staticmethod
379
+ def get_parser(doc_type, filename, default):
380
+ if doc_type == FileType.VISUAL:
381
+ return ParserType.PICTURE.value
382
+ if doc_type == FileType.AURAL:
383
+ return ParserType.AUDIO.value
384
  if re.search(r"\.(ppt|pptx|pages)$", filename):
385
+ return ParserType.PRESENTATION.value
386
  if re.search(r"\.(eml)$", filename):
387
+ return ParserType.EMAIL.value
388
+ return default