liuhua liuhua commited on
Commit
78eb735
·
1 Parent(s): 3c2255f

Fix the bug causing garbled text (#3640)

Browse files

### What problem does this PR solve?

Fix the bug causing garbled text #3613

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <[email protected]>

api/apps/kb_app.py CHANGED
@@ -162,9 +162,9 @@ def rm():
162
  message="Database error (Document removal)!")
163
  f2d = File2DocumentService.get_by_document_id(doc.id)
164
  FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
165
- FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
166
  File2DocumentService.delete_by_document_id(doc.id)
167
-
 
168
  if not KnowledgebaseService.delete_by_id(req["kb_id"]):
169
  return get_data_error_result(
170
  message="Database error (Knowledgebase removal)!")
 
162
  message="Database error (Document removal)!")
163
  f2d = File2DocumentService.get_by_document_id(doc.id)
164
  FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
 
165
  File2DocumentService.delete_by_document_id(doc.id)
166
+ FileService.filter_delete(
167
+ [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
168
  if not KnowledgebaseService.delete_by_id(req["kb_id"]):
169
  return get_data_error_result(
170
  message="Database error (Knowledgebase removal)!")
api/apps/sdk/dataset.py CHANGED
@@ -252,9 +252,9 @@ def delete(tenant_id):
252
  File.id == f2d[0].file_id,
253
  ]
254
  )
255
- FileService.filter_delete(
256
- [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
257
  File2DocumentService.delete_by_document_id(doc.id)
 
 
258
  if not KnowledgebaseService.delete_by_id(id):
259
  return get_error_data_result(message="Delete dataset error.(Database error)")
260
  return get_result(code=settings.RetCode.SUCCESS)
 
252
  File.id == f2d[0].file_id,
253
  ]
254
  )
 
 
255
  File2DocumentService.delete_by_document_id(doc.id)
256
+ FileService.filter_delete(
257
+ [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
258
  if not KnowledgebaseService.delete_by_id(id):
259
  return get_error_data_result(message="Delete dataset error.(Database error)")
260
  return get_result(code=settings.RetCode.SUCCESS)
rag/nlp/__init__.py CHANGED
@@ -28,6 +28,8 @@ from cn2an import cn2an
28
  from PIL import Image
29
  import json
30
 
 
 
31
  all_codecs = [
32
  'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
33
  'cp037', 'cp273', 'cp424', 'cp437',
@@ -43,12 +45,17 @@ all_codecs = [
43
  'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
44
  'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
45
  'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
46
- 'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
 
 
47
  ]
48
 
49
 
50
  def find_codec(blob):
51
- global all_codecs
 
 
 
52
  for c in all_codecs:
53
  try:
54
  blob[:1024].decode(c)
 
28
  from PIL import Image
29
  import json
30
 
31
+ import chardet
32
+
33
  all_codecs = [
34
  'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
35
  'cp037', 'cp273', 'cp424', 'cp437',
 
45
  'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
46
  'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
47
  'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
48
+ 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16_be', 'utf_16_le', 'utf_7', 'windows-1250', 'windows-1251',
49
+ 'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256',
50
+ 'windows-1257', 'windows-1258', 'latin-2'
51
  ]
52
 
53
 
54
  def find_codec(blob):
55
+ detected = chardet.detect(blob[:1024])
56
+ if detected['confidence'] > 0.5:
57
+ return detected['encoding']
58
+
59
  for c in all_codecs:
60
  try:
61
  blob[:1024].decode(c)