liuhua
liuhua
commited on
Commit
·
78eb735
1
Parent(s):
3c2255f
Fix the bug causing garbled text (#3640)
Browse files### What problem does this PR solve?
Fix the bug causing garbled text #3613
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: liuhua <[email protected]>
- api/apps/kb_app.py +2 -2
- api/apps/sdk/dataset.py +2 -2
- rag/nlp/__init__.py +9 -2
api/apps/kb_app.py
CHANGED
|
@@ -162,9 +162,9 @@ def rm():
|
|
| 162 |
message="Database error (Document removal)!")
|
| 163 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 164 |
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
| 165 |
-
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
| 166 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 167 |
-
|
|
|
|
| 168 |
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
| 169 |
return get_data_error_result(
|
| 170 |
message="Database error (Knowledgebase removal)!")
|
|
|
|
| 162 |
message="Database error (Document removal)!")
|
| 163 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 164 |
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
|
|
|
| 165 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 166 |
+
FileService.filter_delete(
|
| 167 |
+
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
| 168 |
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
| 169 |
return get_data_error_result(
|
| 170 |
message="Database error (Knowledgebase removal)!")
|
api/apps/sdk/dataset.py
CHANGED
|
@@ -252,9 +252,9 @@ def delete(tenant_id):
|
|
| 252 |
File.id == f2d[0].file_id,
|
| 253 |
]
|
| 254 |
)
|
| 255 |
-
FileService.filter_delete(
|
| 256 |
-
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
| 257 |
File2DocumentService.delete_by_document_id(doc.id)
|
|
|
|
|
|
|
| 258 |
if not KnowledgebaseService.delete_by_id(id):
|
| 259 |
return get_error_data_result(message="Delete dataset error.(Database error)")
|
| 260 |
return get_result(code=settings.RetCode.SUCCESS)
|
|
|
|
| 252 |
File.id == f2d[0].file_id,
|
| 253 |
]
|
| 254 |
)
|
|
|
|
|
|
|
| 255 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 256 |
+
FileService.filter_delete(
|
| 257 |
+
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
| 258 |
if not KnowledgebaseService.delete_by_id(id):
|
| 259 |
return get_error_data_result(message="Delete dataset error.(Database error)")
|
| 260 |
return get_result(code=settings.RetCode.SUCCESS)
|
rag/nlp/__init__.py
CHANGED
|
@@ -28,6 +28,8 @@ from cn2an import cn2an
|
|
| 28 |
from PIL import Image
|
| 29 |
import json
|
| 30 |
|
|
|
|
|
|
|
| 31 |
all_codecs = [
|
| 32 |
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
| 33 |
'cp037', 'cp273', 'cp424', 'cp437',
|
|
@@ -43,12 +45,17 @@ all_codecs = [
|
|
| 43 |
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
| 44 |
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
|
| 45 |
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
|
| 46 |
-
'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
|
| 50 |
def find_codec(blob):
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
for c in all_codecs:
|
| 53 |
try:
|
| 54 |
blob[:1024].decode(c)
|
|
|
|
| 28 |
from PIL import Image
|
| 29 |
import json
|
| 30 |
|
| 31 |
+
import chardet
|
| 32 |
+
|
| 33 |
all_codecs = [
|
| 34 |
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
| 35 |
'cp037', 'cp273', 'cp424', 'cp437',
|
|
|
|
| 45 |
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
| 46 |
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
|
| 47 |
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
|
| 48 |
+
'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16_be', 'utf_16_le', 'utf_7', 'windows-1250', 'windows-1251',
|
| 49 |
+
'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256',
|
| 50 |
+
'windows-1257', 'windows-1258', 'latin-2'
|
| 51 |
]
|
| 52 |
|
| 53 |
|
| 54 |
def find_codec(blob):
|
| 55 |
+
detected = chardet.detect(blob[:1024])
|
| 56 |
+
if detected['confidence'] > 0.5:
|
| 57 |
+
return detected['encoding']
|
| 58 |
+
|
| 59 |
for c in all_codecs:
|
| 60 |
try:
|
| 61 |
blob[:1024].decode(c)
|