fix: Large document thumbnail display failed (#2763)
Browse files### What problem does this PR solve?
In MySQL, when the thumbnail base64 of a document is relatively large,
the display of the document's thumbnail fails.
Now, I put the document thumbnail into MiniIO storage.
### Type of change
- [✓] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: chongchuanbing <[email protected]>
- api/apps/document_app.py +6 -0
- api/contants.py +3 -1
- api/db/services/file_service.py +10 -3
- api/utils/file_utils.py +10 -8
api/apps/document_app.py
CHANGED
|
@@ -51,6 +51,7 @@ from api.utils.api_utils import get_json_result
|
|
| 51 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 52 |
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
| 53 |
from api.utils.web_utils import html2pdf, is_valid_url
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
@manager.route('/upload', methods=['POST'])
|
|
@@ -209,6 +210,11 @@ def list_docs():
|
|
| 209 |
try:
|
| 210 |
docs, tol = DocumentService.get_by_kb_id(
|
| 211 |
kb_id, page_number, items_per_page, orderby, desc, keywords)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
return get_json_result(data={"total": tol, "docs": docs})
|
| 213 |
except Exception as e:
|
| 214 |
return server_error_response(e)
|
|
|
|
| 51 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 52 |
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
| 53 |
from api.utils.web_utils import html2pdf, is_valid_url
|
| 54 |
+
from api.contants import IMG_BASE64_PREFIX
|
| 55 |
|
| 56 |
|
| 57 |
@manager.route('/upload', methods=['POST'])
|
|
|
|
| 210 |
try:
|
| 211 |
docs, tol = DocumentService.get_by_kb_id(
|
| 212 |
kb_id, page_number, items_per_page, orderby, desc, keywords)
|
| 213 |
+
|
| 214 |
+
for doc_item in docs:
|
| 215 |
+
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
|
| 216 |
+
doc_item['thumbnail'] = f'/v1/document/image/{kb_id}-{doc_item['thumbnail']}'
|
| 217 |
+
|
| 218 |
return get_json_result(data={"total": tol, "docs": docs})
|
| 219 |
except Exception as e:
|
| 220 |
return server_error_response(e)
|
api/contants.py
CHANGED
|
@@ -13,4 +13,6 @@
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
|
| 16 |
-
NAME_LENGTH_LIMIT = 2 ** 10
|
|
|
|
|
|
|
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
|
| 16 |
+
NAME_LENGTH_LIMIT = 2 ** 10
|
| 17 |
+
|
| 18 |
+
IMG_BASE64_PREFIX = 'data:image/png;base64,'
|
api/db/services/file_service.py
CHANGED
|
@@ -26,7 +26,7 @@ from api.db.services.common_service import CommonService
|
|
| 26 |
from api.db.services.document_service import DocumentService
|
| 27 |
from api.db.services.file2document_service import File2DocumentService
|
| 28 |
from api.utils import get_uuid
|
| 29 |
-
from api.utils.file_utils import filename_type,
|
| 30 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 31 |
|
| 32 |
|
|
@@ -354,8 +354,15 @@ class FileService(CommonService):
|
|
| 354 |
location += "_"
|
| 355 |
blob = file.read()
|
| 356 |
STORAGE_IMPL.put(kb.id, location, blob)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
doc = {
|
| 358 |
-
"id":
|
| 359 |
"kb_id": kb.id,
|
| 360 |
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
|
| 361 |
"parser_config": kb.parser_config,
|
|
@@ -364,7 +371,7 @@ class FileService(CommonService):
|
|
| 364 |
"name": filename,
|
| 365 |
"location": location,
|
| 366 |
"size": len(blob),
|
| 367 |
-
"thumbnail":
|
| 368 |
}
|
| 369 |
DocumentService.insert(doc)
|
| 370 |
|
|
|
|
| 26 |
from api.db.services.document_service import DocumentService
|
| 27 |
from api.db.services.file2document_service import File2DocumentService
|
| 28 |
from api.utils import get_uuid
|
| 29 |
+
from api.utils.file_utils import filename_type, thumbnail_img
|
| 30 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 31 |
|
| 32 |
|
|
|
|
| 354 |
location += "_"
|
| 355 |
blob = file.read()
|
| 356 |
STORAGE_IMPL.put(kb.id, location, blob)
|
| 357 |
+
|
| 358 |
+
doc_id = get_uuid()
|
| 359 |
+
|
| 360 |
+
img = thumbnail_img(filename, blob)
|
| 361 |
+
thumbnail_location = f'thumbnail_{doc_id}.png'
|
| 362 |
+
STORAGE_IMPL.put(kb.id, thumbnail_location, img)
|
| 363 |
+
|
| 364 |
doc = {
|
| 365 |
+
"id": doc_id,
|
| 366 |
"kb_id": kb.id,
|
| 367 |
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
|
| 368 |
"parser_config": kb.parser_config,
|
|
|
|
| 371 |
"name": filename,
|
| 372 |
"location": location,
|
| 373 |
"size": len(blob),
|
| 374 |
+
"thumbnail": thumbnail_location
|
| 375 |
}
|
| 376 |
DocumentService.insert(doc)
|
| 377 |
|
api/utils/file_utils.py
CHANGED
|
@@ -25,6 +25,7 @@ from cachetools import LRUCache, cached
|
|
| 25 |
from ruamel.yaml import YAML
|
| 26 |
|
| 27 |
from api.db import FileType
|
|
|
|
| 28 |
|
| 29 |
PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
|
| 30 |
RAG_BASE = os.getenv("RAG_BASE")
|
|
@@ -168,23 +169,20 @@ def filename_type(filename):
|
|
| 168 |
|
| 169 |
return FileType.OTHER.value
|
| 170 |
|
| 171 |
-
|
| 172 |
-
def thumbnail(filename, blob):
|
| 173 |
filename = filename.lower()
|
| 174 |
if re.match(r".*\.pdf$", filename):
|
| 175 |
pdf = pdfplumber.open(BytesIO(blob))
|
| 176 |
buffered = BytesIO()
|
| 177 |
pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
|
| 178 |
-
return
|
| 179 |
-
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 180 |
|
| 181 |
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
| 182 |
image = Image.open(BytesIO(blob))
|
| 183 |
image.thumbnail((30, 30))
|
| 184 |
buffered = BytesIO()
|
| 185 |
image.save(buffered, format="png")
|
| 186 |
-
return
|
| 187 |
-
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 188 |
|
| 189 |
if re.match(r".*\.(ppt|pptx)$", filename):
|
| 190 |
import aspose.slides as slides
|
|
@@ -194,11 +192,15 @@ def thumbnail(filename, blob):
|
|
| 194 |
buffered = BytesIO()
|
| 195 |
presentation.slides[0].get_thumbnail(0.03, 0.03).save(
|
| 196 |
buffered, drawing.imaging.ImageFormat.png)
|
| 197 |
-
return
|
| 198 |
-
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 199 |
except Exception as e:
|
| 200 |
pass
|
|
|
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
def traversal_files(base):
|
| 204 |
for root, ds, fs in os.walk(base):
|
|
|
|
| 25 |
from ruamel.yaml import YAML
|
| 26 |
|
| 27 |
from api.db import FileType
|
| 28 |
+
from api.contants import IMG_BASE64_PREFIX
|
| 29 |
|
| 30 |
PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
|
| 31 |
RAG_BASE = os.getenv("RAG_BASE")
|
|
|
|
| 169 |
|
| 170 |
return FileType.OTHER.value
|
| 171 |
|
| 172 |
+
def thumbnail_img(filename, blob):
|
|
|
|
| 173 |
filename = filename.lower()
|
| 174 |
if re.match(r".*\.pdf$", filename):
|
| 175 |
pdf = pdfplumber.open(BytesIO(blob))
|
| 176 |
buffered = BytesIO()
|
| 177 |
pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
|
| 178 |
+
return buffered.getvalue()
|
|
|
|
| 179 |
|
| 180 |
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
| 181 |
image = Image.open(BytesIO(blob))
|
| 182 |
image.thumbnail((30, 30))
|
| 183 |
buffered = BytesIO()
|
| 184 |
image.save(buffered, format="png")
|
| 185 |
+
return buffered.getvalue()
|
|
|
|
| 186 |
|
| 187 |
if re.match(r".*\.(ppt|pptx)$", filename):
|
| 188 |
import aspose.slides as slides
|
|
|
|
| 192 |
buffered = BytesIO()
|
| 193 |
presentation.slides[0].get_thumbnail(0.03, 0.03).save(
|
| 194 |
buffered, drawing.imaging.ImageFormat.png)
|
| 195 |
+
return buffered.getvalue()
|
|
|
|
| 196 |
except Exception as e:
|
| 197 |
pass
|
| 198 |
+
return None
|
| 199 |
|
| 200 |
+
def thumbnail(filename, blob):
|
| 201 |
+
img = thumbnail_img(filename, blob)
|
| 202 |
+
return IMG_BASE64_PREFIX + \
|
| 203 |
+
base64.b64encode(img).decode("utf-8")
|
| 204 |
|
| 205 |
def traversal_files(base):
|
| 206 |
for root, ds, fs in os.walk(base):
|