diff --git a/agent/canvas.py b/agent/canvas.py index 0b6281398f8b3cd50853f3f7050047075662c35d..80b28d3fb71b4c0f7ad5bbee5656999597583f7d 100644 --- a/agent/canvas.py +++ b/agent/canvas.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import json from abc import ABC from copy import deepcopy from functools import partial from agent.component import component_class from agent.component.base import ComponentBase -from api.utils.log_utils import logger class Canvas(ABC): """ @@ -187,7 +187,7 @@ class Canvas(ABC): if cpn.component_name == "Answer": self.answer.append(c) else: - logger.debug(f"Canvas.prepare2run: {c}") + logging.debug(f"Canvas.prepare2run: {c}") cpids = cpn.get_dependent_components() if any([c not in self.path[-1] for c in cpids]): continue @@ -197,7 +197,7 @@ class Canvas(ABC): prepare2run(self.components[self.path[-2][-1]]["downstream"]) while 0 <= ran < len(self.path[-1]): - logger.debug(f"Canvas.run: {ran} {self.path}") + logging.debug(f"Canvas.run: {ran} {self.path}") cpn_id = self.path[-1][ran] cpn = self.get_component(cpn_id) if not cpn["downstream"]: break @@ -217,7 +217,7 @@ class Canvas(ABC): self.get_component(p)["obj"].set_exception(e) prepare2run([p]) break - logger.exception("Canvas.run got exception") + logging.exception("Canvas.run got exception") break continue @@ -229,7 +229,7 @@ class Canvas(ABC): self.get_component(p)["obj"].set_exception(e) prepare2run([p]) break - logger.exception("Canvas.run got exception") + logging.exception("Canvas.run got exception") break if self.answer: diff --git a/agent/component/arxiv.py b/agent/component/arxiv.py index c3f52d64f347680c3cbf9f6b6d024c24b362aba0..a7df3385cec163b53cdcf95a4ad9ddb3385e5a21 100644 --- a/agent/component/arxiv.py +++ b/agent/component/arxiv.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import arxiv import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class ArXivParam(ComponentParamBase): """ @@ -64,5 +64,5 @@ class ArXiv(ComponentBase, ABC): return ArXiv.be_output("") df = pd.DataFrame(arxiv_res) - logger.debug(f"df: {str(df)}") + logging.debug(f"df: {str(df)}") return df diff --git a/agent/component/baidu.py b/agent/component/baidu.py index cb1f5a2a52f6a6d72926b1539fdeb92d2c1ed24c..7311b43cff1dde7e5a88ede05fb4132d0f2e9869 100644 --- a/agent/component/baidu.py +++ b/agent/component/baidu.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import pandas as pd import requests import re from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class BaiduParam(ComponentParamBase): @@ -62,6 +62,6 @@ class Baidu(ComponentBase, ABC): return Baidu.be_output("") df = pd.DataFrame(baidu_res) - logger.debug(f"df: {str(df)}") + logging.debug(f"df: {str(df)}") return df diff --git a/agent/component/base.py b/agent/component/base.py index ae34b6155504c752d40bbe8f38e1a0e448198921..2eee4a25ffda297975c0661564fea87979faa76d 100644 --- a/agent/component/base.py +++ b/agent/component/base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import builtins import json @@ -23,7 +24,6 @@ from typing import Tuple, Union import pandas as pd from agent import settings -from api.utils.log_utils import logger _FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params" @@ -361,13 +361,13 @@ class ComponentParamBase(ABC): def _warn_deprecated_param(self, param_name, descr): if self._deprecated_params_set.get(param_name): - logger.warning( + logging.warning( f"{descr} {param_name} is deprecated and ignored in this version." ) def _warn_to_deprecate_param(self, param_name, descr, new_param): if self._deprecated_params_set.get(param_name): - logger.warning( + logging.warning( f"{descr} {param_name} will be deprecated in future release; " f"please use {new_param} instead." ) @@ -403,7 +403,7 @@ class ComponentBase(ABC): return cpnts def run(self, history, **kwargs): - logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False), + logging.debug("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False), json.dumps(kwargs, ensure_ascii=False))) try: res = self._run(history, **kwargs) @@ -476,7 +476,7 @@ class ComponentBase(ABC): reversed_cpnts.extend(self._canvas.path[-2]) reversed_cpnts.extend(self._canvas.path[-1]) - logger.debug(f"{self.component_name} {reversed_cpnts[::-1]}") + logging.debug(f"{self.component_name} {reversed_cpnts[::-1]}") for u in reversed_cpnts[::-1]: if self.get_component_name(u) in ["switch", "concentrator"]: continue if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval": diff --git a/agent/component/bing.py b/agent/component/bing.py index 7497c4c97944935fdd15726782345120a841ffa9..6ec97c196b18825af731bbfcffaf800898ee2231 100644 --- a/agent/component/bing.py +++ b/agent/component/bing.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import requests import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class BingParam(ComponentParamBase): """ @@ -80,5 +80,5 @@ class Bing(ComponentBase, ABC): return Bing.be_output("") df = pd.DataFrame(bing_res) - logger.debug(f"df: {str(df)}") + logging.debug(f"df: {str(df)}") return df diff --git a/agent/component/categorize.py b/agent/component/categorize.py index f1c8855ca15a1e48f7e8c45daa7b8a0b31e26b8f..94f10c799163747e18a80aef3849302665869574 100644 --- a/agent/component/categorize.py +++ b/agent/component/categorize.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from api.db import LLMType from api.db.services.llm_service import LLMBundle from agent.component import GenerateParam, Generate -from api.utils.log_utils import logger class CategorizeParam(GenerateParam): @@ -77,7 +77,7 @@ class Categorize(Generate, ABC): chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id) ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}], self._param.gen_conf()) - logger.debug(f"input: {input}, answer: {str(ans)}") + logging.debug(f"input: {input}, answer: {str(ans)}") for c in self._param.category_description.keys(): if ans.lower().find(c.lower()) >= 0: return Categorize.be_output(self._param.category_description[c]["to"]) diff --git a/agent/component/duckduckgo.py b/agent/component/duckduckgo.py index b0ad40c23169bbb04541b7921ef522430eddd697..9f460a699dc0ef6fd3b8c128fb52c5e0d2f650f6 100644 --- a/agent/component/duckduckgo.py +++ b/agent/component/duckduckgo.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from duckduckgo_search import DDGS import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class DuckDuckGoParam(ComponentParamBase): @@ -62,5 +62,5 @@ class DuckDuckGo(ComponentBase, ABC): return DuckDuckGo.be_output("") df = pd.DataFrame(duck_res) - logger.debug("df: {df}") + logging.debug("df: {df}") return df diff --git a/agent/component/github.py b/agent/component/github.py index 20c9a0c3ff06106982065a2a4d2567d1340140e3..4149da37ab468aac11fbe5618c9b48893995ce9a 100644 --- a/agent/component/github.py +++ b/agent/component/github.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import pandas as pd import requests from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class GitHubParam(ComponentParamBase): @@ -57,5 +57,5 @@ class GitHub(ComponentBase, ABC): return GitHub.be_output("") df = pd.DataFrame(github_res) - logger.debug(f"df: {df}") + logging.debug(f"df: {df}") return df diff --git a/agent/component/google.py b/agent/component/google.py index 0e0cec40ed1fce9a7fdddc1b1435531bddb792be..691dc239746377928594f12dd38526a6879f23f6 100644 --- a/agent/component/google.py +++ b/agent/component/google.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from serpapi import GoogleSearch import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class GoogleParam(ComponentParamBase): @@ -92,5 +92,5 @@ class Google(ComponentBase, ABC): return Google.be_output("") df = pd.DataFrame(google_res) - logger.debug(f"df: {df}") + logging.debug(f"df: {df}") return df diff --git a/agent/component/googlescholar.py b/agent/component/googlescholar.py index d6dc146891d5ed723701228b94883374b7972f74..4ad580ff72f97369efb9a666e9ddecb860c585ea 100644 --- a/agent/component/googlescholar.py +++ b/agent/component/googlescholar.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase from scholarly import scholarly -from api.utils.log_utils import logger class GoogleScholarParam(ComponentParamBase): @@ -59,12 +59,12 @@ class GoogleScholar(ComponentBase, ABC): 'bib'].get('abstract', 'no abstract')}) except StopIteration or Exception: - logger.exception("GoogleScholar") + logging.exception("GoogleScholar") break if not scholar_res: return GoogleScholar.be_output("") df = pd.DataFrame(scholar_res) - logger.debug(f"df: {df}") + logging.debug(f"df: {df}") return df diff --git a/agent/component/keyword.py b/agent/component/keyword.py index abc0c40ccb741e9034a837e0035b1855a3ab094d..1e417511ede377577f915a5ba57607067634d367 100644 --- a/agent/component/keyword.py +++ b/agent/component/keyword.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import re from abc import ABC from api.db import LLMType from api.db.services.llm_service import LLMBundle from agent.component import GenerateParam, Generate -from api.utils.log_utils import logger class KeywordExtractParam(GenerateParam): @@ -58,5 +58,5 @@ class KeywordExtract(Generate, ABC): self._param.gen_conf()) ans = re.sub(r".*keyword:", "", ans).strip() - logger.info(f"ans: {ans}") + logging.debug(f"ans: {ans}") return KeywordExtract.be_output(ans) diff --git a/agent/component/pubmed.py b/agent/component/pubmed.py index 409fb05258df50a55f0ee455404be2ef0a9d5bfa..8f41d3c972f9d1b974f4129c6506e87e29fa1ffc 100644 --- a/agent/component/pubmed.py +++ b/agent/component/pubmed.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from Bio import Entrez import re import pandas as pd import xml.etree.ElementTree as ET from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class PubMedParam(ComponentParamBase): @@ -65,5 +65,5 @@ class PubMed(ComponentBase, ABC): return PubMed.be_output("") df = pd.DataFrame(pubmed_res) - logger.debug(f"df: {df}") + logging.debug(f"df: {df}") return df diff --git a/agent/component/relevant.py b/agent/component/relevant.py index e20083bbdd6c252e7722c3ca3b194784c69eea52..65a8433d13910ff2897503a54790a3d98a96b15a 100644 --- a/agent/component/relevant.py +++ b/agent/component/relevant.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from api.db import LLMType from api.db.services.llm_service import LLMBundle from agent.component import GenerateParam, Generate from rag.utils import num_tokens_from_string, encoder -from api.utils.log_utils import logger class RelevantParam(GenerateParam): @@ -71,7 +71,7 @@ class Relevant(Generate, ABC): ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}], self._param.gen_conf()) - logger.info(ans) + logging.debug(ans) if ans.lower().find("yes") >= 0: return Relevant.be_output(self._param.yes) if ans.lower().find("no") >= 0: diff --git a/agent/component/retrieval.py b/agent/component/retrieval.py index 0fc0c714e8cfc5a7431545fa3534e1ec1ed77f0b..ab48e04a77021a67716e05d249bb751a53c2bdec 100644 --- a/agent/component/retrieval.py +++ b/agent/component/retrieval.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import pandas as pd @@ -22,7 +23,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.settings import retrievaler from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class RetrievalParam(ComponentParamBase): @@ -81,7 +81,7 @@ class Retrieval(ComponentBase, ABC): df = pd.DataFrame(kbinfos["chunks"]) df["content"] = df["content_with_weight"] del df["content_with_weight"] - logger.debug("{} {}".format(query, df)) + logging.debug("{} {}".format(query, df)) return df diff --git a/agent/component/rewrite.py b/agent/component/rewrite.py index 7cfc01a61e3b88dd195bc0415ba0a8f14a85316e..4257a81fda0188cd023979e879af65edc4fac26b 100644 --- a/agent/component/rewrite.py +++ b/agent/component/rewrite.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC from api.db import LLMType from api.db.services.llm_service import LLMBundle from agent.component import GenerateParam, Generate -from api.utils.log_utils import logger class RewriteQuestionParam(GenerateParam): @@ -105,7 +105,7 @@ class RewriteQuestion(Generate, ABC): self._canvas.history.pop() self._canvas.history.append(("user", ans)) - logger.info(ans) + logging.debug(ans) return RewriteQuestion.be_output(ans) diff --git a/agent/component/wikipedia.py b/agent/component/wikipedia.py index 3d773a63761b0a5f21b6432bf35141f1fed9d984..8ccadca21f8a763a8bab33368e522bdd773bf95d 100644 --- a/agent/component/wikipedia.py +++ b/agent/component/wikipedia.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import wikipedia import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase -from api.utils.log_utils import logger class WikipediaParam(ComponentParamBase): @@ -63,5 +63,5 @@ class Wikipedia(ComponentBase, ABC): return Wikipedia.be_output("") df = pd.DataFrame(wiki_res) - logger.debug(f"df: {df}") + logging.debug(f"df: {df}") return df diff --git a/agent/component/yahoofinance.py b/agent/component/yahoofinance.py index 139c72345c60df8842bc43cb59a4e8ffbe727344..ac932bc682796655e7675e728737899112620914 100644 --- a/agent/component/yahoofinance.py +++ b/agent/component/yahoofinance.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from abc import ABC import pandas as pd from agent.component.base import ComponentBase, ComponentParamBase import yfinance as yf -from api.utils.log_utils import logger class YahooFinanceParam(ComponentParamBase): @@ -76,7 +76,7 @@ class YahooFinance(ComponentBase, ABC): if self._param.news: yohoo_res.append({"content": "news:\n" + pd.DataFrame(msft.news).to_markdown() + "\n"}) except Exception: - logger.exception("YahooFinance got exception") + logging.exception("YahooFinance got exception") if not yohoo_res: return YahooFinance.be_output("") diff --git a/api/apps/__init__.py b/api/apps/__init__.py index d1dc755cfcf731d92c1b4fc75f3e115dcdc3116d..73f323a14907a9ed517130e07064a4937a955067 100644 --- a/api/apps/__init__.py +++ b/api/apps/__init__.py @@ -15,6 +15,7 @@ # import os import sys +import logging from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path from flask import Blueprint, Flask @@ -32,7 +33,6 @@ from flask_login import LoginManager from api.settings import SECRET_KEY from api.settings import API_VERSION from api.utils.api_utils import server_error_response -from api.utils.log_utils import logger from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer __all__ = ["app"] @@ -154,7 +154,7 @@ def load_user(web_request): else: return None except Exception: - logger.exception("load_user got exception") + logging.exception("load_user got exception") return None else: return None diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py index 6ef41c5e9efaeb50c0e1019161c921abb38e9c63..751202f8ae781d7b5e36ccae5df8109233628059 100644 --- a/api/apps/canvas_app.py +++ b/api/apps/canvas_app.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import json from functools import partial from flask import request, Response @@ -23,7 +24,6 @@ from api.utils import get_uuid from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result from agent.canvas import Canvas from peewee import MySQLDatabase, PostgresqlDatabase -from api.utils.log_utils import logger @manager.route('/templates', methods=['GET']) @@ -115,7 +115,7 @@ def run(): pass canvas.add_user_input(req["message"]) answer = canvas.run(stream=stream) - logger.info(canvas) + logging.debug(canvas) except Exception as e: return server_error_response(e) diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index f035da82b5fd0dd8a2f4e758483ca10b93e001a0..487c70e2bba70b5c80c7c4a3c3f8abcc742fcc59 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import json from flask import request @@ -25,7 +26,6 @@ from api.db.db_models import TenantLLM from api.utils.api_utils import get_json_result from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel import requests -from api.utils.log_utils import logger @manager.route('/factories', methods=['GET']) @@ -90,7 +90,7 @@ def set_api_key(): if len(arr) == 0 or tc == 0: raise Exception("Fail") rerank_passed = True - logger.info(f'passed model rerank {llm.llm_name}') + logging.debug(f'passed model rerank {llm.llm_name}') except Exception as e: msg += f"\nFail to access model({llm.llm_name}) using this api key." + str( e) diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 35a9a46c2755d42683c3999521ed8e1a80e12be6..08ab1227d3824a2dd07d12fa91924b0f99f8948f 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import json import re from datetime import datetime @@ -54,7 +55,6 @@ from api.settings import ( from api.db.services.user_service import UserService, TenantService, UserTenantService from api.db.services.file_service import FileService from api.utils.api_utils import get_json_result, construct_response -from api.utils.log_utils import logger @manager.route("/login", methods=["POST", "GET"]) @@ -177,7 +177,7 @@ def github_callback(): try: avatar = download_img(user_info["avatar_url"]) except Exception as e: - logger.exception(e) + logging.exception(e) avatar = "" users = user_register( user_id, @@ -202,7 +202,7 @@ def github_callback(): return redirect("/?auth=%s" % user.get_id()) except Exception as e: rollback_user_registration(user_id) - logger.exception(e) + logging.exception(e) return redirect("/?error=%s" % str(e)) # User has already registered, try to log in @@ -279,7 +279,7 @@ def feishu_callback(): try: avatar = download_img(user_info["avatar_url"]) except Exception as e: - logger.exception(e) + logging.exception(e) avatar = "" users = user_register( user_id, @@ -304,7 +304,7 @@ def feishu_callback(): return redirect("/?auth=%s" % user.get_id()) except Exception as e: rollback_user_registration(user_id) - logger.exception(e) + logging.exception(e) return redirect("/?error=%s" % str(e)) # User has already registered, try to log in @@ -436,7 +436,7 @@ def setting_user(): UserService.update_by_id(current_user.id, update_dict) return get_json_result(data=True) except Exception as e: - logger.exception(e) + logging.exception(e) return get_json_result( data=False, message="Update failure!", code=RetCode.EXCEPTION_ERROR ) @@ -621,7 +621,7 @@ def user_add(): ) except Exception as e: rollback_user_registration(user_id) - logger.exception(e) + logging.exception(e) return get_json_result( data=False, message=f"User registration failure, error: {str(e)}", diff --git a/api/db/db_models.py b/api/db/db_models.py index 29c3c2618bc9cf3eb5d2b8d84f30c8580193f3b8..7c63254569c2053686f60e131df3909da268294e 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import inspect import os import sys @@ -32,7 +33,6 @@ from playhouse.pool import PooledMySQLDatabase, PooledPostgresqlDatabase from api.db import SerializedType, ParserType from api.settings import DATABASE, SECRET_KEY, DATABASE_TYPE from api import utils -from api.utils.log_utils import logger def singleton(cls, *args, **kw): instances = {} @@ -285,7 +285,7 @@ class BaseDataBase: database_config = DATABASE.copy() db_name = database_config.pop("name") self.database_connection = PooledDatabase[DATABASE_TYPE.upper()].value(db_name, **database_config) - logger.info('init database on cluster mode successfully') + logging.info('init database on cluster mode successfully') class PostgresDatabaseLock: def __init__(self, lock_name, timeout=10, db=None): @@ -393,7 +393,7 @@ def close_connection(): if DB: DB.close_stale(age=30) except Exception as e: - logger.exception(e) + logging.exception(e) class DataBaseModel(BaseModel): @@ -409,15 +409,15 @@ def init_database_tables(alter_fields=[]): for name, obj in members: if obj != DataBaseModel and issubclass(obj, DataBaseModel): table_objs.append(obj) - logger.info(f"start create table {obj.__name__}") + logging.debug(f"start create table {obj.__name__}") try: obj.create_table() - logger.info(f"create table success: {obj.__name__}") + logging.debug(f"create table success: {obj.__name__}") except Exception as e: - logger.exception(e) + logging.exception(e) create_failed_list.append(obj.__name__) if create_failed_list: - logger.info(f"create tables failed: {create_failed_list}") + logging.error(f"create tables failed: {create_failed_list}") raise Exception(f"create tables failed: {create_failed_list}") migrate_db() diff --git a/api/db/init_data.py b/api/db/init_data.py index 20e42aab6598c48233baef6a8948740a54d31576..c04169ad32bba92895369922cf12a86d1c555bde 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import base64 import json import os @@ -30,7 +31,6 @@ from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantL from api.db.services.user_service import TenantService, UserTenantService from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger def encode_to_base64(input_string): @@ -70,26 +70,26 @@ def init_superuser(): "api_key": API_KEY, "api_base": LLM_BASE_URL}) if not UserService.save(**user_info): - logger.info("can't init admin.") + logging.error("can't init admin.") return TenantService.insert(**tenant) UserTenantService.insert(**usr_tenant) TenantLLMService.insert_many(tenant_llm) - logger.info( - "Super user initialized. email: admin@ragflow.io, password: admin. Changing the password after logining is strongly recomanded.") + logging.info( + "Super user initialized. email: admin@ragflow.io, password: admin. Changing the password after login is strongly recommended.") chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"]) msg = chat_mdl.chat(system="", history=[ {"role": "user", "content": "Hello!"}], gen_conf={}) if msg.find("ERROR: ") == 0: - logger.error( + logging.error( "'{}' dosen't work. {}".format( tenant["llm_id"], msg)) embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) v, c = embd_mdl.encode(["Hello!"]) if c == 0: - logger.error( + logging.error( "'{}' dosen't work!".format( tenant["embd_id"])) @@ -172,7 +172,7 @@ def add_graph_templates(): except: CanvasTemplateService.update_by_id(cnvs["id"], cnvs) except Exception: - logger.exception("Add graph templates error: ") + logging.exception("Add graph templates error: ") def init_web_data(): @@ -183,7 +183,7 @@ def init_web_data(): # init_superuser() add_graph_templates() - logger.info("init web data success:{}".format(time.time() - start_time)) + logging.info("init web data success:{}".format(time.time() - start_time)) if __name__ == '__main__': diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index 3cad184ae61dd5a52a99af08c0250c4fb776d57f..c586309b9cd610bf537e7cef94629d47f1d92816 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import binascii import os import json @@ -31,7 +32,6 @@ from rag.app.resume import forbidden_select_fields4resume from rag.nlp.search import index_name from rag.utils import rmSpace, num_tokens_from_string, encoder from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger class DialogService(CommonService): @@ -178,7 +178,7 @@ def chat(dialog, messages, stream=True, **kwargs): tts_mdl = LLMBundle(dialog.tenant_id, LLMType.TTS) # try to use sql if field mapping is good to go if field_map: - logger.info("Use SQL to retrieval:{}".format(questions[-1])) + logging.debug("Use SQL to retrieval:{}".format(questions[-1])) ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True)) if ans: yield ans @@ -220,7 +220,7 @@ def chat(dialog, messages, stream=True, **kwargs): doc_ids=attachments, top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] - logger.info( + logging.debug( "{}->{}".format(" ".join(questions), "\n->".join(knowledges))) retrieval_tm = timer() @@ -292,7 +292,7 @@ def chat(dialog, messages, stream=True, **kwargs): yield decorate_answer(answer) else: answer = chat_mdl.chat(prompt, msg[1:], gen_conf) - logger.info("User: {}|Assistant: {}".format( + logging.debug("User: {}|Assistant: {}".format( msg[-1]["content"], answer)) res = decorate_answer(answer) res["audio_binary"] = tts(tts_mdl, answer) @@ -320,7 +320,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): nonlocal sys_prompt, user_promt, question, tried_times sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], { "temperature": 0.06}) - logger.info(f"{question} ==> {user_promt} get SQL: {sql}") + logging.debug(f"{question} ==> {user_promt} get SQL: {sql}") sql = re.sub(r"[\r\n]+", " ", sql.lower()) sql = re.sub(r".*select ", "select ", sql.lower()) sql = re.sub(r" +", " ", sql) @@ -340,7 +340,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): flds.append(k) sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:] - logger.info(f"{question} get SQL(refined): {sql}") + logging.debug(f"{question} get SQL(refined): {sql}") tried_times += 1 return retrievaler.sql_retrieval(sql, format="json"), sql @@ -369,9 +369,9 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): question, sql, tbl["error"] ) tbl, sql = get_table() - logger.info("TRY it again: {}".format(sql)) + logging.debug("TRY it again: {}".format(sql)) - logger.info("GET table: {}".format(tbl)) + logging.debug("GET table: {}".format(tbl)) if tbl.get("error") or len(tbl["rows"]) == 0: return None @@ -401,7 +401,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) if not docid_idx or not docnm_idx: - logger.warning("SQL missing field: " + sql) + logging.warning("SQL missing field: " + sql) return { "answer": "\n".join([clmns, line, rows]), "reference": {"chunks": [], "doc_aggs": []}, diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 428f1a4bc85e6a493f4f0621715bbdb5b7fc1964..301db2f0304144de74c990aff9d3812d553ecb1a 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import hashlib import json import random @@ -39,7 +40,6 @@ from api.db.services.common_service import CommonService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db import StatusEnum from rag.utils.redis_conn import REDIS_CONN -from api.utils.log_utils import logger class DocumentService(CommonService): @@ -387,7 +387,7 @@ class DocumentService(CommonService): cls.update_by_id(d["id"], info) except Exception as e: if str(e).find("'0'") < 0: - logger.exception("fetch task exception") + logging.exception("fetch task exception") @classmethod @DB.connection_context() @@ -544,7 +544,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): "knowledge_graph_kwd": "mind_map" }) except Exception as e: - logger.exception("Mind map generation error") + logging.exception("Mind map generation error") vects = embedding(doc_id, [c["content_with_weight"] for c in cks]) assert len(cks) == len(vects) diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index 6a98614a817aa47c07c620520a2164c7503284dd..2602bf3764b4ec36376b71fd322a21270108b756 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import re import os from concurrent.futures import ThreadPoolExecutor @@ -30,7 +31,6 @@ from api.db.services.file2document_service import File2DocumentService from api.utils import get_uuid from api.utils.file_utils import filename_type, thumbnail_img from rag.utils.storage_factory import STORAGE_IMPL -from api.utils.log_utils import logger class FileService(CommonService): @@ -276,7 +276,7 @@ class FileService(CommonService): return cls.model.delete().where((cls.model.tenant_id == user_id) & (cls.model.id == folder_id)).execute(), except Exception: - logger.exception("delete_folder_by_pf_id") + logging.exception("delete_folder_by_pf_id") raise RuntimeError("Database error (File retrieval)!") @classmethod @@ -325,7 +325,7 @@ class FileService(CommonService): try: cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id }) except Exception: - logger.exception("move_file") + logging.exception("move_file") raise RuntimeError("Database error (File move)!") @classmethod diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index 6e94b15d0ee89b6b0656e400cac5513d1b80eb5e..1a91c78b6328d6a19bdffd01e929ba682caaa252 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from api.db.services.user_service import TenantService from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel from api.db import LLMType from api.db.db_models import DB from api.db.db_models import LLMFactories, LLM, TenantLLM from api.db.services.common_service import CommonService -from api.utils.log_utils import logger class LLMFactoriesService(CommonService): @@ -209,7 +209,7 @@ class LLMBundle(object): emd, used_tokens = self.mdl.encode(texts, batch_size) if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens): - logger.error( + logging.error( "LLMBundle.encode can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens)) return emd, used_tokens @@ -217,7 +217,7 @@ class LLMBundle(object): emd, used_tokens = self.mdl.encode_queries(query) if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens): - logger.error( + logging.error( "LLMBundle.encode_queries can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens)) return emd, used_tokens @@ -225,7 +225,7 @@ class LLMBundle(object): sim, used_tokens = self.mdl.similarity(query, texts) if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens): - logger.error( + logging.error( "LLMBundle.similarity can't update token usage for {}/RERANK used_tokens: {}".format(self.tenant_id, used_tokens)) return sim, used_tokens @@ -233,7 +233,7 @@ class LLMBundle(object): txt, used_tokens = self.mdl.describe(image, max_tokens) if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens): - logger.error( + logging.error( "LLMBundle.describe can't update token usage for {}/IMAGE2TEXT used_tokens: {}".format(self.tenant_id, used_tokens)) return txt @@ -241,7 +241,7 @@ class LLMBundle(object): txt, used_tokens = self.mdl.transcription(audio) if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens): - logger.error( + logging.error( "LLMBundle.transcription can't update token usage for {}/SEQUENCE2TXT used_tokens: {}".format(self.tenant_id, used_tokens)) return txt @@ -250,7 +250,7 @@ class LLMBundle(object): if isinstance(chunk,int): if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, chunk, self.llm_name): - logger.error( + logging.error( "LLMBundle.tts can't update token usage for {}/TTS".format(self.tenant_id)) return yield chunk @@ -259,7 +259,7 @@ class LLMBundle(object): txt, used_tokens = self.mdl.chat(system, history, gen_conf) if isinstance(txt, int) and not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, used_tokens, self.llm_name): - logger.error( + logging.error( "LLMBundle.chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens)) return txt @@ -268,7 +268,7 @@ class LLMBundle(object): if isinstance(txt, int): if not TenantLLMService.increase_usage( self.tenant_id, self.llm_type, txt, self.llm_name): - logger.error( + logging.error( "LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt)) return yield txt diff --git a/api/ragflow_server.py b/api/ragflow_server.py index 00cb1704204e693170019ff9e0bac81bc1bd3d2e..0049f01a5b098a5400115abc65b2920221a2b450 100644 --- a/api/ragflow_server.py +++ b/api/ragflow_server.py @@ -15,6 +15,17 @@ # import logging +import inspect +from api.utils.log_utils import initRootLogger +initRootLogger(inspect.getfile(inspect.currentframe())) +for module in ["pdfminer"]: + module_logger = logging.getLogger(module) + module_logger.setLevel(logging.WARNING) +for module in ["peewee"]: + module_logger = logging.getLogger(module) + module_logger.handlers.clear() + module_logger.propagate = True + import os import signal import sys @@ -22,7 +33,6 @@ import time import traceback from concurrent.futures import ThreadPoolExecutor -import validation from werkzeug.serving import run_simple from api.apps import app from api.db.runtime_config import RuntimeConfig @@ -31,7 +41,6 @@ from api.settings import ( HOST, HTTP_PORT ) from api import utils -from api.utils.log_utils import logger from api.db.db_models import init_database_tables as init_web_db from api.db.init_data import init_web_data @@ -44,11 +53,11 @@ def update_progress(): try: DocumentService.update_progress() except Exception: - logger.exception("update_progress exception") + logging.exception("update_progress exception") if __name__ == '__main__': - logger.info(r""" + logging.info(r""" ____ ___ ______ ______ __ / __ \ / | / ____// ____// /____ _ __ / /_/ // /| | / / __ / /_ / // __ \| | /| / / @@ -56,10 +65,10 @@ if __name__ == '__main__': /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/ """) - logger.info( + logging.info( f'RAGFlow version: {RAGFLOW_VERSION_INFO}' ) - logger.info( + logging.info( f'project base: {utils.file_utils.get_project_base_directory()}' ) @@ -83,26 +92,18 @@ if __name__ == '__main__': RuntimeConfig.DEBUG = args.debug if RuntimeConfig.DEBUG: - logger.info("run on debug mode") + logging.info("run on debug mode") RuntimeConfig.init_env() RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT) - peewee_logger = logging.getLogger("peewee") - peewee_logger.propagate = False - # rag_arch.common.log.ROpenHandler - peewee_logger.addHandler(logger.handlers[0]) - peewee_logger.setLevel(logger.handlers[0].level) thr = ThreadPoolExecutor(max_workers=1) thr.submit(update_progress) # start http server try: - logger.info("RAG Flow http server start...") - werkzeug_logger = logging.getLogger("werkzeug") - for h in logger.handlers: - werkzeug_logger.addHandler(h) + logging.info("RAG Flow http server start...") run_simple( hostname=HOST, port=HTTP_PORT, diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index 9c48ce179667ec6e91dccbaa29fb77b05ffb7966..37f4a3c92159c2dcb93558bb250350c038ca3f6d 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import functools import json import random @@ -40,7 +41,6 @@ from api.settings import ( from api.settings import RetCode from api.utils import CustomJSONEncoder, get_uuid from api.utils import json_dumps -from api.utils.log_utils import logger requests.models.complexjson.dumps = functools.partial( json.dumps, cls=CustomJSONEncoder) @@ -118,7 +118,7 @@ def get_data_error_result(code=RetCode.DATA_ERROR, def server_error_response(e): - logger.exception(e) + logging.exception(e) try: if e.code == 401: return get_json_result(code=401, message=repr(e)) @@ -259,7 +259,7 @@ def construct_json_result(code=RetCode.SUCCESS, message='success', data=None): def construct_error_response(e): - logger.exception(e) + logging.exception(e) try: if e.code == 401: return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e)) diff --git a/api/utils/log_utils.py b/api/utils/log_utils.py index ffe08097893dd5267d53ca1345348813e94b0a33..4b296d036a53c9615a3b8fa655650df670ada1cb 100644 --- a/api/utils/log_utils.py +++ b/api/utils/log_utils.py @@ -14,38 +14,41 @@ # limitations under the License. # import os +import os.path import logging from logging.handlers import RotatingFileHandler -from api.utils.file_utils import get_project_base_directory - -LOG_LEVEL = logging.INFO -LOG_FILE = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"ragflow_{os.getpid()}.log")) -LOG_FORMAT = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s" -logger = None - -def getLogger(): - global logger - if logger is not None: - return logger - - print(f"log file path: {LOG_FILE}") - os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) - logger = logging.getLogger("ragflow") - logger.setLevel(LOG_LEVEL) - - handler1 = RotatingFileHandler(LOG_FILE, maxBytes=10*1024*1024, backupCount=5) - handler1.setLevel(LOG_LEVEL) - formatter1 = logging.Formatter(LOG_FORMAT) - handler1.setFormatter(formatter1) +def get_project_base_directory(): + PROJECT_BASE = os.path.abspath( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + os.pardir, + os.pardir, + ) + ) + return PROJECT_BASE + +def initRootLogger(script_path: str, log_level: int = logging.INFO, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"): + logger = logging.getLogger() + if logger.hasHandlers(): + return + + script_name = os.path.basename(script_path) + log_path = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{os.path.splitext(script_name)[0]}.log")) + + os.makedirs(os.path.dirname(log_path), exist_ok=True) + logger.setLevel(log_level) + formatter = logging.Formatter(log_format) + + handler1 = RotatingFileHandler(log_path, maxBytes=10*1024*1024, backupCount=5) + handler1.setLevel(log_level) + handler1.setFormatter(formatter) logger.addHandler(handler1) handler2 = logging.StreamHandler() - handler2.setLevel(LOG_LEVEL) - formatter2 = logging.Formatter(LOG_FORMAT) - handler2.setFormatter(formatter2) + handler2.setLevel(log_level) + handler2.setFormatter(formatter) logger.addHandler(handler2) - return logger - -logger = getLogger() + msg = f"{script_name} log path: {log_path}" + logger.info(msg) \ No newline at end of file diff --git a/api/validation.py b/api/validation.py index f4d610c45f747c3b54df54cd6f155ca1fbb7e3a4..7c0e42a14167eebfae46555dfa919ccf850f9e2d 100644 --- a/api/validation.py +++ b/api/validation.py @@ -14,20 +14,20 @@ # limitations under the License. # +import logging import sys -from api.utils.log_utils import logger def python_version_validation(): # Check python version required_python_version = (3, 10) if sys.version_info < required_python_version: - logger.info( + logging.info( f"Required Python: >= {required_python_version[0]}.{required_python_version[1]}. Current Python version: {sys.version_info[0]}.{sys.version_info[1]}." ) sys.exit(1) else: - logger.info(f"Python version: {sys.version_info[0]}.{sys.version_info[1]}") + logging.info(f"Python version: {sys.version_info[0]}.{sys.version_info[1]}") python_version_validation() diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 0d4572bbde9a9b778b8f9d71f5ec11236663dce4..2f2bde76ba9fabdaacc77b7c634c2759f6aefbb4 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -11,6 +11,7 @@ # limitations under the License. # +import logging import os import random @@ -18,7 +19,6 @@ import xgboost as xgb from io import BytesIO import re import pdfplumber -import logging from PIL import Image import numpy as np from timeit import default_timer as timer @@ -26,15 +26,11 @@ from pypdf import PdfReader as pdf2_read from api.settings import LIGHTEN from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from rag.nlp import rag_tokenizer from copy import deepcopy from huggingface_hub import snapshot_download -logging.getLogger("pdfminer").setLevel(logging.WARNING) - - class RAGFlowPdfParser: def __init__(self): self.ocr = OCR() @@ -51,7 +47,7 @@ class RAGFlowPdfParser: if torch.cuda.is_available(): self.updown_cnt_mdl.set_param({"device": "cuda"}) except Exception: - logger.exception("RAGFlowPdfParser __init__") + logging.exception("RAGFlowPdfParser __init__") try: model_dir = os.path.join( get_project_base_directory(), @@ -188,7 +184,7 @@ class RAGFlowPdfParser: return True def _table_transformer_job(self, ZM): - logger.info("Table processing...") + logging.debug("Table processing...") imgs, pos = [], [] tbcnt = [0] MARGIN = 10 @@ -426,7 +422,7 @@ class RAGFlowPdfParser: detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] if (any(feats) and not any(concatting_feats)) or any(detach_feats): - logger.info("{} {} {} {}".format( + logging.debug("{} {} {} {}".format( b["text"], b_["text"], any(feats), @@ -727,14 +723,14 @@ class RAGFlowPdfParser: # continue if tv < fv and tk: tables[tk].insert(0, c) - logger.debug( + logging.debug( "TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk) elif fk: figures[fk].insert(0, c) - logger.debug( + logging.debug( "FIGURE:" + self.boxes[i]["text"] + "; Cap: " + @@ -761,7 +757,7 @@ class RAGFlowPdfParser: if ii is not None: b = louts[ii] else: - logger.warn( + logging.warn( f"Missing layout match: {pn + 1},%s" % (bxs[0].get( "layoutno", ""))) @@ -919,7 +915,7 @@ class RAGFlowPdfParser: if usefull(boxes[0]): dfs(boxes[0], 0) else: - logger.debug("WASTE: " + boxes[0]["text"]) + logging.debug("WASTE: " + boxes[0]["text"]) except Exception: pass boxes.pop(0) @@ -928,7 +924,7 @@ class RAGFlowPdfParser: res.append( "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines])) else: - logger.debug("REMOVED: " + + logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines])) return "\n\n".join(res) @@ -940,7 +936,7 @@ class RAGFlowPdfParser: fnm) if not binary else pdfplumber.open(BytesIO(binary)) return len(pdf.pages) except Exception: - logger.exception("total_page_number") + logging.exception("total_page_number") def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): @@ -964,7 +960,7 @@ class RAGFlowPdfParser: self.pdf.pages[page_from:page_to]] self.total_page = len(self.pdf.pages) except Exception: - logger.exception("RAGFlowPdfParser __images__") + logging.exception("RAGFlowPdfParser __images__") self.outlines = [] try: @@ -980,11 +976,11 @@ class RAGFlowPdfParser: dfs(outlines, 0) except Exception as e: - logger.warning(f"Outlines exception: {e}") + logging.warning(f"Outlines exception: {e}") if not self.outlines: - logger.warning("Miss outlines") + logging.warning("Miss outlines") - logger.info("Images converted.") + logging.debug("Images converted.") self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join( random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))] @@ -1024,7 +1020,7 @@ class RAGFlowPdfParser: self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) - logger.info("Is it English:", self.is_english) + logging.debug("Is it English:", self.is_english) self.page_cum_height = np.cumsum(self.page_cum_height) assert len(self.page_cum_height) == len(self.page_images) + 1 @@ -1164,9 +1160,9 @@ class PlainParser(object): dfs(outlines, 0) except Exception: - logger.exception("Outlines exception") + logging.exception("Outlines exception") if not self.outlines: - logger.warning("Miss outlines") + logging.warning("Miss outlines") return [(l, "") for l in lines], [] diff --git a/deepdoc/parser/resume/entities/corporations.py b/deepdoc/parser/resume/entities/corporations.py index 20f606e17ed236a09815f8ee38765adad6c547e0..c26f58aebd478e4906133b90e925aa799c1143fa 100644 --- a/deepdoc/parser/resume/entities/corporations.py +++ b/deepdoc/parser/resume/entities/corporations.py @@ -11,13 +11,13 @@ # limitations under the License. # +import logging import re import json import os import pandas as pd from rag.nlp import rag_tokenizer from . import regions -from api.utils.log_utils import logger current_file_path = os.path.dirname(os.path.abspath(__file__)) @@ -71,7 +71,7 @@ GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP]) for c,v in CORP_TAG.items(): cc = corpNorm(rmNoise(c), False) if not cc: - logger.info(c) + logging.debug(c) CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()} def is_good(nm): diff --git a/deepdoc/parser/resume/step_two.py b/deepdoc/parser/resume/step_two.py index d480b4520aa242d7995c4a66e44ca06634c29da7..afc5fb47da718cf3372252cc7f9762c32913d9b9 100644 --- a/deepdoc/parser/resume/step_two.py +++ b/deepdoc/parser/resume/step_two.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import logging import re import copy import time @@ -23,7 +23,6 @@ from deepdoc.parser.resume.entities import degrees, schools, corporations from rag.nlp import rag_tokenizer, surname from xpinyin import Pinyin from contextlib import contextmanager -from api.utils.log_utils import logger class TimeoutException(Exception): pass @@ -164,7 +163,7 @@ def forEdu(cv): y, m, d = getYMD(edu_end_dt) cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) except Exception as e: - logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt"))) + logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt"))) if sch: cv["school_name_kwd"] = sch if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \ @@ -276,7 +275,7 @@ def forWork(cv): try: duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days) except Exception: - logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time"))) + logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time"))) if n.get("scale"): r = re.search(r"^([0-9]+)", str(n["scale"])) @@ -333,7 +332,7 @@ def forWork(cv): y, m, d = getYMD(work_st_tm) cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) except Exception as e: - logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt"))) + logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt"))) cv["job_num_int"] = 0 if duas: @@ -464,7 +463,7 @@ def parse(cv): cv[f"{t}_kwd"] = nms cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms)) except Exception: - logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k])) + logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k])) cv[k] = [] # tokenize fields @@ -565,7 +564,7 @@ def parse(cv): cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y) except Exception as e: - logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time"))) + logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time"))) if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12. keys = list(cv.keys()) @@ -580,7 +579,7 @@ def parse(cv): cv["tob_resume_id"] = str(cv["tob_resume_id"]) cv["id"] = cv["tob_resume_id"] - logger.info("CCCCCCCCCCCCCCC") + logging.debug("CCCCCCCCCCCCCCC") return dealWithInt64(cv) diff --git a/deepdoc/vision/operators.py b/deepdoc/vision/operators.py index ff625906dbda0a4a351d292db0cbcb2c24c71339..9037fc455238cb5b1d4c90c56b37b6cc202efad7 100644 --- a/deepdoc/vision/operators.py +++ b/deepdoc/vision/operators.py @@ -14,13 +14,13 @@ # limitations under the License. # +import logging import sys import six import cv2 import numpy as np import math from PIL import Image -from api.utils.log_utils import logger class DecodeImage(object): @@ -403,7 +403,7 @@ class DetResizeForTest(object): return None, (None, None) img = cv2.resize(img, (int(resize_w), int(resize_h))) except BaseException: - logger.exception("{} {} {}".format(img.shape, resize_w, resize_h)) + logging.exception("{} {} {}".format(img.shape, resize_w, resize_h)) sys.exit(0) ratio_h = resize_h / float(h) ratio_w = resize_w / float(w) diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index e4d2107cf477273b880a74036f76c9c99d8ddf95..2181550e548fe1275f3a3570a86365216889a9c0 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -11,6 +11,7 @@ # limitations under the License. # +import logging import os from copy import deepcopy @@ -19,7 +20,6 @@ from huggingface_hub import snapshot_download from api.utils.file_utils import get_project_base_directory from .operators import * -from api.utils.log_utils import logger class Recognizer(object): @@ -440,7 +440,7 @@ class Recognizer(object): end_index = min((i + 1) * batch_size, len(imgs)) batch_image_list = imgs[start_index:end_index] inputs = self.preprocess(batch_image_list) - logger.info("preprocess") + logging.debug("preprocess") for ins in inputs: bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr) res.append(bb) diff --git a/deepdoc/vision/seeit.py b/deepdoc/vision/seeit.py index e753598496018ff64741d6a6444aac55ab8a6c86..6064bc4843e023d71b8266e2b137b78b1c1b7f15 100644 --- a/deepdoc/vision/seeit.py +++ b/deepdoc/vision/seeit.py @@ -11,10 +11,10 @@ # limitations under the License. # +import logging import os import PIL from PIL import ImageDraw -from api.utils.log_utils import logger def save_results(image_list, results, labels, output_dir='output/', threshold=0.5): @@ -25,7 +25,7 @@ def save_results(image_list, results, labels, output_dir='output/', threshold=0. out_path = os.path.join(output_dir, f"{idx}.jpg") im.save(out_path, quality=95) - logger.info("save result to: " + out_path) + logging.debug("save result to: " + out_path) def draw_box(im, result, lables, threshold=0.5): diff --git a/deepdoc/vision/t_recognizer.py b/deepdoc/vision/t_recognizer.py index 7a6f2e472592bc15de26bb933d3c73a63bea4f92..e5d7a22ae7ea2fb8c4f220cffe356424b452742e 100644 --- a/deepdoc/vision/t_recognizer.py +++ b/deepdoc/vision/t_recognizer.py @@ -10,9 +10,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import os import sys -from api.utils.log_utils import logger sys.path.insert( 0, @@ -59,7 +59,7 @@ def main(args): } for t in lyt] img = draw_box(images[i], lyt, labels, float(args.threshold)) img.save(outputs[i], quality=95) - logger.info("save result to: " + outputs[i]) + logging.info("save result to: " + outputs[i]) def get_table_html(img, tb_cpns, ocr): diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index 7ec6b1a62dd70fc6b46e58ca2c206802a7b5625a..5759c0f644bd190a26e6de66600ce0ca19095e82 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -38,7 +38,7 @@ class TableStructureRecognizer(Recognizer): super().__init__(self.labels, "tsr", os.path.join( get_project_base_directory(), "rag/res/deepdoc")) - except Exception as e: + except Exception: super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False)) diff --git a/graphrag/claim_extractor.py b/graphrag/claim_extractor.py index edb59e878812265b6bfbed1d3450cbb202e667c0..d3495a79195a0effa4148a6074e0c9f36b1d9aa9 100644 --- a/graphrag/claim_extractor.py +++ b/graphrag/claim_extractor.py @@ -5,6 +5,7 @@ Reference: - [graphrag](https://github.com/microsoft/graphrag) """ +import logging import argparse import json import re @@ -17,7 +18,6 @@ import tiktoken from graphrag.claim_prompt import CLAIM_EXTRACTION_PROMPT, CONTINUE_PROMPT, LOOP_PROMPT from rag.llm.chat_model import Base as CompletionLLM from graphrag.utils import ErrorHandlerFn, perform_variable_replacements -from api.utils.log_utils import logger DEFAULT_TUPLE_DELIMITER = "<|>" DEFAULT_RECORD_DELIMITER = "##" @@ -126,7 +126,7 @@ class ClaimExtractor: ] source_doc_map[document_id] = text except Exception as e: - logger.exception("error extracting claim") + logging.exception("error extracting claim") self._on_error( e, traceback.format_exc(), @@ -265,4 +265,4 @@ if __name__ == "__main__": "claim_description": "" } claim = ex(info) - logger.info(json.dumps(claim.output, ensure_ascii=False, indent=2)) + logging.info(json.dumps(claim.output, ensure_ascii=False, indent=2)) diff --git a/graphrag/community_reports_extractor.py b/graphrag/community_reports_extractor.py index 468ee18c38a274d73769dca978ce039bc8c07b55..213162f29be902e1144003db07c2d3085c16b6a8 100644 --- a/graphrag/community_reports_extractor.py +++ b/graphrag/community_reports_extractor.py @@ -5,6 +5,7 @@ Reference: - [graphrag](https://github.com/microsoft/graphrag) """ +import logging import json import re import traceback @@ -19,7 +20,6 @@ from rag.llm.chat_model import Base as CompletionLLM from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, dict_has_keys_with_types from rag.utils import num_tokens_from_string from timeit import default_timer as timer -from api.utils.log_utils import logger @dataclass @@ -80,7 +80,7 @@ class CommunityReportsExtractor: response = re.sub(r"[^\}]*$", "", response) response = re.sub(r"\{\{", "{", response) response = re.sub(r"\}\}", "}", response) - logger.info(response) + logging.debug(response) response = json.loads(response) if not dict_has_keys_with_types(response, [ ("title", str), @@ -92,7 +92,7 @@ class CommunityReportsExtractor: response["weight"] = weight response["entities"] = ents except Exception as e: - logger.exception("CommunityReportsExtractor got exception") + logging.exception("CommunityReportsExtractor got exception") self._on_error(e, traceback.format_exc(), None) continue diff --git a/graphrag/description_summary.py b/graphrag/description_summary.py index 3aa3287e7e563b5d1b972af4b40fa14540289e7e..f5537c954476b47859c71eb2e95cc86db5c5e6b4 100644 --- a/graphrag/description_summary.py +++ b/graphrag/description_summary.py @@ -5,19 +5,11 @@ Reference: - [graphrag](https://github.com/microsoft/graphrag) """ -import argparse -import html import json -import logging -import numbers -import re -import traceback -from collections.abc import Callable from dataclasses import dataclass from graphrag.utils import ErrorHandlerFn, perform_variable_replacements from rag.llm.chat_model import Base as CompletionLLM -import networkx as nx from rag.utils import num_tokens_from_string diff --git a/graphrag/entity_resolution.py b/graphrag/entity_resolution.py index 3fd7f4fbf06a8e5c4f47c9e7dca4d0ae6a96efcc..1c8a4b4e0fe7e5e24f287265feb534afd5de87e0 100644 --- a/graphrag/entity_resolution.py +++ b/graphrag/entity_resolution.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import itertools import logging +import itertools import re import traceback from dataclasses import dataclass diff --git a/graphrag/index.py b/graphrag/index.py index 4b72333ccf03029edc224532ee62de88988636d7..26891803f1a83d3445678b3bb56ef19ae381da5f 100644 --- a/graphrag/index.py +++ b/graphrag/index.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import os from concurrent.futures import ThreadPoolExecutor import json @@ -28,7 +29,6 @@ from graphrag.graph_extractor import GraphExtractor, DEFAULT_ENTITY_TYPES from graphrag.mind_map_extractor import MindMapExtractor from rag.nlp import rag_tokenizer from rag.utils import num_tokens_from_string -from api.utils.log_utils import logger def graph_merge(g1, g2): @@ -95,7 +95,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en chunks = [] for n, attr in graph.nodes(data=True): if attr.get("rank", 0) == 0: - logger.info(f"Ignore entity: {n}") + logging.debug(f"Ignore entity: {n}") continue chunk = { "name_kwd": n, @@ -137,7 +137,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en mg = mindmap(_chunks).output if not len(mg.keys()): return chunks - logger.info(json.dumps(mg, ensure_ascii=False, indent=2)) + logging.debug(json.dumps(mg, ensure_ascii=False, indent=2)) chunks.append( { "content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2), diff --git a/graphrag/leiden.py b/graphrag/leiden.py index 07cd815875d8fdbd6d40b4237c8e6011a39fed94..09440597ea3af381ab8df81f7a6f4e5126ed333c 100644 --- a/graphrag/leiden.py +++ b/graphrag/leiden.py @@ -14,8 +14,6 @@ from graspologic.utils import largest_connected_component import networkx as nx from networkx import is_empty -log = logging.getLogger(__name__) - def _stabilize_graph(graph: nx.Graph) -> nx.Graph: """Ensure an undirected graph with the same relationships will always be read the same way.""" @@ -99,7 +97,7 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]: max_cluster_size = args.get("max_cluster_size", 12) use_lcc = args.get("use_lcc", True) if args.get("verbose", False): - log.info( + logging.debug( "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc ) if not graph.nodes(): return {} diff --git a/graphrag/mind_map_extractor.py b/graphrag/mind_map_extractor.py index 8ceebe4e2268df9495f9e2a96393f5d5870a38d8..1f4b924af10c82d8d53786a2576e6ab4b628fda6 100644 --- a/graphrag/mind_map_extractor.py +++ b/graphrag/mind_map_extractor.py @@ -14,8 +14,8 @@ # limitations under the License. # -import collections import logging +import collections import os import re import traceback @@ -29,7 +29,6 @@ from rag.llm.chat_model import Base as CompletionLLM import markdown_to_json from functools import reduce from rag.utils import num_tokens_from_string -from api.utils.log_utils import logger @dataclass @@ -193,6 +192,6 @@ class MindMapExtractor: gen_conf = {"temperature": 0.5} response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf) response = re.sub(r"```[^\n]*", "", response) - logger.info(response) - logger.info(self._todict(markdown_to_json.dictify(response))) + logging.debug(response) + logging.debug(self._todict(markdown_to_json.dictify(response))) return self._todict(markdown_to_json.dictify(response)) diff --git a/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py b/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py index 7be661f8a2363ec776c59c84358c3fe71fda4a37..17878c540bf07c0da924a7e91955d68c1c687dca 100644 --- a/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py +++ b/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py @@ -1,8 +1,8 @@ +import logging import requests from bridge.context import ContextType # Import Context, ContextType from bridge.reply import Reply, ReplyType # Import Reply, ReplyType from bridge import * -from api.utils.log_utils import logger from plugins import Plugin, register # Import Plugin and register from plugins.event import Event, EventContext, EventAction # Import event-related classes @@ -16,7 +16,7 @@ class RAGFlowChat(Plugin): self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context # Store conversation_id for each user self.conversations = {} - logger.info("[RAGFlowChat] Plugin initialized") + logging.info("[RAGFlowChat] Plugin initialized") def on_handle_context(self, e_context: EventContext): context = e_context['context'] @@ -45,7 +45,7 @@ class RAGFlowChat(Plugin): user_id = session_id # Use session_id as user_id if not api_key or not host_address: - logger.error("[RAGFlowChat] Missing configuration") + logging.error("[RAGFlowChat] Missing configuration") return "The plugin configuration is incomplete. Please check the configuration." headers = { @@ -63,20 +63,20 @@ class RAGFlowChat(Plugin): } try: response = requests.get(url_new_conversation, headers=headers, params=params_new_conversation) - logger.debug(f"[RAGFlowChat] New conversation response: {response.text}") + logging.debug(f"[RAGFlowChat] New conversation response: {response.text}") if response.status_code == 200: data = response.json() if data.get("code") == 0: conversation_id = data["data"]["id"] self.conversations[user_id] = conversation_id else: - logger.error(f"[RAGFlowChat] Failed to create conversation: {data.get('message')}") + logging.error(f"[RAGFlowChat] Failed to create conversation: {data.get('message')}") return f"Sorry, unable to create a conversation: {data.get('message')}" else: - logger.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}") + logging.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}") return f"Sorry, unable to connect to RAGFlow API (create conversation). HTTP status code: {response.status_code}" except Exception as e: - logger.exception("[RAGFlowChat] Exception when creating conversation") + logging.exception("[RAGFlowChat] Exception when creating conversation") return f"Sorry, an internal error occurred: {str(e)}" # Step 2: Send the message and get a reply @@ -95,18 +95,18 @@ class RAGFlowChat(Plugin): try: response = requests.post(url_completion, headers=headers, json=payload_completion) - logger.debug(f"[RAGFlowChat] Completion response: {response.text}") + logging.debug(f"[RAGFlowChat] Completion response: {response.text}") if response.status_code == 200: data = response.json() if data.get("code") == 0: answer = data["data"]["answer"] return answer else: - logger.error(f"[RAGFlowChat] Failed to get answer: {data.get('message')}") + logging.error(f"[RAGFlowChat] Failed to get answer: {data.get('message')}") return f"Sorry, unable to get a reply: {data.get('message')}" else: - logger.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}") + logging.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}") return f"Sorry, unable to connect to RAGFlow API (get reply). HTTP status code: {response.status_code}" except Exception as e: - logger.exception("[RAGFlowChat] Exception when getting answer") + logging.exception("[RAGFlowChat] Exception when getting answer") return f"Sorry, an internal error occurred: {str(e)}" diff --git a/rag/app/book.py b/rag/app/book.py index 9aa88c3dfac0f0bb694ba70c80f76d033143dd5e..efd78c18e54cd36881b0acd3d4f1555fab56e351 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from tika import parser import re from io import BytesIO @@ -20,7 +21,6 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \ tokenize_chunks from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser -from api.utils.log_utils import logger class Pdf(PdfParser): @@ -39,7 +39,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - logger.info("layouts: {}".format(timer() - start)) + logging.debug("layouts: {}".format(timer() - start)) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() diff --git a/rag/app/email.py b/rag/app/email.py index 1c3350222bfa2b780c855699ba57bc6edc677e68..5226c78eead8fa86a3bfcbd715ffd9c1ec822ec1 100644 --- a/rag/app/email.py +++ b/rag/app/email.py @@ -11,6 +11,7 @@ # limitations under the License. # +import logging from email import policy from email.parser import BytesParser from rag.app.naive import chunk as naive_chunk @@ -18,7 +19,6 @@ import re from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks from deepdoc.parser import HtmlParser, TxtParser from timeit import default_timer as timer -from api.utils.log_utils import logger import io @@ -86,7 +86,7 @@ def chunk( ) main_res.extend(tokenize_chunks(chunks, doc, eng, None)) - logger.info("naive_merge({}): {}".format(filename, timer() - st)) + logging.debug("naive_merge({}): {}".format(filename, timer() - st)) # get the attachment info for part in msg.iter_attachments(): content_disposition = part.get("Content-Disposition") diff --git a/rag/app/laws.py b/rag/app/laws.py index 38e7d106e701f9390a6e038b6833cf6876ec72ba..3334e9eba8551e47f62fef3f38a4bf39d365ebfa 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -21,7 +21,6 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, make_colon_as_title, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser -from api.utils.log_utils import logger class Docx(DocxParser): @@ -122,7 +121,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.67, "Layout analysis finished") - logger.info("layouts:".format( + logging.debug("layouts:".format( )) self._naive_vertical_merge() diff --git a/rag/app/manual.py b/rag/app/manual.py index a3cf100e71ce054b81a894873ab9572ed29bafff..4efb92986e99033c06a8728d9a8cbe68be491cc8 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -14,6 +14,7 @@ # limitations under the License. # +import logging import copy import re @@ -24,7 +25,6 @@ from rag.utils import num_tokens_from_string from deepdoc.parser import PdfParser, PlainParser, DocxParser from docx import Document from PIL import Image -from api.utils.log_utils import logger class Pdf(PdfParser): @@ -48,11 +48,11 @@ class Pdf(PdfParser): # for bb in self.boxes: # for b in bb: # print(b) - logger.info("OCR: {}".format(timer() - start)) + logging.debug("OCR: {}".format(timer() - start)) self._layouts_rec(zoomin) callback(0.65, "Layout analysis finished.") - logger.info("layouts: {}".format(timer() - start)) + logging.debug("layouts: {}".format(timer() - start)) self._table_transformer_job(zoomin) callback(0.67, "Table analysis finished.") self._text_merge() diff --git a/rag/app/naive.py b/rag/app/naive.py index 3834c67b106ae6899b107b2226191b39f09083e1..dfaef0bec88a4f84a11cc1b8551df85cf4e5066a 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from tika import parser from io import BytesIO from docx import Document @@ -19,7 +20,6 @@ from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \ naive_merge_docx, tokenize_chunks_docx from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser -from api.utils.log_utils import logger from rag.utils import num_tokens_from_string from PIL import Image from functools import reduce @@ -41,13 +41,13 @@ class Docx(DocxParser): try: image_blob = related_part.image.blob except UnrecognizedImageError: - logger.info("Unrecognized image format. Skipping image.") + logging.info("Unrecognized image format. Skipping image.") return None except UnexpectedEndOfFileError: - logger.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.") + logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.") return None except InvalidImageStreamError: - logger.info("The recognized image stream appears to be corrupted. Skipping image.") + logging.info("The recognized image stream appears to be corrupted. Skipping image.") return None try: image = Image.open(BytesIO(image_blob)).convert('RGB') @@ -133,7 +133,7 @@ class Pdf(PdfParser): callback ) callback(msg="OCR finished") - logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) @@ -147,7 +147,7 @@ class Pdf(PdfParser): self._concat_downward() # self._filter_forpages() - logger.info("layouts cost: {}s".format(timer() - start)) + logging.info("layouts cost: {}s".format(timer() - start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls @@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return chunks res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) - logger.info("naive_merge({}): {}".format(filename, timer() - st)) + logging.info("naive_merge({}): {}".format(filename, timer() - st)) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): @@ -280,7 +280,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return chunks res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) - logger.info("naive_merge({}): {}".format(filename, timer() - st)) + logging.info("naive_merge({}): {}".format(filename, timer() - st)) return res diff --git a/rag/app/one.py b/rag/app/one.py index 7335c74fa73ac21063d38510d270994fa6afa0f1..9f24ccb95bba7c0fcaff3a21f65bac39639962df 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging from tika import parser from io import BytesIO import re @@ -18,7 +19,6 @@ from deepdoc.parser.utils import get_text from rag.app import laws from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser -from api.utils.log_utils import logger class Pdf(PdfParser): @@ -38,7 +38,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin, drop=False) callback(0.63, "Layout analysis finished.") - logger.info("layouts cost: {}s".format(timer() - start)) + logging.debug("layouts cost: {}s".format(timer() - start)) self._table_transformer_job(zoomin) callback(0.65, "Table analysis finished.") self._text_merge() diff --git a/rag/app/paper.py b/rag/app/paper.py index 59c3ccd75602e0f6852f8c6a69a022522e962495..1be93be8656c3c39b75004aae710d68119a2db4e 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import copy import re @@ -17,7 +18,6 @@ from api.db import ParserType from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from deepdoc.parser import PdfParser, PlainParser import numpy as np -from api.utils.log_utils import logger class Pdf(PdfParser): @@ -41,7 +41,7 @@ class Pdf(PdfParser): start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis finished") - logger.info(f"layouts cost: {timer() - start}s") + logging.debug(f"layouts cost: {timer() - start}s") self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() @@ -53,7 +53,7 @@ class Pdf(PdfParser): # clean mess if column_width < self.page_images[0].size[0] / zoomin / 2: - logger.info("two_column................... {} {}".format(column_width, + logging.debug("two_column................... {} {}".format(column_width, self.page_images[0].size[0] / zoomin / 2)) self.boxes = self.sort_X_by_page(self.boxes, column_width / 2) for b in self.boxes: @@ -115,8 +115,8 @@ class Pdf(PdfParser): from_page, min( to_page, self.total_page))) for b in self.boxes: - logger.info("{} {}".format(b["text"], b.get("layoutno"))) - logger.info("{}".format(tbls)) + logging.debug("{} {}".format(b["text"], b.get("layoutno"))) + logging.debug("{}".format(tbls)) return { "title": title, @@ -157,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"]) # is it English eng = lang.lower() == "english" # pdf_parser.is_english - logger.info("It's English.....{}".format(eng)) + logging.debug("It's English.....{}".format(eng)) res = tokenize_table(paper["tables"], doc, eng) @@ -184,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1 sec_ids.append(sid) - logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid)) + logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid)) chunks = [] last_sid = -2 diff --git a/rag/app/qa.py b/rag/app/qa.py index b6c756966cba7c8656b8df5aba123ff6b9f76eca..d9756eecd056dd051fe5023ac1f0999982e78d8e 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import re from copy import deepcopy from io import BytesIO @@ -19,7 +20,6 @@ from openpyxl import load_workbook from deepdoc.parser.utils import get_text from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level from rag.nlp import rag_tokenizer, tokenize_table, concat_img -from api.utils.log_utils import logger from deepdoc.parser import PdfParser, ExcelParser, DocxParser from docx import Document from PIL import Image @@ -82,7 +82,7 @@ class Pdf(PdfParser): callback ) callback(msg="OCR finished") - logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin, drop=False) callback(0.63, "Layout analysis finished.") @@ -94,7 +94,7 @@ class Pdf(PdfParser): #self._naive_vertical_merge() # self._concat_downward() #self._filter_forpages() - logger.info("layouts: {}".format(timer() - start)) + logging.debug("layouts: {}".format(timer() - start)) sections = [b["text"] for b in self.boxes] bull_x0_list = [] q_bull, reg = qbullets_category(sections) diff --git a/rag/app/resume.py b/rag/app/resume.py index ade0b12c93f205eabb40d1eeb90d2019ac52f0d0..0805253750cbb7191d2b5ab7e79d89dfe5c38396 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import base64 import datetime import json @@ -20,7 +21,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from rag.nlp import rag_tokenizer from deepdoc.parser.resume import refactor from deepdoc.parser.resume import step_one, step_two -from api.utils.log_utils import logger from rag.utils import rmSpace forbidden_select_fields4resume = [ @@ -64,7 +64,7 @@ def remote_call(filename, binary): resume = step_two.parse(resume) return resume except Exception: - logger.exception("Resume parser error") + logging.exception("Resume parser error") return {} @@ -86,7 +86,7 @@ def chunk(filename, binary=None, callback=None, **kwargs): callback(-1, "Resume is not successfully parsed.") raise Exception("Resume parser remote call fail!") callback(0.6, "Done parsing. Chunking...") - logger.info("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2)) + logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2)) field_map = { "name_kwd": "姓名/名字", @@ -158,7 +158,7 @@ def chunk(filename, binary=None, callback=None, **kwargs): resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) doc[n] = resume[n] - logger.info("chunked resume to " + str(doc)) + logging.debug("chunked resume to " + str(doc)) KnowledgebaseService.update_parser_config( kwargs["kb_id"], {"field_map": field_map}) return [doc] diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index b9e1fec8e142088f748883751beb7331639dea4d..b167b75e97578846e7eace3fd3c099631eb9d364 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import re from typing import Optional import threading @@ -32,7 +33,6 @@ from api.utils.file_utils import get_home_cache_dir from rag.utils import num_tokens_from_string, truncate import google.generativeai as genai import json -from api.utils.log_utils import logger class Base(ABC): def __init__(self, key, model_name): @@ -297,7 +297,7 @@ class YoudaoEmbed(Base): if not LIGHTEN and not YoudaoEmbed._client: from BCEmbedding import EmbeddingModel as qanthing try: - logger.info("LOADING BCE...") + logging.info("LOADING BCE...") YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join( get_home_cache_dir(), "bce-embedding-base_v1")) diff --git a/rag/llm/rerank_model.py b/rag/llm/rerank_model.py index a2c3902d67fe12ce7368a0ad2f887e0d40058035..45bd10aeb3fa139e08f96646363db786279000cb 100644 --- a/rag/llm/rerank_model.py +++ b/rag/llm/rerank_model.py @@ -27,7 +27,6 @@ from api.settings import LIGHTEN from api.utils.file_utils import get_home_cache_dir from rag.utils import num_tokens_from_string, truncate import json -from api.utils.log_utils import logger def sigmoid(x): @@ -127,7 +126,7 @@ class YoudaoRerank(DefaultRerank): with YoudaoRerank._model_lock: if not YoudaoRerank._model: try: - logger.info("LOADING BCE...") + logging.info("LOADING BCE...") YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join( get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name))) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 8fd239736847717c7838ece603b044d30fec359a..ce1ec1517a0329d3976fbbdbd5761e65e72d4c22 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -14,6 +14,7 @@ # limitations under the License. # +import logging import random from collections import Counter @@ -26,7 +27,6 @@ from word2number import w2n from cn2an import cn2an from PIL import Image import json -from api.utils.log_utils import logger all_codecs = [ 'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs', @@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): # wrap up as es documents for ck in chunks: if len(ck.strip()) == 0:continue - logger.debug("-- {}".format(ck)) + logging.debug("-- {}".format(ck)) d = copy.deepcopy(doc) if pdf_parser: try: @@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images): # wrap up as es documents for ck, image in zip(chunks, images): if len(ck.strip()) == 0:continue - logger.debug("-- {}".format(ck)) + logging.debug("-- {}".format(ck)) d = copy.deepcopy(doc) d["image"] = image tokenize(d, ck, eng) @@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth): for i in range(len(cks)): cks[i] = [sections[j] for j in cks[i][::-1]] - logger.info("\n* ".join(cks[i])) + logging.debug("\n* ".join(cks[i])) res = [[]] num = [0] diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 390b514e86bac4dda26d03c426f6a40272e8f932..f77f65fd41add27d40769432fdaea427b687e260 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -14,9 +14,9 @@ # limitations under the License. # +import logging import json import re -import logging from rag.utils.doc_store_conn import MatchTextExpr from rag.nlp import rag_tokenizer, term_weight, synonym @@ -88,7 +88,7 @@ class FulltextQueryer: syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn] syns.append(" ".join(syn)) - q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)] + q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)] for i in range(1, len(tks_w)): q.append( '"%s %s"^%.4f' @@ -121,7 +121,7 @@ class FulltextQueryer: twts = self.tw.weights([tt]) syns = self.syn.lookup(tt) if syns: keywords.extend(syns) - logging.info(json.dumps(twts, ensure_ascii=False)) + logging.debug(json.dumps(twts, ensure_ascii=False)) tms = [] for tk, w in sorted(twts, key=lambda x: x[1] * -1): sm = ( diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index cd71fa64c61286426d44a43cb97c20bd47356acd..378b678499d9490754f0ff0e5792ea4e89fef881 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -14,6 +14,7 @@ # limitations under the License. # +import logging import copy import datrie import math @@ -25,7 +26,6 @@ from hanziconv import HanziConv from nltk import word_tokenize from nltk.stem import PorterStemmer, WordNetLemmatizer from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger class RagTokenizer: @@ -36,7 +36,7 @@ class RagTokenizer: return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1] def loadDict_(self, fnm): - logger.info(f"[HUQIE]:Build trie {fnm}") + logging.info(f"[HUQIE]:Build trie {fnm}") try: of = open(fnm, "r", encoding='utf-8') while True: @@ -53,7 +53,7 @@ class RagTokenizer: self.trie_.save(fnm + ".trie") of.close() except Exception: - logger.exception(f"[HUQIE]:Build trie {fnm} failed") + logging.exception(f"[HUQIE]:Build trie {fnm} failed") def __init__(self, debug=False): self.DEBUG = debug @@ -69,7 +69,7 @@ class RagTokenizer: self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie") return except Exception: - logger.exception("[HUQIE]:Build default trie") + logging.exception("[HUQIE]:Build default trie") self.trie_ = datrie.Trie(string.printable) self.loadDict_(self.DIR_ + ".txt") @@ -173,7 +173,7 @@ class RagTokenizer: tks.append(tk) F /= len(tks) L /= len(tks) - logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F)) + logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F)) return tks, B / len(tks) + L + F def sortTks_(self, tkslist): @@ -277,8 +277,8 @@ class RagTokenizer: tks, s = self.maxForward_(L) tks1, s1 = self.maxBackward_(L) if self.DEBUG: - logger.debug("[FW] {} {}".format(tks, s)) - logger.debug("[BW] {} {}".format(tks1, s1)) + logging.debug("[FW] {} {}".format(tks, s)) + logging.debug("[BW] {} {}".format(tks1, s1)) i, j, _i, _j = 0, 0, 0, 0 same = 0 @@ -325,7 +325,7 @@ class RagTokenizer: res.append(" ".join(self.sortTks_(tkslist)[0][0])) res = " ".join(self.english_normalize_(res)) - logger.debug("[TKS] {}".format(self.merge_(res))) + logging.debug("[TKS] {}".format(self.merge_(res))) return self.merge_(res) def fine_grained_tokenize(self, tks): @@ -416,30 +416,30 @@ if __name__ == '__main__': # huqie.addUserDict("/tmp/tmp.new.tks.dict") tks = tknzr.tokenize( "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize( "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize( "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize( "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize("虽然我不怎么玩") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize( "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) tks = tknzr.tokenize( "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-") - logger.info(tknzr.fine_grained_tokenize(tks)) + logging.info(tknzr.fine_grained_tokenize(tks)) if len(sys.argv) < 2: sys.exit() tknzr.DEBUG = False @@ -449,5 +449,5 @@ if __name__ == '__main__': line = of.readline() if not line: break - logger.info(tknzr.tokenize(line)) + logging.info(tknzr.tokenize(line)) of.close() diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 8e288788fddb6dc5fbf57d9ef9e506dd934da21a..ffad855834c0eb8c1a109c0a5697588857200703 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -14,12 +14,12 @@ # limitations under the License. # +import logging import re import json from typing import List, Optional, Dict, Union from dataclasses import dataclass -from api.utils.log_utils import logger from rag.utils import rmSpace from rag.nlp import rag_tokenizer, query import numpy as np @@ -83,7 +83,7 @@ class Dealer: orderBy.desc("create_timestamp_flt") res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids) total=self.dataStore.getTotal(res) - logger.info("Dealer.search TOTAL: {}".format(total)) + logging.debug("Dealer.search TOTAL: {}".format(total)) else: highlightFields = ["content_ltks", "title_tks"] if highlight else [] matchText, keywords = self.qryr.question(qst, min_match=0.3) @@ -91,7 +91,7 @@ class Dealer: matchExprs = [matchText] res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids) total=self.dataStore.getTotal(res) - logger.info("Dealer.search TOTAL: {}".format(total)) + logging.debug("Dealer.search TOTAL: {}".format(total)) else: matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1)) q_vec = matchDense.embedding_data @@ -102,7 +102,7 @@ class Dealer: res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids) total=self.dataStore.getTotal(res) - logger.info("Dealer.search TOTAL: {}".format(total)) + logging.debug("Dealer.search TOTAL: {}".format(total)) # If result is empty, try again with lower min_match if total == 0: @@ -112,7 +112,7 @@ class Dealer: matchDense.extra_options["similarity"] = 0.17 res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids) total=self.dataStore.getTotal(res) - logger.info("Dealer.search 2 TOTAL: {}".format(total)) + logging.debug("Dealer.search 2 TOTAL: {}".format(total)) for k in keywords: kwds.add(k) @@ -123,7 +123,7 @@ class Dealer: continue kwds.add(kk) - logger.info(f"TOTAL: {total}") + logging.debug(f"TOTAL: {total}") ids=self.dataStore.getChunkIds(res) keywords=list(kwds) highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight") @@ -180,7 +180,7 @@ class Dealer: continue idx.append(i) pieces_.append(t) - logger.info("{} => {}".format(answer, pieces_)) + logging.debug("{} => {}".format(answer, pieces_)) if not pieces_: return answer, set([]) @@ -201,7 +201,7 @@ class Dealer: chunks_tks, tkweight, vtweight) mx = np.max(sim) * 0.99 - logger.info("{} SIM: {}".format(pieces_[i], mx)) + logging.debug("{} SIM: {}".format(pieces_[i], mx)) if mx < thr: continue cites[idx[i]] = list( diff --git a/rag/nlp/synonym.py b/rag/nlp/synonym.py index 1575b534462816073837be33a92214bde7057225..0c7721da6d1990ff2df449604fdbdb2342ce6565 100644 --- a/rag/nlp/synonym.py +++ b/rag/nlp/synonym.py @@ -14,13 +14,13 @@ # limitations under the License. # +import logging import json import os import time import re from nltk.corpus import wordnet from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger class Dealer: @@ -33,14 +33,14 @@ class Dealer: try: self.dictionary = json.load(open(path, 'r')) except Exception: - logger.warn("Missing synonym.json") + logging.warn("Missing synonym.json") self.dictionary = {} if not redis: - logger.warning( + logging.warning( "Realtime synonym is disabled, since no redis connection.") if not len(self.dictionary.keys()): - logger.warning("Fail to load synonym") + logging.warning("Fail to load synonym") self.redis = redis self.load() @@ -64,7 +64,7 @@ class Dealer: d = json.loads(d) self.dictionary = d except Exception as e: - logger.error("Fail to load synonym!" + str(e)) + logging.error("Fail to load synonym!" + str(e)) def lookup(self, tk): if re.match(r"[a-z]+$", tk): diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index 2a3533ca513602b835a84b9aab15c2a867878ea1..afd409a1597b59f7f0d5e72b21bc3adf0d6af55e 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -14,6 +14,7 @@ # limitations under the License. # +import logging import math import json import re @@ -21,7 +22,6 @@ import os import numpy as np from rag.nlp import rag_tokenizer from api.utils.file_utils import get_project_base_directory -from api.utils.log_utils import logger class Dealer: @@ -83,11 +83,11 @@ class Dealer: try: self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r")) except Exception: - logger.warning("Load ner.json FAIL!") + logging.warning("Load ner.json FAIL!") try: self.df = load_dict(os.path.join(fnm, "term.freq")) except Exception: - logger.warning("Load term.freq FAIL!") + logging.warning("Load term.freq FAIL!") def pretoken(self, txt, num=False, stpwd=True): patt = [ diff --git a/rag/raptor.py b/rag/raptor.py index 9839c5009f5c100a8add465874d699683e7a17fd..bd07f9406050c8089a7700a46e33e975f7d9c611 100644 --- a/rag/raptor.py +++ b/rag/raptor.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import re from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait from threading import Lock @@ -22,7 +23,6 @@ import numpy as np from sklearn.mixture import GaussianMixture from rag.utils import truncate -from api.utils.log_utils import logger class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: @@ -62,13 +62,13 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: {"temperature": 0.3, "max_tokens": self._max_token} ) cnt = re.sub("(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)", "", cnt) - logger.info(f"SUM: {cnt}") + logging.debug(f"SUM: {cnt}") embds, _ = self._embd_model.encode([cnt]) with lock: if not len(embds[0]): return chunks.append((cnt, embds[0])) except Exception as e: - logger.exception("summarize got exception") + logging.exception("summarize got exception") return e labels = [] @@ -104,7 +104,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: ck_idx = [i+start for i in range(len(lbls)) if lbls[i] == c] threads.append(executor.submit(summarize, ck_idx, lock)) wait(threads, return_when=ALL_COMPLETED) - logger.info(str([t.result() for t in threads])) + logging.debug(str([t.result() for t in threads])) assert len(chunks) - end == n_clusters, "{} vs. {}".format(len(chunks) - end, n_clusters) labels.extend(lbls) diff --git a/rag/svr/cache_file_svr.py b/rag/svr/cache_file_svr.py index 6b0581164c14b05034d62a6cca86040dc4765890..98769d3b53261eb6db46ea85f491677466020cc6 100644 --- a/rag/svr/cache_file_svr.py +++ b/rag/svr/cache_file_svr.py @@ -13,19 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import time import traceback from api.db.db_models import close_connection from api.db.services.task_service import TaskService -from api.utils.log_utils import logger from rag.utils.storage_factory import STORAGE_IMPL from rag.utils.redis_conn import REDIS_CONN def collect(): doc_locations = TaskService.get_ongoing_doc_name() - logger.info(doc_locations) + logging.debug(doc_locations) if len(doc_locations) == 0: time.sleep(1) return @@ -34,7 +34,7 @@ def collect(): def main(): locations = collect() if not locations:return - logger.info(f"TASKS: {len(locations)}") + logging.info(f"TASKS: {len(locations)}") for kb_id, loc in locations: try: if REDIS_CONN.is_alive(): @@ -43,7 +43,7 @@ def main(): if REDIS_CONN.exist(key):continue file_bin = STORAGE_IMPL.get(kb_id, loc) REDIS_CONN.transaction(key, file_bin, 12 * 60) - logger.info("CACHE: {}".format(loc)) + logging.info("CACHE: {}".format(loc)) except Exception as e: traceback.print_stack(e) except Exception as e: diff --git a/rag/svr/discord_svr.py b/rag/svr/discord_svr.py index 313205dea8d9c10568ce0b69359e27d1d9ff40ff..ec842c45c7c67b4933cbd3eb46d08fed132b5fca 100644 --- a/rag/svr/discord_svr.py +++ b/rag/svr/discord_svr.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import discord import requests import base64 import asyncio -from api.utils.log_utils import logger URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk @@ -37,7 +37,7 @@ client = discord.Client(intents=intents) @client.event async def on_ready(): - logger.info(f'We have logged in as {client.user}') + logging.info(f'We have logged in as {client.user}') @client.event diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 4c9b92b01352a1959aa4623f087dd53380442bf2..c8fe5ccfa440d9af73309eccc14c76d2d08cc950 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -13,9 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging +import inspect +from api.utils.log_utils import initRootLogger +initRootLogger(inspect.getfile(inspect.currentframe())) +for module in ["pdfminer"]: + module_logger = logging.getLogger(module) + module_logger.setLevel(logging.WARNING) +for module in ["peewee"]: + module_logger = logging.getLogger(module) + module_logger.handlers.clear() + module_logger.propagate = True + import datetime import json -import logging import os import hashlib import copy @@ -42,7 +53,6 @@ from api.db.db_models import close_connection from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email from rag.nlp import search, rag_tokenizer from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor -from api.utils.log_utils import logger, LOG_FILE from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME from rag.utils import rmSpace, num_tokens_from_string from rag.utils.redis_conn import REDIS_CONN, Payload @@ -90,7 +100,7 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing... try: TaskService.update_progress(task_id, d) except Exception: - logger.exception(f"set_progress({task_id}) got exception") + logging.exception(f"set_progress({task_id}) got exception") close_connection() if cancel: @@ -110,7 +120,7 @@ def collect(): time.sleep(1) return pd.DataFrame() except Exception: - logger.exception("Get task event from queue exception") + logging.exception("Get task event from queue exception") return pd.DataFrame() msg = PAYLOAD.get_message() @@ -118,11 +128,11 @@ def collect(): return pd.DataFrame() if TaskService.do_cancel(msg["id"]): - logger.info("Task {} has been canceled.".format(msg["id"])) + logging.info("Task {} has been canceled.".format(msg["id"])) return pd.DataFrame() tasks = TaskService.get_tasks(msg["id"]) if not tasks: - logger.warning("{} empty task!".format(msg["id"])) + logging.warning("{} empty task!".format(msg["id"])) return [] tasks = pd.DataFrame(tasks) @@ -151,29 +161,29 @@ def build(row): st = timer() bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"]) binary = get_storage_binary(bucket, name) - logger.info( + logging.info( "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"])) except TimeoutError: callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.") - logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"])) + logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"])) return except Exception as e: if re.search("(No such file|not found)", str(e)): callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"]) else: callback(-1, "Get file from minio: %s" % str(e).replace("'", "")) - logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"])) + logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"])) return try: cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], to_page=row["to_page"], lang=row["language"], callback=callback, kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) - logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"])) + logging.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"])) except Exception as e: callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", "")) - logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"])) + logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"])) return docs = [] @@ -210,12 +220,12 @@ def build(row): STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue()) el += timer() - st except Exception: - logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"])) + logging.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"])) d["img_id"] = "{}-{}".format(row["kb_id"], d["id"]) del d["image"] docs.append(d) - logger.info("MINIO PUT({}):{}".format(row["name"], el)) + logging.info("MINIO PUT({}):{}".format(row["name"], el)) if row["parser_config"].get("auto_keywords", 0): st = timer() @@ -345,7 +355,7 @@ def main(): embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"]) except Exception as e: callback(-1, msg=str(e)) - logger.exception("LLMBundle got exception") + logging.exception("LLMBundle got exception") continue if r.get("task_type", "") == "raptor": @@ -354,12 +364,12 @@ def main(): cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback) except Exception as e: callback(-1, msg=str(e)) - logger.exception("run_raptor got exception") + logging.exception("run_raptor got exception") continue else: st = timer() cks = build(r) - logger.info("Build chunks({}): {}".format(r["name"], timer() - st)) + logging.info("Build chunks({}): {}".format(r["name"], timer() - st)) if cks is None: continue if not cks: @@ -375,12 +385,12 @@ def main(): tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback) except Exception as e: callback(-1, "Embedding error:{}".format(str(e))) - logger.exception("run_rembedding got exception") + logging.exception("run_rembedding got exception") tk_count = 0 - logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) + logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st)) - # logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") + # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") init_kb(r, vector_size) chunk_count = len(set([c["id"] for c in cks])) st = timer() @@ -391,11 +401,11 @@ def main(): if b % 128 == 0: callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="") - logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st)) + logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st)) if es_r: callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!") docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) - logger.error('Insert chunk error: ' + str(es_r)) + logging.error('Insert chunk error: ' + str(es_r)) else: if TaskService.do_cancel(r["id"]): docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) @@ -404,7 +414,7 @@ def main(): callback(1., "Done!") DocumentService.increment_chunk_num( r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) - logger.info( + logging.info( "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format( r["id"], tk_count, len(cks), timer() - st)) @@ -421,16 +431,11 @@ def report_status(): obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:] REDIS_CONN.set_obj("TASKEXE", obj, 60*2) except Exception: - logger.exception("report_status got exception") + logging.exception("report_status got exception") time.sleep(30) if __name__ == "__main__": - peewee_logger = logging.getLogger('peewee') - peewee_logger.propagate = False - peewee_logger.addHandler(logger.handlers[0]) - peewee_logger.setLevel(logger.handlers[0].level) - exe = ThreadPoolExecutor(max_workers=1) exe.submit(report_status) diff --git a/rag/utils/azure_sas_conn.py b/rag/utils/azure_sas_conn.py index 87495e63318f9ed3bca66229f2b5928d193bbc4b..275f7fe6b18c07fa4c5d4dbd897d8624a60c3d3f 100644 --- a/rag/utils/azure_sas_conn.py +++ b/rag/utils/azure_sas_conn.py @@ -1,3 +1,4 @@ +import logging import os import time from io import BytesIO @@ -24,7 +25,7 @@ class RAGFlowAzureSasBlob(object): try: self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token) except Exception: - logger.exception("Fail to connect %s " % self.container_url) + logging.exception("Fail to connect %s " % self.container_url) def __close__(self): del self.conn @@ -39,7 +40,7 @@ class RAGFlowAzureSasBlob(object): try: return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary)) except Exception: - logger.exception(f"Fail put {bucket}/{fnm}") + logging.exception(f"Fail put {bucket}/{fnm}") self.__open__() time.sleep(1) @@ -47,7 +48,7 @@ class RAGFlowAzureSasBlob(object): try: self.conn.delete_blob(fnm) except Exception: - logger.exception(f"Fail rm {bucket}/{fnm}") + logging.exception(f"Fail rm {bucket}/{fnm}") def get(self, bucket, fnm): for _ in range(1): @@ -55,7 +56,7 @@ class RAGFlowAzureSasBlob(object): r = self.conn.download_blob(fnm) return r.read() except Exception: - logger.exception(f"fail get {bucket}/{fnm}") + logging.exception(f"fail get {bucket}/{fnm}") self.__open__() time.sleep(1) return @@ -64,7 +65,7 @@ class RAGFlowAzureSasBlob(object): try: return self.conn.get_blob_client(fnm).exists() except Exception: - logger.exception(f"Fail put {bucket}/{fnm}") + logging.exception(f"Fail put {bucket}/{fnm}") return False def get_presigned_url(self, bucket, fnm, expires): @@ -72,7 +73,7 @@ class RAGFlowAzureSasBlob(object): try: return self.conn.get_presigned_url("GET", bucket, fnm, expires) except Exception: - logger.exception(f"fail get {bucket}/{fnm}") + logging.exception(f"fail get {bucket}/{fnm}") self.__open__() time.sleep(1) return diff --git a/rag/utils/azure_spn_conn.py b/rag/utils/azure_spn_conn.py index cbaea2131002ca87e389ef6cf019c58678f1af58..7081f892d1a7408d28eaa040db9eb7cff3150ce1 100644 --- a/rag/utils/azure_spn_conn.py +++ b/rag/utils/azure_spn_conn.py @@ -1,3 +1,4 @@ +import logging import os import time from rag import settings @@ -28,7 +29,7 @@ class RAGFlowAzureSpnBlob(object): credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA) self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials) except Exception: - logger.exception("Fail to connect %s" % self.account_url) + logging.exception("Fail to connect %s" % self.account_url) def __close__(self): del self.conn @@ -47,7 +48,7 @@ class RAGFlowAzureSpnBlob(object): f.append_data(binary, offset=0, length=len(binary)) return f.flush_data(len(binary)) except Exception: - logger.exception(f"Fail put {bucket}/{fnm}") + logging.exception(f"Fail put {bucket}/{fnm}") self.__open__() time.sleep(1) @@ -55,7 +56,7 @@ class RAGFlowAzureSpnBlob(object): try: self.conn.delete_file(fnm) except Exception: - logger.exception(f"Fail rm {bucket}/{fnm}") + logging.exception(f"Fail rm {bucket}/{fnm}") def get(self, bucket, fnm): for _ in range(1): @@ -64,7 +65,7 @@ class RAGFlowAzureSpnBlob(object): r = client.download_file() return r.read() except Exception: - logger.exception(f"fail get {bucket}/{fnm}") + logging.exception(f"fail get {bucket}/{fnm}") self.__open__() time.sleep(1) return @@ -74,7 +75,7 @@ class RAGFlowAzureSpnBlob(object): client = self.conn.get_file_client(fnm) return client.exists() except Exception: - logger.exception(f"Fail put {bucket}/{fnm}") + logging.exception(f"Fail put {bucket}/{fnm}") return False def get_presigned_url(self, bucket, fnm, expires): @@ -82,7 +83,7 @@ class RAGFlowAzureSpnBlob(object): try: return self.conn.get_presigned_url("GET", bucket, fnm, expires) except Exception: - logger.exception(f"fail get {bucket}/{fnm}") + logging.exception(f"fail get {bucket}/{fnm}") self.__open__() time.sleep(1) return \ No newline at end of file diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py index c2ae7f2c8150838c0fa4ed39c21cbd951d7b4223..b372ff67a507d6d9fde8380a773c8f5b0fd8458a 100644 --- a/rag/utils/es_conn.py +++ b/rag/utils/es_conn.py @@ -1,3 +1,4 @@ +import logging import re import json import time @@ -8,7 +9,6 @@ import copy from elasticsearch import Elasticsearch from elasticsearch_dsl import UpdateByQuery, Q, Search, Index from elastic_transport import ConnectionTimeout -from api.utils.log_utils import logger from rag import settings from rag.utils import singleton from api.utils.file_utils import get_project_base_directory @@ -22,7 +22,7 @@ from rag.nlp import is_english, rag_tokenizer class ESConnection(DocStoreConnection): def __init__(self): self.info = {} - logger.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.") + logging.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.") for _ in range(24): try: self.es = Elasticsearch( @@ -36,25 +36,25 @@ class ESConnection(DocStoreConnection): self.info = self.es.info() break except Exception as e: - logger.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.") + logging.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.") time.sleep(5) if not self.es.ping(): msg = f"Elasticsearch {settings.ES['hosts']} didn't become healthy in 120s." - logger.error(msg) + logging.error(msg) raise Exception(msg) v = self.info.get("version", {"number": "8.11.3"}) v = v["number"].split(".")[0] if int(v) < 8: msg = f"Elasticsearch version must be greater than or equal to 8, current version: {v}" - logger.error(msg) + logging.error(msg) raise Exception(msg) fp_mapping = os.path.join(get_project_base_directory(), "conf", "mapping.json") if not os.path.exists(fp_mapping): msg = f"Elasticsearch mapping file not found at {fp_mapping}" - logger.error(msg) + logging.error(msg) raise Exception(msg) self.mapping = json.load(open(fp_mapping, "r")) - logger.info(f"Elasticsearch {settings.ES['hosts']} is healthy.") + logging.info(f"Elasticsearch {settings.ES['hosts']} is healthy.") """ Database operations @@ -79,13 +79,13 @@ class ESConnection(DocStoreConnection): settings=self.mapping["settings"], mappings=self.mapping["mappings"]) except Exception: - logger.exception("ES create index error %s" % (indexName)) + logging.exception("ES create index error %s" % (indexName)) def deleteIdx(self, indexName: str, knowledgebaseId: str): try: return self.es.indices.delete(indexName, allow_no_indices=True) except Exception: - logger.exception("ES delete index error %s" % (indexName)) + logging.exception("ES delete index error %s" % (indexName)) def indexExist(self, indexName: str, knowledgebaseId: str) -> bool: s = Index(indexName, self.es) @@ -93,7 +93,7 @@ class ESConnection(DocStoreConnection): try: return s.exists() except Exception as e: - logger.exception("ES indexExist") + logging.exception("ES indexExist") if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0: continue return False @@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection): s = s[offset:limit] q = s.to_dict() print(json.dumps(q), flush=True) - # logger.info("ESConnection.search [Q]: " + json.dumps(q)) + logging.debug("ESConnection.search [Q]: " + json.dumps(q)) for i in range(3): try: @@ -190,14 +190,14 @@ class ESConnection(DocStoreConnection): _source=True) if str(res.get("timed_out", "")).lower() == "true": raise Exception("Es Timeout.") - logger.info("ESConnection.search res: " + str(res)) + logging.debug("ESConnection.search res: " + str(res)) return res except Exception as e: - logger.exception("ES search [Q]: " + str(q)) + logging.exception("ES search [Q]: " + str(q)) if str(e).find("Timeout") > 0: continue raise e - logger.error("ES search timeout for 3 times!") + logging.error("ES search timeout for 3 times!") raise Exception("ES search timeout.") def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None: @@ -213,11 +213,11 @@ class ESConnection(DocStoreConnection): chunk["id"] = chunkId return chunk except Exception as e: - logger.exception(f"ES get({chunkId}) got exception") + logging.exception(f"ES get({chunkId}) got exception") if str(e).find("Timeout") > 0: continue raise e - logger.error("ES search timeout for 3 times!") + logging.error("ES search timeout for 3 times!") raise Exception("ES search timeout.") def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]: @@ -247,7 +247,7 @@ class ESConnection(DocStoreConnection): res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"])) return res except Exception as e: - logger.warning("Fail to bulk: " + str(e)) + logging.warning("Fail to bulk: " + str(e)) if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE): time.sleep(3) continue @@ -264,7 +264,7 @@ class ESConnection(DocStoreConnection): self.es.update(index=indexName, id=chunkId, doc=doc) return True except Exception as e: - logger.exception( + logging.exception( f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})") if str(e).find("Timeout") > 0: continue @@ -304,7 +304,7 @@ class ESConnection(DocStoreConnection): _ = ubq.execute() return True except Exception as e: - logger.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict())) + logging.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict())) if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0: continue return False @@ -326,7 +326,7 @@ class ESConnection(DocStoreConnection): qry.must.append(Q("term", **{k: v})) else: raise Exception("Condition value must be int, str or list.") - logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict())) + logging.debug("ESConnection.delete [Q]: " + json.dumps(qry.to_dict())) for _ in range(10): try: res = self.es.delete_by_query( @@ -335,7 +335,7 @@ class ESConnection(DocStoreConnection): refresh=True) return res["deleted"] except Exception as e: - logger.warning("Fail to delete: " + str(filter) + str(e)) + logging.warning("Fail to delete: " + str(filter) + str(e)) if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE): time.sleep(3) continue @@ -419,7 +419,7 @@ class ESConnection(DocStoreConnection): """ def sql(self, sql: str, fetch_size: int, format: str): - logger.info(f"ESConnection.sql get sql: {sql}") + logging.debug(f"ESConnection.sql get sql: {sql}") sql = re.sub(r"[ `]+", " ", sql) sql = sql.replace("%", "") replaces = [] @@ -436,7 +436,7 @@ class ESConnection(DocStoreConnection): for p, r in replaces: sql = sql.replace(p, r, 1) - logger.info(f"ESConnection.sql to es: {sql}") + logging.debug(f"ESConnection.sql to es: {sql}") for i in range(3): try: @@ -444,10 +444,10 @@ class ESConnection(DocStoreConnection): request_timeout="2s") return res except ConnectionTimeout: - logger.exception("ESConnection.sql timeout [Q]: " + sql) + logging.exception("ESConnection.sql timeout [Q]: " + sql) continue except Exception: - logger.exception("ESConnection.sql got exception [Q]: " + sql) + logging.exception("ESConnection.sql got exception [Q]: " + sql) return None - logger.error("ESConnection.sql timeout for 3 times!") + logging.error("ESConnection.sql timeout for 3 times!") return None diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py index baa2541340b04e66eb060025be180a0494ab373d..f6ca3493b23e3c50391ada868d75cbf26d8a9d85 100644 --- a/rag/utils/infinity_conn.py +++ b/rag/utils/infinity_conn.py @@ -1,3 +1,4 @@ +import logging import os import re import json @@ -7,7 +8,6 @@ import infinity from infinity.common import ConflictType, InfinityException from infinity.index import IndexInfo, IndexType from infinity.connection_pool import ConnectionPool -from api.utils.log_utils import logger from rag import settings from rag.utils import singleton import polars as pl @@ -56,7 +56,7 @@ class InfinityConnection(DocStoreConnection): host, port = infinity_uri.split(":") infinity_uri = infinity.common.NetworkAddress(host, int(port)) self.connPool = None - logger.info(f"Use Infinity {infinity_uri} as the doc engine.") + logging.info(f"Use Infinity {infinity_uri} as the doc engine.") for _ in range(24): try: connPool = ConnectionPool(infinity_uri) @@ -66,13 +66,13 @@ class InfinityConnection(DocStoreConnection): self.connPool = connPool break except Exception as e: - logger.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.") + logging.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.") time.sleep(5) if self.connPool is None: msg = f"Infinity {infinity_uri} didn't become healthy in 120s." - logger.error(msg) + logging.error(msg) raise Exception(msg) - logger.info(f"Infinity {infinity_uri} is healthy.") + logging.info(f"Infinity {infinity_uri} is healthy.") """ Database operations @@ -148,7 +148,7 @@ class InfinityConnection(DocStoreConnection): ) break self.connPool.release_conn(inf_conn) - logger.info( + logging.info( f"INFINITY created table {table_name}, vector size {vectorSize}" ) @@ -158,7 +158,7 @@ class InfinityConnection(DocStoreConnection): db_instance = inf_conn.get_database(self.dbName) db_instance.drop_table(table_name, ConflictType.Ignore) self.connPool.release_conn(inf_conn) - logger.info(f"INFINITY dropped table {table_name}") + logging.info(f"INFINITY dropped table {table_name}") def indexExist(self, indexName: str, knowledgebaseId: str) -> bool: table_name = f"{indexName}_{knowledgebaseId}" @@ -169,7 +169,7 @@ class InfinityConnection(DocStoreConnection): self.connPool.release_conn(inf_conn) return True except Exception as e: - logger.warn(f"INFINITY indexExist {str(e)}") + logging.warn(f"INFINITY indexExist {str(e)}") return False """ @@ -216,7 +216,7 @@ class InfinityConnection(DocStoreConnection): ) if len(filter_cond) != 0: filter_fulltext = f"({filter_cond}) AND {filter_fulltext}" - # logger.info(f"filter_fulltext: {filter_fulltext}") + logging.debug(f"filter_fulltext: {filter_fulltext}") minimum_should_match = "0%" if "minimum_should_match" in matchExpr.extra_options: minimum_should_match = ( @@ -279,7 +279,7 @@ class InfinityConnection(DocStoreConnection): df_list.append(kb_res) self.connPool.release_conn(inf_conn) res = pl.concat(df_list) - logger.info("INFINITY search tables: " + str(table_list)) + logging.debug("INFINITY search tables: " + str(table_list)) return res def get( @@ -334,18 +334,18 @@ class InfinityConnection(DocStoreConnection): str_filter = f"id IN ({str_ids})" table_instance.delete(str_filter) # for doc in documents: - # logger.info(f"insert position_list: {doc['position_list']}") - # logger.info(f"InfinityConnection.insert {json.dumps(documents)}") + # logging.info(f"insert position_list: {doc['position_list']}") + # logging.info(f"InfinityConnection.insert {json.dumps(documents)}") table_instance.insert(documents) self.connPool.release_conn(inf_conn) - logger.info(f"inserted into {table_name} {str_ids}.") + logging.debug(f"inserted into {table_name} {str_ids}.") return [] def update( self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str ) -> bool: # if 'position_list' in newValue: - # logger.info(f"upsert position_list: {newValue['position_list']}") + # logging.info(f"upsert position_list: {newValue['position_list']}") inf_conn = self.connPool.get_conn() db_instance = inf_conn.get_database(self.dbName) table_name = f"{indexName}_{knowledgebaseId}" @@ -366,7 +366,7 @@ class InfinityConnection(DocStoreConnection): try: table_instance = db_instance.get_table(table_name) except Exception: - logger.warning( + logging.warning( f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist." ) return 0 diff --git a/rag/utils/minio_conn.py b/rag/utils/minio_conn.py index 5053ef2d3f3e23e1cdd21886bf42f483b391cac4..b9c30386b73ee05e498e37fe377b5783bf5802e3 100644 --- a/rag/utils/minio_conn.py +++ b/rag/utils/minio_conn.py @@ -1,9 +1,9 @@ +import logging import time from minio import Minio from io import BytesIO from rag import settings from rag.utils import singleton -from api.utils.log_utils import logger @singleton @@ -26,7 +26,7 @@ class RAGFlowMinio(object): secure=False ) except Exception: - logger.exception( + logging.exception( "Fail to connect %s " % settings.MINIO["host"]) def __close__(self): @@ -55,7 +55,7 @@ class RAGFlowMinio(object): ) return r except Exception: - logger.exception(f"Fail put {bucket}/{fnm}:") + logging.exception(f"Fail put {bucket}/{fnm}:") self.__open__() time.sleep(1) @@ -63,7 +63,7 @@ class RAGFlowMinio(object): try: self.conn.remove_object(bucket, fnm) except Exception: - logger.exception(f"Fail put {bucket}/{fnm}:") + logging.exception(f"Fail put {bucket}/{fnm}:") def get(self, bucket, fnm): for _ in range(1): @@ -71,7 +71,7 @@ class RAGFlowMinio(object): r = self.conn.get_object(bucket, fnm) return r.read() except Exception: - logger.exception(f"Fail put {bucket}/{fnm}:") + logging.exception(f"Fail put {bucket}/{fnm}:") self.__open__() time.sleep(1) return @@ -81,7 +81,7 @@ class RAGFlowMinio(object): if self.conn.stat_object(bucket, fnm):return True return False except Exception: - logger.exception(f"Fail put {bucket}/{fnm}:") + logging.exception(f"Fail put {bucket}/{fnm}:") return False @@ -90,7 +90,7 @@ class RAGFlowMinio(object): try: return self.conn.get_presigned_url("GET", bucket, fnm, expires) except Exception: - logger.exception(f"Fail put {bucket}/{fnm}:") + logging.exception(f"Fail put {bucket}/{fnm}:") self.__open__() time.sleep(1) return diff --git a/rag/utils/redis_conn.py b/rag/utils/redis_conn.py index b256b0e0cadb2de2af61fb1ec165622c59ad8085..7529bee3223ef158b6c3bee92df6674cba2b392a 100644 --- a/rag/utils/redis_conn.py +++ b/rag/utils/redis_conn.py @@ -1,7 +1,7 @@ +import logging import json import valkey as redis -import logging from rag import settings from rag.utils import singleton diff --git a/rag/utils/s3_conn.py b/rag/utils/s3_conn.py index 86397ce1ce3d223cefeb893063e1f319c8fd8cb5..4ad7bfc289327b12b686913b4684fa8fbe422ca1 100644 --- a/rag/utils/s3_conn.py +++ b/rag/utils/s3_conn.py @@ -1,3 +1,4 @@ +import logging import boto3 import os from botocore.exceptions import ClientError @@ -40,7 +41,7 @@ class RAGFlowS3(object): config=config ) except Exception: - logger.exception( + logging.exception( "Fail to connect %s" % self.endpoint) def __close__(self): @@ -49,11 +50,11 @@ class RAGFlowS3(object): def bucket_exists(self, bucket): try: - logger.debug(f"head_bucket bucketname {bucket}") + logging.debug(f"head_bucket bucketname {bucket}") self.conn.head_bucket(Bucket=bucket) exists = True except ClientError: - logger.exception(f"head_bucket error {bucket}") + logging.exception(f"head_bucket error {bucket}") exists = False return exists @@ -62,7 +63,7 @@ class RAGFlowS3(object): if not self.bucket_exists(bucket): self.conn.create_bucket(Bucket=bucket) - logger.debug(f"create bucket {bucket} ********") + logging.debug(f"create bucket {bucket} ********") r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm) return r @@ -74,17 +75,17 @@ class RAGFlowS3(object): return [] def put(self, bucket, fnm, binary): - logger.debug(f"bucket name {bucket}; filename :{fnm}:") + logging.debug(f"bucket name {bucket}; filename :{fnm}:") for _ in range(1): try: if not self.bucket_exists(bucket): self.conn.create_bucket(Bucket=bucket) - logger.info(f"create bucket {bucket} ********") + logging.info(f"create bucket {bucket} ********") r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm) return r except Exception: - logger.exception(f"Fail put {bucket}/{fnm}") + logging.exception(f"Fail put {bucket}/{fnm}") self.__open__() time.sleep(1) @@ -92,7 +93,7 @@ class RAGFlowS3(object): try: self.conn.delete_object(Bucket=bucket, Key=fnm) except Exception: - logger.exception(f"Fail rm {bucket}/{fnm}") + logging.exception(f"Fail rm {bucket}/{fnm}") def get(self, bucket, fnm): for _ in range(1): @@ -101,7 +102,7 @@ class RAGFlowS3(object): object_data = r['Body'].read() return object_data except Exception: - logger.exception(f"fail get {bucket}/{fnm}") + logging.exception(f"fail get {bucket}/{fnm}") self.__open__() time.sleep(1) return @@ -128,7 +129,7 @@ class RAGFlowS3(object): return r except Exception: - logger.exception(f"fail get url {bucket}/{fnm}") + logging.exception(f"fail get url {bucket}/{fnm}") self.__open__() time.sleep(1) return