KevinHuSh
commited on
Commit
·
2ef1d8e
1
Parent(s):
79f4fcc
resolve table issues (#125)
Browse files- Dockerfile +0 -1
- Dockerfile.cuda +0 -1
- api/apps/conversation_app.py +1 -0
- api/db/init_data.py +1 -1
- api/db/services/knowledgebase_service.py +1 -1
- rag/app/table.py +10 -8
- rag/utils/__init__.py +2 -2
Dockerfile
CHANGED
@@ -14,7 +14,6 @@ ADD ./rag ./rag
|
|
14 |
ENV PYTHONPATH=/ragflow/
|
15 |
ENV HF_ENDPOINT=https://hf-mirror.com
|
16 |
|
17 |
-
/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
|
18 |
ADD docker/entrypoint.sh ./entrypoint.sh
|
19 |
RUN chmod +x ./entrypoint.sh
|
20 |
|
|
|
14 |
ENV PYTHONPATH=/ragflow/
|
15 |
ENV HF_ENDPOINT=https://hf-mirror.com
|
16 |
|
|
|
17 |
ADD docker/entrypoint.sh ./entrypoint.sh
|
18 |
RUN chmod +x ./entrypoint.sh
|
19 |
|
Dockerfile.cuda
CHANGED
@@ -19,7 +19,6 @@ ADD ./rag ./rag
|
|
19 |
ENV PYTHONPATH=/ragflow/
|
20 |
ENV HF_ENDPOINT=https://hf-mirror.com
|
21 |
|
22 |
-
/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
|
23 |
ADD docker/entrypoint.sh ./entrypoint.sh
|
24 |
RUN chmod +x ./entrypoint.sh
|
25 |
|
|
|
19 |
ENV PYTHONPATH=/ragflow/
|
20 |
ENV HF_ENDPOINT=https://hf-mirror.com
|
21 |
|
|
|
22 |
ADD docker/entrypoint.sh ./entrypoint.sh
|
23 |
RUN chmod +x ./entrypoint.sh
|
24 |
|
api/apps/conversation_app.py
CHANGED
@@ -309,6 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
|
|
309 |
# compose markdown table
|
310 |
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
|
311 |
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
|
|
|
312 |
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
|
313 |
if not docid_idx or not docnm_idx:
|
314 |
chat_logger.warning("SQL missing field: " + sql)
|
|
|
309 |
# compose markdown table
|
310 |
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
|
311 |
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
|
312 |
+
line = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}\|", "|", line)
|
313 |
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
|
314 |
if not docid_idx or not docnm_idx:
|
315 |
chat_logger.warning("SQL missing field: " + sql)
|
api/db/init_data.py
CHANGED
@@ -94,7 +94,7 @@ def init_llm_factory():
|
|
94 |
"name": "Local",
|
95 |
"logo": "",
|
96 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
97 |
-
"status": "
|
98 |
},{
|
99 |
"name": "Moonshot",
|
100 |
"logo": "",
|
|
|
94 |
"name": "Local",
|
95 |
"logo": "",
|
96 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
97 |
+
"status": "1",
|
98 |
},{
|
99 |
"name": "Moonshot",
|
100 |
"logo": "",
|
api/db/services/knowledgebase_service.py
CHANGED
@@ -78,7 +78,7 @@ class KnowledgebaseService(CommonService):
|
|
78 |
if isinstance(v, dict):
|
79 |
assert isinstance(old[k], dict)
|
80 |
dfs_update(old[k], v)
|
81 |
-
|
82 |
assert isinstance(old[k], list)
|
83 |
old[k] = list(set(old[k]+v))
|
84 |
else: old[k] = v
|
|
|
78 |
if isinstance(v, dict):
|
79 |
assert isinstance(old[k], dict)
|
80 |
dfs_update(old[k], v)
|
81 |
+
elif isinstance(v, list):
|
82 |
assert isinstance(old[k], list)
|
83 |
old[k] = list(set(old[k]+v))
|
84 |
else: old[k] = v
|
rag/app/table.py
CHANGED
@@ -73,9 +73,9 @@ def trans_datatime(s):
|
|
73 |
|
74 |
|
75 |
def trans_bool(s):
|
76 |
-
if re.match(r"(true|yes
|
77 |
return ["yes", "是"]
|
78 |
-
if re.match(r"(false|no
|
79 |
return ["no", "否"]
|
80 |
|
81 |
|
@@ -107,9 +107,9 @@ def column_data_type(arr):
|
|
107 |
arr[i] = trans[ty](str(arr[i]))
|
108 |
except Exception as e:
|
109 |
arr[i] = None
|
110 |
-
if ty == "text":
|
111 |
-
|
112 |
-
|
113 |
return arr, ty
|
114 |
|
115 |
|
@@ -170,7 +170,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
170 |
PY = Pinyin()
|
171 |
fieds_map = {
|
172 |
"text": "_tks",
|
173 |
-
"int": "
|
174 |
"keyword": "_kwd",
|
175 |
"float": "_flt",
|
176 |
"datetime": "_dt",
|
@@ -189,7 +189,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
189 |
df[clmns[j]] = cln
|
190 |
if ty == "text":
|
191 |
txts.extend([str(c) for c in cln if c])
|
192 |
-
clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
|
193 |
for i in range(len(clmns))]
|
194 |
|
195 |
eng = lang.lower() == "english"#is_english(txts)
|
@@ -204,6 +204,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
204 |
continue
|
205 |
if not str(row[clmns[j]]):
|
206 |
continue
|
|
|
|
|
207 |
fld = clmns_map[j][0]
|
208 |
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
209 |
row[clmns[j]])
|
@@ -223,7 +225,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
223 |
if __name__ == "__main__":
|
224 |
import sys
|
225 |
|
226 |
-
def dummy(
|
227 |
pass
|
228 |
|
229 |
chunk(sys.argv[1], callback=dummy)
|
|
|
73 |
|
74 |
|
75 |
def trans_bool(s):
|
76 |
+
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
|
77 |
return ["yes", "是"]
|
78 |
+
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
|
79 |
return ["no", "否"]
|
80 |
|
81 |
|
|
|
107 |
arr[i] = trans[ty](str(arr[i]))
|
108 |
except Exception as e:
|
109 |
arr[i] = None
|
110 |
+
#if ty == "text":
|
111 |
+
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
112 |
+
# ty = "keyword"
|
113 |
return arr, ty
|
114 |
|
115 |
|
|
|
170 |
PY = Pinyin()
|
171 |
fieds_map = {
|
172 |
"text": "_tks",
|
173 |
+
"int": "_long",
|
174 |
"keyword": "_kwd",
|
175 |
"float": "_flt",
|
176 |
"datetime": "_dt",
|
|
|
189 |
df[clmns[j]] = cln
|
190 |
if ty == "text":
|
191 |
txts.extend([str(c) for c in cln if c])
|
192 |
+
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
|
193 |
for i in range(len(clmns))]
|
194 |
|
195 |
eng = lang.lower() == "english"#is_english(txts)
|
|
|
204 |
continue
|
205 |
if not str(row[clmns[j]]):
|
206 |
continue
|
207 |
+
if pd.isna(row[clmns[j]]):
|
208 |
+
continue
|
209 |
fld = clmns_map[j][0]
|
210 |
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
211 |
row[clmns[j]])
|
|
|
225 |
if __name__ == "__main__":
|
226 |
import sys
|
227 |
|
228 |
+
def dummy(prog=None, msg=""):
|
229 |
pass
|
230 |
|
231 |
chunk(sys.argv[1], callback=dummy)
|
rag/utils/__init__.py
CHANGED
@@ -19,8 +19,8 @@ from .minio_conn import MINIO
|
|
19 |
from .es_conn import ELASTICSEARCH
|
20 |
|
21 |
def rmSpace(txt):
|
22 |
-
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt)
|
23 |
-
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt)
|
24 |
|
25 |
|
26 |
def findMaxDt(fnm):
|
|
|
19 |
from .es_conn import ELASTICSEARCH
|
20 |
|
21 |
def rmSpace(txt):
|
22 |
+
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
23 |
+
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
24 |
|
25 |
|
26 |
def findMaxDt(fnm):
|