KevinHuSh
		
	commited on
		
		
					Commit 
							
							·
						
						2ef1d8e
	
1
								Parent(s):
							
							79f4fcc
								
resolve table issues (#125)
Browse files- Dockerfile +0 -1
 - Dockerfile.cuda +0 -1
 - api/apps/conversation_app.py +1 -0
 - api/db/init_data.py +1 -1
 - api/db/services/knowledgebase_service.py +1 -1
 - rag/app/table.py +10 -8
 - rag/utils/__init__.py +2 -2
 
    	
        Dockerfile
    CHANGED
    
    | 
         @@ -14,7 +14,6 @@ ADD ./rag ./rag 
     | 
|
| 14 | 
         
             
            ENV PYTHONPATH=/ragflow/
         
     | 
| 15 | 
         
             
            ENV HF_ENDPOINT=https://hf-mirror.com
         
     | 
| 16 | 
         | 
| 17 | 
         
            -
            /root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
         
     | 
| 18 | 
         
             
            ADD docker/entrypoint.sh ./entrypoint.sh
         
     | 
| 19 | 
         
             
            RUN chmod +x ./entrypoint.sh
         
     | 
| 20 | 
         | 
| 
         | 
|
| 14 | 
         
             
            ENV PYTHONPATH=/ragflow/
         
     | 
| 15 | 
         
             
            ENV HF_ENDPOINT=https://hf-mirror.com
         
     | 
| 16 | 
         | 
| 
         | 
|
| 17 | 
         
             
            ADD docker/entrypoint.sh ./entrypoint.sh
         
     | 
| 18 | 
         
             
            RUN chmod +x ./entrypoint.sh
         
     | 
| 19 | 
         | 
    	
        Dockerfile.cuda
    CHANGED
    
    | 
         @@ -19,7 +19,6 @@ ADD ./rag ./rag 
     | 
|
| 19 | 
         
             
            ENV PYTHONPATH=/ragflow/
         
     | 
| 20 | 
         
             
            ENV HF_ENDPOINT=https://hf-mirror.com
         
     | 
| 21 | 
         | 
| 22 | 
         
            -
            /root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
         
     | 
| 23 | 
         
             
            ADD docker/entrypoint.sh ./entrypoint.sh
         
     | 
| 24 | 
         
             
            RUN chmod +x ./entrypoint.sh
         
     | 
| 25 | 
         | 
| 
         | 
|
| 19 | 
         
             
            ENV PYTHONPATH=/ragflow/
         
     | 
| 20 | 
         
             
            ENV HF_ENDPOINT=https://hf-mirror.com
         
     | 
| 21 | 
         | 
| 
         | 
|
| 22 | 
         
             
            ADD docker/entrypoint.sh ./entrypoint.sh
         
     | 
| 23 | 
         
             
            RUN chmod +x ./entrypoint.sh
         
     | 
| 24 | 
         | 
    	
        api/apps/conversation_app.py
    CHANGED
    
    | 
         @@ -309,6 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl): 
     | 
|
| 309 | 
         
             
                # compose markdown table
         
     | 
| 310 | 
         
             
                clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
         
     | 
| 311 | 
         
             
                line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
         
     | 
| 
         | 
|
| 312 | 
         
             
                rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
         
     | 
| 313 | 
         
             
                if not docid_idx or not docnm_idx:
         
     | 
| 314 | 
         
             
                    chat_logger.warning("SQL missing field: " + sql)
         
     | 
| 
         | 
|
| 309 | 
         
             
                # compose markdown table
         
     | 
| 310 | 
         
             
                clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
         
     | 
| 311 | 
         
             
                line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
         
     | 
| 312 | 
         
            +
                line = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}\|", "|", line)
         
     | 
| 313 | 
         
             
                rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
         
     | 
| 314 | 
         
             
                if not docid_idx or not docnm_idx:
         
     | 
| 315 | 
         
             
                    chat_logger.warning("SQL missing field: " + sql)
         
     | 
    	
        api/db/init_data.py
    CHANGED
    
    | 
         @@ -94,7 +94,7 @@ def init_llm_factory(): 
     | 
|
| 94 | 
         
             
                         "name": "Local",
         
     | 
| 95 | 
         
             
                         "logo": "",
         
     | 
| 96 | 
         
             
                         "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
         
     | 
| 97 | 
         
            -
                        "status": " 
     | 
| 98 | 
         
             
                    },{
         
     | 
| 99 | 
         
             
                        "name": "Moonshot",
         
     | 
| 100 | 
         
             
                         "logo": "",
         
     | 
| 
         | 
|
| 94 | 
         
             
                         "name": "Local",
         
     | 
| 95 | 
         
             
                         "logo": "",
         
     | 
| 96 | 
         
             
                         "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
         
     | 
| 97 | 
         
            +
                        "status": "1",
         
     | 
| 98 | 
         
             
                    },{
         
     | 
| 99 | 
         
             
                        "name": "Moonshot",
         
     | 
| 100 | 
         
             
                         "logo": "",
         
     | 
    	
        api/db/services/knowledgebase_service.py
    CHANGED
    
    | 
         @@ -78,7 +78,7 @@ class KnowledgebaseService(CommonService): 
     | 
|
| 78 | 
         
             
                            if isinstance(v, dict):
         
     | 
| 79 | 
         
             
                                assert isinstance(old[k], dict)
         
     | 
| 80 | 
         
             
                                dfs_update(old[k], v)
         
     | 
| 81 | 
         
            -
                             
     | 
| 82 | 
         
             
                                assert isinstance(old[k], list)
         
     | 
| 83 | 
         
             
                                old[k] = list(set(old[k]+v))
         
     | 
| 84 | 
         
             
                            else: old[k] = v
         
     | 
| 
         | 
|
| 78 | 
         
             
                            if isinstance(v, dict):
         
     | 
| 79 | 
         
             
                                assert isinstance(old[k], dict)
         
     | 
| 80 | 
         
             
                                dfs_update(old[k], v)
         
     | 
| 81 | 
         
            +
                            elif isinstance(v, list):
         
     | 
| 82 | 
         
             
                                assert isinstance(old[k], list)
         
     | 
| 83 | 
         
             
                                old[k] = list(set(old[k]+v))
         
     | 
| 84 | 
         
             
                            else: old[k] = v
         
     | 
    	
        rag/app/table.py
    CHANGED
    
    | 
         @@ -73,9 +73,9 @@ def trans_datatime(s): 
     | 
|
| 73 | 
         | 
| 74 | 
         | 
| 75 | 
         
             
            def trans_bool(s):
         
     | 
| 76 | 
         
            -
                if re.match(r"(true|yes 
     | 
| 77 | 
         
             
                    return ["yes", "是"]
         
     | 
| 78 | 
         
            -
                if re.match(r"(false|no 
     | 
| 79 | 
         
             
                    return ["no", "否"]
         
     | 
| 80 | 
         | 
| 81 | 
         | 
| 
         @@ -107,9 +107,9 @@ def column_data_type(arr): 
     | 
|
| 107 | 
         
             
                        arr[i] = trans[ty](str(arr[i]))
         
     | 
| 108 | 
         
             
                    except Exception as e:
         
     | 
| 109 | 
         
             
                        arr[i] = None
         
     | 
| 110 | 
         
            -
                if ty == "text":
         
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
             
                return arr, ty
         
     | 
| 114 | 
         | 
| 115 | 
         | 
| 
         @@ -170,7 +170,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese 
     | 
|
| 170 | 
         
             
                PY = Pinyin()
         
     | 
| 171 | 
         
             
                fieds_map = {
         
     | 
| 172 | 
         
             
                    "text": "_tks",
         
     | 
| 173 | 
         
            -
                    "int": " 
     | 
| 174 | 
         
             
                    "keyword": "_kwd",
         
     | 
| 175 | 
         
             
                    "float": "_flt",
         
     | 
| 176 | 
         
             
                    "datetime": "_dt",
         
     | 
| 
         @@ -189,7 +189,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese 
     | 
|
| 189 | 
         
             
                        df[clmns[j]] = cln
         
     | 
| 190 | 
         
             
                        if ty == "text":
         
     | 
| 191 | 
         
             
                            txts.extend([str(c) for c in cln if c])
         
     | 
| 192 | 
         
            -
                    clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
         
     | 
| 193 | 
         
             
                                 for i in range(len(clmns))]
         
     | 
| 194 | 
         | 
| 195 | 
         
             
                    eng = lang.lower() == "english"#is_english(txts)
         
     | 
| 
         @@ -204,6 +204,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese 
     | 
|
| 204 | 
         
             
                                continue
         
     | 
| 205 | 
         
             
                            if not str(row[clmns[j]]):
         
     | 
| 206 | 
         
             
                                continue
         
     | 
| 
         | 
|
| 
         | 
|
| 207 | 
         
             
                            fld = clmns_map[j][0]
         
     | 
| 208 | 
         
             
                            d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
         
     | 
| 209 | 
         
             
                                row[clmns[j]])
         
     | 
| 
         @@ -223,7 +225,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese 
     | 
|
| 223 | 
         
             
            if __name__ == "__main__":
         
     | 
| 224 | 
         
             
                import sys
         
     | 
| 225 | 
         | 
| 226 | 
         
            -
                def dummy( 
     | 
| 227 | 
         
             
                    pass
         
     | 
| 228 | 
         | 
| 229 | 
         
             
                chunk(sys.argv[1], callback=dummy)
         
     | 
| 
         | 
|
| 73 | 
         | 
| 74 | 
         | 
| 75 | 
         
             
            def trans_bool(s):
         
     | 
| 76 | 
         
            +
                if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
         
     | 
| 77 | 
         
             
                    return ["yes", "是"]
         
     | 
| 78 | 
         
            +
                if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
         
     | 
| 79 | 
         
             
                    return ["no", "否"]
         
     | 
| 80 | 
         | 
| 81 | 
         | 
| 
         | 
|
| 107 | 
         
             
                        arr[i] = trans[ty](str(arr[i]))
         
     | 
| 108 | 
         
             
                    except Exception as e:
         
     | 
| 109 | 
         
             
                        arr[i] = None
         
     | 
| 110 | 
         
            +
                #if ty == "text":
         
     | 
| 111 | 
         
            +
                #    if len(arr) > 128 and uni / len(arr) < 0.1:
         
     | 
| 112 | 
         
            +
                #        ty = "keyword"
         
     | 
| 113 | 
         
             
                return arr, ty
         
     | 
| 114 | 
         | 
| 115 | 
         | 
| 
         | 
|
| 170 | 
         
             
                PY = Pinyin()
         
     | 
| 171 | 
         
             
                fieds_map = {
         
     | 
| 172 | 
         
             
                    "text": "_tks",
         
     | 
| 173 | 
         
            +
                    "int": "_long",
         
     | 
| 174 | 
         
             
                    "keyword": "_kwd",
         
     | 
| 175 | 
         
             
                    "float": "_flt",
         
     | 
| 176 | 
         
             
                    "datetime": "_dt",
         
     | 
| 
         | 
|
| 189 | 
         
             
                        df[clmns[j]] = cln
         
     | 
| 190 | 
         
             
                        if ty == "text":
         
     | 
| 191 | 
         
             
                            txts.extend([str(c) for c in cln if c])
         
     | 
| 192 | 
         
            +
                    clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
         
     | 
| 193 | 
         
             
                                 for i in range(len(clmns))]
         
     | 
| 194 | 
         | 
| 195 | 
         
             
                    eng = lang.lower() == "english"#is_english(txts)
         
     | 
| 
         | 
|
| 204 | 
         
             
                                continue
         
     | 
| 205 | 
         
             
                            if not str(row[clmns[j]]):
         
     | 
| 206 | 
         
             
                                continue
         
     | 
| 207 | 
         
            +
                            if pd.isna(row[clmns[j]]):
         
     | 
| 208 | 
         
            +
                                continue
         
     | 
| 209 | 
         
             
                            fld = clmns_map[j][0]
         
     | 
| 210 | 
         
             
                            d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
         
     | 
| 211 | 
         
             
                                row[clmns[j]])
         
     | 
| 
         | 
|
| 225 | 
         
             
            if __name__ == "__main__":
         
     | 
| 226 | 
         
             
                import sys
         
     | 
| 227 | 
         | 
| 228 | 
         
            +
                def dummy(prog=None, msg=""):
         
     | 
| 229 | 
         
             
                    pass
         
     | 
| 230 | 
         | 
| 231 | 
         
             
                chunk(sys.argv[1], callback=dummy)
         
     | 
    	
        rag/utils/__init__.py
    CHANGED
    
    | 
         @@ -19,8 +19,8 @@ from .minio_conn import MINIO 
     | 
|
| 19 | 
         
             
            from .es_conn import ELASTICSEARCH
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            def rmSpace(txt):
         
     | 
| 22 | 
         
            -
                txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt)
         
     | 
| 23 | 
         
            -
                return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt)
         
     | 
| 24 | 
         | 
| 25 | 
         | 
| 26 | 
         
             
            def findMaxDt(fnm):
         
     | 
| 
         | 
|
| 19 | 
         
             
            from .es_conn import ELASTICSEARCH
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            def rmSpace(txt):
         
     | 
| 22 | 
         
            +
                txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
         
     | 
| 23 | 
         
            +
                return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
         
     | 
| 24 | 
         | 
| 25 | 
         | 
| 26 | 
         
             
            def findMaxDt(fnm):
         
     |