Spaces:

ChallengeHub
/

Chinese-LangChain

Runtime error

App Files Files Community

yanqiang commited on Apr 18, 2023

Commit

bd111f7

1 Parent(s): 96a6f43

update

Browse files

Files changed (10) hide show

.gitignore +2 -0
README.md +2 -1
cache/index.faiss +0 -0
cache/index.pkl +0 -0
create_knowledge.py +30 -0
images/result.png +0 -0
images/web_demo.png +0 -0
main.py +25 -13
tests/test_duckduckgo_search.py +4 -4
tests/test_langchain.py +3 -3

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	.idea

 .idea
+cache
+docs/zh_wikipedia

README.md CHANGED Viewed

@@ -4,7 +4,7 @@
 ## 🔥 效果演示
-![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/result.png)
 ## 🚀 特性
@@ -22,6 +22,7 @@
 * [ ] 检索结果过滤与排序
 * [ ] 互联网检索结果接入
 * [ ] 模型初始化有问题
 ## 交流
 欢迎多提建议、Bad cases，目前尚不完善，欢迎进群及时交流，也欢迎大家多提PR

 ## 🔥 效果演示
+![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)
 ## 🚀 特性
 * [ ] 检索结果过滤与排序
 * [ ] 互联网检索结果接入
 * [ ] 模型初始化有问题
+* [ ] 增加非LangChain策略
 ## 交流
 欢迎多提建议、Bad cases，目前尚不完善，欢迎进群及时交流，也欢迎大家多提PR

cache/index.faiss DELETED Viewed

Binary file (53.3 kB)

cache/index.pkl DELETED Viewed

Binary file (5.43 kB)

create_knowledge.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: create_knowledge.py
+@time: 2023/04/18
+@contact: [email protected]
+@software: PyCharm
+@description: coding..
+"""
+from langchain.docstore.document import Document
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from tqdm import tqdm
+# 中文Wikipedia数据导入示例：
+embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
+docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
+embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+docs = []
+with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
+    for idx, line in tqdm(enumerate(f.readlines())):
+        metadata = {"source": f'doc_id_{idx}'}
+        docs.append(Document(page_content=line.strip(), metadata=metadata))
+vector_store = FAISS.from_documents(docs, embeddings)
+vector_store.save_local('cache/zh_wikipedia/')

images/result.png DELETED Viewed

Binary file (72.3 kB)

images/web_demo.png ADDED Viewed

main.py CHANGED Viewed

@@ -10,8 +10,8 @@ os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 # 修改成自己的配置！！！
 class LangChainCFG:
-    llm_model_name = 'THUDM/chatglm-6b-int4-qe'  # 本地模型文件 or huggingface远程仓库
-    embedding_model_name = 'GanymedeNil/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
     vector_store_path = './cache'
     docs_path = './docs'
@@ -91,19 +91,24 @@ with block as demo:
                 label="large language model",
                 value="ChatGLM-6B-int4")
-            with gr.Tab("select"):
-                selectFile = gr.Dropdown(file_list,
-                                         label="content file",
-                                         interactive=True,
-                                         value=file_list[0] if len(file_list) > 0 else None)
-            with gr.Tab("upload"):
-                file = gr.File(label="请上传知识库文件",
-                               file_types=['.txt', '.md', '.docx', '.pdf']
-                               )
             file.upload(upload_file,
                         inputs=file,
-                        outputs=selectFile)
         with gr.Column(scale=4):
             with gr.Row():
                 with gr.Column(scale=4):
@@ -137,4 +142,11 @@ with block as demo:
                        ],
                        outputs=[message, chatbot, state, search])
-demo.queue(concurrency_count=2).launch(server_name='0.0.0.0', server_port=8888, share=False,show_error=True, enable_queue=True)

 # 修改成自己的配置！！！
 class LangChainCFG:
+    llm_model_name = '../../pretrained_models/chatglm-6b-int4-qe'  # 本地模型文件 or huggingface远程仓库
+    embedding_model_name = '../../pretrained_models/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
     vector_store_path = './cache'
     docs_path = './docs'
                 label="large language model",
                 value="ChatGLM-6B-int4")
+            top_k = gr.Slider(1,
+                              20,
+                              value=2,
+                              step=1,
+                              label="向量匹配 top k",
+                              interactive=True)
+            kg_name = gr.Radio(['中文维基百科', '百度百科数据', '坦克世界'],
+                               label="知识库",
+                               value='中文维基百科',
+                               interactive=True)
+            file = gr.File(label="将文件上传到数据库",
+                           visible=True,
+                           file_types=['.txt', '.md', '.docx', '.pdf']
+                           )
             file.upload(upload_file,
                         inputs=file,
+                        outputs=None)
         with gr.Column(scale=4):
             with gr.Row():
                 with gr.Column(scale=4):
                        ],
                        outputs=[message, chatbot, state, search])
+demo.queue(concurrency_count=2).launch(
+    server_name='0.0.0.0',
+    server_port=8888,
+    share=False,
+    show_error=True,
+    debug=True,
+    enable_queue=True
+)

tests/test_duckduckgo_search.py CHANGED Viewed

@@ -2,9 +2,9 @@ from duckduckgo_search import ddg
 from duckduckgo_search.utils import SESSION
-SESSION.proxies = {
-    "http": f"socks5h://localhost:7890",
-    "https": f"socks5h://localhost:7890"
-}
 r = ddg("马保国")
 print(r)

 from duckduckgo_search.utils import SESSION
+# SESSION.proxies = {
+#     "http": f"socks5h://localhost:7890",
+#     "https": f"socks5h://localhost:7890"
+# }
 r = ddg("马保国")
 print(r)

tests/test_langchain.py CHANGED Viewed

@@ -4,8 +4,8 @@ from langchain.document_loaders import UnstructuredFileLoader
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
-embedding_model_name = 'pretrained_models/ernie-gram-zh'
-docs_path = 'docs'
 embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
 docs = []
@@ -22,7 +22,7 @@ vector_store.save_local('vector_store_local')
 search_result = vector_store.similarity_search_with_score(query='科比', k=2)
 print(search_result)
-loader = UnstructuredFileLoader(f'{docs_path}/added/科比.txt', mode="elements")
 doc = loader.load()
 vector_store.add_documents(doc)
 print(doc)

 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
+embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
+docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
 embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
 docs = []
 search_result = vector_store.similarity_search_with_score(query='科比', k=2)
 print(search_result)
+loader = UnstructuredFileLoader(f'{docs_path}/added/马保国.txt', mode="elements")
 doc = loader.load()
 vector_store.add_documents(doc)
 print(doc)