Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .gitignore +2 -0
- README.md +9 -8
- __pycache__/openai_utils.cpython-39.pyc +0 -0
- __pycache__/pdf_utils.cpython-39.pyc +0 -0
- __pycache__/prompt_utils.cpython-39.pyc +0 -0
- __pycache__/text_utils.cpython-39.pyc +0 -0
- __pycache__/vectordb_utils.cpython-39.pyc +0 -0
- import gradio as gr.py +12 -0
- llama2-extracted.pdf +3 -0
- openai_utils.py +25 -0
- pdf_utils.py +27 -0
- prompt_utils.py +28 -0
- requirements.txt +6 -0
- text_utils.py +26 -0
- vectordb_utils.py +28 -0
- web_demo.py +68 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
llama2-extracted.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# dotenv
|
2 |
+
.env
|
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: pdfsearch
|
3 |
+
app_file: web_demo.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 4.15.0
|
|
|
|
|
6 |
---
|
7 |
+
# chatpdf
|
8 |
+
RAG homework
|
9 |
|
10 |
+
运行:
|
11 |
+
```
|
12 |
+
python web_demo.py
|
13 |
+
```
|
__pycache__/openai_utils.cpython-39.pyc
ADDED
Binary file (957 Bytes). View file
|
|
__pycache__/pdf_utils.cpython-39.pyc
ADDED
Binary file (884 Bytes). View file
|
|
__pycache__/prompt_utils.cpython-39.pyc
ADDED
Binary file (1.07 kB). View file
|
|
__pycache__/text_utils.cpython-39.pyc
ADDED
Binary file (952 Bytes). View file
|
|
__pycache__/vectordb_utils.cpython-39.pyc
ADDED
Binary file (1.65 kB). View file
|
|
import gradio as gr.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def greet(lala, intensity):
|
4 |
+
return "Hello, " + lala + "!" * int(intensity)
|
5 |
+
|
6 |
+
demo = gr.Interface(
|
7 |
+
fn=greet,
|
8 |
+
inputs=["text", "slider"],
|
9 |
+
outputs=["text"],
|
10 |
+
)
|
11 |
+
|
12 |
+
demo.launch()
|
llama2-extracted.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a965aaca87d5f56e7ab43abc562a7205d3d22f72dce49e8dac1bad3abd5f114
|
3 |
+
size 1238532
|
openai_utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
# 加载环境变量
|
4 |
+
from dotenv import load_dotenv, find_dotenv
|
5 |
+
|
6 |
+
_ = load_dotenv(find_dotenv()) # 读取本地 .env 文件,里面定义了 OPENAI_API_KEY
|
7 |
+
|
8 |
+
openai.api_key = os.getenv('OPENAI_API_KEY')
|
9 |
+
|
10 |
+
|
11 |
+
def get_completion(prompt, context, model="gpt-3.5-turbo"):
|
12 |
+
"""封装 openai 接口"""
|
13 |
+
messages = context + [{"role": "user", "content": prompt}]
|
14 |
+
response = openai.ChatCompletion.create(
|
15 |
+
model=model,
|
16 |
+
messages=messages,
|
17 |
+
temperature=0, # 模型输出的随机性,0 表示随机性最小
|
18 |
+
)
|
19 |
+
return response.choices[0].message["content"]
|
20 |
+
|
21 |
+
|
22 |
+
def get_embedding(text, model="text-embedding-3-small"):#text-embedding-ada-002
|
23 |
+
"""封装 OpenAI 的 Embedding 模型接口"""
|
24 |
+
#return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
|
25 |
+
return openai.Embedding.create(input=[text], model=model).data[0].embedding
|
pdf_utils.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdfminer.high_level import extract_pages
|
2 |
+
from pdfminer.layout import LTTextContainer
|
3 |
+
|
4 |
+
def extract_text_from_pdf(filename,page_numbers=None,min_line_length=10):
|
5 |
+
"""从 PDF 文件中(按指定页码)提取文字"""
|
6 |
+
paragraphs = []
|
7 |
+
buffer = ''
|
8 |
+
full_text = ''
|
9 |
+
# 提取全部文本
|
10 |
+
for i, page_layout in enumerate(extract_pages(filename)):
|
11 |
+
# 如果指定了页码范围,跳过范围外的页
|
12 |
+
if page_numbers is not None and i not in page_numbers:
|
13 |
+
continue
|
14 |
+
for element in page_layout:
|
15 |
+
if isinstance(element, LTTextContainer):
|
16 |
+
full_text += element.get_text() + '\n'
|
17 |
+
# 按空行分隔,将文本重新组织成段落
|
18 |
+
lines = full_text.split('\n')
|
19 |
+
for text in lines:
|
20 |
+
if len(text) >= min_line_length:
|
21 |
+
buffer += (' '+text) if not text.endswith('-') else text.strip('-')
|
22 |
+
elif buffer:
|
23 |
+
paragraphs.append(buffer)
|
24 |
+
buffer = ''
|
25 |
+
if buffer:
|
26 |
+
paragraphs.append(buffer)
|
27 |
+
return paragraphs
|
prompt_utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompt_template = """
|
2 |
+
你是一个问答机器人。
|
3 |
+
你的任务是根据下述给定的已知信息回答用户问题。
|
4 |
+
确保你的回复完全依据下述已知信息。不要编造答案。
|
5 |
+
如果下述已知信息不足以回答用户的问题,请直接回复"我无法回答您的问题"。
|
6 |
+
|
7 |
+
已知信息:
|
8 |
+
__INFO__
|
9 |
+
|
10 |
+
用户问:
|
11 |
+
__QUERY__
|
12 |
+
|
13 |
+
请用中文回答用户问题。
|
14 |
+
"""
|
15 |
+
|
16 |
+
|
17 |
+
def build_prompt(template=prompt_template, **kwargs):
|
18 |
+
"""将 Prompt 模板赋值"""
|
19 |
+
prompt = template
|
20 |
+
for k, v in kwargs.items():
|
21 |
+
if isinstance(v, str):
|
22 |
+
val = v
|
23 |
+
elif isinstance(v, list) and all(isinstance(elem, str) for elem in v):
|
24 |
+
val = '\n'.join(v)
|
25 |
+
else:
|
26 |
+
val = str(v)
|
27 |
+
prompt = prompt.replace(f"__{k.upper()}__", val)
|
28 |
+
return prompt
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai==0.28.1
|
2 |
+
python-dotenv
|
3 |
+
pdfminer.six
|
4 |
+
nltk==3.8.1
|
5 |
+
chromadb==0.4.15
|
6 |
+
gradio
|
text_utils.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tokenize import sent_tokenize
|
2 |
+
import json
|
3 |
+
|
4 |
+
def split_text(paragraphs,chunk_size=300,overlap_size=100):
|
5 |
+
"""按指定 chunk_size 和 overlap_size 交叠割文本"""
|
6 |
+
sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
|
7 |
+
chunks = []
|
8 |
+
i= 0
|
9 |
+
while i < len(sentences):
|
10 |
+
chunk = sentences[i]
|
11 |
+
overlap = ''
|
12 |
+
prev_len = 0
|
13 |
+
prev = i - 1
|
14 |
+
# 向前计算重叠部分
|
15 |
+
while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
|
16 |
+
overlap = sentences[prev] + ' ' + overlap
|
17 |
+
prev -= 1
|
18 |
+
chunk = overlap+chunk
|
19 |
+
next = i + 1
|
20 |
+
# 向后计算当前chunk
|
21 |
+
while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
|
22 |
+
chunk = chunk + ' ' + sentences[next]
|
23 |
+
next += 1
|
24 |
+
chunks.append(chunk)
|
25 |
+
i = next
|
26 |
+
return chunks
|
vectordb_utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from chromadb.config import Settings
|
3 |
+
from openai_utils import get_embedding
|
4 |
+
|
5 |
+
|
6 |
+
class InMemoryVecDB:
|
7 |
+
|
8 |
+
def __init__(self, name="demo"):
|
9 |
+
self.chroma_client = chromadb.Client(Settings(allow_reset=True))
|
10 |
+
self.chroma_client.reset()
|
11 |
+
self.name = name
|
12 |
+
self.collection = self.chroma_client.get_or_create_collection(name=name)
|
13 |
+
|
14 |
+
def add_documents(self, documents):
|
15 |
+
self.collection.add(
|
16 |
+
embeddings=[get_embedding(doc) for doc in documents],
|
17 |
+
documents=documents,
|
18 |
+
metadatas=[{"source": self.name} for _ in documents],
|
19 |
+
ids=[f"id_{i}" for i in range(len(documents))]
|
20 |
+
)
|
21 |
+
|
22 |
+
def search(self, query, top_n):
|
23 |
+
"""检索向量数据库"""
|
24 |
+
results = self.collection.query(
|
25 |
+
query_embeddings=[get_embedding(query)],
|
26 |
+
n_results=top_n
|
27 |
+
)
|
28 |
+
return results['documents'][0]
|
web_demo.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding=utf-8
|
3 |
+
import gradio as gr
|
4 |
+
from openai_utils import get_completion
|
5 |
+
from prompt_utils import build_prompt
|
6 |
+
from vectordb_utils import InMemoryVecDB
|
7 |
+
from pdf_utils import extract_text_from_pdf
|
8 |
+
from text_utils import split_text
|
9 |
+
|
10 |
+
vec_db = InMemoryVecDB() # 创建一个chromadb数据库对象
|
11 |
+
|
12 |
+
# init_db函数用于初始化数据库。它首先从PDF文件中提取文本,然后将文本分割成一系列的段落,最后将这些段落添加到vec_db中。
|
13 |
+
def init_db(file):
|
14 |
+
paragraphs = extract_text_from_pdf(file.name)
|
15 |
+
documents = split_text(paragraphs, 500, 100)
|
16 |
+
vec_db.add_documents(documents)
|
17 |
+
|
18 |
+
# 调用chat函数时,先对用户输入的文本内容在vec_db中进行检索,然后构建一个提示Prompt,最后调用大模型,获得检索回复。
|
19 |
+
def chat(user_input, chatbot, context, search_field):
|
20 |
+
search_results = vec_db.search(user_input, 3) # 在向量数据库中搜索用户输入的文本
|
21 |
+
search_field = "\n\n".join(search_results) # 将search_results列表中的所有元素连接成一个字符串,每个元素之间用两个换行符("\n\n")分隔.如果search_results是['result1', 'result2', 'result3'],那么"\n\n".join(search_results)的结果就是'result1\n\nresult2\n\nresult3'。这样,每个搜索结果都会在新的一行开始,使得输出更加清晰易读。
|
22 |
+
prompt = build_prompt(info = search_results, query = user_input) # 构建一个提示 Prompt
|
23 |
+
response = get_completion(prompt, context) # 调用大模型,获得检索回复
|
24 |
+
chatbot.append((user_input, response)) # 将用户输入和检索回复添加到聊天机器人的对话中
|
25 |
+
context.append({'role': 'user', 'content': user_input}) # 将用户的问题添加到user角色下
|
26 |
+
context.append({'role': 'assistant', 'content': response}) # 将检索回复添加到assistant角色下
|
27 |
+
return "", chatbot, context, search_field # 返回空字符串,聊天机器人,context和检索结果
|
28 |
+
|
29 |
+
# 重置聊天机器人的状态
|
30 |
+
def reset_state():
|
31 |
+
return [], [], "", ""
|
32 |
+
|
33 |
+
|
34 |
+
def main():
|
35 |
+
with gr.Blocks() as demo:
|
36 |
+
gr.HTML("""<h1 align="center">PDF内容检索器</h1>""")
|
37 |
+
gr.Markdown("本demo为您提供了一个简单的界面,用于从PDF文件中提取文本,并使用OpenAI的大模型来检索相关的信息。")
|
38 |
+
|
39 |
+
with gr.Row():
|
40 |
+
with gr.Column():
|
41 |
+
fileCtrl = gr.File(label="上传文件", file_types=[',pdf'])
|
42 |
+
|
43 |
+
with gr.Row():
|
44 |
+
with gr.Column(scale=2):
|
45 |
+
chatbot = gr.Chatbot() # 创建一个Gradio聊天机器人对象
|
46 |
+
with gr.Column(scale=2):
|
47 |
+
# gr.HTML("""<h4>检索结果</h4>""")
|
48 |
+
search_field = gr.Textbox(show_label=True, label="检索结果", info="本检索采用的是单一RAG方式", placeholder="空...", lines=10)
|
49 |
+
user_input = gr.Textbox(show_label=True, label="用户输入", placeholder="请点击此处输入...", lines=3)
|
50 |
+
with gr.Row():
|
51 |
+
submitBtn = gr.Button("提 交", variant="primary", size="lg")
|
52 |
+
emptyBtn = gr.Button("清 空", size="sm", variant="secondary")
|
53 |
+
|
54 |
+
context = gr.State([])
|
55 |
+
|
56 |
+
# 当点击提交按钮时,调用chat函数
|
57 |
+
submitBtn.click(chat, [user_input, chatbot, context, search_field],
|
58 |
+
[user_input, chatbot, context, search_field])
|
59 |
+
# 当点击清空按钮时,调用reset_state函数
|
60 |
+
emptyBtn.click(reset_state, outputs=[chatbot, context, user_input, search_field])
|
61 |
+
|
62 |
+
fileCtrl.upload(init_db, inputs = [fileCtrl])
|
63 |
+
|
64 |
+
demo.queue().launch(share=True, server_name='0.0.0.0', server_port=8080, inbrowser=True)
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
main()
|