Hao2727 commited on
Commit
dc41094
·
verified ·
1 Parent(s): f76a8d3

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llama2-extracted.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # dotenv
2
+ .env
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Pdfsearch
3
- emoji: 👀
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.19.2
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: pdfsearch
3
+ app_file: web_demo.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.15.0
 
 
6
  ---
7
+ # chatpdf
8
+ RAG homework
9
 
10
+ 运行:
11
+ ```
12
+ python web_demo.py
13
+ ```
__pycache__/openai_utils.cpython-39.pyc ADDED
Binary file (957 Bytes). View file
 
__pycache__/pdf_utils.cpython-39.pyc ADDED
Binary file (884 Bytes). View file
 
__pycache__/prompt_utils.cpython-39.pyc ADDED
Binary file (1.07 kB). View file
 
__pycache__/text_utils.cpython-39.pyc ADDED
Binary file (952 Bytes). View file
 
__pycache__/vectordb_utils.cpython-39.pyc ADDED
Binary file (1.65 kB). View file
 
import gradio as gr.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def greet(lala, intensity):
4
+ return "Hello, " + lala + "!" * int(intensity)
5
+
6
+ demo = gr.Interface(
7
+ fn=greet,
8
+ inputs=["text", "slider"],
9
+ outputs=["text"],
10
+ )
11
+
12
+ demo.launch()
llama2-extracted.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a965aaca87d5f56e7ab43abc562a7205d3d22f72dce49e8dac1bad3abd5f114
3
+ size 1238532
openai_utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ # 加载环境变量
4
+ from dotenv import load_dotenv, find_dotenv
5
+
6
+ _ = load_dotenv(find_dotenv()) # 读取本地 .env 文件,里面定义了 OPENAI_API_KEY
7
+
8
+ openai.api_key = os.getenv('OPENAI_API_KEY')
9
+
10
+
11
+ def get_completion(prompt, context, model="gpt-3.5-turbo"):
12
+ """封装 openai 接口"""
13
+ messages = context + [{"role": "user", "content": prompt}]
14
+ response = openai.ChatCompletion.create(
15
+ model=model,
16
+ messages=messages,
17
+ temperature=0, # 模型输出的随机性,0 表示随机性最小
18
+ )
19
+ return response.choices[0].message["content"]
20
+
21
+
22
+ def get_embedding(text, model="text-embedding-3-small"):#text-embedding-ada-002
23
+ """封装 OpenAI 的 Embedding 模型接口"""
24
+ #return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
25
+ return openai.Embedding.create(input=[text], model=model).data[0].embedding
pdf_utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.high_level import extract_pages
2
+ from pdfminer.layout import LTTextContainer
3
+
4
+ def extract_text_from_pdf(filename,page_numbers=None,min_line_length=10):
5
+ """从 PDF 文件中(按指定页码)提取文字"""
6
+ paragraphs = []
7
+ buffer = ''
8
+ full_text = ''
9
+ # 提取全部文本
10
+ for i, page_layout in enumerate(extract_pages(filename)):
11
+ # 如果指定了页码范围,跳过范围外的页
12
+ if page_numbers is not None and i not in page_numbers:
13
+ continue
14
+ for element in page_layout:
15
+ if isinstance(element, LTTextContainer):
16
+ full_text += element.get_text() + '\n'
17
+ # 按空行分隔,将文本重新组织成段落
18
+ lines = full_text.split('\n')
19
+ for text in lines:
20
+ if len(text) >= min_line_length:
21
+ buffer += (' '+text) if not text.endswith('-') else text.strip('-')
22
+ elif buffer:
23
+ paragraphs.append(buffer)
24
+ buffer = ''
25
+ if buffer:
26
+ paragraphs.append(buffer)
27
+ return paragraphs
prompt_utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_template = """
2
+ 你是一个问答机器人。
3
+ 你的任务是根据下述给定的已知信息回答用户问题。
4
+ 确保你的回复完全依据下述已知信息。不要编造答案。
5
+ 如果下述已知信息不足以回答用户的问题,请直接回复"我无法回答您的问题"。
6
+
7
+ 已知信息:
8
+ __INFO__
9
+
10
+ 用户问:
11
+ __QUERY__
12
+
13
+ 请用中文回答用户问题。
14
+ """
15
+
16
+
17
+ def build_prompt(template=prompt_template, **kwargs):
18
+ """将 Prompt 模板赋值"""
19
+ prompt = template
20
+ for k, v in kwargs.items():
21
+ if isinstance(v, str):
22
+ val = v
23
+ elif isinstance(v, list) and all(isinstance(elem, str) for elem in v):
24
+ val = '\n'.join(v)
25
+ else:
26
+ val = str(v)
27
+ prompt = prompt.replace(f"__{k.upper()}__", val)
28
+ return prompt
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openai==0.28.1
2
+ python-dotenv
3
+ pdfminer.six
4
+ nltk==3.8.1
5
+ chromadb==0.4.15
6
+ gradio
text_utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import sent_tokenize
2
+ import json
3
+
4
+ def split_text(paragraphs,chunk_size=300,overlap_size=100):
5
+ """按指定 chunk_size 和 overlap_size 交叠割文本"""
6
+ sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
7
+ chunks = []
8
+ i= 0
9
+ while i < len(sentences):
10
+ chunk = sentences[i]
11
+ overlap = ''
12
+ prev_len = 0
13
+ prev = i - 1
14
+ # 向前计算重叠部分
15
+ while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
16
+ overlap = sentences[prev] + ' ' + overlap
17
+ prev -= 1
18
+ chunk = overlap+chunk
19
+ next = i + 1
20
+ # 向后计算当前chunk
21
+ while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
22
+ chunk = chunk + ' ' + sentences[next]
23
+ next += 1
24
+ chunks.append(chunk)
25
+ i = next
26
+ return chunks
vectordb_utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.config import Settings
3
+ from openai_utils import get_embedding
4
+
5
+
6
+ class InMemoryVecDB:
7
+
8
+ def __init__(self, name="demo"):
9
+ self.chroma_client = chromadb.Client(Settings(allow_reset=True))
10
+ self.chroma_client.reset()
11
+ self.name = name
12
+ self.collection = self.chroma_client.get_or_create_collection(name=name)
13
+
14
+ def add_documents(self, documents):
15
+ self.collection.add(
16
+ embeddings=[get_embedding(doc) for doc in documents],
17
+ documents=documents,
18
+ metadatas=[{"source": self.name} for _ in documents],
19
+ ids=[f"id_{i}" for i in range(len(documents))]
20
+ )
21
+
22
+ def search(self, query, top_n):
23
+ """检索向量数据库"""
24
+ results = self.collection.query(
25
+ query_embeddings=[get_embedding(query)],
26
+ n_results=top_n
27
+ )
28
+ return results['documents'][0]
web_demo.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ import gradio as gr
4
+ from openai_utils import get_completion
5
+ from prompt_utils import build_prompt
6
+ from vectordb_utils import InMemoryVecDB
7
+ from pdf_utils import extract_text_from_pdf
8
+ from text_utils import split_text
9
+
10
+ vec_db = InMemoryVecDB() # 创建一个chromadb数据库对象
11
+
12
+ # init_db函数用于初始化数据库。它首先从PDF文件中提取文本,然后将文本分割成一系列的段落,最后将这些段落添加到vec_db中。
13
+ def init_db(file):
14
+ paragraphs = extract_text_from_pdf(file.name)
15
+ documents = split_text(paragraphs, 500, 100)
16
+ vec_db.add_documents(documents)
17
+
18
+ # 调用chat函数时,先对用户输入的文本内容在vec_db中进行检索,然后构建一个提示Prompt,最后调用大模型,获得检索回复。
19
+ def chat(user_input, chatbot, context, search_field):
20
+ search_results = vec_db.search(user_input, 3) # 在向量数据库中搜索用户输入的文本
21
+ search_field = "\n\n".join(search_results) # 将search_results列表中的所有元素连接成一个字符串,每个元素之间用两个换行符("\n\n")分隔.如果search_results是['result1', 'result2', 'result3'],那么"\n\n".join(search_results)的结果就是'result1\n\nresult2\n\nresult3'。这样,每个搜索结果都会在新的一行开始,使得输出更加清晰易读。
22
+ prompt = build_prompt(info = search_results, query = user_input) # 构建一个提示 Prompt
23
+ response = get_completion(prompt, context) # 调用大模型,获得检索回复
24
+ chatbot.append((user_input, response)) # 将用户输入和检索回复添加到聊天机器人的对话中
25
+ context.append({'role': 'user', 'content': user_input}) # 将用户的问题添加到user角色下
26
+ context.append({'role': 'assistant', 'content': response}) # 将检索回复添加到assistant角色下
27
+ return "", chatbot, context, search_field # 返回空字符串,聊天机器人,context和检索结果
28
+
29
+ # 重置聊天机器人的状态
30
+ def reset_state():
31
+ return [], [], "", ""
32
+
33
+
34
+ def main():
35
+ with gr.Blocks() as demo:
36
+ gr.HTML("""<h1 align="center">PDF内容检索器</h1>""")
37
+ gr.Markdown("本demo为您提供了一个简单的界面,用于从PDF文件中提取文本,并使用OpenAI的大模型来检索相关的信息。")
38
+
39
+ with gr.Row():
40
+ with gr.Column():
41
+ fileCtrl = gr.File(label="上传文件", file_types=[',pdf'])
42
+
43
+ with gr.Row():
44
+ with gr.Column(scale=2):
45
+ chatbot = gr.Chatbot() # 创建一个Gradio聊天机器人对象
46
+ with gr.Column(scale=2):
47
+ # gr.HTML("""<h4>检索结果</h4>""")
48
+ search_field = gr.Textbox(show_label=True, label="检索结果", info="本检索采用的是单一RAG方式", placeholder="空...", lines=10)
49
+ user_input = gr.Textbox(show_label=True, label="用户输入", placeholder="请点击此处输入...", lines=3)
50
+ with gr.Row():
51
+ submitBtn = gr.Button("提 交", variant="primary", size="lg")
52
+ emptyBtn = gr.Button("清 空", size="sm", variant="secondary")
53
+
54
+ context = gr.State([])
55
+
56
+ # 当点击提交按钮时,调用chat函数
57
+ submitBtn.click(chat, [user_input, chatbot, context, search_field],
58
+ [user_input, chatbot, context, search_field])
59
+ # 当点击清空按钮时,调用reset_state函数
60
+ emptyBtn.click(reset_state, outputs=[chatbot, context, user_input, search_field])
61
+
62
+ fileCtrl.upload(init_db, inputs = [fileCtrl])
63
+
64
+ demo.queue().launch(share=True, server_name='0.0.0.0', server_port=8080, inbrowser=True)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()