Spaces:
Sleeping
Sleeping
danicafisher
commited on
Commit
•
036f779
1
Parent(s):
412d04a
Updates requirements
Browse files- app.py +21 -9
- requirements.txt +2 -1
- synthetic_data_generation.ipynb +4 -12
app.py
CHANGED
@@ -14,34 +14,46 @@ import chainlit as cl
|
|
14 |
import asyncio
|
15 |
import nest_asyncio
|
16 |
nest_asyncio.apply()
|
17 |
-
|
18 |
from langchain_community.document_loaders import PyMuPDFLoader
|
|
|
|
|
19 |
|
20 |
filepath_NIST = "data/NIST.AI.600-1.pdf"
|
21 |
filepath_Blueprint = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"
|
22 |
|
23 |
documents_NIST = PyMuPDFLoader(filepath_NIST).load()
|
24 |
documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()
|
|
|
25 |
|
26 |
# pdf_loader_NIST = PDFFileLoader("data/NIST.AI.600-1.pdf")
|
27 |
# pdf_loader_Blueprint = PDFFileLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pdf")
|
28 |
# documents_NIST = pdf_loader_NIST.load_documents()
|
29 |
# documents_Blueprint = pdf_loader_Blueprint.load_documents()
|
30 |
|
31 |
-
text_splitter = CharacterTextSplitter()
|
32 |
-
split_documents_NIST = text_splitter.split_texts(documents_NIST)
|
33 |
-
split_documents_Blueprint = text_splitter.split_texts(documents_Blueprint)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
|
40 |
|
41 |
-
|
|
|
42 |
"""
|
43 |
|
44 |
-
rag_prompt =
|
45 |
|
46 |
USER_PROMPT_TEMPLATE = """ \
|
47 |
Context:
|
|
|
14 |
import asyncio
|
15 |
import nest_asyncio
|
16 |
nest_asyncio.apply()
|
17 |
+
import langchain_community
|
18 |
from langchain_community.document_loaders import PyMuPDFLoader
|
19 |
+
import langchain
|
20 |
+
from langchain.prompts import ChatPromptTemplate
|
21 |
|
22 |
filepath_NIST = "data/NIST.AI.600-1.pdf"
|
23 |
filepath_Blueprint = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"
|
24 |
|
25 |
documents_NIST = PyMuPDFLoader(filepath_NIST).load()
|
26 |
documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()
|
27 |
+
documents = documents_NIST + documents_Blueprint
|
28 |
|
29 |
# pdf_loader_NIST = PDFFileLoader("data/NIST.AI.600-1.pdf")
|
30 |
# pdf_loader_Blueprint = PDFFileLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pdf")
|
31 |
# documents_NIST = pdf_loader_NIST.load_documents()
|
32 |
# documents_Blueprint = pdf_loader_Blueprint.load_documents()
|
33 |
|
34 |
+
# text_splitter = CharacterTextSplitter()
|
35 |
+
# split_documents_NIST = text_splitter.split_texts(documents_NIST)
|
36 |
+
# split_documents_Blueprint = text_splitter.split_texts(documents_Blueprint)
|
37 |
+
|
38 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
39 |
+
|
40 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
41 |
+
chunk_size = 500,
|
42 |
+
chunk_overlap = 50
|
43 |
+
)
|
44 |
|
45 |
+
rag_documents = text_splitter.split_documents(documents)
|
46 |
|
47 |
+
RAG_PROMPT = """\
|
48 |
+
Given a provided context and question, you must answer the question based only on context.
|
49 |
|
50 |
+
If you cannot answer the question based on the context - you must say "I don't know".
|
51 |
|
52 |
+
Context: {context}
|
53 |
+
Question: {question}
|
54 |
"""
|
55 |
|
56 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
57 |
|
58 |
USER_PROMPT_TEMPLATE = """ \
|
59 |
Context:
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ numpy
|
|
2 |
chainlit==0.7.700
|
3 |
openai
|
4 |
PyPDF2
|
5 |
-
langchain_community
|
|
|
|
2 |
chainlit==0.7.700
|
3 |
openai
|
4 |
PyPDF2
|
5 |
+
langchain_community
|
6 |
+
langchain
|
synthetic_data_generation.ipynb
CHANGED
@@ -865,7 +865,7 @@
|
|
865 |
"\n",
|
866 |
"generator_llm = ChatOpenAI(model=\"gpt-3.5-turbo\")\n",
|
867 |
"critic_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n",
|
868 |
-
"embeddings = OpenAIEmbeddings()\n",
|
869 |
"\n",
|
870 |
"generator = TestsetGenerator.from_langchain(\n",
|
871 |
" generator_llm,\n",
|
@@ -982,21 +982,13 @@
|
|
982 |
" | rag_prompt | llm | StrOutputParser()\n",
|
983 |
")\n",
|
984 |
"\n",
|
985 |
-
"
|
986 |
-
" # INVOKE CHAIN WITH: {\"question\" : \"<<SOME USER QUESTION>>\"}\n",
|
987 |
-
" # \"question\" : populated by getting the value of the \"question\" key\n",
|
988 |
-
" # \"context\" : populated by getting the value of the \"question\" key and chaining it into the base_retriever\n",
|
989 |
" {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
|
990 |
-
" # \"context\" : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)\n",
|
991 |
-
" # by getting the value of the \"context\" key from the previous step\n",
|
992 |
" | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
|
993 |
-
" # \"response\" : the \"context\" and \"question\" values are used to format our prompt object and then piped\n",
|
994 |
-
" # into the LLM and stored in a key called \"response\"\n",
|
995 |
-
" # \"context\" : populated by getting the value of the \"context\" key from the previous step\n",
|
996 |
" | {\"response\": rag_prompt | llm, \"context\": itemgetter(\"context\")}\n",
|
997 |
")\n",
|
998 |
"\n",
|
999 |
-
"result =
|
1000 |
"print(result)"
|
1001 |
]
|
1002 |
},
|
@@ -1010,7 +1002,7 @@
|
|
1010 |
"contexts = []\n",
|
1011 |
"\n",
|
1012 |
"for question in test_questions:\n",
|
1013 |
-
" response =
|
1014 |
" answers.append(response[\"response\"].content)\n",
|
1015 |
" contexts.append([context.page_content for context in response[\"context\"]])"
|
1016 |
]
|
|
|
865 |
"\n",
|
866 |
"generator_llm = ChatOpenAI(model=\"gpt-3.5-turbo\")\n",
|
867 |
"critic_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n",
|
868 |
+
"embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
|
869 |
"\n",
|
870 |
"generator = TestsetGenerator.from_langchain(\n",
|
871 |
" generator_llm,\n",
|
|
|
982 |
" | rag_prompt | llm | StrOutputParser()\n",
|
983 |
")\n",
|
984 |
"\n",
|
985 |
+
"rag_qa_chain = (\n",
|
|
|
|
|
|
|
986 |
" {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
|
|
|
|
|
987 |
" | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
|
|
|
|
|
|
|
988 |
" | {\"response\": rag_prompt | llm, \"context\": itemgetter(\"context\")}\n",
|
989 |
")\n",
|
990 |
"\n",
|
991 |
+
"result = rag_qa_chain.invoke({\"question\" : \"Is AI a threat to humanity?\"})\n",
|
992 |
"print(result)"
|
993 |
]
|
994 |
},
|
|
|
1002 |
"contexts = []\n",
|
1003 |
"\n",
|
1004 |
"for question in test_questions:\n",
|
1005 |
+
" response = rag_qa_chain.invoke({\"question\" : question})\n",
|
1006 |
" answers.append(response[\"response\"].content)\n",
|
1007 |
" contexts.append([context.page_content for context in response[\"context\"]])"
|
1008 |
]
|