Spaces:

vsarathy
/

anchor

Runtime error

App Files Files Community

Vasanth Sarathy commited on Mar 14, 2023

Commit

73f6aba

1 Parent(s): e3b439a

Working streamlit app

Browse files

Files changed (8) hide show

.gitattributes +2 -0
app.py +146 -5
faiss_document_store.db +3 -0
faiss_document_store.faiss +3 -0
faiss_document_store.json +1 -0
pipelines.py +118 -0
requirements.txt +2 -1
utils.py +24 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ faiss_document_store.db filter=lfs diff=lfs merge=lfs -text
2	+ faiss_document_store.faiss filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,8 +1,149 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import streamlit as st
+import os
+from pipelines import get_pipeline
+import logging
+from json import JSONDecodeError
+from utils import find_substring_indices
+from annotated_text import annotation
+from markdown import markdown
+# Sliders
+DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
+def set_state_if_absent(key, value):
+    if key not in st.session_state:
+        st.session_state[key] = value
+def query(concept, filters={}, top_k_retriever=5):
+    params ={"Retriever": {"top_k": top_k_retriever}}
+    pipe = get_pipeline("data/narratives/processed")
+    prediction = pipe.run(
+        query=concept,
+        params={"Retriever": {"top_k": top_k_retriever}
+        }
+    )
+    # Format results
+    results = []
+    spans = prediction['results']
+    for idx, span in enumerate(spans):
+        context = prediction["documents"][idx].to_dict()['content']
+        span_indices = find_substring_indices(context, span)
+        if span_indices:
+            result = {"context": context,
+                      "span": span,
+                      "span_start": span_indices[0],
+                      "span_end": span_indices[1]}
+            results.append(result)
+    return results
+def main():
+    st.set_page_config(page_title="Anchor")
+    # Persistent state
+    set_state_if_absent("question", "husband's permission")
+    set_state_if_absent("results", None)
+    set_state_if_absent("raw_json", None)
+    set_state_if_absent("random_question_requested", False)
+    # Small callback to reset the interface in case the text of the question changes
+    def reset_results(*args):
+        st.session_state.answer = None
+        st.session_state.results = None
+        st.session_state.raw_json = None
+    # Title
+    st.write("""
+    # ⚓ ANCHOR
+    #### Grounding Abstract Concepts in Text
+    """)
+    # Sidebar
+    st.sidebar.header("Options")
+    top_k_retriever = st.sidebar.slider(
+        "Max. number of documents from retriever",
+        min_value=1,
+        max_value=20,
+        value=DEFAULT_DOCS_FROM_RETRIEVER,
+        step=1,
+        on_change=reset_results,
+    )
+    # Search bar
+    question = st.text_input(
+        value=st.session_state.question,
+        max_chars=100,
+        on_change=reset_results,
+        label="Concept",
+        label_visibility="visible",
+    )
+    col1, col2 = st.columns(2)
+    col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
+    col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
+    # Run button
+    run_pressed = col1.button("Run")
+    run_query = (run_pressed or question != st.session_state.question)
+    # Get results for query
+    if run_query and question:
+        reset_results()
+        st.session_state.question = question
+        with st.spinner(
+            "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
+        ):
+            try:
+                st.session_state.results = query(
+                    question, top_k_retriever=top_k_retriever
+                )
+            except JSONDecodeError as je:
+                st.error("👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
+                return
+            except Exception as e:
+                logging.exception(e)
+                if "The server is busy processing requests" in str(e) or "503" in str(e):
+                    st.error("🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
+                else:
+                    st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
+                return
+    if st.session_state.results:
+        st.write("## Results:")
+        for count, result in enumerate(st.session_state.results):
+            if result['span']:
+                st.write(
+                    markdown(result['context'][:result['span_start']] +
+                             str(annotation(result['span'], "anchor", "#fad6a5")) +
+                             result['context'][result['span_end']+1:]),
+                    unsafe_allow_html=True
+                )
+            else:
+                st.info(
+                    "🤔 &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
+                )
+                st.write("**Relevance:** ", result["relevance"])
+main()

faiss_document_store.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:815c57a332029c405a6bda58876f80db6a31a60fbbd35b8a4ea8595e9fcd398a
+size 1839104

faiss_document_store.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e55b94d536a2d9998f7cd9a24c9dc13bd86072699ae2045cc574a7ff7a5b0af
+size 5050413

faiss_document_store.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"faiss_index_factory_str": "Flat"}

pipelines.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.pipelines.standard_pipelines import TextIndexingPipeline
+from haystack.nodes import BM25Retriever
+from haystack.nodes import FARMReader
+from haystack.pipelines import ExtractiveQAPipeline
+from haystack.nodes.other import Shaper
+from haystack.nodes import PromptNode, PromptTemplate
+from haystack.pipelines import Pipeline
+from haystack.document_stores import FAISSDocumentStore
+from haystack.nodes import EmbeddingRetriever
+from haystack.utils import convert_files_to_docs
+# Set logging level to INFO
+import logging
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
+api_key = "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"
+#faiss_index_path = "faiss_document_store.db"
+# Shaper helps expand the `query` variable into a list of identical queries (length of documents)
+# and store the list of queries in the `questions` variable
+# (the variable used in the question answering template)
+def get_pipeline(doc_dir):
+    # Registering a new prompt template called "concept-exemplar"
+    prompt_template = \
+    """
+    Identify a non-overlapping spans of text in the given
+    context that resonates with the given concept. By resonate we mean that the
+    meaning of the concept is captured in the span. The span exemplifies what
+    the concept means. The identified span MUST be present verbatim in the context. \n\nConcept: 'family support'\nContext: Abubakar and his
+    wife are expecting his mother to help his wife with the new born. It is
+    evening and she has not arrived yet. Then he decided to call the neighbor's
+    wife to Come and help the baby and the new born woman to boil hot water and
+    baths the baby and made the baby to sleep before the mother come
+    back.\nSpan: are expecting his mother to help\n\n\nConcept: $concepts
+    \nContext: $documents\nSpan:
+    """
+    template = PromptTemplate(name="concept-exemplar",prompt_text=prompt_template)
+    prompt_node = PromptNode("text-davinci-003", api_key=api_key)
+    prompt_node.add_prompt_template(template)
+    # Set concept-exemplar as my default
+    exemplifier = prompt_node.set_default_prompt_template("concept-exemplar")
+    shaper = Shaper(func="value_to_list", inputs={"value": "query", "target_list":"documents"}, outputs=["concepts"])
+    if os.path.exists("faiss_document_store.db"):
+        print("FAISS document store already exists")
+        document_store = FAISSDocumentStore(
+            faiss_index_path="faiss_document_store.faiss",
+            faiss_config_path="faiss_document_store.json")
+        retriever = EmbeddingRetriever(
+            document_store=document_store,
+            embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+    else:
+        print("New Document Store created")
+        document_store = FAISSDocumentStore(faiss_index_factory_str='Flat')
+        docs = convert_files_to_docs(dir_path=doc_dir)
+        document_store.write_documents(docs)
+        # 4. Set up retriever
+        # bm25_retriever = BM25Retriever(document_store=document_store)
+        retriever = EmbeddingRetriever(
+            document_store=document_store,
+            embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
+        )
+        # Important:
+        # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
+        # previously indexed documents and update their embedding representation.
+        # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
+        # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
+        document_store.update_embeddings(retriever)
+        document_store.save("faiss_document_store.faiss")
+  #  # 1. Setup document store
+  #  if os.path.exists("faiss_document_store.json"):
+  #      print("Path exists")
+  #      document_store = FAISSDocumentStore(
+  #          faiss_index_path="faiss_document_store.faiss",
+  #          faiss_config_path="faiss_document_store.json")
+  #  else:
+  #      print("path does not exist")
+  #      document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)
+  #      document_store.save("faiss_document_store.faiss")
+        #document_store.save("faiss_index")
+       # document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+    #document_store = InMemoryDocumentStore(use_bm25=True)
+    # 2. Put files in habitus folder into a list for indexing
+    #files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
+    # 3. Set up text indexing pipline and index all files in folder
+    #indexing_pipeline = TextIndexingPipeline(document_store)
+    #indexing_pipeline.run_batch(file_paths=files_to_index)
+        # New combined pipeline
+    pipe = Pipeline()
+    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
+    pipe.add_node(component=shaper, name="shaper", inputs=["Retriever"])
+    pipe.add_node(component=exemplifier, name="exemplifier", inputs=['shaper'])
+    return pipe

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-gradio
 farm-haystack[all-gpu]

 farm-haystack[all-gpu]
+streamlit
+st-annotated-text

utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import ast
+import re
+def get_span_indices(document, span):
+    print(f"\nSpan: {span}")
+    print(f"Document: {document}")
+    res = re.search(span, document, re.IGNORECASE)
+    print(f"Res: {res}")
+    print(f"Find: {document.find(span)}")
+    if res:
+        return res.span()
+def find_substring_indices(string, substring):
+    substring = remove_trailing_periods(substring)
+    start_index = string.lower().find(substring.lower())
+    if start_index == -1:
+        return None
+    end_index = start_index + len(substring) - 1
+    return (start_index, end_index)
+def remove_trailing_periods(string):
+    while string.endswith('.'):
+        string = string[:-1]
+    return string