Spaces:

SamOliveira
/

research_assistant

Sleeping

App Files Files Community

Sam-Oliveira commited on Jun 23

Commit

b9e7b32

1 Parent(s): 06018df

Fixed locally

Browse files

Files changed (3) hide show

.gitignore +2 -1
src/scrape.py +33 -18
src/streamlit_app.py +14 -8

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .DS_Store
 ra_env
-src/__pycache__

 .DS_Store
 ra_env
+src/__pycache__
+src/papers.db

src/scrape.py CHANGED Viewed

@@ -2,41 +2,56 @@ import time, arxiv
 from query_builder import build_query
 from db import get_conn
 from config import MAX_RESULTS
-import os
-import pathlib
-import os, pathlib, uuid, shutil
-BASE_CACHE = pathlib.Path("/data")              # always writable in Spaces
-CACHE_DIR  = BASE_CACHE / "hf_cache" / str(os.getpid())
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-# 1) Point every HF-related lib there
-os.environ["HF_HOME"]                    = str(CACHE_DIR)
-os.environ["HF_HUB_CACHE"]               = str(CACHE_DIR)
-os.environ["TRANSFORMERS_CACHE"]         = str(CACHE_DIR)
-os.environ["SENTENCE_TRANSFORMERS_HOME"] = str(CACHE_DIR)
-# 2) Remove any stale lock that might have been copied along
-lock_file = CACHE_DIR / ".lock"
-if lock_file.exists():
-    lock_file.unlink()
-# 3) Now import and load the model safely
 from sentence_transformers import SentenceTransformer
 from keybert import KeyBERT
 st_model = SentenceTransformer(
     "sentence-transformers/all-MiniLM-L6-v2",
-    cache_folder=str(CACHE_DIR)
 )
 kw_model = KeyBERT(st_model)
 def make_tags(title, abstract, top_n=5):
     """
     Extract keywords from the title and abstract using KeyBERT.
     """
-    phrases = _kw.extract_keywords(f"{title}. {abstract}",
                                    top_n=top_n,
                                    stop_words="english",
                                    use_mmr=True)

 from query_builder import build_query
 from db import get_conn
 from config import MAX_RESULTS
+import os, pathlib, tempfile,uuid, shutil
+CACHE_DIR = pathlib.Path(tempfile.gettempdir()) / "hf_cache"
+CACHE_DIR.mkdir(parents=True, exist_ok=True)      # guaranteed writable
+for var in (
+    "HF_HOME",
+    "HF_HUB_CACHE",
+    "TRANSFORMERS_CACHE",
+    "SENTENCE_TRANSFORMERS_HOME",
+):
+    os.environ[var] = str(CACHE_DIR)
+from sentence_transformers import SentenceTransformer
+from keybert import KeyBERT
+st_model = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2",
+    cache_folder=str(CACHE_DIR)          # explicit path
+)
+kw_model = KeyBERT(st_model)
+"""
+# For my Mac
 from sentence_transformers import SentenceTransformer
 from keybert import KeyBERT
+# Use a writable cache directory on macOS
+cache_dir = os.path.expanduser("~/cache")
+os.makedirs(cache_dir, exist_ok=True)
+os.environ["HF_HOME"]                    = cache_dir
+os.environ["HF_HUB_CACHE"]               = cache_dir
+os.environ["TRANSFORMERS_CACHE"]         = cache_dir
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
 st_model = SentenceTransformer(
     "sentence-transformers/all-MiniLM-L6-v2",
+    cache_folder=cache_dir                 # <- writable
 )
 kw_model = KeyBERT(st_model)
+"""
 def make_tags(title, abstract, top_n=5):
     """
     Extract keywords from the title and abstract using KeyBERT.
     """
+    phrases = kw_model.extract_keywords(f"{title}. {abstract}",
                                    top_n=top_n,
                                    stop_words="english",
                                    use_mmr=True)

src/streamlit_app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import html as ihtml
 from datetime import date
 from config     import MAX_RESULTS
 from scrape     import scrape
-from summarise  import summarise_pending
 from digest     import build_html
 from ideate     import ideate_from_topic, ideate_from_ids
 from helpers    import render_rows, rows_by_tag
@@ -39,9 +38,10 @@ with tab1:
     category = c4.text_input("Category (e.g. cs.CL)")
     k = st.slider("Max papers", 5, 50, 25)
     if st.button("Run search"):
-        scrape(max_results=k, topic=topic, title=title,
                author=author, category=category)
-        st.success("Scraped, tagged, stored!")
         from db import get_conn
         newest = get_conn().execute(
             "SELECT title, authors, abstract, published FROM papers "
@@ -54,8 +54,12 @@ with tab2:
     st.header("Get a digest from the latest papers you have previously scraped")
     d_topic = st.text_input("Keyword to match tags", value="large language")
     if st.button("Generate digest"):
-        summarise_by_tag(d_topic)
-        rows = rows_by_tag(d_topic, MAX_RESULTS)
         st.components.v1.html(render_rows(rows), height=800, scrolling=True)
 with tab3:
@@ -65,7 +69,8 @@ with tab3:
     if mode == "Keyword":
         kw = st.text_input("Keyword")
         if st.button("Ideate"):
-            ideas = ideate_from_topic(kw)
             if ideas is None:
                 st.info("No papers in the database match that keyword. "
                         "Try running a search in the **Search** tab first.")
@@ -76,8 +81,9 @@ with tab3:
         ids_in = st.text_area("Comma-separated IDs",
                               placeholder="2406.01234,2405.01234")
         if st.button("Ideate"):
-            ids   = [x.strip() for x in ids_in.split(",") if x.strip()]
-            ideas = ideate_from_ids(ids)
             if ideas is None:
                 st.info("Those IDs aren’t in the database yet. "
                         "Fetch them via the **Search** tab, then try again.")

 from datetime import date
 from config     import MAX_RESULTS
 from scrape     import scrape
 from digest     import build_html
 from ideate     import ideate_from_topic, ideate_from_ids
 from helpers    import render_rows, rows_by_tag
     category = c4.text_input("Category (e.g. cs.CL)")
     k = st.slider("Max papers", 5, 50, 25)
     if st.button("Run search"):
+        with st.spinner("Scraping papers, and storing them..."):
+            scrape(max_results=k, topic=topic, title=title,
                author=author, category=category)
+        st.success("All done!")
         from db import get_conn
         newest = get_conn().execute(
             "SELECT title, authors, abstract, published FROM papers "
     st.header("Get a digest from the latest papers you have previously scraped")
     d_topic = st.text_input("Keyword to match tags", value="large language")
     if st.button("Generate digest"):
+        with st.spinner("Finding papers and summarising them..."):
+            summarise_by_tag(d_topic)
+            rows = rows_by_tag(d_topic, MAX_RESULTS)
+    if not rows:
+        st.info("No papers found; try the Search tab.")
+    else:
         st.components.v1.html(render_rows(rows), height=800, scrolling=True)
 with tab3:
     if mode == "Keyword":
         kw = st.text_input("Keyword")
         if st.button("Ideate"):
+            with st.spinner("Thinking of new ideas..."):
+                ideas = ideate_from_topic(kw)
             if ideas is None:
                 st.info("No papers in the database match that keyword. "
                         "Try running a search in the **Search** tab first.")
         ids_in = st.text_area("Comma-separated IDs",
                               placeholder="2406.01234,2405.01234")
         if st.button("Ideate"):
+            with st.spinner("Thinking of new ideas..."):
+                ids   = [x.strip() for x in ids_in.split(",") if x.strip()]
+                ideas = ideate_from_ids(ids)
             if ideas is None:
                 st.info("Those IDs aren’t in the database yet. "
                         "Fetch them via the **Search** tab, then try again.")