Sam-Oliveira commited on
Commit
b9e7b32
·
1 Parent(s): 06018df

Fixed locally

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. src/scrape.py +33 -18
  3. src/streamlit_app.py +14 -8
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .DS_Store
2
  ra_env
3
- src/__pycache__
 
 
1
  .DS_Store
2
  ra_env
3
+ src/__pycache__
4
+ src/papers.db
src/scrape.py CHANGED
@@ -2,41 +2,56 @@ import time, arxiv
2
  from query_builder import build_query
3
  from db import get_conn
4
  from config import MAX_RESULTS
5
- import os
6
- import pathlib
7
- import os, pathlib, uuid, shutil
8
 
9
- BASE_CACHE = pathlib.Path("/data") # always writable in Spaces
10
- CACHE_DIR = BASE_CACHE / "hf_cache" / str(os.getpid())
11
 
12
- CACHE_DIR.mkdir(parents=True, exist_ok=True)
 
13
 
14
- # 1) Point every HF-related lib there
15
- os.environ["HF_HOME"] = str(CACHE_DIR)
16
- os.environ["HF_HUB_CACHE"] = str(CACHE_DIR)
17
- os.environ["TRANSFORMERS_CACHE"] = str(CACHE_DIR)
18
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = str(CACHE_DIR)
 
 
19
 
20
- # 2) Remove any stale lock that might have been copied along
21
- lock_file = CACHE_DIR / ".lock"
22
- if lock_file.exists():
23
- lock_file.unlink()
 
 
 
 
 
 
24
 
25
- # 3) Now import and load the model safely
26
  from sentence_transformers import SentenceTransformer
27
  from keybert import KeyBERT
 
 
 
 
 
 
 
 
 
28
 
29
  st_model = SentenceTransformer(
30
  "sentence-transformers/all-MiniLM-L6-v2",
31
- cache_folder=str(CACHE_DIR)
32
  )
33
  kw_model = KeyBERT(st_model)
 
34
 
35
  def make_tags(title, abstract, top_n=5):
36
  """
37
  Extract keywords from the title and abstract using KeyBERT.
38
  """
39
- phrases = _kw.extract_keywords(f"{title}. {abstract}",
40
  top_n=top_n,
41
  stop_words="english",
42
  use_mmr=True)
 
2
  from query_builder import build_query
3
  from db import get_conn
4
  from config import MAX_RESULTS
5
+ import os, pathlib, tempfile,uuid, shutil
 
 
6
 
 
 
7
 
8
+ CACHE_DIR = pathlib.Path(tempfile.gettempdir()) / "hf_cache"
9
+ CACHE_DIR.mkdir(parents=True, exist_ok=True) # guaranteed writable
10
 
11
+ for var in (
12
+ "HF_HOME",
13
+ "HF_HUB_CACHE",
14
+ "TRANSFORMERS_CACHE",
15
+ "SENTENCE_TRANSFORMERS_HOME",
16
+ ):
17
+ os.environ[var] = str(CACHE_DIR)
18
 
19
+ from sentence_transformers import SentenceTransformer
20
+ from keybert import KeyBERT
21
+
22
+ st_model = SentenceTransformer(
23
+ "sentence-transformers/all-MiniLM-L6-v2",
24
+ cache_folder=str(CACHE_DIR) # explicit path
25
+ )
26
+ kw_model = KeyBERT(st_model)
27
+
28
+ """
29
 
30
+ # For my Mac
31
  from sentence_transformers import SentenceTransformer
32
  from keybert import KeyBERT
33
+ # Use a writable cache directory on macOS
34
+ cache_dir = os.path.expanduser("~/cache")
35
+ os.makedirs(cache_dir, exist_ok=True)
36
+
37
+ os.environ["HF_HOME"] = cache_dir
38
+ os.environ["HF_HUB_CACHE"] = cache_dir
39
+ os.environ["TRANSFORMERS_CACHE"] = cache_dir
40
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
41
+
42
 
43
  st_model = SentenceTransformer(
44
  "sentence-transformers/all-MiniLM-L6-v2",
45
+ cache_folder=cache_dir # <- writable
46
  )
47
  kw_model = KeyBERT(st_model)
48
+ """
49
 
50
  def make_tags(title, abstract, top_n=5):
51
  """
52
  Extract keywords from the title and abstract using KeyBERT.
53
  """
54
+ phrases = kw_model.extract_keywords(f"{title}. {abstract}",
55
  top_n=top_n,
56
  stop_words="english",
57
  use_mmr=True)
src/streamlit_app.py CHANGED
@@ -12,7 +12,6 @@ import html as ihtml
12
  from datetime import date
13
  from config import MAX_RESULTS
14
  from scrape import scrape
15
- from summarise import summarise_pending
16
  from digest import build_html
17
  from ideate import ideate_from_topic, ideate_from_ids
18
  from helpers import render_rows, rows_by_tag
@@ -39,9 +38,10 @@ with tab1:
39
  category = c4.text_input("Category (e.g. cs.CL)")
40
  k = st.slider("Max papers", 5, 50, 25)
41
  if st.button("Run search"):
42
- scrape(max_results=k, topic=topic, title=title,
 
43
  author=author, category=category)
44
- st.success("Scraped, tagged, stored!")
45
  from db import get_conn
46
  newest = get_conn().execute(
47
  "SELECT title, authors, abstract, published FROM papers "
@@ -54,8 +54,12 @@ with tab2:
54
  st.header("Get a digest from the latest papers you have previously scraped")
55
  d_topic = st.text_input("Keyword to match tags", value="large language")
56
  if st.button("Generate digest"):
57
- summarise_by_tag(d_topic)
58
- rows = rows_by_tag(d_topic, MAX_RESULTS)
 
 
 
 
59
  st.components.v1.html(render_rows(rows), height=800, scrolling=True)
60
 
61
  with tab3:
@@ -65,7 +69,8 @@ with tab3:
65
  if mode == "Keyword":
66
  kw = st.text_input("Keyword")
67
  if st.button("Ideate"):
68
- ideas = ideate_from_topic(kw)
 
69
  if ideas is None:
70
  st.info("No papers in the database match that keyword. "
71
  "Try running a search in the **Search** tab first.")
@@ -76,8 +81,9 @@ with tab3:
76
  ids_in = st.text_area("Comma-separated IDs",
77
  placeholder="2406.01234,2405.01234")
78
  if st.button("Ideate"):
79
- ids = [x.strip() for x in ids_in.split(",") if x.strip()]
80
- ideas = ideate_from_ids(ids)
 
81
  if ideas is None:
82
  st.info("Those IDs aren’t in the database yet. "
83
  "Fetch them via the **Search** tab, then try again.")
 
12
  from datetime import date
13
  from config import MAX_RESULTS
14
  from scrape import scrape
 
15
  from digest import build_html
16
  from ideate import ideate_from_topic, ideate_from_ids
17
  from helpers import render_rows, rows_by_tag
 
38
  category = c4.text_input("Category (e.g. cs.CL)")
39
  k = st.slider("Max papers", 5, 50, 25)
40
  if st.button("Run search"):
41
+ with st.spinner("Scraping papers, and storing them..."):
42
+ scrape(max_results=k, topic=topic, title=title,
43
  author=author, category=category)
44
+ st.success("All done!")
45
  from db import get_conn
46
  newest = get_conn().execute(
47
  "SELECT title, authors, abstract, published FROM papers "
 
54
  st.header("Get a digest from the latest papers you have previously scraped")
55
  d_topic = st.text_input("Keyword to match tags", value="large language")
56
  if st.button("Generate digest"):
57
+ with st.spinner("Finding papers and summarising them..."):
58
+ summarise_by_tag(d_topic)
59
+ rows = rows_by_tag(d_topic, MAX_RESULTS)
60
+ if not rows:
61
+ st.info("No papers found; try the Search tab.")
62
+ else:
63
  st.components.v1.html(render_rows(rows), height=800, scrolling=True)
64
 
65
  with tab3:
 
69
  if mode == "Keyword":
70
  kw = st.text_input("Keyword")
71
  if st.button("Ideate"):
72
+ with st.spinner("Thinking of new ideas..."):
73
+ ideas = ideate_from_topic(kw)
74
  if ideas is None:
75
  st.info("No papers in the database match that keyword. "
76
  "Try running a search in the **Search** tab first.")
 
81
  ids_in = st.text_area("Comma-separated IDs",
82
  placeholder="2406.01234,2405.01234")
83
  if st.button("Ideate"):
84
+ with st.spinner("Thinking of new ideas..."):
85
+ ids = [x.strip() for x in ids_in.split(",") if x.strip()]
86
+ ideas = ideate_from_ids(ids)
87
  if ideas is None:
88
  st.info("Those IDs aren’t in the database yet. "
89
  "Fetch them via the **Search** tab, then try again.")