Sam-Oliveira commited on
Commit
06018df
·
1 Parent(s): e469e5f

change keybert again

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. src/helpers.py +4 -1
  3. src/scrape.py +20 -9
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  .DS_Store
2
- ra_env
 
 
1
  .DS_Store
2
+ ra_env
3
+ src/__pycache__
src/helpers.py CHANGED
@@ -30,4 +30,7 @@ def rows_by_tag(keyword: str, limit: int = 25):
30
  return conn.execute(
31
  "SELECT title, authors, summary, published FROM papers "
32
  "WHERE LOWER(tags) LIKE ? ORDER BY published DESC LIMIT ?", (q, limit)
33
- ).fetchall()
 
 
 
 
30
  return conn.execute(
31
  "SELECT title, authors, summary, published FROM papers "
32
  "WHERE LOWER(tags) LIKE ? ORDER BY published DESC LIMIT ?", (q, limit)
33
+ ).fetchall()
34
+
35
+
36
+
src/scrape.py CHANGED
@@ -2,23 +2,34 @@ import time, arxiv
2
  from query_builder import build_query
3
  from db import get_conn
4
  from config import MAX_RESULTS
5
- from keybert import KeyBERT
6
  import os
7
  import pathlib
8
- from sentence_transformers import SentenceTransformer
 
 
 
 
 
9
 
10
- os.environ["HF_HOME"] = "/data"
11
- os.environ["HF_HUB_CACHE"] = "/data"
12
- os.environ["TRANSFORMERS_CACHE"] = "/data"
13
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/data"
 
14
 
 
 
 
 
 
 
 
 
15
 
16
  st_model = SentenceTransformer(
17
  "sentence-transformers/all-MiniLM-L6-v2",
18
- cache_folder="/data" # <- writable
19
  )
20
-
21
- # 2) Hand it to KeyBERT
22
  kw_model = KeyBERT(st_model)
23
 
24
  def make_tags(title, abstract, top_n=5):
 
2
  from query_builder import build_query
3
  from db import get_conn
4
  from config import MAX_RESULTS
 
5
  import os
6
  import pathlib
7
+ import os, pathlib, uuid, shutil
8
+
9
+ BASE_CACHE = pathlib.Path("/data") # always writable in Spaces
10
+ CACHE_DIR = BASE_CACHE / "hf_cache" / str(os.getpid())
11
+
12
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
13
 
14
+ # 1) Point every HF-related lib there
15
+ os.environ["HF_HOME"] = str(CACHE_DIR)
16
+ os.environ["HF_HUB_CACHE"] = str(CACHE_DIR)
17
+ os.environ["TRANSFORMERS_CACHE"] = str(CACHE_DIR)
18
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = str(CACHE_DIR)
19
 
20
+ # 2) Remove any stale lock that might have been copied along
21
+ lock_file = CACHE_DIR / ".lock"
22
+ if lock_file.exists():
23
+ lock_file.unlink()
24
+
25
+ # 3) Now import and load the model safely
26
+ from sentence_transformers import SentenceTransformer
27
+ from keybert import KeyBERT
28
 
29
  st_model = SentenceTransformer(
30
  "sentence-transformers/all-MiniLM-L6-v2",
31
+ cache_folder=str(CACHE_DIR)
32
  )
 
 
33
  kw_model = KeyBERT(st_model)
34
 
35
  def make_tags(title, abstract, top_n=5):