Spaces:
Sleeping
Sleeping
Sam-Oliveira
commited on
Commit
·
06018df
1
Parent(s):
e469e5f
change keybert again
Browse files- .gitignore +2 -1
- src/helpers.py +4 -1
- src/scrape.py +20 -9
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
.DS_Store
|
| 2 |
-
ra_env
|
|
|
|
|
|
| 1 |
.DS_Store
|
| 2 |
+
ra_env
|
| 3 |
+
src/__pycache__
|
src/helpers.py
CHANGED
|
@@ -30,4 +30,7 @@ def rows_by_tag(keyword: str, limit: int = 25):
|
|
| 30 |
return conn.execute(
|
| 31 |
"SELECT title, authors, summary, published FROM papers "
|
| 32 |
"WHERE LOWER(tags) LIKE ? ORDER BY published DESC LIMIT ?", (q, limit)
|
| 33 |
-
).fetchall()
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
return conn.execute(
|
| 31 |
"SELECT title, authors, summary, published FROM papers "
|
| 32 |
"WHERE LOWER(tags) LIKE ? ORDER BY published DESC LIMIT ?", (q, limit)
|
| 33 |
+
).fetchall()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
src/scrape.py
CHANGED
|
@@ -2,23 +2,34 @@ import time, arxiv
|
|
| 2 |
from query_builder import build_query
|
| 3 |
from db import get_conn
|
| 4 |
from config import MAX_RESULTS
|
| 5 |
-
from keybert import KeyBERT
|
| 6 |
import os
|
| 7 |
import pathlib
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
os.environ["
|
| 12 |
-
os.environ["
|
| 13 |
-
os.environ["
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
st_model = SentenceTransformer(
|
| 17 |
"sentence-transformers/all-MiniLM-L6-v2",
|
| 18 |
-
cache_folder=
|
| 19 |
)
|
| 20 |
-
|
| 21 |
-
# 2) Hand it to KeyBERT
|
| 22 |
kw_model = KeyBERT(st_model)
|
| 23 |
|
| 24 |
def make_tags(title, abstract, top_n=5):
|
|
|
|
| 2 |
from query_builder import build_query
|
| 3 |
from db import get_conn
|
| 4 |
from config import MAX_RESULTS
|
|
|
|
| 5 |
import os
|
| 6 |
import pathlib
|
| 7 |
+
import os, pathlib, uuid, shutil
|
| 8 |
+
|
| 9 |
+
BASE_CACHE = pathlib.Path("/data") # always writable in Spaces
|
| 10 |
+
CACHE_DIR = BASE_CACHE / "hf_cache" / str(os.getpid())
|
| 11 |
+
|
| 12 |
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 13 |
|
| 14 |
+
# 1) Point every HF-related lib there
|
| 15 |
+
os.environ["HF_HOME"] = str(CACHE_DIR)
|
| 16 |
+
os.environ["HF_HUB_CACHE"] = str(CACHE_DIR)
|
| 17 |
+
os.environ["TRANSFORMERS_CACHE"] = str(CACHE_DIR)
|
| 18 |
+
os.environ["SENTENCE_TRANSFORMERS_HOME"] = str(CACHE_DIR)
|
| 19 |
|
| 20 |
+
# 2) Remove any stale lock that might have been copied along
|
| 21 |
+
lock_file = CACHE_DIR / ".lock"
|
| 22 |
+
if lock_file.exists():
|
| 23 |
+
lock_file.unlink()
|
| 24 |
+
|
| 25 |
+
# 3) Now import and load the model safely
|
| 26 |
+
from sentence_transformers import SentenceTransformer
|
| 27 |
+
from keybert import KeyBERT
|
| 28 |
|
| 29 |
st_model = SentenceTransformer(
|
| 30 |
"sentence-transformers/all-MiniLM-L6-v2",
|
| 31 |
+
cache_folder=str(CACHE_DIR)
|
| 32 |
)
|
|
|
|
|
|
|
| 33 |
kw_model = KeyBERT(st_model)
|
| 34 |
|
| 35 |
def make_tags(title, abstract, top_n=5):
|