Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import requests | |
import tempfile | |
from datetime import datetime, timezone, timedelta | |
import base64 | |
from tqdm.auto import tqdm | |
import pymupdf | |
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" | |
class PaperManager: | |
def fetch_papers(self, date=None): | |
""" | |
Fetch papers from the API with optional date filtering. | |
Args: | |
date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date. | |
Returns: | |
bool: True if papers were successfully fetched, False otherwise. | |
""" | |
try: | |
# Use today's date if none provided | |
if date is None: | |
date = datetime.now().strftime('%Y-%m-%d') | |
# Construct the URL with the date parameter | |
url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100" | |
print(f"Fetching papers from: {url}") | |
response = requests.get(url) | |
response.raise_for_status() | |
data = response.json() | |
if not data: | |
print("No data received from API.") | |
return False | |
self.raw_papers = data # Store raw data | |
print(f"Found {len(data)} papers for date {date}") | |
return True | |
except requests.RequestException as e: | |
print(f"Error fetching papers: {e}") | |
return False | |
except Exception as e: | |
print(f"Unexpected error: {e}") | |
return False | |
def get_top_content(self): | |
""" | |
Get the most upvoted paper from today's submissions. | |
Returns: | |
dict: Dictionary mapping paper titles to their contents. | |
""" | |
# Fetch papers from today | |
if not self.fetch_papers(): | |
return {} | |
# Sort by upvotes | |
if self.raw_papers: | |
sorted_papers = sorted( | |
self.raw_papers, | |
key=lambda x: x.get('paper', {}).get('upvotes', 0), | |
reverse=True | |
) | |
# Take only the top paper | |
self.papers = [sorted_papers[0]] if sorted_papers else [] | |
else: | |
print("No papers found for today.") | |
self.papers = [] | |
# Get content | |
contents = {} | |
print(f"Processing {len(self.papers)} papers:") | |
for paper in tqdm(self.papers): | |
paper_id = paper["paper"]['id'] | |
content = self.get_paper_text(paper_id) | |
contents[paper["paper"]['title']] = {"id": paper_id, "content": content} | |
return contents | |
def get_paper_text(self, paper_id): | |
url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
response = requests.get(url) | |
if response.status_code != 200: | |
raise Exception(f"Failed to download PDF: {response.status_code}") | |
with open("temp.pdf", "wb") as f: | |
f.write(response.content) | |
with pymupdf.open("temp.pdf") as doc: | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# def get_top_content(self): | |
# self.fetch_papers() | |
# self.filter_top_papers() | |
# contents = {} | |
# print(f"Processing {len(self.papers)} papers:") | |
# for paper in tqdm(self.papers): | |
# paper_id = paper["paper"]['id'] | |
# contents[paper["paper"]['title']] = self.get_paper_text(paper_id) | |
# return contents | |
# Example usage | |
if __name__ == "__main__": | |
paper_manager = PaperManager() | |
top_papers = paper_manager.get_top_content() | |
for title, content in top_papers.items(): | |
print(f"Title: {title}") | |
print(f"Content: {content[:100]}...") # Print first 100 characters of content |