fdaudens's picture
fdaudens HF Staff
link in description
55e52b0
import os
import requests
import tempfile
from datetime import datetime, timezone, timedelta
import base64
from tqdm.auto import tqdm
import pymupdf
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
class PaperManager:
def fetch_papers(self, date=None):
"""
Fetch papers from the API with optional date filtering.
Args:
date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date.
Returns:
bool: True if papers were successfully fetched, False otherwise.
"""
try:
# Use today's date if none provided
if date is None:
date = datetime.now().strftime('%Y-%m-%d')
# Construct the URL with the date parameter
url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100"
print(f"Fetching papers from: {url}")
response = requests.get(url)
response.raise_for_status()
data = response.json()
if not data:
print("No data received from API.")
return False
self.raw_papers = data # Store raw data
print(f"Found {len(data)} papers for date {date}")
return True
except requests.RequestException as e:
print(f"Error fetching papers: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
def get_top_content(self):
"""
Get the most upvoted paper from today's submissions.
Returns:
dict: Dictionary mapping paper titles to their contents.
"""
# Fetch papers from today
if not self.fetch_papers():
return {}
# Sort by upvotes
if self.raw_papers:
sorted_papers = sorted(
self.raw_papers,
key=lambda x: x.get('paper', {}).get('upvotes', 0),
reverse=True
)
# Take only the top paper
self.papers = [sorted_papers[0]] if sorted_papers else []
else:
print("No papers found for today.")
self.papers = []
# Get content
contents = {}
print(f"Processing {len(self.papers)} papers:")
for paper in tqdm(self.papers):
paper_id = paper["paper"]['id']
content = self.get_paper_text(paper_id)
contents[paper["paper"]['title']] = {"id": paper_id, "content": content}
return contents
def get_paper_text(self, paper_id):
url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to download PDF: {response.status_code}")
with open("temp.pdf", "wb") as f:
f.write(response.content)
with pymupdf.open("temp.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
return text
# def get_top_content(self):
# self.fetch_papers()
# self.filter_top_papers()
# contents = {}
# print(f"Processing {len(self.papers)} papers:")
# for paper in tqdm(self.papers):
# paper_id = paper["paper"]['id']
# contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
# return contents
# Example usage
if __name__ == "__main__":
paper_manager = PaperManager()
top_papers = paper_manager.get_top_content()
for title, content in top_papers.items():
print(f"Title: {title}")
print(f"Content: {content[:100]}...") # Print first 100 characters of content