davanstrien HF staff commited on
Commit
2ddb7e5
1 Parent(s): 6cf3db7
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +6 -14
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
- from datetime import datetime, timedelta
3
- from sys import platform
4
  from typing import Any, Dict
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  from cachetools import TTLCache, cached
9
- from diskcache import Cache
10
  from dotenv import load_dotenv
11
  from httpx import Client
12
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
@@ -15,7 +13,7 @@ from tqdm.contrib.concurrent import thread_map
15
 
16
  load_dotenv()
17
 
18
- LIMIT = 5_000
19
  CACHE_TIME = 60 * 60 * 12 # 12 hours
20
  REMOVE_ORGS = {
21
  "HuggingFaceM4",
@@ -44,18 +42,13 @@ cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
44
 
45
 
46
  def get_three_months_ago():
47
- now = datetime.now()
48
  return now - timedelta(days=90)
49
 
50
 
51
- def parse_date(date_str):
52
- # parse the created date from string 2023-11-17T16:39:54.000Z to datetime
53
- return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
54
-
55
-
56
  def add_created_data(dataset):
57
  _id = dataset._id
58
- created = parse_date(dataset.createdAt)
59
  dataset_dict = dataset.__dict__
60
  dataset_dict["createdAt"] = created
61
  return dataset_dict
@@ -129,7 +122,6 @@ columns_to_drop = [
129
  "cardData",
130
  "gated",
131
  "sha",
132
- # "paperswithcode_id",
133
  "tags",
134
  "description",
135
  "siblings",
@@ -137,7 +129,7 @@ columns_to_drop = [
137
  "_id",
138
  "private",
139
  "author",
140
- "citation",
141
  "lastModified",
142
  ]
143
 
@@ -158,7 +150,7 @@ def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to
158
 
159
  def filter_df_by_max_age(df, max_age_days=None):
160
  df = df.dropna(subset=["createdAt"])
161
- now = datetime.now()
162
  if max_age_days is not None:
163
  max_date = now - timedelta(days=max_age_days)
164
  df = df[df["createdAt"] >= max_date]
 
1
  import os
2
+ from datetime import datetime, timedelta, timezone
 
3
  from typing import Any, Dict
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from cachetools import TTLCache, cached
 
8
  from dotenv import load_dotenv
9
  from httpx import Client
10
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 
13
 
14
  load_dotenv()
15
 
16
+ LIMIT = 3_000
17
  CACHE_TIME = 60 * 60 * 12 # 12 hours
18
  REMOVE_ORGS = {
19
  "HuggingFaceM4",
 
42
 
43
 
44
  def get_three_months_ago():
45
+ now = datetime.now(timezone.utc)
46
  return now - timedelta(days=90)
47
 
48
 
 
 
 
 
 
49
  def add_created_data(dataset):
50
  _id = dataset._id
51
+ created = dataset.created_at
52
  dataset_dict = dataset.__dict__
53
  dataset_dict["createdAt"] = created
54
  return dataset_dict
 
122
  "cardData",
123
  "gated",
124
  "sha",
 
125
  "tags",
126
  "description",
127
  "siblings",
 
129
  "_id",
130
  "private",
131
  "author",
132
+ # "citation",
133
  "lastModified",
134
  ]
135
 
 
150
 
151
  def filter_df_by_max_age(df, max_age_days=None):
152
  df = df.dropna(subset=["createdAt"])
153
+ now = datetime.now(timezone.utc)
154
  if max_age_days is not None:
155
  max_date = now - timedelta(days=max_age_days)
156
  df = df[df["createdAt"] >= max_date]