import streamlit as st from huggingface_hub import HfApi import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from functools import lru_cache import time st.set_page_config(page_title="HF Contributions", layout="wide") api = HfApi() # Cache for API responses @lru_cache(maxsize=1000) def cached_repo_info(repo_id, repo_type): return api.repo_info(repo_id=repo_id, repo_type=repo_type) @lru_cache(maxsize=1000) def cached_list_commits(repo_id, repo_type): return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)) @lru_cache(maxsize=100) def cached_list_items(username, kind): if kind == "model": return list(api.list_models(author=username)) elif kind == "dataset": return list(api.list_datasets(author=username)) elif kind == "space": return list(api.list_spaces(author=username)) return [] # Rate limiting class RateLimiter: def __init__(self, calls_per_second=10): self.calls_per_second = calls_per_second self.last_call = 0 def wait(self): current_time = time.time() time_since_last_call = current_time - self.last_call if time_since_last_call < (1.0 / self.calls_per_second): time.sleep((1.0 / self.calls_per_second) - time_since_last_call) self.last_call = time.time() rate_limiter = RateLimiter() # Function to fetch commits for a repository (optimized) def fetch_commits_for_repo(repo_id, repo_type, username, selected_year): try: rate_limiter.wait() # Skip private/gated repos upfront repo_info = cached_repo_info(repo_id, repo_type) if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated): return [], [] # Get initial commit date initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date() commit_dates = [] commit_count = 0 # Add initial commit if it's from the selected year if initial_commit_date.year == selected_year: commit_dates.append(initial_commit_date) commit_count += 1 # Get all commits commits = cached_list_commits(repo_id, repo_type) for commit in commits: commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date() if commit_date.year == selected_year: commit_dates.append(commit_date) commit_count += 1 return commit_dates, commit_count except Exception: return [], 0 # Function to get commit events for a user (optimized) def get_commit_events(username, kind=None, selected_year=None): commit_dates = [] items_with_type = [] kinds = [kind] if kind else ["model", "dataset", "space"] for k in kinds: try: items = cached_list_items(username, k) items_with_type.extend((item, k) for item in items) repo_ids = [item.id for item in items] # Optimized parallel fetch with chunking chunk_size = 5 # Process 5 repos at a time for i in range(0, len(repo_ids), chunk_size): chunk = repo_ids[i:i + chunk_size] with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: future_to_repo = { executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id for repo_id in chunk } for future in as_completed(future_to_repo): repo_commits, repo_count = future.result() if repo_commits: # Only extend if we got commits commit_dates.extend(repo_commits) except Exception as e: st.warning(f"Error fetching {k}s for {username}: {str(e)}") # Create DataFrame with all commits df = pd.DataFrame(commit_dates, columns=["date"]) if not df.empty: df = df.drop_duplicates() # Remove any duplicate dates return df, items_with_type # Calendar heatmap function (optimized) def make_calendar_heatmap(df, title, year): if df.empty: st.info(f"No {title.lower()} found for {year}.") return # Optimize DataFrame operations df["count"] = 1 df = df.groupby("date", as_index=False).sum() df["date"] = pd.to_datetime(df["date"]) # Create date range more efficiently start = pd.Timestamp(f"{year}-01-01") end = pd.Timestamp(f"{year}-12-31") all_days = pd.date_range(start=start, end=end) # Optimize DataFrame creation and merging heatmap_data = pd.DataFrame({"date": all_days, "count": 0}) heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y")) heatmap_data["count"] = heatmap_data["count_y"].fillna(0) heatmap_data = heatmap_data.drop("count_y", axis=1) # Calculate week and day of week more efficiently heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7 # Create pivot table more efficiently pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0) # Optimize month labels calculation month_labels = pd.date_range(start, end, freq="MS").strftime("%b") month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7) # Create custom colormap with specific boundaries from matplotlib.colors import ListedColormap, BoundaryNorm colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions cmap = ListedColormap(colors) norm = BoundaryNorm(bounds, cmap.N) # Create plot more efficiently fig, ax = plt.subplots(figsize=(12, 1.2)) # Convert pivot values to integers to ensure proper color mapping pivot_int = pivot.astype(int) # Create heatmap with explicit vmin and vmax sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white", square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"]) ax.set_title(f"{title}", fontsize=12, pad=10) ax.set_xlabel("") ax.set_ylabel("") ax.set_xticks(month_positions) ax.set_xticklabels(month_labels, fontsize=8) ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) st.pyplot(fig) # Sidebar with st.sidebar: st.title("👤 Contributor") username = st.selectbox( "Select or type a username", options=["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"], index=0 ) st.markdown("