Spaces:

hfmlsoc
/

OrgStats

Running

App Files Files Community

evijit HF Staff commited on 7 days ago

Commit

9d2f4f2

verified ·

1 Parent(s): 3610b1c

carryover from evijit

Browse files

Files changed (3) hide show

app.py +468 -788
models_processed.parquet +3 -0
preprocess.py +371 -0

app.py CHANGED Viewed

@@ -1,846 +1,526 @@
 import json
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import os
 import numpy as np
-import io
 import duckdb
-# Define pipeline tags
-PIPELINE_TAGS = [
- 'text-generation',
- 'text-to-image',
- 'text-classification',
- 'text2text-generation',
- 'audio-to-audio',
- 'feature-extraction',
- 'image-classification',
- 'translation',
- 'reinforcement-learning',
- 'fill-mask',
- 'text-to-speech',
- 'automatic-speech-recognition',
- 'image-text-to-text',
- 'token-classification',
- 'sentence-similarity',
- 'question-answering',
- 'image-feature-extraction',
- 'summarization',
- 'zero-shot-image-classification',
- 'object-detection',
- 'image-segmentation',
- 'image-to-image',
- 'image-to-text',
- 'audio-classification',
- 'visual-question-answering',
- 'text-to-video',
- 'zero-shot-classification',
- 'depth-estimation',
- 'text-ranking',
- 'image-to-video',
- 'multiple-choice',
- 'unconditional-image-generation',
- 'video-classification',
- 'text-to-audio',
- 'time-series-forecasting',
- 'any-to-any',
- 'video-text-to-text',
- 'table-question-answering',
-]
-# Model size categories in GB
 MODEL_SIZE_RANGES = {
-    "Small (<1GB)": (0, 1),
-    "Medium (1-5GB)": (1, 5),
-    "Large (5-20GB)": (5, 20),
-    "X-Large (20-50GB)": (20, 50),
-    "XX-Large (>50GB)": (50, float('inf'))
 }
-# Filter functions for tags - UPDATED to use cached columns
-def is_audio_speech(row):
-    # Use cached column instead of recalculating
-    return row['is_audio_speech']
-def is_music(row):
-    # Use cached column instead of recalculating
-    return row['has_music']
-def is_robotics(row):
-    # Use cached column instead of recalculating
-    return row['has_robot']
-def is_biomed(row):
-    # Use cached column instead of recalculating
-    return row['is_biomed']
-def is_timeseries(row):
-    # Use cached column instead of recalculating
-    return row['has_series']
-def is_science(row):
-    # Use cached column instead of recalculating
-    return row['has_science']
-def is_video(row):
-    # Use cached column instead of recalculating
-    return row['has_video']
-def is_image(row):
-    # Use cached column instead of recalculating
-    return row['has_image']
-def is_text(row):
-    # Use cached column instead of recalculating
-    return row['has_text']
-def is_image(row):
-    tags = row.get("tags", [])
-    # Check if tags exists and is not empty
-    if tags is not None:
-        # For numpy arrays
-        if hasattr(tags, 'dtype') and hasattr(tags, 'tolist'):
-            # Convert numpy array to list
-            tags_list = tags.tolist()
-            return any("image" in str(tag).lower() for tag in tags_list)
-        # For regular lists
-        elif isinstance(tags, list):
-            return any("image" in str(tag).lower() for tag in tags)
-        # For string tags
-        elif isinstance(tags, str):
-            return "image" in tags.lower()
-    return False
-def is_text(row):
-    tags = row.get("tags", [])
-    # Check if tags exists and is not empty
-    if tags is not None:
-        # For numpy arrays
-        if hasattr(tags, 'dtype') and hasattr(tags, 'tolist'):
-            # Convert numpy array to list
-            tags_list = tags.tolist()
-            return any("text" in str(tag).lower() for tag in tags_list)
-        # For regular lists
-        elif isinstance(tags, list):
-            return any("text" in str(tag).lower() for tag in tags)
-        # For string tags
-        elif isinstance(tags, str):
-            return "text" in tags.lower()
-    return False
-def extract_model_size(safetensors_data):
-    """Extract model size in GB from safetensors data"""
-    try:
-        if pd.isna(safetensors_data):
-            return 0
-        # If it's already a dictionary, use it directly
-        if isinstance(safetensors_data, dict):
-            if 'total' in safetensors_data:
-                try:
-                    size_bytes = float(safetensors_data['total'])
-                    return size_bytes / (1024 * 1024 * 1024)  # Convert to GB
-                except (ValueError, TypeError):
-                    pass
-        # If it's a string, try to parse it as JSON
-        elif isinstance(safetensors_data, str):
-            try:
-                data_dict = json.loads(safetensors_data)
-                if 'total' in data_dict:
-                    try:
-                        size_bytes = float(data_dict['total'])
-                        return size_bytes / (1024 * 1024 * 1024)  # Convert to GB
-                    except (ValueError, TypeError):
-                        pass
-            except:
-                pass
-        return 0
-    except Exception as e:
-        print(f"Error extracting model size: {e}")
-        return 0
-# Add model size filter function - UPDATED to use cached size_category column
-def is_in_size_range(row, size_range):
-    """Check if a model is in the specified size range using pre-calculated size category"""
-    if size_range is None or size_range == "None":
-        return True
-    # Simply compare with cached size_category
-    return row['size_category'] == size_range
-TAG_FILTER_FUNCS = {
-    "Audio & Speech": is_audio_speech,
-    "Time series": is_timeseries,
-    "Robotics": is_robotics,
-    "Music": is_music,
-    "Video": is_video,
-    "Images": is_image,
-    "Text": is_text,
-    "Biomedical": is_biomed,
-    "Sciences": is_science,
-}
-def extract_org_from_id(model_id):
-    """Extract organization name from model ID"""
-    if "/" in model_id:
-        return model_id.split("/")[0]
-    return "unaffiliated"
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, pipeline_filter=None, size_filter=None, skip_orgs=None):
-    """Process DataFrame into treemap format with filters applied - OPTIMIZED with cached columns"""
-    # Create a copy to avoid modifying the original
     filtered_df = df.copy()
-    # Apply filters
-    filter_stats = {"initial": len(filtered_df)}
-    start_time = pd.Timestamp.now()
-    # Apply tag filter - OPTIMIZED to use cached columns
-    if tag_filter and tag_filter in TAG_FILTER_FUNCS:
-        print(f"Applying tag filter: {tag_filter}")
-        # Use direct column filtering instead of applying a function to each row
-        if tag_filter == "Audio & Speech":
-            filtered_df = filtered_df[filtered_df['is_audio_speech']]
-        elif tag_filter == "Music":
-            filtered_df = filtered_df[filtered_df['has_music']]
-        elif tag_filter == "Robotics":
-            filtered_df = filtered_df[filtered_df['has_robot']]
-        elif tag_filter == "Biomedical":
-            filtered_df = filtered_df[filtered_df['is_biomed']]
-        elif tag_filter == "Time series":
-            filtered_df = filtered_df[filtered_df['has_series']]
-        elif tag_filter == "Sciences":
-            filtered_df = filtered_df[filtered_df['has_science']]
-        elif tag_filter == "Video":
-            filtered_df = filtered_df[filtered_df['has_video']]
-        elif tag_filter == "Images":
-            filtered_df = filtered_df[filtered_df['has_image']]
-        elif tag_filter == "Text":
-            filtered_df = filtered_df[filtered_df['has_text']]
-        filter_stats["after_tag_filter"] = len(filtered_df)
-        print(f"Tag filter applied in {(pd.Timestamp.now() - start_time).total_seconds():.3f} seconds")
-        start_time = pd.Timestamp.now()
-    # Apply pipeline filter
     if pipeline_filter:
-        print(f"Applying pipeline filter: {pipeline_filter}")
-        filtered_df = filtered_df[filtered_df["pipeline_tag"] == pipeline_filter]
-        filter_stats["after_pipeline_filter"] = len(filtered_df)
-        print(f"Pipeline filter applied in {(pd.Timestamp.now() - start_time).total_seconds():.3f} seconds")
-        start_time = pd.Timestamp.now()
-    # Apply size filter - OPTIMIZED to use cached size_category column
-    if size_filter and size_filter in MODEL_SIZE_RANGES:
-        print(f"Applying size filter: {size_filter}")
-        # Use the cached size_category column directly
-        filtered_df = filtered_df[filtered_df['size_category'] == size_filter]
-        # Debug info
-        print(f"Size filter '{size_filter}' applied.")
-        print(f"Models after size filter: {len(filtered_df)}")
-        filter_stats["after_size_filter"] = len(filtered_df)
-        print(f"Size filter applied in {(pd.Timestamp.now() - start_time).total_seconds():.3f} seconds")
-        start_time = pd.Timestamp.now()
-    # Add organization column
-    filtered_df["organization"] = filtered_df["id"].apply(extract_org_from_id)
-    # Skip organizations if specified
     if skip_orgs and len(skip_orgs) > 0:
-        filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
-        filter_stats["after_skip_orgs"] = len(filtered_df)
-    # Print filter stats
-    print("Filter statistics:")
-    for stage, count in filter_stats.items():
-        print(f"  {stage}: {count} models")
-    # Check if we have any data left
-    if filtered_df.empty:
-        print("Warning: No data left after applying filters!")
-        return pd.DataFrame()  # Return empty DataFrame
-    # Aggregate by organization
-    org_totals = filtered_df.groupby("organization")[count_by].sum().reset_index()
-    org_totals = org_totals.sort_values(by=count_by, ascending=False)
-    # Get top organizations
-    top_orgs = org_totals.head(top_k)["organization"].tolist()
-    # Filter to only include models from top organizations
-    filtered_df = filtered_df[filtered_df["organization"].isin(top_orgs)]
-    # Prepare data for treemap
-    treemap_data = filtered_df[["id", "organization", count_by]].copy()
-    # Add a root node
     treemap_data["root"] = "models"
-    # Ensure numeric values
-    treemap_data[count_by] = pd.to_numeric(treemap_data[count_by], errors="coerce").fillna(0)
-    print(f"Treemap data prepared in {(pd.Timestamp.now() - start_time).total_seconds():.3f} seconds")
     return treemap_data
 def create_treemap(treemap_data, count_by, title=None):
-    """Create a Plotly treemap from the prepared data"""
     if treemap_data.empty:
-        # Create an empty figure with a message
-        fig = px.treemap(
-            names=["No data matches the selected filters"],
-            values=[1]
-        )
-        fig.update_layout(
-            title="No data matches the selected filters",
-            margin=dict(t=50, l=25, r=25, b=25)
-        )
         return fig
-    # Create the treemap
     fig = px.treemap(
-        treemap_data,
-        path=["root", "organization", "id"],
-        values=count_by,
         title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization",
         color_discrete_sequence=px.colors.qualitative.Plotly
     )
-    # Update layout
-    fig.update_layout(
-        margin=dict(t=50, l=25, r=25, b=25)
-    )
-    # Update traces for better readability
-    fig.update_traces(
-        textinfo="label+value+percent root",
-        hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>"
-    )
     return fig
-def load_models_data():
-    """Load models data from Hugging Face using DuckDB with caching for improved performance"""
-    try:
-        # The URL to the parquet file
-        parquet_url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
-        print("Fetching data from Hugging Face models.parquet...")
-        # Based on the column names provided, we can directly select the columns we need
-        # Note: We need to select safetensors to get the model size information
         try:
-            query = """
-            SELECT
-                id,
-                downloads,
-                downloadsAllTime,
-                likes,
-                pipeline_tag,
-                tags,
-                safetensors
-            FROM read_parquet('https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet')
-            """
-            df = duckdb.sql(query).df()
-        except Exception as sql_error:
-            print(f"Error with specific column selection: {sql_error}")
-            # Fallback to just selecting everything and then filtering
-            print("Falling back to select * query...")
-            query = "SELECT * FROM read_parquet('https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet')"
-            raw_df = duckdb.sql(query).df()
-            # Now extract only the columns we need
-            needed_columns = ['id', 'downloads', 'downloadsAllTime', 'likes', 'pipeline_tag', 'tags', 'safetensors']
-            available_columns = set(raw_df.columns)
-            df = pd.DataFrame()
-            # Copy over columns that exist
-            for col in needed_columns:
-                if col in available_columns:
-                    df[col] = raw_df[col]
                 else:
-                    # Create empty columns for missing data
-                    if col in ['downloads', 'downloadsAllTime', 'likes']:
-                        df[col] = 0
-                    elif col == 'pipeline_tag':
-                        df[col] = ''
-                    elif col == 'tags':
-                        df[col] = [[] for _ in range(len(raw_df))]
-                    elif col == 'safetensors':
-                        df[col] = None
-                    elif col == 'id':
-                        # Create IDs based on index if missing
-                        df[col] = [f"model_{i}" for i in range(len(raw_df))]
-        print(f"Data fetched successfully. Shape: {df.shape}")
-        # Check if safetensors column exists before trying to process it
-        if 'safetensors' in df.columns:
-            # Add params column derived from safetensors.total (model size in GB)
-            df['params'] = df['safetensors'].apply(extract_model_size)
-            # Debug model sizes
-            size_ranges = {
-                "Small (<1GB)": 0,
-                "Medium (1-5GB)": 0,
-                "Large (5-20GB)": 0,
-                "X-Large (20-50GB)": 0,
-                "XX-Large (>50GB)": 0
-            }
-            # Count models in each size range
-            for idx, row in df.iterrows():
-                size_gb = row['params']
-                if 0 <= size_gb < 1:
-                    size_ranges["Small (<1GB)"] += 1
-                elif 1 <= size_gb < 5:
-                    size_ranges["Medium (1-5GB)"] += 1
-                elif 5 <= size_gb < 20:
-                    size_ranges["Large (5-20GB)"] += 1
-                elif 20 <= size_gb < 50:
-                    size_ranges["X-Large (20-50GB)"] += 1
-                elif size_gb >= 50:
-                    size_ranges["XX-Large (>50GB)"] += 1
-            print("Model size distribution:")
-            for size_range, count in size_ranges.items():
-                print(f"  {size_range}: {count} models")
-            # CACHE SIZE CATEGORY: Add a size_category column for faster filtering
-            def get_size_category(size_gb):
-                if 0 <= size_gb < 1:
-                    return "Small (<1GB)"
-                elif 1 <= size_gb < 5:
-                    return "Medium (1-5GB)"
-                elif 5 <= size_gb < 20:
-                    return "Large (5-20GB)"
-                elif 20 <= size_gb < 50:
-                    return "X-Large (20-50GB)"
-                elif size_gb >= 50:
-                    return "XX-Large (>50GB)"
-                return None
-            # Add cached size category column
-            df['size_category'] = df['params'].apply(get_size_category)
-            # Remove the safetensors column as we don't need it anymore
-            df = df.drop(columns=['safetensors'])
-        else:
-            # If no safetensors column, add empty params column
-            df['params'] = 0
-            df['size_category'] = None
-        # Process tags to ensure it's in the right format - FIXED
-        def process_tags(tags_value):
-            try:
-                if pd.isna(tags_value) or tags_value is None:
-                    return []
-                # If it's a numpy array, convert to a list of strings
-                if hasattr(tags_value, 'dtype') and hasattr(tags_value, 'tolist'):
-                    # Note: This is the fix for the error
-                    return [str(tag) for tag in tags_value.tolist()]
-                # If already a list, ensure all elements are strings
-                if isinstance(tags_value, list):
-                    return [str(tag) for tag in tags_value]
-                # If string, try to parse as JSON or split by comma
-                if isinstance(tags_value, str):
-                    try:
-                        tags_list = json.loads(tags_value)
-                        if isinstance(tags_list, list):
-                            return [str(tag) for tag in tags_list]
-                    except:
-                        # Split by comma if JSON parsing fails
-                        return [tag.strip() for tag in tags_value.split(',') if tag.strip()]
-                # Last resort, convert to string and return as a single tag
-                return [str(tags_value)]
-            except Exception as e:
-                print(f"Error processing tags: {e}")
-                return []
-        # Check if tags column exists before trying to process it
-        if 'tags' in df.columns:
-            # Process tags column
-            df['tags'] = df['tags'].apply(process_tags)
-            # CACHE TAG CATEGORIES: Pre-calculate tag categories for faster filtering
-            print("Pre-calculating cached tag categories...")
-            # Helper functions to check for specific tags (simplified for caching)
-            def has_audio_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("audio" in str(tag).lower() for tag in tags)
-                return False
-            def has_speech_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("speech" in str(tag).lower() for tag in tags)
-                return False
-            def has_music_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("music" in str(tag).lower() for tag in tags)
-                return False
-            def has_robot_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("robot" in str(tag).lower() for tag in tags)
-                return False
-            def has_bio_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("bio" in str(tag).lower() for tag in tags)
-                return False
-            def has_med_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("medic" in str(tag).lower() for tag in tags)
-                return False
-            def has_series_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("series" in str(tag).lower() for tag in tags)
-                return False
-            def has_science_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("science" in str(tag).lower() and "bigscience" not in str(tag).lower() for tag in tags)
-                return False
-            def has_video_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("video" in str(tag).lower() for tag in tags)
-                return False
-            def has_image_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("image" in str(tag).lower() for tag in tags)
-                return False
-            def has_text_tag(tags):
-                if tags and isinstance(tags, list):
-                    return any("text" in str(tag).lower() for tag in tags)
-                return False
-            # Add cached columns for tag categories
-            print("Creating cached tag columns...")
-            df['has_audio'] = df['tags'].apply(has_audio_tag)
-            df['has_speech'] = df['tags'].apply(has_speech_tag)
-            df['has_music'] = df['tags'].apply(has_music_tag)
-            df['has_robot'] = df['tags'].apply(has_robot_tag)
-            df['has_bio'] = df['tags'].apply(has_bio_tag)
-            df['has_med'] = df['tags'].apply(has_med_tag)
-            df['has_series'] = df['tags'].apply(has_series_tag)
-            df['has_science'] = df['tags'].apply(has_science_tag)
-            df['has_video'] = df['tags'].apply(has_video_tag)
-            df['has_image'] = df['tags'].apply(has_image_tag)
-            df['has_text'] = df['tags'].apply(has_text_tag)
-            # Create combined category flags for faster filtering
-            df['is_audio_speech'] = (df['has_audio'] | df['has_speech'] |
-                                    df['pipeline_tag'].str.contains('audio', case=False, na=False) |
-                                    df['pipeline_tag'].str.contains('speech', case=False, na=False))
-            df['is_biomed'] = df['has_bio'] | df['has_med']
-            print("Cached tag columns created successfully!")
-        else:
-            # If no tags column, add empty tags and set all category flags to False
-            df['tags'] = [[] for _ in range(len(df))]
-            for col in ['has_audio', 'has_speech', 'has_music', 'has_robot',
-                        'has_bio', 'has_med', 'has_series', 'has_science',
-                        'has_video', 'has_image', 'has_text',
-                        'is_audio_speech', 'is_biomed']:
-                df[col] = False
-        # Fill NaN values
-        df.fillna({'downloads': 0, 'downloadsAllTime': 0, 'likes': 0, 'params': 0}, inplace=True)
-        # Ensure pipeline_tag is a string
-        if 'pipeline_tag' in df.columns:
-            df['pipeline_tag'] = df['pipeline_tag'].fillna('')
-        else:
-            df['pipeline_tag'] = ''
-        # Make sure all required columns exist
-        for col in ['id', 'downloads', 'downloadsAllTime', 'likes', 'pipeline_tag', 'tags', 'params']:
-            if col not in df.columns:
-                if col in ['downloads', 'downloadsAllTime', 'likes', 'params']:
-                    df[col] = 0
-                elif col == 'pipeline_tag':
-                    df[col] = ''
-                elif col == 'tags':
-                    df[col] = [[] for _ in range(len(df))]
-                elif col == 'id':
-                    df[col] = [f"model_{i}" for i in range(len(df))]
-        print(f"Successfully processed {len(df)} models with cached tag and size information")
-        return df, True
-    except Exception as e:
-        print(f"Error loading data: {e}")
-        # Return an empty DataFrame and False to indicate loading failure
-        return pd.DataFrame(), False
-# Create Gradio interface
-with gr.Blocks() as demo:
-    models_data = gr.State()
-    loading_complete = gr.State(False)  # Flag to indicate data load completion
-    with gr.Row():
-        gr.Markdown("""
-            # HuggingFace Models TreeMap Visualization
-            This app shows how different organizations contribute to the HuggingFace ecosystem with their models.
-            Use the filters to explore models by different metrics, tags, pipelines, and model sizes.
-            The treemap visualizes models grouped by organization, with the size of each box representing the selected metric.
-        """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            count_by_dropdown = gr.Dropdown(
-                label="Metric",
-                choices=[
-                    ("Downloads (last 30 days)", "downloads"),
-                    ("Downloads (All Time)", "downloadsAllTime"),
-                    ("Likes", "likes")
-                ],
-                value="downloads",
-                info="Select the metric to determine box sizes"
-            )
-            filter_choice_radio = gr.Radio(
-                label="Filter Type",
-                choices=["None", "Tag Filter", "Pipeline Filter"],
-                value="None",
-                info="Choose how to filter the models"
-            )
-            tag_filter_dropdown = gr.Dropdown(
-                label="Select Tag",
-                choices=list(TAG_FILTER_FUNCS.keys()),
-                value=None,
-                visible=False,
-                info="Filter models by domain/category"
-            )
-            pipeline_filter_dropdown = gr.Dropdown(
-                label="Select Pipeline Tag",
-                choices=PIPELINE_TAGS,
-                value=None,
-                visible=False,
-                info="Filter models by specific pipeline"
-            )
-            size_filter_dropdown = gr.Dropdown(
-                label="Model Size Filter",
-                choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
-                value="None",
-                info="Filter models by their size (using params column)"
-            )
-            top_k_slider = gr.Slider(
-                label="Number of Top Organizations",
-                minimum=5,
-                maximum=50,
-                value=25,
-                step=5,
-                info="Number of top organizations to include"
-            )
-            skip_orgs_textbox = gr.Textbox(
-                label="Organizations to Skip (comma-separated)",
-                placeholder="e.g., OpenAI, Google",
-                value="TheBloke, MaziyarPanahi, unsloth, modularai, Gensyn, bartowski"
-            )
-            generate_plot_button = gr.Button("Generate Plot", variant="primary", interactive=False)
-            refresh_data_button = gr.Button("Refresh Data from Hugging Face", variant="secondary")
-        with gr.Column(scale=3):
-            plot_output = gr.Plot()
-            stats_output = gr.Markdown("*Loading data from Hugging Face...*")
-            data_info = gr.Markdown("")
-    # Button enablement after data load
-    def enable_plot_button(loaded):
-        return gr.update(interactive=loaded)
-    loading_complete.change(
-        fn=enable_plot_button,
-        inputs=[loading_complete],
-        outputs=[generate_plot_button]
-    )
-    # Show/hide tag/pipeline dropdown
-    def update_filter_visibility(filter_choice):
-        if filter_choice == "Tag Filter":
-            return gr.update(visible=True), gr.update(visible=False)
-        elif filter_choice == "Pipeline Filter":
-            return gr.update(visible=False), gr.update(visible=True)
-        else:
-            return gr.update(visible=False), gr.update(visible=False)
-    filter_choice_radio.change(
-        fn=update_filter_visibility,
-        inputs=[filter_choice_radio],
-        outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
-    )
-    # Function to handle data load and provide data info
-    def load_and_provide_info():
-        df, success = load_models_data()
-        if success:
-            # Generate information about the loaded data
-            info_text = f"""
-### Data Information
-- **Total models loaded**: {len(df):,}
-- **Last update**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-- **Data source**: [Hugging Face Hub Stats](https://huggingface.co/datasets/cfahlgren1/hub-stats) (models.parquet)
-            """
-            # Return the data, loading status, and info text
-            return df, True, info_text, "*Data loaded successfully. Use the controls to generate a plot.*"
-        else:
-            # Return empty data, failed loading status, and error message
-            return pd.DataFrame(), False, "*Error loading data from Hugging Face.*", "*Failed to load data. Please try again.*"
-    # Main generate function
-    def generate_plot_on_click(count_by, filter_choice, tag_filter, pipeline_filter, size_filter, top_k, skip_orgs_text, data_df):
-        if data_df is None or not isinstance(data_df, pd.DataFrame) or data_df.empty:
-            return None, "Error: Data is still loading. Please wait a moment and try again."
-        selected_tag_filter = None
-        selected_pipeline_filter = None
-        selected_size_filter = None
-        if filter_choice == "Tag Filter":
-            selected_tag_filter = tag_filter
-        elif filter_choice == "Pipeline Filter":
-            selected_pipeline_filter = pipeline_filter
-        if size_filter != "None":
-            selected_size_filter = size_filter
-        skip_orgs = []
-        if skip_orgs_text and skip_orgs_text.strip():
-            skip_orgs = [org.strip() for org in skip_orgs_text.split(',') if org.strip()]
-        treemap_data = make_treemap_data(
-            df=data_df,
-            count_by=count_by,
-            top_k=top_k,
-            tag_filter=selected_tag_filter,
-            pipeline_filter=selected_pipeline_filter,
-            size_filter=selected_size_filter,
-            skip_orgs=skip_orgs
-        )
-        title_labels = {
-            "downloads": "Downloads (last 30 days)",
-            "downloadsAllTime": "Downloads (All Time)",
-            "likes": "Likes"
-        }
-        title_text = f"HuggingFace Models - {title_labels.get(count_by, count_by)} by Organization"
-        fig = create_treemap(
-            treemap_data=treemap_data,
-            count_by=count_by,
-            title=title_text
-        )
-        if treemap_data.empty:
-            stats_md = "No data matches the selected filters."
         else:
-            total_models = len(treemap_data)
-            total_value = treemap_data[count_by].sum()
-            # Get top 5 organizations
-            top_5_orgs = treemap_data.groupby("organization")[count_by].sum().sort_values(ascending=False).head(5)
-            # Get top 5 individual models
-            top_5_models = treemap_data[["id", count_by]].sort_values(by=count_by, ascending=False).head(5)
-            # Create statistics section
-            stats_md = f"""
-## Statistics
-- **Total models shown**: {total_models:,}
-- **Total {count_by}**: {int(total_value):,}
-## Top Organizations by {count_by.capitalize()}
-| Organization | {count_by.capitalize()} | % of Total |
-|--------------|-------------:|----------:|
-"""
-            # Add top organizations to the table
-            for org, value in top_5_orgs.items():
-                percentage = (value / total_value) * 100
-                stats_md += f"| {org} | {int(value):,} | {percentage:.2f}% |\n"
-            # Add the top models table
-            stats_md += f"""
-## Top Models by {count_by.capitalize()}
-| Model | {count_by.capitalize()} | % of Total |
-|-------|-------------:|----------:|
-"""
-            # Add top models to the table
-            for _, row in top_5_models.iterrows():
-                model_id = row["id"]
-                value = row[count_by]
-                percentage = (value / total_value) * 100
-                stats_md += f"| {model_id} | {int(value):,} | {percentage:.2f}% |\n"
-            # Add note about skipped organizations if any
-            if skip_orgs:
-                stats_md += f"\n*Note: {len(skip_orgs)} organization(s) excluded: {', '.join(skip_orgs)}*"
-        return fig, stats_md
-    # Load data at startup
     demo.load(
-        fn=load_and_provide_info,
-        inputs=[],
-        outputs=[models_data, loading_complete, data_info, stats_output]
     )
-    # Refresh data when button is clicked
     refresh_data_button.click(
-        fn=load_and_provide_info,
-        inputs=[],
-        outputs=[models_data, loading_complete, data_info, stats_output]
     )
     generate_plot_button.click(
-        fn=generate_plot_on_click,
-        inputs=[
-            count_by_dropdown,
-            filter_choice_radio,
-            tag_filter_dropdown,
-            pipeline_filter_dropdown,
-            size_filter_dropdown,
-            top_k_slider,
-            skip_orgs_textbox,
-            models_data
-        ],
-        outputs=[plot_output, stats_output]
     )
 if __name__ == "__main__":
-    demo.launch()

+# --- START OF FILE app.py ---
 import json
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import os
 import numpy as np
 import duckdb
+from tqdm.auto import tqdm # Standard tqdm for console, gr.Progress will track it
+import time
+import ast # For safely evaluating string representations of lists/dicts
+# --- Constants ---
 MODEL_SIZE_RANGES = {
+    "Small (<1GB)": (0, 1), "Medium (1-5GB)": (1, 5), "Large (5-20GB)": (5, 20),
+    "X-Large (20-50GB)": (20, 50), "XX-Large (>50GB)": (50, float('inf'))
 }
+PROCESSED_PARQUET_FILE_PATH = "models_processed.parquet"
+HF_PARQUET_URL = 'https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet' # Added for completeness within app.py context
+TAG_FILTER_CHOICES = [
+    "Audio & Speech", "Time series", "Robotics", "Music", "Video", "Images",
+    "Text", "Biomedical", "Sciences"
+]
+PIPELINE_TAGS = [
+ 'text-generation', 'text-to-image', 'text-classification', 'text2text-generation',
+ 'audio-to-audio', 'feature-extraction', 'image-classification', 'translation',
+ 'reinforcement-learning', 'fill-mask', 'text-to-speech', 'automatic-speech-recognition',
+ 'image-text-to-text', 'token-classification', 'sentence-similarity', 'question-answering',
+ 'image-feature-extraction', 'summarization', 'zero-shot-image-classification',
+ 'object-detection', 'image-segmentation', 'image-to-image', 'image-to-text',
+ 'audio-classification', 'visual-question-answering', 'text-to-video',
+ 'zero-shot-classification', 'depth-estimation', 'text-ranking', 'image-to-video',
+ 'multiple-choice', 'unconditional-image-generation', 'video-classification',
+ 'text-to-audio', 'time-series-forecasting', 'any-to-any', 'video-text-to-text',
+ 'table-question-answering',
+]
+def extract_model_size(safetensors_data):
+    try:
+        if pd.isna(safetensors_data): return 0.0
+        data_to_parse = safetensors_data
+        if isinstance(safetensors_data, str):
+            try:
+                if (safetensors_data.startswith('{') and safetensors_data.endswith('}')) or \
+                   (safetensors_data.startswith('[') and safetensors_data.endswith(']')):
+                    data_to_parse = ast.literal_eval(safetensors_data)
+                else: data_to_parse = json.loads(safetensors_data)
+            except: return 0.0
+        if isinstance(data_to_parse, dict) and 'total' in data_to_parse:
+            try:
+                total_bytes_val = data_to_parse['total']
+                size_bytes = float(total_bytes_val)
+                return size_bytes / (1024 * 1024 * 1024)
+            except (ValueError, TypeError): pass
+        return 0.0
+    except: return 0.0
+def extract_org_from_id(model_id):
+    if pd.isna(model_id): return "unaffiliated"
+    model_id_str = str(model_id)
+    return model_id_str.split("/")[0] if "/" in model_id_str else "unaffiliated"
+def process_tags_for_series(series_of_tags_values):
+    processed_tags_accumulator = []
+    for i, tags_value_from_series in enumerate(tqdm(series_of_tags_values, desc="Standardizing Tags", leave=False, unit="row")):
+        temp_processed_list_for_row = []
+        current_value_for_error_msg = str(tags_value_from_series)[:200] # Truncate for long error messages
+        try:
+            # Order of checks is important!
+            # 1. Handle explicit Python lists first
+            if isinstance(tags_value_from_series, list):
+                current_tags_in_list = []
+                for idx_tag, tag_item in enumerate(tags_value_from_series):
+                    try:
+                        # Ensure item is not NaN before string conversion if it might be a float NaN in a list
+                        if pd.isna(tag_item): continue
+                        str_tag = str(tag_item)
+                        stripped_tag = str_tag.strip()
+                        if stripped_tag:
+                            current_tags_in_list.append(stripped_tag)
+                    except Exception as e_inner_list_proc:
+                        print(f"ERROR processing item '{tag_item}' (type: {type(tag_item)}) within a list for row {i}. Error: {e_inner_list_proc}. Original list: {current_value_for_error_msg}")
+                temp_processed_list_for_row = current_tags_in_list
+            # 2. Handle NumPy arrays
+            elif isinstance(tags_value_from_series, np.ndarray):
+                # Convert to list, then process elements, handling potential NaNs within the array
+                current_tags_in_list = []
+                for idx_tag, tag_item in enumerate(tags_value_from_series.tolist()): # .tolist() is crucial
+                    try:
+                        if pd.isna(tag_item): continue # Check for NaN after converting to Python type
+                        str_tag = str(tag_item)
+                        stripped_tag = str_tag.strip()
+                        if stripped_tag:
+                            current_tags_in_list.append(stripped_tag)
+                    except Exception as e_inner_array_proc:
+                        print(f"ERROR processing item '{tag_item}' (type: {type(tag_item)}) within a NumPy array for row {i}. Error: {e_inner_array_proc}. Original array: {current_value_for_error_msg}")
+                temp_processed_list_for_row = current_tags_in_list
+            # 3. Handle simple None or pd.NA after lists and arrays (which might contain pd.NA elements handled above)
+            elif tags_value_from_series is None or pd.isna(tags_value_from_series): # Now pd.isna is safe for scalars
+                temp_processed_list_for_row = []
+            # 4. Handle strings (could be JSON-like, list-like, or comma-separated)
+            elif isinstance(tags_value_from_series, str):
+                processed_str_tags = []
+                # Attempt ast.literal_eval for strings that look like lists/tuples
+                if (tags_value_from_series.startswith('[') and tags_value_from_series.endswith(']')) or \
+                   (tags_value_from_series.startswith('(') and tags_value_from_series.endswith(')')):
+                    try:
+                        evaluated_tags = ast.literal_eval(tags_value_from_series)
+                        if isinstance(evaluated_tags, (list, tuple)): # Check if eval result is a list/tuple
+                            # Recursively process this evaluated list/tuple, as its elements could be complex
+                            # For simplicity here, assume elements are simple strings after eval
+                            current_eval_list = []
+                            for tag_item in evaluated_tags:
+                                if pd.isna(tag_item): continue
+                                str_tag = str(tag_item).strip()
+                                if str_tag: current_eval_list.append(str_tag)
+                            processed_str_tags = current_eval_list
+                    except (ValueError, SyntaxError):
+                        pass # If ast.literal_eval fails, let it fall to JSON or comma split
+                # If ast.literal_eval didn't populate, try JSON
+                if not processed_str_tags:
+                    try:
+                        json_tags = json.loads(tags_value_from_series)
+                        if isinstance(json_tags, list):
+                            # Similar to above, assume elements are simple strings after JSON parsing
+                            current_json_list = []
+                            for tag_item in json_tags:
+                                if pd.isna(tag_item): continue
+                                str_tag = str(tag_item).strip()
+                                if str_tag: current_json_list.append(str_tag)
+                            processed_str_tags = current_json_list
+                    except json.JSONDecodeError:
+                        # If not a valid JSON list, fall back to comma splitting as the final string strategy
+                        processed_str_tags = [tag.strip() for tag in tags_value_from_series.split(',') if tag.strip()]
+                    except Exception as e_json_other:
+                        print(f"ERROR during JSON processing for string '{current_value_for_error_msg}' for row {i}. Error: {e_json_other}")
+                        processed_str_tags = [tag.strip() for tag in tags_value_from_series.split(',') if tag.strip()] # Fallback
+                temp_processed_list_for_row = processed_str_tags
+            # 5. Fallback for other scalar types (e.g., int, float that are not NaN)
+            else:
+                # This path is for non-list, non-ndarray, non-None/NaN, non-string types.
+                # Or for NaNs that slipped through if they are not None or pd.NA (e.g. float('nan'))
+                if pd.isna(tags_value_from_series): # Catch any remaining NaNs like float('nan')
+                     temp_processed_list_for_row = []
+                else:
+                    str_val = str(tags_value_from_series).strip()
+                    temp_processed_list_for_row = [str_val] if str_val else []
+            processed_tags_accumulator.append(temp_processed_list_for_row)
+        except Exception as e_outer_tag_proc:
+            print(f"CRITICAL UNHANDLED ERROR processing row {i}: value '{current_value_for_error_msg}' (type: {type(tags_value_from_series)}). Error: {e_outer_tag_proc}. Appending [].")
+            processed_tags_accumulator.append([])
+    return processed_tags_accumulator
+def load_models_data(force_refresh=False, tqdm_cls=None):
+    if tqdm_cls is None: tqdm_cls = tqdm
+    overall_start_time = time.time()
+    print(f"Gradio load_models_data called with force_refresh={force_refresh}")
+    expected_cols_in_processed_parquet = [
+        'id', 'downloads', 'downloadsAllTime', 'likes', 'pipeline_tag', 'tags', 'params',
+        'size_category', 'organization', 'has_audio', 'has_speech', 'has_music',
+        'has_robot', 'has_bio', 'has_med', 'has_series', 'has_video', 'has_image',
+        'has_text', 'has_science', 'is_audio_speech', 'is_biomed',
+        'data_download_timestamp'
+    ]
+    if not force_refresh and os.path.exists(PROCESSED_PARQUET_FILE_PATH):
+        print(f"Attempting to load pre-processed data from: {PROCESSED_PARQUET_FILE_PATH}")
+        try:
+            df = pd.read_parquet(PROCESSED_PARQUET_FILE_PATH)
+            elapsed = time.time() - overall_start_time
+            missing_cols = [col for col in expected_cols_in_processed_parquet if col not in df.columns]
+            if missing_cols:
+                raise ValueError(f"Pre-processed Parquet is missing columns: {missing_cols}. Please run preprocessor or refresh data in app.")
+            # --- Diagnostic for 'has_robot' after loading parquet ---
+            if 'has_robot' in df.columns:
+                robot_count_parquet = df['has_robot'].sum()
+                print(f"DIAGNOSTIC (App - Parquet Load): 'has_robot' column found. Number of True values: {robot_count_parquet}")
+                if 0 < robot_count_parquet < 10:
+                     print(f"Sample 'has_robot' models (from parquet): {df[df['has_robot']]['id'].head().tolist()}")
+            else:
+                print("DIAGNOSTIC (App - Parquet Load): 'has_robot' column NOT FOUND.")
+            # --- End Diagnostic ---
+            msg = f"Successfully loaded pre-processed data in {elapsed:.2f}s. Shape: {df.shape}"
+            print(msg)
+            return df, True, msg
+        except Exception as e:
+            print(f"Could not load pre-processed Parquet: {e}. ")
+            if force_refresh: print("Proceeding to fetch fresh data as force_refresh=True.")
+            else:
+                 err_msg = (f"Pre-processed data could not be loaded: {e}. "
+                           "Please use 'Refresh Data from Hugging Face' button.")
+                 return pd.DataFrame(), False, err_msg
+    df_raw = None
+    raw_data_source_msg = ""
+    if force_refresh:
+        print("force_refresh=True (Gradio). Fetching fresh data...")
+        fetch_start = time.time()
+        try:
+            query = f"SELECT * FROM read_parquet('{HF_PARQUET_URL}')" # Ensure HF_PARQUET_URL is defined
+            df_raw = duckdb.sql(query).df()
+            if df_raw is None or df_raw.empty: raise ValueError("Fetched data is empty or None.")
+            raw_data_source_msg = f"Fetched by Gradio in {time.time() - fetch_start:.2f}s. Rows: {len(df_raw)}"
+            print(raw_data_source_msg)
+        except Exception as e_hf:
+            return pd.DataFrame(), False, f"Fatal error fetching from Hugging Face (Gradio): {e_hf}"
+    else:
+        err_msg = (f"Pre-processed data '{PROCESSED_PARQUET_FILE_PATH}' not found/invalid. "
+                   "Run preprocessor or use 'Refresh Data' button.")
+        return pd.DataFrame(), False, err_msg
+    print(f"Initiating processing for data newly fetched by Gradio. {raw_data_source_msg}")
+    df = pd.DataFrame()
+    proc_start = time.time()
+    core_cols = {'id': str, 'downloads': float, 'downloadsAllTime': float, 'likes': float,
+                 'pipeline_tag': str, 'tags': object, 'safetensors': object}
+    for col, dtype in core_cols.items():
+        if col in df_raw.columns:
+            df[col] = df_raw[col]
+            if dtype == float: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
+            elif dtype == str: df[col] = df[col].astype(str).fillna('')
+        else:
+            if col in ['downloads', 'downloadsAllTime', 'likes']: df[col] = 0.0
+            elif col == 'pipeline_tag': df[col] = ''
+            elif col == 'tags': df[col] = pd.Series([[] for _ in range(len(df_raw))])
+            elif col == 'safetensors': df[col] = None
+            elif col == 'id': return pd.DataFrame(), False, "Critical: 'id' column missing."
+    output_filesize_col_name = 'params'
+    if output_filesize_col_name in df_raw.columns and pd.api.types.is_numeric_dtype(df_raw[output_filesize_col_name]):
+        df[output_filesize_col_name] = pd.to_numeric(df_raw[output_filesize_col_name], errors='coerce').fillna(0.0)
+    elif 'safetensors' in df.columns:
+        safetensors_iter = df['safetensors']
+        if tqdm_cls != tqdm :
+             safetensors_iter = tqdm_cls(df['safetensors'], desc="Extracting model sizes (GB)")
+        df[output_filesize_col_name] = [extract_model_size(s) for s in safetensors_iter]
+        df[output_filesize_col_name] = pd.to_numeric(df[output_filesize_col_name], errors='coerce').fillna(0.0)
+    else:
+        df[output_filesize_col_name] = 0.0
+    def get_size_category_gradio(size_gb_val):
+        try: numeric_size_gb = float(size_gb_val)
+        except (ValueError, TypeError): numeric_size_gb = 0.0
+        if pd.isna(numeric_size_gb): numeric_size_gb = 0.0
+        if 0 <= numeric_size_gb < 1: return "Small (<1GB)"
+        elif 1 <= numeric_size_gb < 5: return "Medium (1-5GB)"
+        elif 5 <= numeric_size_gb < 20: return "Large (5-20GB)"
+        elif 20 <= numeric_size_gb < 50: return "X-Large (20-50GB)"
+        elif numeric_size_gb >= 50: return "XX-Large (>50GB)"
+        else: return "Small (<1GB)"
+    df['size_category'] = df[output_filesize_col_name].apply(get_size_category_gradio)
+    df['tags'] = process_tags_for_series(df['tags'])
+    df['temp_tags_joined'] = df['tags'].apply(
+        lambda tl: '~~~'.join(str(t).lower() for t in tl if pd.notna(t) and str(t).strip()) if isinstance(tl, list) else ''
+    )
+    tag_map = {
+        'has_audio': ['audio'], 'has_speech': ['speech'], 'has_music': ['music'],
+        'has_robot': ['robot', 'robotics'],
+        'has_bio': ['bio'], 'has_med': ['medic', 'medical'],
+        'has_series': ['series', 'time-series', 'timeseries'],
+        'has_video': ['video'], 'has_image': ['image', 'vision'],
+        'has_text': ['text', 'nlp', 'llm']
+    }
+    for col, kws in tag_map.items():
+        pattern = '|'.join(kws)
+        df[col] = df['temp_tags_joined'].str.contains(pattern, na=False, case=False, regex=True)
+    df['has_science'] = (
+        df['temp_tags_joined'].str.contains('science', na=False, case=False, regex=True) &
+        ~df['temp_tags_joined'].str.contains('bigscience', na=False, case=False, regex=True)
+    )
+    del df['temp_tags_joined']
+    df['is_audio_speech'] = (df['has_audio'] | df['has_speech'] |
+                            df['pipeline_tag'].str.contains('audio|speech', case=False, na=False, regex=True))
+    df['is_biomed'] = df['has_bio'] | df['has_med']
+    df['organization'] = df['id'].apply(extract_org_from_id)
+    if 'safetensors' in df.columns and \
+       not (output_filesize_col_name in df_raw.columns and pd.api.types.is_numeric_dtype(df_raw[output_filesize_col_name])):
+        df = df.drop(columns=['safetensors'], errors='ignore')
+    # --- Diagnostic for 'has_robot' after app-side processing (force_refresh path) ---
+    if force_refresh and 'has_robot' in df.columns:
+        robot_count_app_proc = df['has_robot'].sum()
+        print(f"DIAGNOSTIC (App - Force Refresh Processing): 'has_robot' column processed. Number of True values: {robot_count_app_proc}")
+        if 0 < robot_count_app_proc < 10:
+            print(f"Sample 'has_robot' models (App processed): {df[df['has_robot']]['id'].head().tolist()}")
+    # --- End Diagnostic ---
+    print(f"Data processing by Gradio completed in {time.time() - proc_start:.2f}s.")
+    total_elapsed = time.time() - overall_start_time
+    final_msg = f"{raw_data_source_msg}. Processing by Gradio took {time.time() - proc_start:.2f}s. Total: {total_elapsed:.2f}s. Shape: {df.shape}"
+    print(final_msg)
+    return df, True, final_msg
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, pipeline_filter=None, size_filter=None, skip_orgs=None):
+    if df is None or df.empty: return pd.DataFrame()
     filtered_df = df.copy()
+    col_map = { "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
+                "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
+                "Video": "has_video", "Images": "has_image", "Text": "has_text"}
+    # --- Diagnostic within make_treemap_data ---
+    if 'has_robot' in filtered_df.columns:
+        initial_robot_count = filtered_df['has_robot'].sum()
+        print(f"DIAGNOSTIC (make_treemap_data entry): Input df has {initial_robot_count} 'has_robot' models.")
+    else:
+        print("DIAGNOSTIC (make_treemap_data entry): 'has_robot' column NOT in input df.")
+    # --- End Diagnostic ---
+    if tag_filter and tag_filter in col_map:
+        target_col = col_map[tag_filter]
+        if target_col in filtered_df.columns:
+            # --- Diagnostic for specific 'Robotics' filter application ---
+            if tag_filter == "Robotics":
+                count_before_robot_filter = filtered_df[target_col].sum()
+                print(f"DIAGNOSTIC (make_treemap_data): Applying 'Robotics' filter. Models with '{target_col}'=True before this filter step: {count_before_robot_filter}")
+            # --- End Diagnostic ---
+            filtered_df = filtered_df[filtered_df[target_col]]
+            if tag_filter == "Robotics":
+                 print(f"DIAGNOSTIC (make_treemap_data): After 'Robotics' filter ({target_col}), df rows: {len(filtered_df)}")
+        else:
+            print(f"Warning: Tag filter column '{col_map[tag_filter]}' not found in DataFrame.")
     if pipeline_filter:
+        if "pipeline_tag" in filtered_df.columns:
+            filtered_df = filtered_df[filtered_df["pipeline_tag"] == pipeline_filter]
+        else:
+            print(f"Warning: 'pipeline_tag' column not found for filtering.")
+    if size_filter and size_filter != "None" and size_filter in MODEL_SIZE_RANGES.keys():
+        if 'size_category' in filtered_df.columns:
+            filtered_df = filtered_df[filtered_df['size_category'] == size_filter]
+        else:
+            print("Warning: 'size_category' column not found for filtering.")
     if skip_orgs and len(skip_orgs) > 0:
+        if "organization" in filtered_df.columns:
+            filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
+        else:
+            print("Warning: 'organization' column not found for filtering.")
+    if filtered_df.empty: return pd.DataFrame()
+    if count_by not in filtered_df.columns or not pd.api.types.is_numeric_dtype(filtered_df[count_by]):
+        filtered_df[count_by] = pd.to_numeric(filtered_df.get(count_by), errors="coerce").fillna(0.0)
+    org_totals = filtered_df.groupby("organization")[count_by].sum().nlargest(top_k, keep='first')
+    top_orgs_list = org_totals.index.tolist()
+    treemap_data = filtered_df[filtered_df["organization"].isin(top_orgs_list)][["id", "organization", count_by]].copy()
     treemap_data["root"] = "models"
+    treemap_data[count_by] = pd.to_numeric(treemap_data[count_by], errors="coerce").fillna(0.0)
     return treemap_data
 def create_treemap(treemap_data, count_by, title=None):
     if treemap_data.empty:
+        fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
+        fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
     fig = px.treemap(
+        treemap_data, path=["root", "organization", "id"], values=count_by,
         title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization",
         color_discrete_sequence=px.colors.qualitative.Plotly
     )
+    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    fig.update_traces(textinfo="label+value+percent root", hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>")
     return fig
+with gr.Blocks(title="HuggingFace Model Explorer", fill_width=True) as demo:
+    models_data_state = gr.State(pd.DataFrame())
+    loading_complete_state = gr.State(False)
+    with gr.Row(): gr.Markdown("# HuggingFace Models TreeMap Visualization")
+    with gr.Row():
+        with gr.Column(scale=1):
+            count_by_dropdown = gr.Dropdown(label="Metric", choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")], value="downloads")
+            filter_choice_radio = gr.Radio(label="Filter Type", choices=["None", "Tag Filter", "Pipeline Filter"], value="None")
+            tag_filter_dropdown = gr.Dropdown(label="Select Tag", choices=TAG_FILTER_CHOICES, value=None, visible=False)
+            pipeline_filter_dropdown = gr.Dropdown(label="Select Pipeline Tag", choices=PIPELINE_TAGS, value=None, visible=False)
+            size_filter_dropdown = gr.Dropdown(label="Model Size Filter", choices=["None"] + list(MODEL_SIZE_RANGES.keys()), value="None")
+            top_k_slider = gr.Slider(label="Number of Top Organizations", minimum=5, maximum=50, value=25, step=5)
+            skip_orgs_textbox = gr.Textbox(label="Organizations to Skip (comma-separated)", value="TheBloke,MaziyarPanahi,unsloth,modularai,Gensyn,bartowski")
+            generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
+            refresh_data_button = gr.Button(value="Refresh Data from Hugging Face", variant="secondary")
+        with gr.Column(scale=3):
+            plot_output = gr.Plot()
+            status_message_md = gr.Markdown("Initializing...")
+            data_info_md = gr.Markdown("")
+    def _update_button_interactivity(is_loaded_flag):
+        return gr.update(interactive=is_loaded_flag)
+    loading_complete_state.change(fn=_update_button_interactivity, inputs=loading_complete_state, outputs=generate_plot_button)
+    def _toggle_filters_visibility(choice):
+        return gr.update(visible=choice == "Tag Filter"), gr.update(visible=choice == "Pipeline Filter")
+    filter_choice_radio.change(fn=_toggle_filters_visibility, inputs=filter_choice_radio, outputs=[tag_filter_dropdown, pipeline_filter_dropdown])
+    def ui_load_data_controller(force_refresh_ui_trigger=False, progress=gr.Progress(track_tqdm=True)):
+        print(f"ui_load_data_controller called with force_refresh_ui_trigger={force_refresh_ui_trigger}")
+        status_msg_ui = "Loading data..."
+        data_info_text = ""
+        current_df = pd.DataFrame()
+        load_success_flag = False
+        data_as_of_date_display = "N/A"
         try:
+            current_df, load_success_flag, status_msg_from_load = load_models_data(
+                force_refresh=force_refresh_ui_trigger, tqdm_cls=progress.tqdm
+            )
+            if load_success_flag:
+                if force_refresh_ui_trigger:
+                    data_as_of_date_display = pd.Timestamp.now(tz='UTC').strftime('%B %d, %Y, %H:%M:%S %Z')
+                elif 'data_download_timestamp' in current_df.columns and not current_df.empty and pd.notna(current_df['data_download_timestamp'].iloc[0]):
+                    timestamp_from_parquet = pd.to_datetime(current_df['data_download_timestamp'].iloc[0])
+                    if timestamp_from_parquet.tzinfo is None:
+                        timestamp_from_parquet = timestamp_from_parquet.tz_localize('UTC')
+                    data_as_of_date_display = timestamp_from_parquet.strftime('%B %d, %Y, %H:%M:%S %Z')
                 else:
+                    data_as_of_date_display = "Pre-processed (date unavailable)"
+                size_dist_lines = []
+                if 'size_category' in current_df.columns:
+                    for cat in MODEL_SIZE_RANGES.keys():
+                        count = (current_df['size_category'] == cat).sum()
+                        size_dist_lines.append(f"  - {cat}: {count:,} models")
+                else: size_dist_lines.append("  - Size category information not available.")
+                size_dist = "\n".join(size_dist_lines)
+                data_info_text = (f"### Data Information\n"
+                                  f"- Overall Status: {status_msg_from_load}\n"
+                                  f"- Total models loaded: {len(current_df):,}\n"
+                                  f"- Data as of: {data_as_of_date_display}\n"
+                                  f"- Size categories:\n{size_dist}")
+                # # --- MODIFICATION: Add 'has_robot' count to UI data_info_text ---
+                # if not current_df.empty and 'has_robot' in current_df.columns:
+                #     robot_true_count = current_df['has_robot'].sum()
+                #     data_info_text += f"\n- **Models flagged 'has_robot'**: {robot_true_count}"
+                #     if 0 < robot_true_count <= 10: # If a few are found, list some IDs
+                #         sample_robot_ids = current_df[current_df['has_robot']]['id'].head(5).tolist()
+                #         data_info_text += f"\n  - Sample 'has_robot' model IDs: `{', '.join(sample_robot_ids)}`"
+                # elif not current_df.empty:
+                #     data_info_text += "\n- **Models flagged 'has_robot'**: 'has_robot' column not found in loaded data."
+                # # --- END MODIFICATION ---
+                status_msg_ui = "Data loaded successfully. Ready to generate plot."
+            else:
+                data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
+                status_msg_ui = status_msg_from_load
+        except Exception as e:
+            status_msg_ui = f"An unexpected error occurred in ui_load_data_controller: {str(e)}"
+            data_info_text = f"### Critical Error\n- {status_msg_ui}"
+            print(f"Critical error in ui_load_data_controller: {e}")
+            load_success_flag = False
+        return current_df, load_success_flag, data_info_text, status_msg_ui
+    def ui_generate_plot_controller(metric_choice, filter_type, tag_choice, pipeline_choice,
+                                   size_choice, k_orgs, skip_orgs_input, df_current_models):
+        if df_current_models is None or df_current_models.empty:
+            empty_fig = create_treemap(pd.DataFrame(), metric_choice, "Error: Model Data Not Loaded")
+            error_msg = "Model data is not loaded or is empty. Please load or refresh data first."
+            gr.Warning(error_msg)
+            return empty_fig, error_msg
+        tag_to_use = tag_choice if filter_type == "Tag Filter" else None
+        pipeline_to_use = pipeline_choice if filter_type == "Pipeline Filter" else None
+        size_to_use = size_choice if size_choice != "None" else None
+        orgs_to_skip = [org.strip() for org in skip_orgs_input.split(',') if org.strip()] if skip_orgs_input else []
+        # --- Diagnostic before calling make_treemap_data ---
+        if 'has_robot' in df_current_models.columns:
+            robot_count_before_treemap = df_current_models['has_robot'].sum()
+            print(f"DIAGNOSTIC (ui_generate_plot_controller): df_current_models entering make_treemap_data has {robot_count_before_treemap} 'has_robot' models.")
+        # --- End Diagnostic ---
+        treemap_df = make_treemap_data(df_current_models, metric_choice, k_orgs, tag_to_use, pipeline_to_use, size_to_use, orgs_to_skip)
+        title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
+        chart_title = f"HuggingFace Models - {title_labels.get(metric_choice, metric_choice)} by Organization"
+        plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
+        if treemap_df.empty:
+            plot_stats_md = "No data matches the selected filters. Try adjusting your filters."
         else:
+            total_items_in_plot = len(treemap_df['id'].unique())
+            total_value_in_plot = treemap_df[metric_choice].sum()
+            plot_stats_md = (f"## Plot Statistics\n- **Models shown**: {total_items_in_plot:,}\n- **Total {metric_choice}**: {int(total_value_in_plot):,}")
+        return plotly_fig, plot_stats_md
     demo.load(
+        fn=lambda progress=gr.Progress(track_tqdm=True): ui_load_data_controller(force_refresh_ui_trigger=False, progress=progress),
+        inputs=[],
+        outputs=[models_data_state, loading_complete_state, data_info_md, status_message_md]
     )
     refresh_data_button.click(
+        fn=lambda progress=gr.Progress(track_tqdm=True): ui_load_data_controller(force_refresh_ui_trigger=True, progress=progress),
+        inputs=[],
+        outputs=[models_data_state, loading_complete_state, data_info_md, status_message_md]
     )
     generate_plot_button.click(
+        fn=ui_generate_plot_controller,
+        inputs=[count_by_dropdown, filter_choice_radio, tag_filter_dropdown, pipeline_filter_dropdown,
+                size_filter_dropdown, top_k_slider, skip_orgs_textbox, models_data_state],
+        outputs=[plot_output, status_message_md]
     )
 if __name__ == "__main__":
+    if not os.path.exists(PROCESSED_PARQUET_FILE_PATH):
+        print(f"WARNING: Pre-processed data file '{PROCESSED_PARQUET_FILE_PATH}' not found.")
+        print("It is highly recommended to run the preprocessing script (e.g., preprocess.py) first.") # Corrected script name
+    else:
+        print(f"Found pre-processed data file: '{PROCESSED_PARQUET_FILE_PATH}'.")
+    demo.launch()
+# --- END OF FILE app.py ---

models_processed.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:998afad6c0c4c64f9e98efd8609d1cbab1dd2ac281b9c2e023878ad436c2fbde
+size 96033487

preprocess.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# --- START OF FILE preprocess.py ---
+import pandas as pd
+import numpy as np
+import json
+import ast
+from tqdm.auto import tqdm
+import time
+import os
+import duckdb
+import re # Import re for the manual regex check in debug
+# --- Constants ---
+PROCESSED_PARQUET_FILE_PATH = "models_processed.parquet"
+HF_PARQUET_URL = 'https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet'
+MODEL_SIZE_RANGES = {
+    "Small (<1GB)": (0, 1),
+    "Medium (1-5GB)": (1, 5),
+    "Large (5-20GB)": (5, 20),
+    "X-Large (20-50GB)": (20, 50),
+    "XX-Large (>50GB)": (50, float('inf'))
+}
+# --- Debugging Constant ---
+# <<<<<<< SET THE MODEL ID YOU WANT TO DEBUG HERE >>>>>>>
+MODEL_ID_TO_DEBUG = "openvla/openvla-7b"
+# Example: MODEL_ID_TO_DEBUG = "openai-community/gpt2"
+# If you don't have a specific ID, the debug block will just report it's not found.
+# --- Utility Functions (extract_model_file_size_gb, extract_org_from_id, process_tags_for_series, get_file_size_category - unchanged from previous correct version) ---
+def extract_model_file_size_gb(safetensors_data):
+    try:
+        if pd.isna(safetensors_data): return 0.0
+        data_to_parse = safetensors_data
+        if isinstance(safetensors_data, str):
+            try:
+                if (safetensors_data.startswith('{') and safetensors_data.endswith('}')) or \
+                   (safetensors_data.startswith('[') and safetensors_data.endswith(']')):
+                    data_to_parse = ast.literal_eval(safetensors_data)
+                else: data_to_parse = json.loads(safetensors_data)
+            except Exception: return 0.0
+        if isinstance(data_to_parse, dict) and 'total' in data_to_parse:
+            total_bytes_val = data_to_parse['total']
+            try:
+                size_bytes = float(total_bytes_val)
+                return size_bytes / (1024 * 1024 * 1024)
+            except (ValueError, TypeError): return 0.0
+        return 0.0
+    except Exception: return 0.0
+def extract_org_from_id(model_id):
+    if pd.isna(model_id): return "unaffiliated"
+    model_id_str = str(model_id)
+    return model_id_str.split("/")[0] if "/" in model_id_str else "unaffiliated"
+def process_tags_for_series(series_of_tags_values):
+    processed_tags_accumulator = []
+    for i, tags_value_from_series in enumerate(tqdm(series_of_tags_values, desc="Standardizing Tags", leave=False, unit="row")):
+        temp_processed_list_for_row = []
+        current_value_for_error_msg = str(tags_value_from_series)[:200] # Truncate for long error messages
+        try:
+            # Order of checks is important!
+            # 1. Handle explicit Python lists first
+            if isinstance(tags_value_from_series, list):
+                current_tags_in_list = []
+                for idx_tag, tag_item in enumerate(tags_value_from_series):
+                    try:
+                        # Ensure item is not NaN before string conversion if it might be a float NaN in a list
+                        if pd.isna(tag_item): continue
+                        str_tag = str(tag_item)
+                        stripped_tag = str_tag.strip()
+                        if stripped_tag:
+                            current_tags_in_list.append(stripped_tag)
+                    except Exception as e_inner_list_proc:
+                        print(f"ERROR processing item '{tag_item}' (type: {type(tag_item)}) within a list for row {i}. Error: {e_inner_list_proc}. Original list: {current_value_for_error_msg}")
+                temp_processed_list_for_row = current_tags_in_list
+            # 2. Handle NumPy arrays
+            elif isinstance(tags_value_from_series, np.ndarray):
+                # Convert to list, then process elements, handling potential NaNs within the array
+                current_tags_in_list = []
+                for idx_tag, tag_item in enumerate(tags_value_from_series.tolist()): # .tolist() is crucial
+                    try:
+                        if pd.isna(tag_item): continue # Check for NaN after converting to Python type
+                        str_tag = str(tag_item)
+                        stripped_tag = str_tag.strip()
+                        if stripped_tag:
+                            current_tags_in_list.append(stripped_tag)
+                    except Exception as e_inner_array_proc:
+                        print(f"ERROR processing item '{tag_item}' (type: {type(tag_item)}) within a NumPy array for row {i}. Error: {e_inner_array_proc}. Original array: {current_value_for_error_msg}")
+                temp_processed_list_for_row = current_tags_in_list
+            # 3. Handle simple None or pd.NA after lists and arrays (which might contain pd.NA elements handled above)
+            elif tags_value_from_series is None or pd.isna(tags_value_from_series): # Now pd.isna is safe for scalars
+                temp_processed_list_for_row = []
+            # 4. Handle strings (could be JSON-like, list-like, or comma-separated)
+            elif isinstance(tags_value_from_series, str):
+                processed_str_tags = []
+                # Attempt ast.literal_eval for strings that look like lists/tuples
+                if (tags_value_from_series.startswith('[') and tags_value_from_series.endswith(']')) or \
+                   (tags_value_from_series.startswith('(') and tags_value_from_series.endswith(')')):
+                    try:
+                        evaluated_tags = ast.literal_eval(tags_value_from_series)
+                        if isinstance(evaluated_tags, (list, tuple)): # Check if eval result is a list/tuple
+                            # Recursively process this evaluated list/tuple, as its elements could be complex
+                            # For simplicity here, assume elements are simple strings after eval
+                            current_eval_list = []
+                            for tag_item in evaluated_tags:
+                                if pd.isna(tag_item): continue
+                                str_tag = str(tag_item).strip()
+                                if str_tag: current_eval_list.append(str_tag)
+                            processed_str_tags = current_eval_list
+                    except (ValueError, SyntaxError):
+                        pass # If ast.literal_eval fails, let it fall to JSON or comma split
+                # If ast.literal_eval didn't populate, try JSON
+                if not processed_str_tags:
+                    try:
+                        json_tags = json.loads(tags_value_from_series)
+                        if isinstance(json_tags, list):
+                            # Similar to above, assume elements are simple strings after JSON parsing
+                            current_json_list = []
+                            for tag_item in json_tags:
+                                if pd.isna(tag_item): continue
+                                str_tag = str(tag_item).strip()
+                                if str_tag: current_json_list.append(str_tag)
+                            processed_str_tags = current_json_list
+                    except json.JSONDecodeError:
+                        # If not a valid JSON list, fall back to comma splitting as the final string strategy
+                        processed_str_tags = [tag.strip() for tag in tags_value_from_series.split(',') if tag.strip()]
+                    except Exception as e_json_other:
+                        print(f"ERROR during JSON processing for string '{current_value_for_error_msg}' for row {i}. Error: {e_json_other}")
+                        processed_str_tags = [tag.strip() for tag in tags_value_from_series.split(',') if tag.strip()] # Fallback
+                temp_processed_list_for_row = processed_str_tags
+            # 5. Fallback for other scalar types (e.g., int, float that are not NaN)
+            else:
+                # This path is for non-list, non-ndarray, non-None/NaN, non-string types.
+                # Or for NaNs that slipped through if they are not None or pd.NA (e.g. float('nan'))
+                if pd.isna(tags_value_from_series): # Catch any remaining NaNs like float('nan')
+                     temp_processed_list_for_row = []
+                else:
+                    str_val = str(tags_value_from_series).strip()
+                    temp_processed_list_for_row = [str_val] if str_val else []
+            processed_tags_accumulator.append(temp_processed_list_for_row)
+        except Exception as e_outer_tag_proc:
+            print(f"CRITICAL UNHANDLED ERROR processing row {i}: value '{current_value_for_error_msg}' (type: {type(tags_value_from_series)}). Error: {e_outer_tag_proc}. Appending [].")
+            processed_tags_accumulator.append([])
+    return processed_tags_accumulator
+def get_file_size_category(file_size_gb_val):
+    try:
+        numeric_file_size_gb = float(file_size_gb_val)
+        if pd.isna(numeric_file_size_gb): numeric_file_size_gb = 0.0
+    except (ValueError, TypeError): numeric_file_size_gb = 0.0
+    if 0 <= numeric_file_size_gb < 1: return "Small (<1GB)"
+    elif 1 <= numeric_file_size_gb < 5: return "Medium (1-5GB)"
+    elif 5 <= numeric_file_size_gb < 20: return "Large (5-20GB)"
+    elif 20 <= numeric_file_size_gb < 50: return "X-Large (20-50GB)"
+    elif numeric_file_size_gb >= 50: return "XX-Large (>50GB)"
+    else: return "Small (<1GB)"
+def main_preprocessor():
+    print(f"Starting pre-processing script. Output: '{PROCESSED_PARQUET_FILE_PATH}'.")
+    overall_start_time = time.time()
+    print(f"Fetching fresh data from Hugging Face: {HF_PARQUET_URL}")
+    try:
+        fetch_start_time = time.time()
+        query = f"SELECT * FROM read_parquet('{HF_PARQUET_URL}')"
+        df_raw = duckdb.sql(query).df()
+        data_download_timestamp = pd.Timestamp.now(tz='UTC')
+        if df_raw is None or df_raw.empty: raise ValueError("Fetched data is empty or None.")
+        if 'id' not in df_raw.columns: raise ValueError("Fetched data must contain 'id' column.")
+        print(f"Fetched data in {time.time() - fetch_start_time:.2f}s. Rows: {len(df_raw)}. Downloaded at: {data_download_timestamp.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+    except Exception as e_fetch:
+        print(f"ERROR: Could not fetch data from Hugging Face: {e_fetch}.")
+        return
+    df = pd.DataFrame()
+    print("Processing raw data...")
+    proc_start = time.time()
+    expected_cols_setup = {
+        'id': str, 'downloads': float, 'downloadsAllTime': float, 'likes': float,
+        'pipeline_tag': str, 'tags': object, 'safetensors': object
+    }
+    for col_name, target_dtype in expected_cols_setup.items():
+        if col_name in df_raw.columns:
+            df[col_name] = df_raw[col_name]
+            if target_dtype == float: df[col_name] = pd.to_numeric(df[col_name], errors='coerce').fillna(0.0)
+            elif target_dtype == str: df[col_name] = df[col_name].astype(str).fillna('')
+        else:
+            if col_name in ['downloads', 'downloadsAllTime', 'likes']: df[col_name] = 0.0
+            elif col_name == 'pipeline_tag': df[col_name] = ''
+            elif col_name == 'tags': df[col_name] = pd.Series([[] for _ in range(len(df_raw))]) # Initialize with empty lists
+            elif col_name == 'safetensors': df[col_name] = None # Initialize with None
+            elif col_name == 'id': print("CRITICAL ERROR: 'id' column missing."); return
+    output_filesize_col_name = 'params'
+    if output_filesize_col_name in df_raw.columns and pd.api.types.is_numeric_dtype(df_raw[output_filesize_col_name]):
+        print(f"Using pre-existing '{output_filesize_col_name}' column as file size in GB.")
+        df[output_filesize_col_name] = pd.to_numeric(df_raw[output_filesize_col_name], errors='coerce').fillna(0.0)
+    elif 'safetensors' in df.columns:
+        print(f"Calculating '{output_filesize_col_name}' (file size in GB) from 'safetensors' data...")
+        df[output_filesize_col_name] = df['safetensors'].apply(extract_model_file_size_gb)
+        df[output_filesize_col_name] = pd.to_numeric(df[output_filesize_col_name], errors='coerce').fillna(0.0)
+    else:
+        print(f"Cannot determine file size. Setting '{output_filesize_col_name}' to 0.0.")
+        df[output_filesize_col_name] = 0.0
+    df['data_download_timestamp'] = data_download_timestamp
+    print(f"Added 'data_download_timestamp' column.")
+    print("Categorizing models by file size...")
+    df['size_category'] = df[output_filesize_col_name].apply(get_file_size_category)
+    print("Standardizing 'tags' column...")
+    df['tags'] = process_tags_for_series(df['tags']) # This now uses tqdm internally
+    # --- START DEBUGGING BLOCK ---
+    # This block will execute before the main tag processing loop
+    if MODEL_ID_TO_DEBUG and MODEL_ID_TO_DEBUG in df['id'].values: # Check if ID exists
+        print(f"\n--- Pre-Loop Debugging for Model ID: {MODEL_ID_TO_DEBUG} ---")
+        # 1. Check the 'tags' column content after process_tags_for_series
+        model_specific_tags_list = df.loc[df['id'] == MODEL_ID_TO_DEBUG, 'tags'].iloc[0]
+        print(f"1. Tags from df['tags'] (after process_tags_for_series): {model_specific_tags_list}")
+        print(f"   Type of tags: {type(model_specific_tags_list)}")
+        if isinstance(model_specific_tags_list, list):
+            for i, tag_item in enumerate(model_specific_tags_list):
+                print(f"   Tag item {i}: '{tag_item}' (type: {type(tag_item)}, len: {len(str(tag_item))})")
+                # Detailed check for 'robotics' specifically
+                if 'robotics' in str(tag_item).lower():
+                    print(f"     DEBUG: Found 'robotics' substring in '{tag_item}'")
+                    print(f"       - str(tag_item).lower().strip(): '{str(tag_item).lower().strip()}'")
+                    print(f"       - Is it exactly 'robotics'?: {str(tag_item).lower().strip() == 'robotics'}")
+                    print(f"       - Ordinals: {[ord(c) for c in str(tag_item)]}")
+        # 2. Simulate temp_tags_joined for this specific model
+        if isinstance(model_specific_tags_list, list):
+            simulated_temp_tags_joined = '~~~'.join(str(t).lower().strip() for t in model_specific_tags_list if pd.notna(t) and str(t).strip())
+        else:
+            simulated_temp_tags_joined = ''
+        print(f"2. Simulated 'temp_tags_joined' for this model: '{simulated_temp_tags_joined}'")
+        # 3. Simulate 'has_robot' check for this model
+        robot_keywords = ['robot', 'robotics']
+        robot_pattern = '|'.join(robot_keywords)
+        manual_robot_check = bool(re.search(robot_pattern, simulated_temp_tags_joined, flags=re.IGNORECASE))
+        print(f"3. Manual regex check for 'has_robot' ('{robot_pattern}' in '{simulated_temp_tags_joined}'): {manual_robot_check}")
+        print(f"--- End Pre-Loop Debugging for Model ID: {MODEL_ID_TO_DEBUG} ---\n")
+    elif MODEL_ID_TO_DEBUG:
+        print(f"DEBUG: Model ID '{MODEL_ID_TO_DEBUG}' not found in DataFrame for pre-loop debugging.")
+    # --- END DEBUGGING BLOCK ---
+    print("Vectorized creation of cached tag columns...")
+    tag_time = time.time()
+    # This is the original temp_tags_joined creation:
+    df['temp_tags_joined'] = df['tags'].apply(
+        lambda tl: '~~~'.join(str(t).lower().strip() for t in tl if pd.notna(t) and str(t).strip()) if isinstance(tl, list) else ''
+    )
+    tag_map = {
+        'has_audio': ['audio'], 'has_speech': ['speech'], 'has_music': ['music'],
+        'has_robot': ['robot', 'robotics','openvla','vla'],
+        'has_bio': ['bio'], 'has_med': ['medic', 'medical'],
+        'has_series': ['series', 'time-series', 'timeseries'],
+        'has_video': ['video'], 'has_image': ['image', 'vision'],
+        'has_text': ['text', 'nlp', 'llm']
+    }
+    for col, kws in tag_map.items():
+        pattern = '|'.join(kws)
+        df[col] = df['temp_tags_joined'].str.contains(pattern, na=False, case=False, regex=True)
+    df['has_science'] = (
+        df['temp_tags_joined'].str.contains('science', na=False, case=False, regex=True) &
+        ~df['temp_tags_joined'].str.contains('bigscience', na=False, case=False, regex=True)
+    )
+    del df['temp_tags_joined'] # Clean up temporary column
+    df['is_audio_speech'] = (df['has_audio'] | df['has_speech'] |
+                            df['pipeline_tag'].str.contains('audio|speech', case=False, na=False, regex=True))
+    df['is_biomed'] = df['has_bio'] | df['has_med']
+    print(f"Vectorized tag columns created in {time.time() - tag_time:.2f}s.")
+    # --- POST-LOOP DIAGNOSTIC for has_robot & a specific model ---
+    if 'has_robot' in df.columns:
+        print("\n--- 'has_robot' Diagnostics (Preprocessor - Post-Loop) ---")
+        print(df['has_robot'].value_counts(dropna=False))
+        if MODEL_ID_TO_DEBUG and MODEL_ID_TO_DEBUG in df['id'].values:
+            model_has_robot_val = df.loc[df['id'] == MODEL_ID_TO_DEBUG, 'has_robot'].iloc[0]
+            print(f"Value of 'has_robot' for model '{MODEL_ID_TO_DEBUG}': {model_has_robot_val}")
+            if model_has_robot_val:
+                 print(f"  Original tags for '{MODEL_ID_TO_DEBUG}': {df.loc[df['id'] == MODEL_ID_TO_DEBUG, 'tags'].iloc[0]}")
+        if df['has_robot'].any():
+            print("Sample models flagged as 'has_robot':")
+            print(df[df['has_robot']][['id', 'tags', 'has_robot']].head(5))
+        else:
+            print("No models were flagged as 'has_robot' after processing.")
+        print("--------------------------------------------------------\n")
+    # --- END POST-LOOP DIAGNOSTIC ---
+    print("Adding organization column...")
+    df['organization'] = df['id'].apply(extract_org_from_id)
+    # Drop safetensors if params was calculated from it, and params didn't pre-exist as numeric
+    if 'safetensors' in df.columns and \
+       not (output_filesize_col_name in df_raw.columns and pd.api.types.is_numeric_dtype(df_raw[output_filesize_col_name])):
+        df = df.drop(columns=['safetensors'], errors='ignore')
+    final_expected_cols = [
+        'id', 'downloads', 'downloadsAllTime', 'likes', 'pipeline_tag', 'tags',
+        'params', 'size_category', 'organization',
+        'has_audio', 'has_speech', 'has_music', 'has_robot', 'has_bio', 'has_med',
+        'has_series', 'has_video', 'has_image', 'has_text', 'has_science',
+        'is_audio_speech', 'is_biomed',
+        'data_download_timestamp'
+    ]
+    # Ensure all final columns exist, adding defaults if necessary
+    for col in final_expected_cols:
+        if col not in df.columns:
+            print(f"Warning: Final expected column '{col}' is missing! Defaulting appropriately.")
+            if col == 'params': df[col] = 0.0
+            elif col == 'size_category': df[col] = "Small (<1GB)" # Default size category
+            elif 'has_' in col or 'is_' in col : df[col] = False # Default boolean flags to False
+            elif col == 'data_download_timestamp': df[col] = pd.NaT # Default timestamp to NaT
+    print(f"Data processing completed in {time.time() - proc_start:.2f}s.")
+    try:
+        print(f"Saving processed data to: {PROCESSED_PARQUET_FILE_PATH}")
+        df_to_save = df[final_expected_cols].copy() # Ensure only expected columns are saved
+        df_to_save.to_parquet(PROCESSED_PARQUET_FILE_PATH, index=False, engine='pyarrow')
+        print(f"Successfully saved processed data.")
+    except Exception as e_save:
+        print(f"ERROR: Could not save processed data: {e_save}")
+        return
+    total_elapsed_script = time.time() - overall_start_time
+    print(f"Pre-processing finished. Total time: {total_elapsed_script:.2f}s. Final Parquet shape: {df_to_save.shape}")
+if __name__ == "__main__":
+    if os.path.exists(PROCESSED_PARQUET_FILE_PATH):
+        print(f"Deleting existing '{PROCESSED_PARQUET_FILE_PATH}' to ensure fresh processing...")
+        try: os.remove(PROCESSED_PARQUET_FILE_PATH)
+        except OSError as e: print(f"Error deleting file: {e}. Please delete manually and rerun."); exit()
+    main_preprocessor()
+    if os.path.exists(PROCESSED_PARQUET_FILE_PATH):
+        print(f"\nTo verify, load parquet and check 'has_robot' and its 'tags':")
+        print(f"import pandas as pd; df_chk = pd.read_parquet('{PROCESSED_PARQUET_FILE_PATH}')")
+        print(f"print(df_chk['has_robot'].value_counts())")
+        if MODEL_ID_TO_DEBUG:
+            print(f"print(df_chk[df_chk['id'] == '{MODEL_ID_TO_DEBUG}'][['id', 'tags', 'has_robot']])")
+        else:
+            print(f"print(df_chk[df_chk['has_robot']][['id', 'tags', 'has_robot']].head())")