Spaces:

ahmedsqrd
/

model_trace

Runtime error

App Files Files Community

Ahmed Ahmed commited on 16 days ago

Commit

3a2ac99

1 Parent(s): 21bc425

no more dynamic updates

Browse files

Files changed (2) hide show

app.py +31 -172
logs.txt +234 -107

app.py CHANGED Viewed

@@ -40,149 +40,14 @@ def init_leaderboard(dataframe):
         ],
     )
-def refresh_leaderboard():
-    import sys
-    import traceback
-    import pandas as pd
-    try:
-        sys.stderr.write("=== REFRESH LEADERBOARD DEBUG ===\n")
-        sys.stderr.write("Refreshing leaderboard data...\n")
-        sys.stderr.flush()
-        # Get fresh leaderboard data
-        df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
-        sys.stderr.write(f"get_leaderboard_df returned: {type(df)}\n")
-        if df is not None:
-            sys.stderr.write(f"DataFrame shape: {df.shape}\n")
-            sys.stderr.write(f"DataFrame columns: {df.columns.tolist()}\n")
-            sys.stderr.write(f"DataFrame empty: {df.empty}\n")
-        else:
-            sys.stderr.write("DataFrame is None!\n")
-        sys.stderr.flush()
-        # Check if DataFrame is valid for leaderboard
-        if df is None:
-            sys.stderr.write("DataFrame is None, creating fallback DataFrame\n")
-            sys.stderr.flush()
-            # Create a fallback DataFrame
-            df = create_fallback_dataframe()
-        elif df.empty:
-            sys.stderr.write("DataFrame is empty, creating fallback DataFrame\n")
-            sys.stderr.flush()
-            # Create a fallback DataFrame for empty case
-            df = create_fallback_dataframe()
-        elif not all(col in df.columns for col in COLS):
-            sys.stderr.write(f"DataFrame missing required columns. Has: {df.columns.tolist()}, Needs: {COLS}\n")
-            sys.stderr.flush()
-            # Create a fallback DataFrame for missing columns
-            df = create_fallback_dataframe()
-        sys.stderr.write(f"Final DataFrame for leaderboard - Shape: {df.shape}, Columns: {df.columns.tolist()}\n")
-        sys.stderr.flush()
-        # Ensure DataFrame has the exact columns expected
-        for col in COLS:
-            if col not in df.columns:
-                sys.stderr.write(f"Adding missing column: {col}\n")
-                if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
-                    df[col] = 0.0
-                elif col == AutoEvalColumn.model.name:
-                    df[col] = "Unknown Model"
-                elif col == AutoEvalColumn.model_type_symbol.name:
-                    df[col] = "?"
-                else:
-                    df[col] = ""
-                sys.stderr.flush()
-        # Reorder columns to match expected order
-        df = df[COLS]
-        sys.stderr.write("Creating leaderboard component...\n")
-        sys.stderr.flush()
-        new_leaderboard = init_leaderboard(df)
-        sys.stderr.write("Leaderboard component created successfully\n")
-        sys.stderr.flush()
-        return new_leaderboard
-    except Exception as e:
-        error_msg = str(e)
-        traceback_str = traceback.format_exc()
-        sys.stderr.write(f"CRITICAL ERROR in refresh_leaderboard: {error_msg}\n")
-        sys.stderr.write(f"Traceback: {traceback_str}\n")
-        sys.stderr.flush()
-        # Create emergency fallback leaderboard
-        try:
-            sys.stderr.write("Creating emergency fallback leaderboard...\n")
-            sys.stderr.flush()
-            fallback_df = create_fallback_dataframe()
-            return init_leaderboard(fallback_df)
-        except Exception as fallback_error:
-            sys.stderr.write(f"Even fallback failed: {fallback_error}\n")
-            sys.stderr.flush()
-            raise Exception(f"Complete leaderboard failure: {error_msg}")
-def create_fallback_dataframe():
-    """Create a minimal valid DataFrame that won't crash the leaderboard"""
-    import pandas as pd
-    import sys
-    sys.stderr.write("Creating fallback DataFrame...\n")
-    sys.stderr.flush()
-    # Create minimal valid data
-    fallback_data = {col: [] for col in COLS}
-    # Add one dummy row to prevent leaderboard component from crashing
-    dummy_row = {}
-    for col in COLS:
-        if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
-            dummy_row[col] = 0.0
-        elif col == AutoEvalColumn.model.name:
-            dummy_row[col] = "No models evaluated yet"
-        elif col == AutoEvalColumn.model_type_symbol.name:
-            dummy_row[col] = "?"
-        elif col == AutoEvalColumn.precision.name:
-            dummy_row[col] = "float16"
-        elif col == AutoEvalColumn.model_type.name:
-            dummy_row[col] = "pretrained"
-        elif col == AutoEvalColumn.weight_type.name:
-            dummy_row[col] = "Original"
-        elif col == AutoEvalColumn.architecture.name:
-            dummy_row[col] = "Unknown"
-        elif col == AutoEvalColumn.still_on_hub.name:
-            dummy_row[col] = True
-        elif col == AutoEvalColumn.license.name:
-            dummy_row[col] = "Unknown"
-        elif col == AutoEvalColumn.params.name:
-            dummy_row[col] = 0.0
-        elif col == AutoEvalColumn.likes.name:
-            dummy_row[col] = 0.0
-        elif col == AutoEvalColumn.revision.name:
-            dummy_row[col] = ""
-        else:
-            dummy_row[col] = ""
-    df = pd.DataFrame([dummy_row])
-    sys.stderr.write(f"Fallback DataFrame created with shape: {df.shape}\n")
-    sys.stderr.write(f"Fallback DataFrame columns: {df.columns.tolist()}\n")
-    sys.stderr.flush()
-    return df
 def run_perplexity_test(model_name, revision, precision):
     """Run perplexity evaluation on demand."""
     import sys
     import traceback
     if not model_name:
-        return "Please enter a model name.", None
     try:
         # Use stderr for more reliable logging in HF Spaces
@@ -197,37 +62,24 @@ def run_perplexity_test(model_name, revision, precision):
         sys.stderr.flush()
         if success:
-            try:
-                # Try to refresh leaderboard
-                sys.stderr.write("Attempting to refresh leaderboard...\n")
-                sys.stderr.flush()
-                new_leaderboard = refresh_leaderboard()
-                if new_leaderboard is not None:
-                    sys.stderr.write("Leaderboard refresh successful\n")
-                    sys.stderr.flush()
-                    return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults saved and leaderboard updated.", new_leaderboard
-                else:
-                    sys.stderr.write("Leaderboard refresh returned None\n")
-                    sys.stderr.flush()
-                    return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard update returned None.\n\nPlease refresh the page to see updated results.", None
-            except Exception as refresh_error:
-                # If leaderboard refresh fails, still show success but don't update leaderboard
-                error_msg = str(refresh_error)
-                traceback_str = traceback.format_exc()
-                sys.stderr.write(f"Leaderboard refresh failed: {error_msg}\n")
-                sys.stderr.write(f"Traceback: {traceback_str}\n")
-                sys.stderr.flush()
-                # Check if it's the specific "must have a value set" error
-                if "must have a value set" in error_msg.lower():
-                    return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard component failed to update due to data structure issue.\n\n**Please refresh the page** to see your results in the main leaderboard.", None
-                else:
-                    return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard refresh failed: {error_msg}\n\nPlease refresh the page to see updated results.", None
         else:
-            return f"❌ Evaluation failed: {result}", None
     except Exception as e:
         error_msg = str(e)
@@ -235,7 +87,7 @@ def run_perplexity_test(model_name, revision, precision):
         sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
         sys.stderr.write(f"Traceback: {traceback_str}\n")
         sys.stderr.flush()
-        return f"❌ Critical error: {error_msg}", None
 # Initialize results repository and directory
 try:
@@ -301,15 +153,22 @@ with demo:
             gr.Markdown("""
             ### Tips:
-            - Check stderr logs in HF Spaces for detailed debugging information
-            - If evaluation succeeds but leaderboard doesn't update, try refreshing the page
-            - Example models to test: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`
             """)
             test_button.click(
                 run_perplexity_test,
                 [model_name, revision, precision],
-                [result, leaderboard]
             )
 demo.queue(default_concurrency_limit=5).launch()

         ],
     )
 def run_perplexity_test(model_name, revision, precision):
     """Run perplexity evaluation on demand."""
     import sys
     import traceback
+    import gradio as gr
     if not model_name:
+        return "Please enter a model name."
     try:
         # Use stderr for more reliable logging in HF Spaces
         sys.stderr.flush()
         if success:
+            sys.stderr.write("Evaluation succeeded - results saved to dataset\n")
+            sys.stderr.flush()
+            return f"""✅ **Perplexity evaluation completed successfully!**
+**Model**: {model_name}
+**Perplexity Score**: {result:.4f}
+🎉 **Results have been saved to the dataset.**
+📋 **To see your results in the leaderboard:**
+1. Click on the **🏅 Leaderboard** tab above
+2. Refresh the page (Ctrl+R or Cmd+R)
+3. Your model should now appear in the rankings!
+💡 **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
         else:
+            return f"❌ **Evaluation failed**: {result}"
     except Exception as e:
         error_msg = str(e)
         sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
         sys.stderr.write(f"Traceback: {traceback_str}\n")
         sys.stderr.flush()
+        return f"❌ **Critical error**: {error_msg}"
 # Initialize results repository and directory
 try:
             gr.Markdown("""
             ### Tips:
+            - **Check stderr logs** in HF Spaces for detailed debugging information
+            - **After evaluation completes**, click the 🏅 Leaderboard tab and refresh the page to see results
+            - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
+            - **Lower perplexity scores = better performance** (better at predicting text)
+            ### How it works:
+            1. Enter a model name from Hugging Face Hub
+            2. Click "Run Perplexity Test"
+            3. Wait for evaluation to complete (may take a few minutes for large models)
+            4. Go to 🏅 Leaderboard tab and refresh the page to see your results!
             """)
             test_button.click(
                 run_perplexity_test,
                 [model_name, revision, precision],
+                [result]
             )
 demo.queue(default_concurrency_limit=5).launch()

logs.txt CHANGED Viewed

@@ -1,39 +1,18 @@
-==== Application Startup at 2025-07-25 22:55:49 =====
-.gitattributes:   0%|          | 0.00/2.46k [00:00<?, ?B/s]
-.gitattributes: 100%|██████████| 2.46k/2.46k [00:00<00:00, 10.5MB/s]
-(…)enai-community_gpt2_20250725_231201.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250725_231201.json: 100%|██████████| 209/209 [00:00<00:00, 1.71MB/s]
-(…)enai-community_gpt2_20250725_233155.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250725_233155.json: 100%|██████████| 209/209 [00:00<00:00, 1.26MB/s]
-(…)enai-community_gpt2_20250725_235115.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250725_235115.json: 100%|██████████| 209/209 [00:00<00:00, 2.02MB/s]
-(…)enai-community_gpt2_20250725_235748.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250725_235748.json: 100%|██████████| 209/209 [00:00<00:00, 2.08MB/s]
-(…)enai-community_gpt2_20250726_000358.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250726_000358.json: 100%|██████████| 209/209 [00:00<00:00, 1.54MB/s]
-(…)enai-community_gpt2_20250726_000650.json:   0%|          | 0.00/209 [00:00<?, ?B/s]
-(…)enai-community_gpt2_20250726_000650.json: 100%|██████████| 209/209 [00:00<00:00, 2.35MB/s]
-=== Starting leaderboard creation ===
-Looking for results in: ./eval-results
-Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
-Benchmark columns: ['Perplexity']
-Searching for result files in: ./eval-results
-Found 6 result files
 Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
 config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]
-config.json: 100%|██████████| 665/665 [00:00<00:00, 6.14MB/s]
 Created result object for: openai-community/gpt2
 Added new result for openai-community_gpt2_float16
@@ -57,112 +36,176 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
 Created result object for: openai-community/gpt2
 Updated existing result for openai-community_gpt2_float16
-Processing 1 evaluation results
 Converting result to dict for: openai-community/gpt2
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
 Calculated average score: 69.7162958010531
-Added perplexity score 20.663532257080078 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Successfully converted and added result
-Returning 1 processed results
-Found 1 raw results
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
 Calculated average score: 69.7162958010531
-Added perplexity score 20.663532257080078 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-Successfully processed result 1/1: openai-community/gpt2
-Converted to 1 JSON records
 Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-DataFrame shape: (1, 14)
 Sorted DataFrame by average
 Selected and rounded columns
-Final DataFrame shape after filtering: (1, 12)
 Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 === Initializing Leaderboard ===
-DataFrame shape: (1, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 * Running on local URL:  http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
 To create a public link, set `share=True` in `launch()`.
-=== Running Perplexity Test ===
-Model: EleutherAI/gpt-neo-1.3B
 Revision: main
 Precision: float16
-Starting dynamic evaluation for EleutherAI/gpt-neo-1.3B
 Running perplexity evaluation...
-Loading model: EleutherAI/gpt-neo-1.3B (revision: main)
 Loading tokenizer...
-tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]
-tokenizer_config.json: 100%|██████████| 200/200 [00:00<00:00, 1.64MB/s]
-config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]
-config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 9.77MB/s]
-vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]
-vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 27.9MB/s]
 merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]
-merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.54MB/s]
-special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]
-special_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 1.05MB/s]
 Tokenizer loaded successfully
 Loading model...
-model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]
-model.safetensors:   0%|          | 778k/5.31G [00:01<2:15:00, 656kB/s]
-model.safetensors:   0%|          | 7.69M/5.31G [00:02<23:51, 3.70MB/s]
-model.safetensors:   1%|▏         | 74.7M/5.31G [00:03<03:29, 25.0MB/s]
-model.safetensors:   9%|▉         | 496M/5.31G [00:04<00:31, 153MB/s]
-model.safetensors:  19%|█▉        | 1.03G/5.31G [00:06<00:16, 263MB/s]
-model.safetensors:  25%|██▍       | 1.32G/5.31G [00:07<00:16, 235MB/s]
-model.safetensors:  38%|███▊      | 1.99G/5.31G [00:08<00:09, 346MB/s]
-model.safetensors:  47%|████▋     | 2.51G/5.31G [00:09<00:07, 379MB/s]
-model.safetensors:  59%|█████▊    | 3.11G/5.31G [00:10<00:05, 429MB/s]
-model.safetensors:  69%|██████▊   | 3.65G/5.31G [00:11<00:03, 451MB/s]
-model.safetensors:  80%|███████▉  | 4.24G/5.31G [00:13<00:02, 477MB/s]
-model.safetensors:  91%|█████████ | 4.84G/5.31G [00:14<00:00, 494MB/s]
-model.safetensors: 100%|██████████| 5.31G/5.31G [00:14<00:00, 355MB/s]
 Model loaded successfully
 Tokenizing input text...
 Tokenized input shape: torch.Size([1, 141])
 Moved inputs to device: cpu
 Running forward pass...
-Calculated loss: 1.78515625
-Final perplexity: 5.9609375
-Perplexity evaluation completed: 5.9609375
-Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'EleutherAI/gpt-neo-1.3B', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 5.9609375}}}
-Saving result to: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
 Result file saved locally
 Uploading to HF dataset: ahmedsqrd/results
 Upload completed successfully
-Evaluation result - Success: True, Result: 5.9609375
 Attempting to refresh leaderboard...
 Refreshing leaderboard data...
-=== Starting leaderboard creation ===
 Looking for results in: ./eval-results
 Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 Benchmark columns: ['Perplexity']
 Searching for result files in: ./eval-results
-Found 7 result files
 Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
 Created result object for: openai-community/gpt2
@@ -188,67 +231,151 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
 Created result object for: openai-community/gpt2
 Updated existing result for openai-community_gpt2_float16
-Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
-Created result object for: EleutherAI/gpt-neo-1.3B
-Added new result for EleutherAI_gpt-neo-1.3B_float16
-Processing 2 evaluation results
-Converting result to dict for: openai-community/gpt2
-Processing result for model: openai-community/gpt2
-Raw results: {'perplexity': 20.663532257080078}
-Calculated average score: 69.7162958010531
-Added perplexity score 20.663532257080078 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-Successfully converted and added result
 Converting result to dict for: EleutherAI/gpt-neo-1.3B
 Processing result for model: EleutherAI/gpt-neo-1.3B
 Raw results: {'perplexity': 5.9609375}
 Calculated average score: 82.1477223263516
-Added perplexity score 5.9609375 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Successfully converted and added result
-Returning 2 processed results
-Found 2 raw results
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
 Calculated average score: 69.7162958010531
-Added perplexity score 20.663532257080078 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-Successfully processed result 1/2: openai-community/gpt2
 Processing result for model: EleutherAI/gpt-neo-1.3B
 Raw results: {'perplexity': 5.9609375}
 Calculated average score: 82.1477223263516
-Added perplexity score 5.9609375 under column Perplexity
-Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-Successfully processed result 2/2: EleutherAI/gpt-neo-1.3B
-Converted to 2 JSON records
 Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
-DataFrame shape: (2, 14)
 Sorted DataFrame by average
 Selected and rounded columns
-Final DataFrame shape after filtering: (2, 12)
 Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
-Got DataFrame with shape: (2, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
-Creating leaderboard with valid DataFrame
 === Initializing Leaderboard ===
-DataFrame shape: (2, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 Leaderboard refresh successful
 Traceback (most recent call last):
   File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events

+Searching for result files in: ./eval-results
+Found 7 result files
+Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
+config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]
+config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 17.2MB/s]
+Created result object for: EleutherAI/gpt-neo-1.3B
+Added new result for EleutherAI_gpt-neo-1.3B_float16
 Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
 config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]
+config.json: 100%|██████████| 665/665 [00:00<00:00, 8.83MB/s]
 Created result object for: openai-community/gpt2
 Added new result for openai-community_gpt2_float16
 Created result object for: openai-community/gpt2
 Updated existing result for openai-community_gpt2_float16
+Processing 2 evaluation results
+Converting result to dict for: EleutherAI/gpt-neo-1.3B
+=== PROCESSING RESULT TO_DICT ===
+Processing result for model: EleutherAI/gpt-neo-1.3B
+Raw results: {'perplexity': 5.9609375}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 5.9609375
+Converted score: 82.1477223263516
+Calculated average score: 82.1477223263516
+Created base data_dict with 13 columns
+Added task score: Perplexity = 5.9609375
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully converted and added result
 Converting result to dict for: openai-community/gpt2
+=== PROCESSING RESULT TO_DICT ===
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 20.663532257080078
+Converted score: 69.7162958010531
 Calculated average score: 69.7162958010531
+Created base data_dict with 13 columns
+Added task score: Perplexity = 20.663532257080078
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
 Successfully converted and added result
+Returning 2 processed results
+Found 2 raw results
+Processing result 1/2: EleutherAI/gpt-neo-1.3B
+=== PROCESSING RESULT TO_DICT ===
+Processing result for model: EleutherAI/gpt-neo-1.3B
+Raw results: {'perplexity': 5.9609375}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 5.9609375
+Converted score: 82.1477223263516
+Calculated average score: 82.1477223263516
+Created base data_dict with 13 columns
+Added task score: Perplexity = 5.9609375
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully processed result 1/2: EleutherAI/gpt-neo-1.3B
+Processing result 2/2: openai-community/gpt2
+=== PROCESSING RESULT TO_DICT ===
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 20.663532257080078
+Converted score: 69.7162958010531
 Calculated average score: 69.7162958010531
+Created base data_dict with 13 columns
+Added task score: Perplexity = 20.663532257080078
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully processed result 2/2: openai-community/gpt2
+Converted to 2 JSON records
 Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+DataFrame shape: (2, 14)
 Sorted DataFrame by average
 Selected and rounded columns
+Final DataFrame shape after filtering: (2, 12)
 Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
+=== FINAL RESULT: DataFrame with 2 rows and 12 columns ===
 === Initializing Leaderboard ===
+DataFrame shape: (2, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 * Running on local URL:  http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
 To create a public link, set `share=True` in `launch()`.
+=== RUNNING PERPLEXITY TEST ===
+Model: openai-community/gpt2-large
 Revision: main
 Precision: float16
+Starting dynamic evaluation for openai-community/gpt2-large
 Running perplexity evaluation...
+Loading model: openai-community/gpt2-large (revision: main)
 Loading tokenizer...
+tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]
+tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 183kB/s]
+config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]
+config.json: 100%|██████████| 666/666 [00:00<00:00, 7.11MB/s]
+vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]
+vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 45.7MB/s]
 merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]
+merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 44.9MB/s]
+tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]
+tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 25.3MB/s]
 Tokenizer loaded successfully
 Loading model...
+model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]
+model.safetensors:   0%|          | 3.99M/3.25G [00:01<18:26, 2.93MB/s]
+model.safetensors:   4%|▍         | 138M/3.25G [00:02<00:47, 65.1MB/s]
+model.safetensors:   7%|▋         | 235M/3.25G [00:03<00:46, 65.4MB/s]
+model.safetensors:  28%|██▊       | 905M/3.25G [00:05<00:09, 258MB/s]
+model.safetensors:  46%|████▋     | 1.51G/3.25G [00:06<00:04, 360MB/s]
+model.safetensors:  71%|███████   | 2.31G/3.25G [00:07<00:01, 484MB/s]
+model.safetensors:  98%|█████████▊| 3.18G/3.25G [00:08<00:00, 593MB/s]
+model.safetensors: 100%|██████████| 3.25G/3.25G [00:08<00:00, 390MB/s]
+generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]
+generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 1.04MB/s]
 Model loaded successfully
 Tokenizing input text...
 Tokenized input shape: torch.Size([1, 141])
 Moved inputs to device: cpu
 Running forward pass...
+`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
+Calculated loss: 2.1944427490234375
+Final perplexity: 8.974998474121094
+Perplexity evaluation completed: 8.974998474121094
+Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'openai-community/gpt2-large', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 8.974998474121094}}}
+Saving result to: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
 Result file saved locally
 Uploading to HF dataset: ahmedsqrd/results
 Upload completed successfully
+Evaluation result - Success: True, Result: 8.974998474121094
 Attempting to refresh leaderboard...
+=== REFRESH LEADERBOARD DEBUG ===
 Refreshing leaderboard data...
+=== GET_LEADERBOARD_DF DEBUG ===
+Starting leaderboard creation...
 Looking for results in: ./eval-results
 Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 Benchmark columns: ['Perplexity']
 Searching for result files in: ./eval-results
+Found 8 result files
+Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
+Created result object for: EleutherAI/gpt-neo-1.3B
+Added new result for EleutherAI_gpt-neo-1.3B_float16
 Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
 Created result object for: openai-community/gpt2
 Created result object for: openai-community/gpt2
 Updated existing result for openai-community_gpt2_float16
+Processing file: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
+Created result object for: openai-community/gpt2-large
+Added new result for openai-community_gpt2-large_float16
+Processing 3 evaluation results
 Converting result to dict for: EleutherAI/gpt-neo-1.3B
+=== PROCESSING RESULT TO_DICT ===
 Processing result for model: EleutherAI/gpt-neo-1.3B
 Raw results: {'perplexity': 5.9609375}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 5.9609375
+Converted score: 82.1477223263516
 Calculated average score: 82.1477223263516
+Created base data_dict with 13 columns
+Added task score: Perplexity = 5.9609375
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
 Successfully converted and added result
+Converting result to dict for: openai-community/gpt2
+=== PROCESSING RESULT TO_DICT ===
 Processing result for model: openai-community/gpt2
 Raw results: {'perplexity': 20.663532257080078}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 20.663532257080078
+Converted score: 69.7162958010531
 Calculated average score: 69.7162958010531
+Created base data_dict with 13 columns
+Added task score: Perplexity = 20.663532257080078
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully converted and added result
+Converting result to dict for: openai-community/gpt2-large
+=== PROCESSING RESULT TO_DICT ===
+Processing result for model: openai-community/gpt2-large
+Raw results: {'perplexity': 8.974998474121094}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 8.974998474121094
+Converted score: 78.05557235640035
+Calculated average score: 78.05557235640035
+Created base data_dict with 13 columns
+Added task score: Perplexity = 8.974998474121094
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully converted and added result
+Returning 3 processed results
+Found 3 raw results
+Processing result 1/3: EleutherAI/gpt-neo-1.3B
+=== PROCESSING RESULT TO_DICT ===
 Processing result for model: EleutherAI/gpt-neo-1.3B
 Raw results: {'perplexity': 5.9609375}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 5.9609375
+Converted score: 82.1477223263516
 Calculated average score: 82.1477223263516
+Created base data_dict with 13 columns
+Added task score: Perplexity = 5.9609375
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully processed result 1/3: EleutherAI/gpt-neo-1.3B
+Processing result 2/3: openai-community/gpt2
+=== PROCESSING RESULT TO_DICT ===
+Processing result for model: openai-community/gpt2
+Raw results: {'perplexity': 20.663532257080078}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 20.663532257080078
+Converted score: 69.7162958010531
+Calculated average score: 69.7162958010531
+Created base data_dict with 13 columns
+Added task score: Perplexity = 20.663532257080078
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully processed result 2/3: openai-community/gpt2
+Processing result 3/3: openai-community/gpt2-large
+=== PROCESSING RESULT TO_DICT ===
+Processing result for model: openai-community/gpt2-large
+Raw results: {'perplexity': 8.974998474121094}
+Model precision: Precision.float16
+Model type: ModelType.PT
+Weight type: WeightType.Original
+Available tasks: ['task0']
+Looking for task: perplexity in results
+Found score for perplexity: 8.974998474121094
+Converted score: 78.05557235640035
+Calculated average score: 78.05557235640035
+Created base data_dict with 13 columns
+Added task score: Perplexity = 8.974998474121094
+Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+=== END PROCESSING RESULT TO_DICT ===
+Successfully processed result 3/3: openai-community/gpt2-large
+Converted to 3 JSON records
 Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
+DataFrame shape: (3, 14)
 Sorted DataFrame by average
 Selected and rounded columns
+Final DataFrame shape after filtering: (3, 12)
 Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
+=== FINAL RESULT: DataFrame with 3 rows and 12 columns ===
+get_leaderboard_df returned: <class 'pandas.core.frame.DataFrame'>
+DataFrame shape: (3, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
+DataFrame empty: False
+Final DataFrame for leaderboard - Shape: (3, 12), Columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
+Creating leaderboard component...
 === Initializing Leaderboard ===
+DataFrame shape: (3, 12)
 DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
+Leaderboard component created successfully
 Leaderboard refresh successful
 Traceback (most recent call last):
   File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events