agent-leaderboard / components /prediction_components.py
Pratik Bhavsar
improved layout
83e2d7b
raw
history blame
24.1 kB
"""Components for AC prediction and visualization"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objects as go
try:
from scipy.optimize import curve_fit
from scipy import stats
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
# Fallback to numpy polynomial fitting
def curve_fit(func, xdata, ydata, p0=None, maxfev=5000, bounds=None):
# Simple fallback - just use polynomial fitting
if func.__name__ == 'exponential_growth':
# Linearize exponential: log(y) = log(a) + b*x
log_y = np.log(ydata + 0.01) # Add small constant to avoid log(0)
coeffs = np.polyfit(xdata, log_y, 1)
a = np.exp(coeffs[1])
b = coeffs[0]
c = 0.01
return [a, b, c], None
elif func.__name__ == 'logistic_growth':
# Better fallback for logistic using data characteristics
# Estimate L (max value) as slightly above current max
L = min(1.0, max(ydata) * 1.2) # Cap at 1.0
# Estimate x0 (midpoint) - when growth would be fastest
# For now, project forward from current trend
if len(xdata) > 1:
# Simple linear projection to estimate when we'd hit midpoint
slope = (ydata[-1] - ydata[0]) / (xdata[-1] - xdata[0])
if slope > 0:
# Estimate days to reach L/2
midpoint_value = L / 2
if ydata[-1] < midpoint_value:
days_to_midpoint = (midpoint_value - ydata[-1]) / slope
x0 = xdata[-1] + days_to_midpoint
else:
x0 = np.median(xdata)
else:
x0 = np.median(xdata)
else:
x0 = np.median(xdata)
# Estimate k (growth rate) based on current growth
k = 0.003 # Conservative default
return [L, k, x0], None
elif func.__name__ == 'power_law':
# Linearize power law: log(y) = log(a) + b*log(x)
log_x = np.log(xdata + 1)
log_y = np.log(ydata + 0.01)
coeffs = np.polyfit(log_x, log_y, 1)
return [np.exp(coeffs[1]), coeffs[0]], None
return p0, None
def exponential_growth(x, a, b, c):
"""Exponential growth function: y = a * exp(b * x) + c"""
return a * np.exp(b * x) + c
def logistic_growth(x, L, k, x0):
"""Logistic growth function: y = L / (1 + exp(-k*(x-x0)))"""
return L / (1 + np.exp(-k * (x - x0)))
def power_law(x, a, b):
"""Power law function: y = a * x^b"""
return a * np.power(x, b)
def create_ac_prediction_chart(df, domain_filter="All", model_type_filter="All"):
"""Create a prediction chart showing when AC will reach 99%
Args:
df: DataFrame with model data
domain_filter: Domain to filter by (All, Banking, Healthcare, etc.)
model_type_filter: Model type to filter by (All, Open Source, Proprietary)
"""
# Clean up domain filter (remove emoji prefix if present)
if domain_filter.startswith('🌐'):
domain_filter = "All"
elif domain_filter.startswith('🏦'):
domain_filter = "Banking"
elif domain_filter.startswith('🏥'):
domain_filter = "Healthcare"
elif domain_filter.startswith('🛡️'):
domain_filter = "Insurance"
elif domain_filter.startswith('💰'):
domain_filter = "Investment"
elif domain_filter.startswith('📱'):
domain_filter = "Telecom"
# Determine which AC column to use based on domain filter
if domain_filter != "All":
ac_column = f'{domain_filter} AC'
# Check if domain-specific column exists
if ac_column not in df.columns:
ac_column = 'Avg AC'
else:
ac_column = 'Avg AC'
# Filter data to only include models with valid release dates and AC scores
df_clean = df.dropna(subset=['Release Date', ac_column])
df_clean = df_clean[df_clean[ac_column] > 0]
# Apply model type filter
if model_type_filter == "Open Source":
df_clean = df_clean[df_clean['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
df_clean = df_clean[df_clean['Model Type'] == 'Proprietary']
# Rename the AC column to 'Avg AC' for consistent processing (only if different)
if ac_column != 'Avg AC':
# Drop the original 'Avg AC' column if it exists to avoid duplicates
if 'Avg AC' in df_clean.columns:
df_clean = df_clean.drop(columns=['Avg AC'])
df_clean = df_clean.rename(columns={ac_column: 'Avg AC'})
# Make a copy to avoid any issues with the original data
df_clean = df_clean.copy()
# Handle both YYYY-MM and YYYY-MM-DD formats
if df_clean['Release Date'].str.contains('-').all():
# Check if it's YYYY-MM format (no day component)
if df_clean['Release Date'].str.count('-').iloc[0] == 1:
# Add '-01' to make it a valid date
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'] + '-01')
else:
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
else:
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
# Sort by release date
df_clean = df_clean.sort_values('Release Date')
# Create a running maximum (best performance achieved up to each date)
df_clean['Cumulative_Max_AC'] = df_clean['Avg AC'].expanding().max()
# Group by date and take the cumulative maximum for each date
df_best = df_clean.groupby('Release Date')['Cumulative_Max_AC'].max().reset_index()
df_best.columns = ['Release Date', 'Avg AC']
# Apply cumulative maximum again to ensure monotonic increase
df_best['Avg AC'] = df_best['Avg AC'].cummax()
# Convert dates to days since first release for curve fitting
first_date = df_best['Release Date'].min()
df_best['Days'] = (df_best['Release Date'] - first_date).dt.days
# Prepare data for fitting
x_data = df_best['Days'].values
y_data = df_best['Avg AC'].values
# With limited data (only 2 performance levels), use simple conservative linear projection
# Don't try to fit complex curves that will overfit
best_model = 'linear'
# Calculate simple linear trend
if len(x_data) > 1:
# Basic linear regression
z = np.polyfit(x_data, y_data, 1)
slope = z[0]
# Apply conservative adjustment (assume diminishing returns)
conservative_slope = slope * 0.5 # Assume 50% slower future improvements
# Create conservative linear projection
best_fit = [conservative_slope, y_data[-1] - conservative_slope * x_data[-1]]
# Calculate R² for the linear fit
p = np.poly1d(z)
y_pred = p(x_data)
best_r2 = 1 - (np.sum((y_data - y_pred)**2) / np.sum((y_data - y_data.mean())**2))
else:
# Single data point - use minimal growth
best_fit = [0.0001, y_data[0]]
best_r2 = 0.0
# Generate prediction timeline
future_days = np.arange(0, 5475, 30) # 15 years in 30-day intervals
# Simple conservative linear projection
p = np.poly1d(best_fit)
future_ac = p(future_days)
# Cap predictions at 1.0
future_ac = np.minimum(future_ac, 1.0)
# Find when we reach 99%
target_ac = 0.99
crossing_idx = np.where(future_ac >= target_ac)[0]
if len(crossing_idx) > 0:
days_to_99 = future_days[crossing_idx[0]]
date_99 = first_date + timedelta(days=int(days_to_99))
months_from_now = (date_99 - datetime.now()).days / 30.4
else:
date_99 = None
months_from_now = None
# Convert future days to dates
future_dates = [first_date + timedelta(days=int(d)) for d in future_days]
# Create the plot
fig = go.Figure()
# Add confidence bands FIRST (so they appear behind other traces)
if best_model and best_fit is not None:
# Generate smooth confidence bands
future_std = 0.05 # Base uncertainty
confidence_multiplier = np.linspace(1.0, 2.0, len(future_dates))
upper_bound = np.minimum(future_ac + future_std * confidence_multiplier, 1.0)
lower_bound = np.maximum(future_ac - future_std * confidence_multiplier, 0)
# Add confidence band as filled area
fig.add_trace(go.Scatter(
x=future_dates + future_dates[::-1],
y=list(upper_bound) + list(lower_bound[::-1]),
fill='toself',
fillcolor='rgba(16, 152, 247, 0.05)',
line=dict(width=0),
showlegend=False,
hoverinfo='skip',
name='Uncertainty'
))
# Add vendor info and additional metrics
df_with_vendor = df_clean.copy()
if 'Vendor' in df.columns:
vendor_map = df.set_index('Model')['Vendor'].to_dict()
df_with_vendor['Vendor'] = df_with_vendor['Model'].map(vendor_map).fillna('Unknown')
else:
df_with_vendor['Vendor'] = 'Unknown'
if 'Model Type' in df.columns:
type_map = df.set_index('Model')['Model Type'].to_dict()
df_with_vendor['Model Type'] = df_with_vendor['Model'].map(type_map).fillna('Unknown')
else:
df_with_vendor['Model Type'] = 'Unknown'
# Calculate additional metrics for each model
df_with_vendor['Gap_to_99'] = 0.99 - df_with_vendor['Avg AC']
df_with_vendor['Gap_to_Best'] = df_with_vendor['Cumulative_Max_AC'] - df_with_vendor['Avg AC']
# Get cost info if available
if 'Avg Total Cost' in df.columns:
cost_map = df.set_index('Model')['Avg Total Cost'].to_dict()
df_with_vendor['Cost'] = df_with_vendor['Model'].map(cost_map).fillna(0)
else:
df_with_vendor['Cost'] = 0
# Check if each model improved the frontier
df_with_vendor['Is_Frontier'] = df_with_vendor['Avg AC'] >= df_with_vendor['Cumulative_Max_AC'] - 0.001 # Small tolerance for float comparison
# Create frontier status text
frontier_status = []
for idx, row in df_with_vendor.iterrows():
if row['Is_Frontier']:
frontier_status.append('✅ Advanced SOTA')
else:
frontier_status.append('❌ Below existing best')
vendor_info = df_with_vendor['Vendor'].values
model_type = df_with_vendor['Model Type'].values
gap_to_99 = df_with_vendor['Gap_to_99'].values
gap_to_best = df_with_vendor['Gap_to_Best'].values
cost_info = df_with_vendor['Cost'].values
# Add historical data points with comprehensive hover
fig.add_trace(go.Scatter(
x=df_clean['Release Date'],
y=df_clean['Avg AC'],
mode='markers',
name='Individual Models',
marker=dict(
size=14,
color='rgba(227, 84, 84, 0.8)',
line=dict(width=2, color='rgba(255, 255, 255, 0.6)'),
symbol='circle'
),
customdata=list(zip(vendor_info, model_type, gap_to_99, gap_to_best, cost_info, frontier_status)),
hovertemplate=(
'<b style="font-size: 18px; color: #E35454;">%{text}</b><br>'
'<br>'
'<b style="color: #1098F7;">Model Information:</b><br>'
'• <b>Vendor:</b> %{customdata[0]}<br>'
'• <b>Type:</b> %{customdata[1]}<br>'
'• <b>Released:</b> %{x|%B %Y}<br>'
'• <b>Frontier Status:</b> %{customdata[5]}<br>'
'<br>'
'<b style="color: #FFD700;">Performance Metrics:</b><br>'
'• <b>Action Completion:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[2]:.1%}</span><br>'
'• <b>Behind Best:</b> <span style="color: #FFA500;">-%{customdata[3]:.1%}</span><br>'
'<br>'
'<b style="color: #28a745;">Cost Efficiency:</b><br>'
'• <b>Avg Session Cost:</b> $%{customdata[4]:.4f}<br>'
'<br>'
'<i style="color: #B1B5B9; font-size: 13px;">Performance at release time</i>'
'<extra></extra>'
),
text=df_clean['Model'].values,
hoverlabel=dict(
bgcolor='rgba(26, 26, 46, 0.95)',
bordercolor='rgba(227, 84, 84, 0.5)',
font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
align='left',
namelength=-1
)
))
# Calculate improvement metrics for hover
df_best['Improvement'] = df_best['Avg AC'].diff().fillna(0)
df_best['Improvement_Pct'] = (df_best['Avg AC'].pct_change() * 100).fillna(0)
df_best['Gap_to_99'] = 0.99 - df_best['Avg AC']
# Find which model is responsible for the best performance at each date
best_model_at_date = []
for date_val in df_best['Release Date']:
# Find all models up to and including this date
models_up_to_date = df_clean[df_clean['Release Date'] <= date_val]
if not models_up_to_date.empty:
# Find the model with the highest AC score up to this date
best_idx = models_up_to_date['Avg AC'].idxmax()
best_model_at_date.append(models_up_to_date.loc[best_idx, 'Model'])
else:
best_model_at_date.append('Unknown')
# Add best performance line with enhanced metrics
fig.add_trace(go.Scatter(
x=df_best['Release Date'],
y=df_best['Avg AC'],
mode='lines+markers',
name='Best Performance Trend',
line=dict(color='#E35454', width=4, shape='linear'),
marker=dict(
size=16,
color='#E35454',
symbol='diamond',
line=dict(width=2, color='white')
),
customdata=list(zip(
df_best['Improvement'].values,
df_best['Improvement_Pct'].values,
df_best['Gap_to_99'].values,
best_model_at_date
)),
hovertemplate=(
'<b style="font-size: 20px; color: #E35454;">📈 Best Performance Frontier</b><br>'
'<br>'
'<b>Date:</b> %{x|%B %Y}<br>'
'<b>Leading Model:</b> %{customdata[3]}<br>'
'<b>Cumulative Best AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
'<br>'
'<b>Progress Metrics:</b><br>'
'• Improvement: <span style="color: #28a745;">+%{customdata[0]:.1%}</span><br>'
'• Growth Rate: <span style="color: #28a745;">+%{customdata[1]:.1f}%</span><br>'
'• Gap to 99%: <span style="color: #1098F7;">%{customdata[2]:.1%}</span><br>'
'<br>'
'<i style="color: #B1B5B9; font-size: 13px;">This represents the best performance achieved by any model up to this date</i>'
'<extra></extra>'
),
hoverlabel=dict(
bgcolor='rgba(26, 26, 46, 0.95)',
bordercolor='rgba(227, 84, 84, 0.5)',
font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
align='left',
namelength=-1
)
))
# Calculate months from now for each prediction point
months_from_now_list = [(date - datetime.now()).days / 30.4 for date in future_dates]
years_from_now_list = [m / 12 for m in months_from_now_list]
# Add prediction line with comprehensive metrics
fig.add_trace(go.Scatter(
x=future_dates,
y=future_ac,
mode='lines',
name=f'Prediction ({best_model.capitalize()})',
line=dict(color='#1098F7', width=4, dash='dash'),
opacity=0.8,
customdata=list(zip(
[max(0, 0.99 - y) for y in future_ac],
months_from_now_list,
years_from_now_list,
[best_r2] * len(future_ac)
)),
hovertemplate=(
'<b style="font-size: 18px; color: #1098F7;">🔮 AI Performance Prediction</b><br>'
'<br>'
'<b style="color: #FFD700;">Forecast Details:</b><br>'
'• <b>Date:</b> %{x|%B %Y}<br>'
'• <b>Predicted AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[0]:.1%}</span><br>'
'<br>'
'<b style="color: #28a745;">Timeline:</b><br>'
'• <b>Months from now:</b> %{customdata[1]:.0f} months<br>'
'• <b>Years from now:</b> %{customdata[2]:.1f} years<br>'
'<br>'
'<b style="color: #1098F7;">Model Confidence:</b><br>'
f'• <b>Algorithm:</b> {best_model.capitalize()}<br>'
'• <b>R² Score:</b> %{customdata[3]:.3f}<br>'
'<br>'
'<i style="color: #B1B5B9; font-size: 13px;">Based on historical performance trends</i>'
'<extra></extra>'
),
hoverlabel=dict(
bgcolor='rgba(26, 26, 46, 0.95)',
bordercolor='rgba(16, 152, 247, 0.5)',
font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
align='left',
namelength=-1
)
))
# Add 99% threshold line with enhanced styling
fig.add_hline(
y=0.99,
line_dash="dash",
line_color="rgba(40, 167, 69, 0.4)",
line_width=2,
annotation=dict(
text="<b>Enterprise-Grade Threshold (99%)</b>",
font=dict(size=13, color='#28a745', family='Geist, sans-serif'),
bgcolor='rgba(40, 167, 69, 0.15)',
bordercolor='#28a745',
borderwidth=1,
borderpad=4
),
annotation_position="right"
)
# Add marker for 99% crossing point with enhanced visibility
if date_99:
# Calculate days until achievement
days_until = (date_99 - datetime.now()).days
fig.add_trace(go.Scatter(
x=[date_99],
y=[0.99],
mode='markers+text',
name='🎯 99% Achievement',
marker=dict(
size=28,
color='#28a745',
symbol='star',
line=dict(width=3, color='white')
),
text=[f'<b>{date_99.strftime("%b %Y")}</b>'],
textposition='top center',
textfont=dict(size=16, color='#28a745', family='Geist, sans-serif'),
hovertemplate=(
'<b style="font-size: 18px; color: #28a745;">🎯 ENTERPRISE-READY MILESTONE</b><br>'
'<br>'
f'<b>Achievement Date:</b> <span style="font-size: 16px;">{date_99.strftime("%B %Y")}</span><br>'
f'<b>Time from today:</b> <span style="font-size: 16px; color: #FFD700;">{months_from_now:.0f} months</span><br>'
f'<b>Days remaining:</b> {days_until} days<br>'
f'<b>Years:</b> {months_from_now/12:.1f} years<br>'
'<br>'
'<b style="color: #1098F7;">Strategic Implications:</b><br>'
f'• Early adopters gain {months_from_now:.0f}-month advantage<br>'
'• Infrastructure investment critical now<br>'
'• 99% reliability enables production deployment<br>'
'<extra></extra>'
),
hoverlabel=dict(
bgcolor='rgba(26, 26, 46, 0.95)',
bordercolor='rgba(40, 167, 69, 0.5)',
font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
align='left',
namelength=-1
)
))
# Update layout with improved title showing active filters
filter_text = ""
if domain_filter != "All":
filter_text += f" - {domain_filter} Domain"
if model_type_filter != "All":
if filter_text:
filter_text += f", {model_type_filter} Models"
else:
filter_text += f" - {model_type_filter} Models"
title_text = f"<span style='font-size: 24px;'>🚀 When Will AI Agents Reach Enterprise-Grade Reliability?</span>"
if filter_text:
title_text += f"<br><span style='font-size: 14px; color: #1098F7;'>{filter_text}</span>"
if date_99 and months_from_now:
if months_from_now > 0:
title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Prediction: <b style='color: #FFD700;'>{date_99.strftime('%B %Y')}</b> (~{months_from_now:.0f} months)</span>"
else:
title_text += f"<br><span style='font-size: 16px; color: #28a745;'>Already achieved!</span>"
else:
title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Tracking performance improvements...</span>"
fig.update_layout(
title=dict(
text=title_text,
font=dict(size=20, family="Geist, sans-serif", color="#F5F6F7"),
x=0.5,
xanchor='center'
),
xaxis=dict(
title=dict(
text="<b>Release Date</b>",
font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
standoff=20
),
tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
gridcolor="rgba(245, 246, 247, 0.08)",
zerolinecolor="rgba(245, 246, 247, 0.15)",
showgrid=True,
gridwidth=1,
tickangle=0,
tickformat='%b %Y',
showspikes=True,
spikecolor="rgba(245, 246, 247, 0.3)",
spikethickness=1,
spikemode='across',
spikedash='dot',
range=[df_clean['Release Date'].min() - timedelta(days=60),
min(datetime.now() + timedelta(days=800), future_dates[-1] if future_dates else datetime.now())]
),
yaxis=dict(
title=dict(
text="<b>Action Completion (AC)</b>",
font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
standoff=20
),
tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
gridcolor="rgba(245, 246, 247, 0.08)",
zerolinecolor="rgba(245, 246, 247, 0.15)",
showgrid=True,
gridwidth=1,
tickformat='.0%',
dtick=0.1,
showspikes=True,
spikecolor="rgba(245, 246, 247, 0.3)",
spikethickness=1,
spikemode='across',
spikedash='dot',
range=[-0.05, 1.08]
),
plot_bgcolor="rgba(1, 9, 26, 0.98)",
paper_bgcolor="rgba(1, 9, 26, 0.98)",
height=650,
margin=dict(l=90, r=100, t=120, b=90),
hovermode='closest',
hoverdistance=30,
spikedistance=50,
legend=dict(
bgcolor="rgba(1, 9, 26, 0.9)",
bordercolor="rgba(245, 246, 247, 0.3)",
borderwidth=2,
font=dict(size=12, family="Geist, sans-serif", color="#F5F6F7"),
x=0.02,
y=0.98,
xanchor='left',
yanchor='top',
orientation='v',
itemsizing='constant',
itemwidth=40,
tracegroupgap=5,
title=dict(
text='<b>Legend</b>',
font=dict(size=13, color='#F5F6F7')
)
),
showlegend=True,
annotations=[
dict(
text=f"<b>Model:</b> Conservative Linear | <b>Note:</b> Limited data - projection assumes diminishing returns",
xref="paper", yref="paper",
x=0.01, y=-0.12,
showarrow=False,
font=dict(size=11, color="#B1B5B9", family="Geist, sans-serif"),
bgcolor="rgba(1, 9, 26, 0.9)",
bordercolor="rgba(245, 246, 247, 0.3)",
borderwidth=1,
borderpad=4
)
]
)
# Get the current best AC value (last value in y_data)
current_best_ac = y_data[-1] if len(y_data) > 0 else None
return fig, date_99, months_from_now, current_best_ac