Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

agent-leaderboard / components /prediction_components.py

Pratik Bhavsar

improved layout

83e2d7b 22 days ago

24.1 kB

	"""Components for AC prediction and visualization"""
	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	import plotly.graph_objects as go

	try:
	from scipy.optimize import curve_fit
	from scipy import stats
	HAS_SCIPY = True
	except ImportError:
	HAS_SCIPY = False
	# Fallback to numpy polynomial fitting
	def curve_fit(func, xdata, ydata, p0=None, maxfev=5000, bounds=None):
	# Simple fallback - just use polynomial fitting
	if func.__name__ == 'exponential_growth':
	# Linearize exponential: log(y) = log(a) + b*x
	log_y = np.log(ydata + 0.01) # Add small constant to avoid log(0)
	coeffs = np.polyfit(xdata, log_y, 1)
	a = np.exp(coeffs[1])
	b = coeffs[0]
	c = 0.01
	return [a, b, c], None
	elif func.__name__ == 'logistic_growth':
	# Better fallback for logistic using data characteristics
	# Estimate L (max value) as slightly above current max
	L = min(1.0, max(ydata) * 1.2) # Cap at 1.0

	# Estimate x0 (midpoint) - when growth would be fastest
	# For now, project forward from current trend
	if len(xdata) > 1:
	# Simple linear projection to estimate when we'd hit midpoint
	slope = (ydata[-1] - ydata[0]) / (xdata[-1] - xdata[0])
	if slope > 0:
	# Estimate days to reach L/2
	midpoint_value = L / 2
	if ydata[-1] < midpoint_value:
	days_to_midpoint = (midpoint_value - ydata[-1]) / slope
	x0 = xdata[-1] + days_to_midpoint
	else:
	x0 = np.median(xdata)
	else:
	x0 = np.median(xdata)
	else:
	x0 = np.median(xdata)

	# Estimate k (growth rate) based on current growth
	k = 0.003 # Conservative default

	return [L, k, x0], None
	elif func.__name__ == 'power_law':
	# Linearize power law: log(y) = log(a) + b*log(x)
	log_x = np.log(xdata + 1)
	log_y = np.log(ydata + 0.01)
	coeffs = np.polyfit(log_x, log_y, 1)
	return [np.exp(coeffs[1]), coeffs[0]], None
	return p0, None

	def exponential_growth(x, a, b, c):
	"""Exponential growth function: y = a * exp(b * x) + c"""
	return a * np.exp(b * x) + c

	def logistic_growth(x, L, k, x0):
	"""Logistic growth function: y = L / (1 + exp(-k*(x-x0)))"""
	return L / (1 + np.exp(-k * (x - x0)))

	def power_law(x, a, b):
	"""Power law function: y = a * x^b"""
	return a * np.power(x, b)

	def create_ac_prediction_chart(df, domain_filter="All", model_type_filter="All"):
	"""Create a prediction chart showing when AC will reach 99%

	Args:
	df: DataFrame with model data
	domain_filter: Domain to filter by (All, Banking, Healthcare, etc.)
	model_type_filter: Model type to filter by (All, Open Source, Proprietary)
	"""

	# Clean up domain filter (remove emoji prefix if present)
	if domain_filter.startswith('🌐'):
	domain_filter = "All"
	elif domain_filter.startswith('🏦'):
	domain_filter = "Banking"
	elif domain_filter.startswith('🏥'):
	domain_filter = "Healthcare"
	elif domain_filter.startswith('🛡️'):
	domain_filter = "Insurance"
	elif domain_filter.startswith('💰'):
	domain_filter = "Investment"
	elif domain_filter.startswith('📱'):
	domain_filter = "Telecom"

	# Determine which AC column to use based on domain filter
	if domain_filter != "All":
	ac_column = f'{domain_filter} AC'
	# Check if domain-specific column exists
	if ac_column not in df.columns:
	ac_column = 'Avg AC'
	else:
	ac_column = 'Avg AC'

	# Filter data to only include models with valid release dates and AC scores
	df_clean = df.dropna(subset=['Release Date', ac_column])
	df_clean = df_clean[df_clean[ac_column] > 0]

	# Apply model type filter
	if model_type_filter == "Open Source":
	df_clean = df_clean[df_clean['Model Type'] == 'Open source']
	elif model_type_filter == "Proprietary":
	df_clean = df_clean[df_clean['Model Type'] == 'Proprietary']

	# Rename the AC column to 'Avg AC' for consistent processing (only if different)
	if ac_column != 'Avg AC':
	# Drop the original 'Avg AC' column if it exists to avoid duplicates
	if 'Avg AC' in df_clean.columns:
	df_clean = df_clean.drop(columns=['Avg AC'])
	df_clean = df_clean.rename(columns={ac_column: 'Avg AC'})

	# Make a copy to avoid any issues with the original data
	df_clean = df_clean.copy()

	# Handle both YYYY-MM and YYYY-MM-DD formats
	if df_clean['Release Date'].str.contains('-').all():
	# Check if it's YYYY-MM format (no day component)
	if df_clean['Release Date'].str.count('-').iloc[0] == 1:
	# Add '-01' to make it a valid date
	df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'] + '-01')
	else:
	df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
	else:
	df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])

	# Sort by release date
	df_clean = df_clean.sort_values('Release Date')

	# Create a running maximum (best performance achieved up to each date)
	df_clean['Cumulative_Max_AC'] = df_clean['Avg AC'].expanding().max()

	# Group by date and take the cumulative maximum for each date
	df_best = df_clean.groupby('Release Date')['Cumulative_Max_AC'].max().reset_index()
	df_best.columns = ['Release Date', 'Avg AC']

	# Apply cumulative maximum again to ensure monotonic increase
	df_best['Avg AC'] = df_best['Avg AC'].cummax()

	# Convert dates to days since first release for curve fitting
	first_date = df_best['Release Date'].min()
	df_best['Days'] = (df_best['Release Date'] - first_date).dt.days

	# Prepare data for fitting
	x_data = df_best['Days'].values
	y_data = df_best['Avg AC'].values

	# With limited data (only 2 performance levels), use simple conservative linear projection
	# Don't try to fit complex curves that will overfit

	best_model = 'linear'

	# Calculate simple linear trend
	if len(x_data) > 1:
	# Basic linear regression
	z = np.polyfit(x_data, y_data, 1)
	slope = z[0]

	# Apply conservative adjustment (assume diminishing returns)
	conservative_slope = slope * 0.5 # Assume 50% slower future improvements

	# Create conservative linear projection
	best_fit = [conservative_slope, y_data[-1] - conservative_slope * x_data[-1]]

	# Calculate R² for the linear fit
	p = np.poly1d(z)
	y_pred = p(x_data)
	best_r2 = 1 - (np.sum((y_data - y_pred)2) / np.sum((y_data - y_data.mean())2))
	else:
	# Single data point - use minimal growth
	best_fit = [0.0001, y_data[0]]
	best_r2 = 0.0

	# Generate prediction timeline
	future_days = np.arange(0, 5475, 30) # 15 years in 30-day intervals

	# Simple conservative linear projection
	p = np.poly1d(best_fit)
	future_ac = p(future_days)

	# Cap predictions at 1.0
	future_ac = np.minimum(future_ac, 1.0)

	# Find when we reach 99%
	target_ac = 0.99
	crossing_idx = np.where(future_ac >= target_ac)[0]

	if len(crossing_idx) > 0:
	days_to_99 = future_days[crossing_idx[0]]
	date_99 = first_date + timedelta(days=int(days_to_99))
	months_from_now = (date_99 - datetime.now()).days / 30.4
	else:
	date_99 = None
	months_from_now = None

	# Convert future days to dates
	future_dates = [first_date + timedelta(days=int(d)) for d in future_days]

	# Create the plot
	fig = go.Figure()

	# Add confidence bands FIRST (so they appear behind other traces)
	if best_model and best_fit is not None:
	# Generate smooth confidence bands
	future_std = 0.05 # Base uncertainty
	confidence_multiplier = np.linspace(1.0, 2.0, len(future_dates))

	upper_bound = np.minimum(future_ac + future_std * confidence_multiplier, 1.0)
	lower_bound = np.maximum(future_ac - future_std * confidence_multiplier, 0)

	# Add confidence band as filled area
	fig.add_trace(go.Scatter(
	x=future_dates + future_dates[::-1],
	y=list(upper_bound) + list(lower_bound[::-1]),
	fill='toself',
	fillcolor='rgba(16, 152, 247, 0.05)',
	line=dict(width=0),
	showlegend=False,
	hoverinfo='skip',
	name='Uncertainty'
	))

	# Add vendor info and additional metrics
	df_with_vendor = df_clean.copy()
	if 'Vendor' in df.columns:
	vendor_map = df.set_index('Model')['Vendor'].to_dict()
	df_with_vendor['Vendor'] = df_with_vendor['Model'].map(vendor_map).fillna('Unknown')
	else:
	df_with_vendor['Vendor'] = 'Unknown'

	if 'Model Type' in df.columns:
	type_map = df.set_index('Model')['Model Type'].to_dict()
	df_with_vendor['Model Type'] = df_with_vendor['Model'].map(type_map).fillna('Unknown')
	else:
	df_with_vendor['Model Type'] = 'Unknown'

	# Calculate additional metrics for each model
	df_with_vendor['Gap_to_99'] = 0.99 - df_with_vendor['Avg AC']
	df_with_vendor['Gap_to_Best'] = df_with_vendor['Cumulative_Max_AC'] - df_with_vendor['Avg AC']

	# Get cost info if available
	if 'Avg Total Cost' in df.columns:
	cost_map = df.set_index('Model')['Avg Total Cost'].to_dict()
	df_with_vendor['Cost'] = df_with_vendor['Model'].map(cost_map).fillna(0)
	else:
	df_with_vendor['Cost'] = 0

	# Check if each model improved the frontier
	df_with_vendor['Is_Frontier'] = df_with_vendor['Avg AC'] >= df_with_vendor['Cumulative_Max_AC'] - 0.001 # Small tolerance for float comparison

	# Create frontier status text
	frontier_status = []
	for idx, row in df_with_vendor.iterrows():
	if row['Is_Frontier']:
	frontier_status.append('✅ Advanced SOTA')
	else:
	frontier_status.append('❌ Below existing best')

	vendor_info = df_with_vendor['Vendor'].values
	model_type = df_with_vendor['Model Type'].values
	gap_to_99 = df_with_vendor['Gap_to_99'].values
	gap_to_best = df_with_vendor['Gap_to_Best'].values
	cost_info = df_with_vendor['Cost'].values

	# Add historical data points with comprehensive hover
	fig.add_trace(go.Scatter(
	x=df_clean['Release Date'],
	y=df_clean['Avg AC'],
	mode='markers',
	name='Individual Models',
	marker=dict(
	size=14,
	color='rgba(227, 84, 84, 0.8)',
	line=dict(width=2, color='rgba(255, 255, 255, 0.6)'),
	symbol='circle'
	),
	customdata=list(zip(vendor_info, model_type, gap_to_99, gap_to_best, cost_info, frontier_status)),
	hovertemplate=(
	'<b style="font-size: 18px; color: #E35454;">%{text}</b><br>'
	'<br>'
	'<b style="color: #1098F7;">Model Information:</b><br>'
	'• <b>Vendor:</b> %{customdata[0]}<br>'
	'• <b>Type:</b> %{customdata[1]}<br>'
	'• <b>Released:</b> %{x\|%B %Y}<br>'
	'• <b>Frontier Status:</b> %{customdata[5]}<br>'
	'<br>'
	'<b style="color: #FFD700;">Performance Metrics:</b><br>'
	'• <b>Action Completion:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
	'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[2]:.1%}</span><br>'
	'• <b>Behind Best:</b> <span style="color: #FFA500;">-%{customdata[3]:.1%}</span><br>'
	'<br>'
	'<b style="color: #28a745;">Cost Efficiency:</b><br>'
	'• <b>Avg Session Cost:</b> $%{customdata[4]:.4f}<br>'
	'<br>'
	'<i style="color: #B1B5B9; font-size: 13px;">Performance at release time</i>'
	'<extra></extra>'
	),
	text=df_clean['Model'].values,
	hoverlabel=dict(
	bgcolor='rgba(26, 26, 46, 0.95)',
	bordercolor='rgba(227, 84, 84, 0.5)',
	font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
	align='left',
	namelength=-1
	)
	))

	# Calculate improvement metrics for hover
	df_best['Improvement'] = df_best['Avg AC'].diff().fillna(0)
	df_best['Improvement_Pct'] = (df_best['Avg AC'].pct_change() * 100).fillna(0)
	df_best['Gap_to_99'] = 0.99 - df_best['Avg AC']

	# Find which model is responsible for the best performance at each date
	best_model_at_date = []
	for date_val in df_best['Release Date']:
	# Find all models up to and including this date
	models_up_to_date = df_clean[df_clean['Release Date'] <= date_val]
	if not models_up_to_date.empty:
	# Find the model with the highest AC score up to this date
	best_idx = models_up_to_date['Avg AC'].idxmax()
	best_model_at_date.append(models_up_to_date.loc[best_idx, 'Model'])
	else:
	best_model_at_date.append('Unknown')

	# Add best performance line with enhanced metrics
	fig.add_trace(go.Scatter(
	x=df_best['Release Date'],
	y=df_best['Avg AC'],
	mode='lines+markers',
	name='Best Performance Trend',
	line=dict(color='#E35454', width=4, shape='linear'),
	marker=dict(
	size=16,
	color='#E35454',
	symbol='diamond',
	line=dict(width=2, color='white')
	),
	customdata=list(zip(
	df_best['Improvement'].values,
	df_best['Improvement_Pct'].values,
	df_best['Gap_to_99'].values,
	best_model_at_date
	)),
	hovertemplate=(
	'<b style="font-size: 20px; color: #E35454;">📈 Best Performance Frontier</b><br>'
	'<br>'
	'<b>Date:</b> %{x\|%B %Y}<br>'
	'<b>Leading Model:</b> %{customdata[3]}<br>'
	'<b>Cumulative Best AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
	'<br>'
	'<b>Progress Metrics:</b><br>'
	'• Improvement: <span style="color: #28a745;">+%{customdata[0]:.1%}</span><br>'
	'• Growth Rate: <span style="color: #28a745;">+%{customdata[1]:.1f}%</span><br>'
	'• Gap to 99%: <span style="color: #1098F7;">%{customdata[2]:.1%}</span><br>'
	'<br>'
	'<i style="color: #B1B5B9; font-size: 13px;">This represents the best performance achieved by any model up to this date</i>'
	'<extra></extra>'
	),
	hoverlabel=dict(
	bgcolor='rgba(26, 26, 46, 0.95)',
	bordercolor='rgba(227, 84, 84, 0.5)',
	font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
	align='left',
	namelength=-1
	)
	))

	# Calculate months from now for each prediction point
	months_from_now_list = [(date - datetime.now()).days / 30.4 for date in future_dates]
	years_from_now_list = [m / 12 for m in months_from_now_list]

	# Add prediction line with comprehensive metrics
	fig.add_trace(go.Scatter(
	x=future_dates,
	y=future_ac,
	mode='lines',
	name=f'Prediction ({best_model.capitalize()})',
	line=dict(color='#1098F7', width=4, dash='dash'),
	opacity=0.8,
	customdata=list(zip(
	[max(0, 0.99 - y) for y in future_ac],
	months_from_now_list,
	years_from_now_list,
	[best_r2] * len(future_ac)
	)),
	hovertemplate=(
	'<b style="font-size: 18px; color: #1098F7;">🔮 AI Performance Prediction</b><br>'
	'<br>'
	'<b style="color: #FFD700;">Forecast Details:</b><br>'
	'• <b>Date:</b> %{x\|%B %Y}<br>'
	'• <b>Predicted AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
	'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[0]:.1%}</span><br>'
	'<br>'
	'<b style="color: #28a745;">Timeline:</b><br>'
	'• <b>Months from now:</b> %{customdata[1]:.0f} months<br>'
	'• <b>Years from now:</b> %{customdata[2]:.1f} years<br>'
	'<br>'
	'<b style="color: #1098F7;">Model Confidence:</b><br>'
	f'• <b>Algorithm:</b> {best_model.capitalize()}<br>'
	'• <b>R² Score:</b> %{customdata[3]:.3f}<br>'
	'<br>'
	'<i style="color: #B1B5B9; font-size: 13px;">Based on historical performance trends</i>'
	'<extra></extra>'
	),
	hoverlabel=dict(
	bgcolor='rgba(26, 26, 46, 0.95)',
	bordercolor='rgba(16, 152, 247, 0.5)',
	font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
	align='left',
	namelength=-1
	)
	))

	# Add 99% threshold line with enhanced styling
	fig.add_hline(
	y=0.99,
	line_dash="dash",
	line_color="rgba(40, 167, 69, 0.4)",
	line_width=2,
	annotation=dict(
	text="<b>Enterprise-Grade Threshold (99%)</b>",
	font=dict(size=13, color='#28a745', family='Geist, sans-serif'),
	bgcolor='rgba(40, 167, 69, 0.15)',
	bordercolor='#28a745',
	borderwidth=1,
	borderpad=4
	),
	annotation_position="right"
	)

	# Add marker for 99% crossing point with enhanced visibility
	if date_99:
	# Calculate days until achievement
	days_until = (date_99 - datetime.now()).days

	fig.add_trace(go.Scatter(
	x=[date_99],
	y=[0.99],
	mode='markers+text',
	name='🎯 99% Achievement',
	marker=dict(
	size=28,
	color='#28a745',
	symbol='star',
	line=dict(width=3, color='white')
	),
	text=[f'<b>{date_99.strftime("%b %Y")}</b>'],
	textposition='top center',
	textfont=dict(size=16, color='#28a745', family='Geist, sans-serif'),
	hovertemplate=(
	'<b style="font-size: 18px; color: #28a745;">🎯 ENTERPRISE-READY MILESTONE</b><br>'
	'<br>'
	f'<b>Achievement Date:</b> <span style="font-size: 16px;">{date_99.strftime("%B %Y")}</span><br>'
	f'<b>Time from today:</b> <span style="font-size: 16px; color: #FFD700;">{months_from_now:.0f} months</span><br>'
	f'<b>Days remaining:</b> {days_until} days<br>'
	f'<b>Years:</b> {months_from_now/12:.1f} years<br>'
	'<br>'
	'<b style="color: #1098F7;">Strategic Implications:</b><br>'
	f'• Early adopters gain {months_from_now:.0f}-month advantage<br>'
	'• Infrastructure investment critical now<br>'
	'• 99% reliability enables production deployment<br>'
	'<extra></extra>'
	),
	hoverlabel=dict(
	bgcolor='rgba(26, 26, 46, 0.95)',
	bordercolor='rgba(40, 167, 69, 0.5)',
	font=dict(size=14, color='#F5F6F7', family='Geist, sans-serif'),
	align='left',
	namelength=-1
	)
	))

	# Update layout with improved title showing active filters
	filter_text = ""
	if domain_filter != "All":
	filter_text += f" - {domain_filter} Domain"
	if model_type_filter != "All":
	if filter_text:
	filter_text += f", {model_type_filter} Models"
	else:
	filter_text += f" - {model_type_filter} Models"

	title_text = f"<span style='font-size: 24px;'>🚀 When Will AI Agents Reach Enterprise-Grade Reliability?</span>"
	if filter_text:
	title_text += f"<br><span style='font-size: 14px; color: #1098F7;'>{filter_text}</span>"

	if date_99 and months_from_now:
	if months_from_now > 0:
	title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Prediction: <b style='color: #FFD700;'>{date_99.strftime('%B %Y')}</b> (~{months_from_now:.0f} months)</span>"
	else:
	title_text += f"<br><span style='font-size: 16px; color: #28a745;'>Already achieved!</span>"
	else:
	title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Tracking performance improvements...</span>"

	fig.update_layout(
	title=dict(
	text=title_text,
	font=dict(size=20, family="Geist, sans-serif", color="#F5F6F7"),
	x=0.5,
	xanchor='center'
	),
	xaxis=dict(
	title=dict(
	text="<b>Release Date</b>",
	font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
	standoff=20
	),
	tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
	gridcolor="rgba(245, 246, 247, 0.08)",
	zerolinecolor="rgba(245, 246, 247, 0.15)",
	showgrid=True,
	gridwidth=1,
	tickangle=0,
	tickformat='%b %Y',
	showspikes=True,
	spikecolor="rgba(245, 246, 247, 0.3)",
	spikethickness=1,
	spikemode='across',
	spikedash='dot',
	range=[df_clean['Release Date'].min() - timedelta(days=60),
	min(datetime.now() + timedelta(days=800), future_dates[-1] if future_dates else datetime.now())]
	),
	yaxis=dict(
	title=dict(
	text="<b>Action Completion (AC)</b>",
	font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
	standoff=20
	),
	tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
	gridcolor="rgba(245, 246, 247, 0.08)",
	zerolinecolor="rgba(245, 246, 247, 0.15)",
	showgrid=True,
	gridwidth=1,
	tickformat='.0%',
	dtick=0.1,
	showspikes=True,
	spikecolor="rgba(245, 246, 247, 0.3)",
	spikethickness=1,
	spikemode='across',
	spikedash='dot',
	range=[-0.05, 1.08]
	),
	plot_bgcolor="rgba(1, 9, 26, 0.98)",
	paper_bgcolor="rgba(1, 9, 26, 0.98)",
	height=650,
	margin=dict(l=90, r=100, t=120, b=90),
	hovermode='closest',
	hoverdistance=30,
	spikedistance=50,
	legend=dict(
	bgcolor="rgba(1, 9, 26, 0.9)",
	bordercolor="rgba(245, 246, 247, 0.3)",
	borderwidth=2,
	font=dict(size=12, family="Geist, sans-serif", color="#F5F6F7"),
	x=0.02,
	y=0.98,
	xanchor='left',
	yanchor='top',
	orientation='v',
	itemsizing='constant',
	itemwidth=40,
	tracegroupgap=5,
	title=dict(
	text='<b>Legend</b>',
	font=dict(size=13, color='#F5F6F7')
	)
	),
	showlegend=True,
	annotations=[
	dict(
	text=f"<b>Model:</b> Conservative Linear \| <b>Note:</b> Limited data - projection assumes diminishing returns",
	xref="paper", yref="paper",
	x=0.01, y=-0.12,
	showarrow=False,
	font=dict(size=11, color="#B1B5B9", family="Geist, sans-serif"),
	bgcolor="rgba(1, 9, 26, 0.9)",
	bordercolor="rgba(245, 246, 247, 0.3)",
	borderwidth=1,
	borderpad=4
	)
	]
	)


	# Get the current best AC value (last value in y_data)
	current_best_ac = y_data[-1] if len(y_data) > 0 else None

	return fig, date_99, months_from_now, current_best_ac