import gradio as gr import pandas as pd import numpy as np import joblib import json from huggingface_hub import hf_hub_download # === Config === REPO_ID = "TarekMasryo/CreditCard-fraud-detection-ML" # your model repo MODEL_FILENAME = "model_rf_cal.joblib" # using RF-Cal by default META_FILENAME = "meta.json" # Engineered features expected by the model PCA_FEATURES = [f"V{i}" for i in range(1, 29)] ENGINEERED = ["Amount", "_log_amount", "Hour_from_start_mod24", "is_night_proxy", "is_business_hours_proxy"] FEATURES = PCA_FEATURES + ENGINEERED # === Load model & thresholds === model_path = hf_hub_download(REPO_ID, MODEL_FILENAME) model = joblib.load(model_path) meta_path = hf_hub_download(REPO_ID, META_FILENAME) with open(meta_path, "r") as f: meta = json.load(f) # Default threshold (Validation P>=90%) for RF-Cal DEFAULT_THR = float(meta["thresholds"]["rf_cal"]["p90"]) def ensure_engineered_columns(df: pd.DataFrame) -> pd.DataFrame: """Create engineered columns if missing, using the same logic as training.""" df = df.copy() # _log_amount if "_log_amount" not in df.columns and "Amount" in df.columns: df["_log_amount"] = np.log1p(df["Amount"].astype(float)) # Hour_from_start_mod24 and proxies if Time exists (seconds from start) if "Hour_from_start_mod24" not in df.columns and "Time" in df.columns: hours = (np.floor(df["Time"].astype(float) / 3600) % 24).astype(int) df["Hour_from_start_mod24"] = hours if "is_night_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns: df["is_night_proxy"] = df["Hour_from_start_mod24"].isin([22,23,0,1,2,3,4,5]).astype(int) if "is_business_hours_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns: df["is_business_hours_proxy"] = df["Hour_from_start_mod24"].between(9,17).astype(int) return df def predict_csv(file, threshold: float, return_all_rows: bool): # Load data df = pd.read_csv(file.name) df = ensure_engineered_columns(df) # Check required columns missing = [c for c in FEATURES if c not in df.columns] if missing: return f"āŒ Missing required columns: {missing}. Provide these or include 'Time' and 'Amount' so the app can derive engineered features." # Predict probs = model.predict_proba(df[FEATURES])[:, 1] preds = (probs >= threshold).astype(int) out = df.copy() out["Fraud_Probability"] = probs out["Prediction"] = preds # Save to a temporary CSV for download out_path = "predictions.csv" (out if return_all_rows else out.head(200)).to_csv(out_path, index=False) # Display top rows + file for download display_df = out if return_all_rows else out.head(50) return display_df, out_path with gr.Blocks() as demo: gr.Markdown("# šŸ’³ Credit Card Fraud Detection — Calibrated RF (HF Model)") gr.Markdown( "Upload a CSV with transaction rows. The app loads a calibrated Random Forest model " "and applies the **validation P≄90% threshold** by default. " "Required columns: V1..V28, Amount, and either engineered features or a raw Time column " "(seconds from start) so the app can derive them." ) with gr.Row(): file_in = gr.File(label="Upload CSV", file_types=[".csv"]) with gr.Row(): thr = gr.Slider(minimum=0.0, maximum=1.0, value=DEFAULT_THR, step=0.001, label=f"Decision Threshold (default P≄90% = {DEFAULT_THR:.3f})") all_rows = gr.Checkbox(label="Return all rows (uncheck to preview first 50)", value=False) btn = gr.Button("Predict") out_df = gr.Dataframe(label="Predictions") out_file = gr.File(label="Download predictions.csv") btn.click(fn=predict_csv, inputs=[file_in, thr, all_rows], outputs=[out_df, out_file]) if __name__ == "__main__": demo.launch()