|
import streamlit as st |
|
import pandas as pd |
|
import seaborn as sns |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.metrics import classification_report |
|
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler |
|
from datasets import load_dataset |
|
|
|
|
|
st.header("1. Load Dataset") |
|
|
|
data_source = st.radio("Choose data source:", ["Upload File", "Hugging Face", "Sample Dataset"]) |
|
|
|
if data_source == "Upload File": |
|
uploaded_file = st.file_uploader("Upload your dataset (CSV, Excel, or Parquet)", type=["csv", "xlsx", "parquet"]) |
|
if uploaded_file: |
|
if uploaded_file.name.endswith(".csv"): |
|
df = pd.read_csv(uploaded_file) |
|
else: |
|
df = pd.read_excel(uploaded_file) |
|
st.success(f"Successfully loaded {uploaded_file.name}") |
|
|
|
elif data_source == "Hugging Face": |
|
hf_dataset_name = st.text_input("Enter Hugging Face dataset name:") |
|
if hf_dataset_name: |
|
try: |
|
dataset = load_dataset(hf_dataset_name) |
|
df = dataset.to_pandas() |
|
st.success(f"Loaded dataset: {hf_dataset_name}") |
|
except Exception as e: |
|
st.error(f"Error loading dataset: {str(e)}") |
|
|
|
else: |
|
sample_data = st.selectbox("Select a sample dataset:", ["Iris", "Wine", "Titanic"]) |
|
df = sns.load_dataset(sample_data.lower()) |
|
st.success(f"Loaded sample dataset: {sample_data}") |
|
|
|
if 'df' in locals(): |
|
st.dataframe(df.head()) |
|
|
|
|
|
st.header("2. Explore Dataset") |
|
|
|
if 'df' in locals(): |
|
st.subheader("Dataset Overview") |
|
st.write(f"Shape: {df.shape}") |
|
st.write("Column Information:") |
|
st.dataframe(df.dtypes) |
|
|
|
if st.checkbox("Show Missing Values"): |
|
missing = df.isnull().sum() |
|
st.bar_chart(missing[missing > 0]) |
|
|
|
st.subheader("Summary Statistics") |
|
st.write(df.describe()) |
|
|
|
if st.checkbox("Generate Correlation Matrix"): |
|
corr_matrix = df.corr() |
|
st.write(sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")) |
|
st.pyplot() |
|
else: |
|
st.warning("Load a dataset to explore.") |
|
|
|
|
|
st.header("3. Preprocess Dataset") |
|
|
|
if 'df' in locals(): |
|
st.subheader("Handle Missing Values") |
|
missing_option = st.radio("Choose missing value strategy:", ["None", "Fill with Mean", "Drop Rows"]) |
|
if missing_option == "Fill with Mean": |
|
df = df.fillna(df.mean()) |
|
elif missing_option == "Drop Rows": |
|
df = df.dropna() |
|
|
|
st.subheader("Encode Categorical Variables") |
|
encoding_method = st.radio("Encoding Method:", ["None", "One-Hot Encoding", "Label Encoding"]) |
|
if encoding_method == "One-Hot Encoding": |
|
df = pd.get_dummies(df) |
|
elif encoding_method == "Label Encoding": |
|
le = LabelEncoder() |
|
for col in df.select_dtypes(include="object").columns: |
|
df[col] = le.fit_transform(df[col]) |
|
|
|
st.subheader("Feature Scaling") |
|
scaling_method = st.radio("Scaling Method:", ["None", "Standardization", "Normalization"]) |
|
if scaling_method != "None": |
|
scaler = StandardScaler() if scaling_method == "Standardization" else MinMaxScaler() |
|
numeric_cols = df.select_dtypes(include="number").columns |
|
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) |
|
|
|
st.success("Preprocessing complete!") |
|
st.dataframe(df.head()) |
|
else: |
|
st.warning("Load a dataset to preprocess.") |
|
|
|
|
|
st.header("4. Train Model") |
|
|
|
if 'df' in locals(): |
|
st.subheader("Select Target Column") |
|
target_col = st.selectbox("Choose the target column:", df.columns) |
|
features = [col for col in df.columns if col != target_col] |
|
|
|
st.subheader("Train/Test Split") |
|
test_size = st.slider("Test size (percentage):", 10, 50, 20) / 100 |
|
X_train, X_test, y_train, y_test = train_test_split( |
|
df[features], df[target_col], test_size=test_size, random_state=42 |
|
) |
|
|
|
|
|
if y_train.dtypes == 'object': |
|
le = LabelEncoder() |
|
y_train = le.fit_transform(y_train) |
|
y_test = le.transform(y_test) |
|
|
|
st.subheader("Select and Train Model") |
|
model_type = st.selectbox("Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest"]) |
|
if model_type == "Logistic Regression": |
|
model = LogisticRegression(max_iter=200) |
|
elif model_type == "Decision Tree": |
|
model = DecisionTreeClassifier() |
|
else: |
|
model = RandomForestClassifier() |
|
|
|
model.fit(X_train, y_train) |
|
st.success("Model trained successfully!") |
|
|
|
st.subheader("Model Performance") |
|
y_pred = model.predict(X_test) |
|
report = classification_report(y_test, y_pred, output_dict=True) |
|
st.dataframe(pd.DataFrame(report).transpose()) |
|
else: |
|
st.warning("Load and preprocess a dataset to train a model.") |
|
|