Canstralian's picture
Update app.py
8776745 verified
import streamlit as st
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from datasets import load_dataset
# 1. Load Dataset
st.header("1. Load Dataset")
data_source = st.radio("Choose data source:", ["Upload File", "Hugging Face", "Sample Dataset"])
if data_source == "Upload File":
uploaded_file = st.file_uploader("Upload your dataset (CSV, Excel, or Parquet)", type=["csv", "xlsx", "parquet"])
if uploaded_file:
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
st.success(f"Successfully loaded {uploaded_file.name}")
elif data_source == "Hugging Face":
hf_dataset_name = st.text_input("Enter Hugging Face dataset name:")
if hf_dataset_name:
try:
dataset = load_dataset(hf_dataset_name)
df = dataset.to_pandas()
st.success(f"Loaded dataset: {hf_dataset_name}")
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
else: # Sample Dataset
sample_data = st.selectbox("Select a sample dataset:", ["Iris", "Wine", "Titanic"])
df = sns.load_dataset(sample_data.lower())
st.success(f"Loaded sample dataset: {sample_data}")
if 'df' in locals():
st.dataframe(df.head())
# 2. Explore Dataset
st.header("2. Explore Dataset")
if 'df' in locals():
st.subheader("Dataset Overview")
st.write(f"Shape: {df.shape}")
st.write("Column Information:")
st.dataframe(df.dtypes)
if st.checkbox("Show Missing Values"):
missing = df.isnull().sum()
st.bar_chart(missing[missing > 0])
st.subheader("Summary Statistics")
st.write(df.describe())
if st.checkbox("Generate Correlation Matrix"):
corr_matrix = df.corr()
st.write(sns.heatmap(corr_matrix, annot=True, cmap="coolwarm"))
st.pyplot()
else:
st.warning("Load a dataset to explore.")
# 3. Preprocess Dataset
st.header("3. Preprocess Dataset")
if 'df' in locals():
st.subheader("Handle Missing Values")
missing_option = st.radio("Choose missing value strategy:", ["None", "Fill with Mean", "Drop Rows"])
if missing_option == "Fill with Mean":
df = df.fillna(df.mean())
elif missing_option == "Drop Rows":
df = df.dropna()
st.subheader("Encode Categorical Variables")
encoding_method = st.radio("Encoding Method:", ["None", "One-Hot Encoding", "Label Encoding"])
if encoding_method == "One-Hot Encoding":
df = pd.get_dummies(df)
elif encoding_method == "Label Encoding":
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
df[col] = le.fit_transform(df[col])
st.subheader("Feature Scaling")
scaling_method = st.radio("Scaling Method:", ["None", "Standardization", "Normalization"])
if scaling_method != "None":
scaler = StandardScaler() if scaling_method == "Standardization" else MinMaxScaler()
numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
st.success("Preprocessing complete!")
st.dataframe(df.head())
else:
st.warning("Load a dataset to preprocess.")
# 4. Train Model
st.header("4. Train Model")
if 'df' in locals():
st.subheader("Select Target Column")
target_col = st.selectbox("Choose the target column:", df.columns)
features = [col for col in df.columns if col != target_col]
st.subheader("Train/Test Split")
test_size = st.slider("Test size (percentage):", 10, 50, 20) / 100
X_train, X_test, y_train, y_test = train_test_split(
df[features], df[target_col], test_size=test_size, random_state=42
)
# Encode the target variable (if categorical) using LabelEncoder
if y_train.dtypes == 'object': # Check if the target column is categorical
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
st.subheader("Select and Train Model")
model_type = st.selectbox("Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest"])
if model_type == "Logistic Regression":
model = LogisticRegression(max_iter=200) # Increase max_iter if needed
elif model_type == "Decision Tree":
model = DecisionTreeClassifier()
else:
model = RandomForestClassifier()
model.fit(X_train, y_train)
st.success("Model trained successfully!")
st.subheader("Model Performance")
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
st.dataframe(pd.DataFrame(report).transpose())
else:
st.warning("Load and preprocess a dataset to train a model.")