Spaces:

Canstralian
/

cybersec-ml-pipeline

Running

App Files Files Community

cybersec-ml-pipeline / app.py

Canstralian

Update app.py

8776745 verified 11 days ago

raw

history blame contribute delete

5.05 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import classification_report
	from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
	from datasets import load_dataset

	# 1. Load Dataset
	st.header("1. Load Dataset")

	data_source = st.radio("Choose data source:", ["Upload File", "Hugging Face", "Sample Dataset"])

	if data_source == "Upload File":
	uploaded_file = st.file_uploader("Upload your dataset (CSV, Excel, or Parquet)", type=["csv", "xlsx", "parquet"])
	if uploaded_file:
	if uploaded_file.name.endswith(".csv"):
	df = pd.read_csv(uploaded_file)
	else:
	df = pd.read_excel(uploaded_file)
	st.success(f"Successfully loaded {uploaded_file.name}")

	elif data_source == "Hugging Face":
	hf_dataset_name = st.text_input("Enter Hugging Face dataset name:")
	if hf_dataset_name:
	try:
	dataset = load_dataset(hf_dataset_name)
	df = dataset.to_pandas()
	st.success(f"Loaded dataset: {hf_dataset_name}")
	except Exception as e:
	st.error(f"Error loading dataset: {str(e)}")

	else: # Sample Dataset
	sample_data = st.selectbox("Select a sample dataset:", ["Iris", "Wine", "Titanic"])
	df = sns.load_dataset(sample_data.lower())
	st.success(f"Loaded sample dataset: {sample_data}")

	if 'df' in locals():
	st.dataframe(df.head())

	# 2. Explore Dataset
	st.header("2. Explore Dataset")

	if 'df' in locals():
	st.subheader("Dataset Overview")
	st.write(f"Shape: {df.shape}")
	st.write("Column Information:")
	st.dataframe(df.dtypes)

	if st.checkbox("Show Missing Values"):
	missing = df.isnull().sum()
	st.bar_chart(missing[missing > 0])

	st.subheader("Summary Statistics")
	st.write(df.describe())

	if st.checkbox("Generate Correlation Matrix"):
	corr_matrix = df.corr()
	st.write(sns.heatmap(corr_matrix, annot=True, cmap="coolwarm"))
	st.pyplot()
	else:
	st.warning("Load a dataset to explore.")

	# 3. Preprocess Dataset
	st.header("3. Preprocess Dataset")

	if 'df' in locals():
	st.subheader("Handle Missing Values")
	missing_option = st.radio("Choose missing value strategy:", ["None", "Fill with Mean", "Drop Rows"])
	if missing_option == "Fill with Mean":
	df = df.fillna(df.mean())
	elif missing_option == "Drop Rows":
	df = df.dropna()

	st.subheader("Encode Categorical Variables")
	encoding_method = st.radio("Encoding Method:", ["None", "One-Hot Encoding", "Label Encoding"])
	if encoding_method == "One-Hot Encoding":
	df = pd.get_dummies(df)
	elif encoding_method == "Label Encoding":
	le = LabelEncoder()
	for col in df.select_dtypes(include="object").columns:
	df[col] = le.fit_transform(df[col])

	st.subheader("Feature Scaling")
	scaling_method = st.radio("Scaling Method:", ["None", "Standardization", "Normalization"])
	if scaling_method != "None":
	scaler = StandardScaler() if scaling_method == "Standardization" else MinMaxScaler()
	numeric_cols = df.select_dtypes(include="number").columns
	df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

	st.success("Preprocessing complete!")
	st.dataframe(df.head())
	else:
	st.warning("Load a dataset to preprocess.")

	# 4. Train Model
	st.header("4. Train Model")

	if 'df' in locals():
	st.subheader("Select Target Column")
	target_col = st.selectbox("Choose the target column:", df.columns)
	features = [col for col in df.columns if col != target_col]

	st.subheader("Train/Test Split")
	test_size = st.slider("Test size (percentage):", 10, 50, 20) / 100
	X_train, X_test, y_train, y_test = train_test_split(
	df[features], df[target_col], test_size=test_size, random_state=42
	)

	# Encode the target variable (if categorical) using LabelEncoder
	if y_train.dtypes == 'object': # Check if the target column is categorical
	le = LabelEncoder()
	y_train = le.fit_transform(y_train)
	y_test = le.transform(y_test)

	st.subheader("Select and Train Model")
	model_type = st.selectbox("Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest"])
	if model_type == "Logistic Regression":
	model = LogisticRegression(max_iter=200) # Increase max_iter if needed
	elif model_type == "Decision Tree":
	model = DecisionTreeClassifier()
	else:
	model = RandomForestClassifier()

	model.fit(X_train, y_train)
	st.success("Model trained successfully!")

	st.subheader("Model Performance")
	y_pred = model.predict(X_test)
	report = classification_report(y_test, y_pred, output_dict=True)
	st.dataframe(pd.DataFrame(report).transpose())
	else:
	st.warning("Load and preprocess a dataset to train a model.")