Spaces:

Canstralian
/

cybersec-ml-pipeline

Running

App Files Files Community

Canstralian commited on Jan 14

Commit

7316b09

verified ·

1 Parent(s): c7d95a9

Upload 16 files

Browse files

Files changed (16) hide show

.gitignore +43 -0
.replit +39 -0
CODE_OF_CONDUCT.md +52 -0
CONTRIBUTING.md +62 -0
LICENSE +21 -0
README.md +36 -31
app.py +152 -23
data_processing.py +129 -0
generated-icon.png +0 -0
model_training.py +48 -0
pyproject.toml +17 -0
replit.nix +16 -0
replit_zip_error_log.txt +83 -0
utils.py +149 -0
uv.lock +0 -0
visualizations.py +93 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,43 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Streamlit
+.streamlit/secrets.toml
+# Model files
+models/
+# Logs
+*.log
+# System
+.DS_Store
+Thumbs.db

.replit ADDED Viewed

	@@ -0,0 +1,39 @@

+modules = ["python-3.11"]
+[nix]
+channel = "stable-24_05"
+[deployment]
+deploymentTarget = "autoscale"
+run = ["sh", "-c", "streamlit run app.py --server.port 5000"]
+[workflows]
+runButton = "Project"
+[[workflows.workflow]]
+name = "Project"
+mode = "parallel"
+author = "agent"
+[[workflows.workflow.tasks]]
+task = "workflow.run"
+args = "Streamlit App"
+[[workflows.workflow]]
+name = "Streamlit App"
+author = "agent"
+[workflows.workflow.metadata]
+agentRequireRestartOnSave = false
+[[workflows.workflow.tasks]]
+task = "packager.installForAll"
+[[workflows.workflow.tasks]]
+task = "shell.exec"
+args = "streamlit run app.py --server.port 5000"
+waitForPort = 5000
+[[ports]]
+localPort = 5000
+externalPort = 80

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+## Our Standards
+Examples of behavior that contributes to a positive environment:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior:
+* The use of sexualized language or imagery, and sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information without explicit permission
+* Other conduct which could reasonably be considered inappropriate
+## Enforcement Responsibilities
+Project maintainers are responsible for clarifying and enforcing standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the project team. All complaints will be reviewed and investigated
+promptly and fairly.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Contributing to ML Pipeline for Cybersecurity Purple Teaming
+First off, thank you for considering contributing to our project! 🎉
+## Code of Conduct
+This project and everyone participating in it is governed by our [Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
+## How Can I Contribute?
+### Reporting Bugs 🐛
+- Use the GitHub issue tracker
+- Check if the bug has already been reported
+- Include detailed steps to reproduce the bug
+- Provide system information and stack traces if applicable
+### Suggesting Enhancements 💡
+- First, read the documentation to make sure the functionality doesn't already exist
+- Use the GitHub issue tracker and clearly describe the feature
+- Explain why this enhancement would be useful
+- Keep the scope as narrow as possible
+### Pull Requests 🔧
+1. Fork the repo and create your branch from `main`
+2. If you've added code that should be tested, add tests
+3. Ensure the test suite passes
+4. Make sure your code lints
+5. Issue that pull request!
+## Development Process
+1. **Setup Development Environment**
+   ```bash
+   pip install -r requirements-dev.txt
+   ```
+2. **Run Tests**
+   ```bash
+   pytest
+   ```
+3. **Code Style**
+   - Follow PEP 8 guidelines
+   - Use meaningful variable names
+   - Add comments for complex logic
+   - Write docstrings for functions and classes
+4. **Commit Messages**
+   - Use clear, descriptive commit messages
+   - Reference issues and pull requests
+   - Keep commits atomic and focused
+## Documentation 📚
+- Update README.md with details of changes to the interface
+- Update docstrings and comments
+- Add any new installation requirements
+Thank you for your contribution! 🙏

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Cybersecurity ML Pipeline Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,20 +1,21 @@
----
-title: ML Pipeline for Cybersecurity Purple Teaming 🛡️
-emoji: 🏃
-colorFrom: indigo
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: A Streamlit-based machine learning pipeline platform
----
 # ML Pipeline for Cybersecurity Purple Teaming 🛡️
 A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
 ## Features 🚀
 - **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
@@ -32,21 +33,11 @@ A scalable Streamlit-based machine learning pipeline platform specialized for cy
 ## Getting Started 🏁
-1. **Clone the repository**
-```bash
-git clone https://github.com/yourusername/cybersec-ml-pipeline.git
-cd cybersec-ml-pipeline
-```
-2. **Install dependencies**
-```bash
-pip install -r requirements.txt
-```
-3. **Run the application**
-```bash
-streamlit run app.py
-```
 ## Usage Guide 📖
@@ -64,13 +55,27 @@ streamlit run app.py
    - Real-time performance metrics
    - Visual model evaluation
-## Contributing 🤝
-Please read our [Contributing Guidelines](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
-## Security 🔒
-For security concerns, please review our [Security Policy](.github/SECURITY.md).
 ## License 📄

+---
+title: ML Pipeline for Cybersecurity Purple Teaming
+emoji: 🛡️
+colorFrom: red
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.28.1
+app_file: app.py
+pinned: false
+license: mit
+---
 # ML Pipeline for Cybersecurity Purple Teaming 🛡️
 A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
+[![Open In Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
 ## Features 🚀
 - **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
 ## Getting Started 🏁
+1. Visit the [Space on Hugging Face Hub](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
+2. Upload your cybersecurity dataset (CSV/JSON format)
+3. Configure the ML pipeline parameters
+4. Train and evaluate your model
+5. Export the trained model for deployment
 ## Usage Guide 📖
    - Real-time performance metrics
    - Visual model evaluation
+## Local Development
+1. **Clone the repository**
+```bash
+git clone https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline
+cd cybersec-ml-pipeline
+```
+2. **Install dependencies**
+```bash
+pip install -r requirements.txt
+```
+3. **Run the application**
+```bash
+streamlit run app.py
+```
+## Contributing 🤝
+Please read our [Contributing Guidelines](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
 ## License 📄

app.py CHANGED Viewed

@@ -6,6 +6,12 @@ from model_training import ModelTrainer
 from visualizations import Visualizer
 from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
 import warnings
 warnings.filterwarnings('ignore')
 st.set_page_config(
@@ -14,21 +20,99 @@ st.set_page_config(
     layout="wide"
 )
 def main():
     st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
     # Sidebar
     st.sidebar.header("Pipeline Configuration")
-    # File upload
-    uploaded_file = st.sidebar.file_uploader(
-        "Upload Dataset (CSV/JSON)",
-        type=['csv', 'json']
     )
-    if uploaded_file is not None:
         try:
-            df = load_data(uploaded_file)
             # Initialize components
             processor = DataProcessor()
@@ -68,37 +152,66 @@ def main():
                 st.subheader("Advanced Features")
                 use_polynomial = st.checkbox("Use Polynomial Features")
                 if use_polynomial:
-                    poly_degree = st.slider("Polynomial Degree", 2, 5, 2)
                 use_feature_selection = st.checkbox("Use Feature Selection")
                 if use_feature_selection:
-                    k_best_features = st.slider("Number of Best Features", 5, 50, 10)
             with col4:
                 use_pca = st.checkbox("Use PCA")
                 if use_pca:
-                    n_components = st.slider("PCA Components (%)", 1, 100, 95) / 100.0
                 add_cyber_features = st.checkbox("Add Cybersecurity Features")
                 feature_cols = st.multiselect(
                     "Select Features",
-                    get_feature_names(df),
-                    default=get_feature_names(df)
                 )
                 target_col = st.selectbox(
                     "Select Target Column",
-                    df.columns.tolist()
                 )
             # Create feature engineering config
             feature_engineering_config = {
                 'use_polynomial': use_polynomial,
-                'poly_degree': poly_degree if use_polynomial else None,
                 'use_feature_selection': use_feature_selection,
-                'k_best_features': k_best_features if use_feature_selection else None,
                 'use_pca': use_pca,
-                'n_components': n_components if use_pca else None,
                 'add_cyber_features': add_cyber_features
             }
@@ -164,16 +277,28 @@ def main():
                         for metric, value in metrics.items():
                             st.metric(metric, f"{value:.4f}")
-                        # Add model export section
                         st.subheader("Export Model")
-                        model_name = st.text_input("Model Name (optional)")
                         if st.button("Save Model"):
                             try:
                                 # Save model and metadata
                                 preprocessing_params = {
                                     'feature_engineering_config': feature_engineering_config,
                                     'handling_strategy': handling_strategy,
-                                    'scaling_method': scaling_method
                                 }
                                 model_path, metadata_path = save_model(
@@ -181,12 +306,13 @@ def main():
                                     feature_cols,
                                     preprocessing_params,
                                     metrics,
-                                    model_name
                                 )
-                                st.success(f"Model saved successfully! Files:\n- {model_path}\n- {metadata_path}")
                             except Exception as e:
                                 st.error(f"Error saving model: {str(e)}")
                     with col8:
                         if not use_pca:  # Skip feature importance for PCA
@@ -215,10 +341,13 @@ def main():
                     st.pyplot(fig_roc)
         except Exception as e:
-            st.error(f"Error: {str(e)}")
     else:
-        st.info("Please upload a dataset to begin.")
     # Add Model Management Section
     st.header("5. Saved Models")

 from visualizations import Visualizer
 from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
 import warnings
+import re
+from typing import Optional
+from datasets import load_dataset
+from huggingface_hub import list_datasets
+import traceback
 warnings.filterwarnings('ignore')
 st.set_page_config(
     layout="wide"
 )
+def validate_model_name(name: Optional[str]) -> str:
+    """Validate and sanitize model name"""
+    if not name:
+        return f"model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
+    sanitized = re.sub(r'[^\w\-]', '_', name)
+    return sanitized
+def load_hf_dataset(dataset_name: str, config_name: Optional[str] = None) -> pd.DataFrame:
+    """Load a dataset from Hugging Face and convert to pandas DataFrame"""
+    try:
+        if config_name:
+            dataset = load_dataset(dataset_name, config_name)
+        else:
+            dataset = load_dataset(dataset_name)
+        # Convert to pandas DataFrame (using first split, usually 'train')
+        split_name = list(dataset.keys())[0]
+        df = dataset[split_name].to_pandas()
+        return df
+    except Exception as e:
+        raise Exception(f"Error loading dataset from Hugging Face: {str(e)}\n{traceback.format_exc()}")
 def main():
     st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
+    # Initialize default values for feature engineering
+    if 'poly_degree' not in st.session_state:
+        st.session_state.poly_degree = 2
+    if 'k_best_features' not in st.session_state:
+        st.session_state.k_best_features = 10
+    if 'n_components' not in st.session_state:
+        st.session_state.n_components = 0.95
     # Sidebar
     st.sidebar.header("Pipeline Configuration")
+    # Data Input Tabs
+    data_input_tab = st.radio(
+        "Choose Data Source",
+        ["Upload File", "Load from Hugging Face"]
     )
+    df = None
+    if data_input_tab == "Upload File":
+        uploaded_file = st.file_uploader(
+            "Upload Dataset (CSV/JSON)",
+            type=['csv', 'json']
+        )
+        if uploaded_file is not None:
+            try:
+                df = load_data(uploaded_file)
+            except Exception as e:
+                st.error(f"Error loading file: {str(e)}")
+    else:
+        # Hugging Face Dataset Loading
+        st.markdown("### Load Dataset from Hugging Face")
+        dataset_name = st.text_input(
+            "Dataset Name",
+            help="Enter the Hugging Face dataset name (e.g., 'username/dataset-name')"
+        )
+        config_name = st.text_input(
+            "Configuration Name (Optional)",
+            help="Enter the specific configuration name if the dataset has multiple configurations"
+        )
+        if dataset_name:
+            try:
+                with st.spinner("Loading dataset from Hugging Face..."):
+                    df = load_hf_dataset(
+                        dataset_name,
+                        config_name if config_name else None
+                    )
+                st.success(f"Successfully loaded dataset: {dataset_name}")
+            except Exception as e:
+                st.error(str(e))
+    if df is not None:
         try:
+            # Validate data
+            if df.empty:
+                st.error("The dataset contains no data.")
+                return
+            if df.shape[1] < 2:
+                st.error("Dataset must contain at least two columns (features and target).")
+                return
+            # Check for numeric columns
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) == 0:
+                st.error("Dataset must contain at least one numeric column for analysis.")
+                return
             # Initialize components
             processor = DataProcessor()
                 st.subheader("Advanced Features")
                 use_polynomial = st.checkbox("Use Polynomial Features")
                 if use_polynomial:
+                    st.session_state.poly_degree = st.slider("Polynomial Degree", 2, 5, st.session_state.poly_degree)
                 use_feature_selection = st.checkbox("Use Feature Selection")
                 if use_feature_selection:
+                    max_features = min(50, df.shape[1])  # Limit k_best_features to number of columns
+                    st.session_state.k_best_features = st.slider(
+                        "Number of Best Features",
+                        2,  # Minimum 2 features required
+                        max_features,
+                        min(st.session_state.k_best_features, max_features),
+                        help="Select the number of most important features to use"
+                    )
             with col4:
                 use_pca = st.checkbox("Use PCA")
                 if use_pca:
+                    st.session_state.n_components = st.slider(
+                        "PCA Components (%)",
+                        1, 100,
+                        int(st.session_state.n_components * 100),
+                        help="Percentage of variance to preserve"
+                    ) / 100.0
                 add_cyber_features = st.checkbox("Add Cybersecurity Features")
+                numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
+                if not numeric_features:
+                    st.error("No numeric features found in the dataset.")
+                    return
                 feature_cols = st.multiselect(
                     "Select Features",
+                    numeric_features,
+                    default=numeric_features,
+                    help="Select the features to use for training"
                 )
+                if not feature_cols:
+                    st.error("Please select at least one feature column")
+                    return
+                categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
                 target_col = st.selectbox(
                     "Select Target Column",
+                    [col for col in categorical_cols if col not in feature_cols],
+                    help="Select the target variable to predict"
                 )
+                if target_col is None:
+                    st.error("No suitable target column found. Target should be categorical.")
+                    return
             # Create feature engineering config
             feature_engineering_config = {
                 'use_polynomial': use_polynomial,
+                'poly_degree': st.session_state.poly_degree if use_polynomial else None,
                 'use_feature_selection': use_feature_selection,
+                'k_best_features': st.session_state.k_best_features if use_feature_selection else None,
                 'use_pca': use_pca,
+                'n_components': st.session_state.n_components if use_pca else None,
                 'add_cyber_features': add_cyber_features
             }
                         for metric, value in metrics.items():
                             st.metric(metric, f"{value:.4f}")
+                        # Add model export section with improved validation
                         st.subheader("Export Model")
+                        model_name = st.text_input(
+                            "Model Name (optional)",
+                            help="Enter a name for your model (alphanumeric and underscores only)"
+                        )
                         if st.button("Save Model"):
                             try:
+                                # Validate and sanitize model name
+                                sanitized_name = validate_model_name(model_name)
+                                if sanitized_name != model_name:
+                                    st.warning(f"Model name was sanitized to: {sanitized_name}")
                                 # Save model and metadata
                                 preprocessing_params = {
                                     'feature_engineering_config': feature_engineering_config,
                                     'handling_strategy': handling_strategy,
+                                    'scaling_method': scaling_method,
+                                    'feature_columns': feature_cols,
+                                    'target_column': target_col
                                 }
                                 model_path, metadata_path = save_model(
                                     feature_cols,
                                     preprocessing_params,
                                     metrics,
+                                    sanitized_name
                                 )
+                                st.success(f"Model saved successfully!\nFiles:\n- {model_path}\n- {metadata_path}")
                             except Exception as e:
                                 st.error(f"Error saving model: {str(e)}")
+                                st.error("Please ensure you have proper permissions and sufficient disk space.")
                     with col8:
                         if not use_pca:  # Skip feature importance for PCA
                     st.pyplot(fig_roc)
         except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+            st.error("Please check your input data and try again.")
     else:
+        if data_input_tab == "Upload File":
+            st.info("Please upload a dataset to begin.")
+        else:
+            st.info("Please enter a Hugging Face dataset name to begin.")
     # Add Model Management Section
     st.header("5. Saved Models")

data_processing.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
+from sklearn.impute import SimpleImputer
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.decomposition import PCA
+import dask.dataframe as dd
+class DataProcessor:
+    def __init__(self):
+        self.scaler = None
+        self.imputer = None
+        self.poly_features = None
+        self.feature_selector = None
+        self.pca = None
+    def _get_scaler(self, method):
+        """Returns the appropriate scaler based on method."""
+        scalers = {
+            'standard': StandardScaler(),
+            'minmax': MinMaxScaler(),
+            'robust': RobustScaler()
+        }
+        return scalers.get(method, StandardScaler())
+    def _get_imputer(self, strategy):
+        """Returns the appropriate imputer based on strategy."""
+        return SimpleImputer(strategy=strategy)
+    def _engineer_features(self, X, feature_engineering_config):
+        """Apply feature engineering transformations."""
+        # Polynomial Features
+        if feature_engineering_config.get('use_polynomial', False):
+            degree = feature_engineering_config.get('poly_degree', 2)
+            self.poly_features = PolynomialFeatures(degree=degree, include_bias=False)
+            X = self.poly_features.fit_transform(X)
+        # Feature Selection
+        if feature_engineering_config.get('use_feature_selection', False):
+            k = feature_engineering_config.get('k_best_features', 10)
+            self.feature_selector = SelectKBest(score_func=f_classif, k=k)
+            X = self.feature_selector.fit_transform(X)
+        # Dimensionality Reduction
+        if feature_engineering_config.get('use_pca', False):
+            n_components = feature_engineering_config.get('n_components', 0.95)
+            self.pca = PCA(n_components=n_components)
+            X = self.pca.fit_transform(X)
+        # Add cybersecurity-specific features
+        if feature_engineering_config.get('add_cyber_features', False):
+            X = self._add_cyber_features(X)
+        return X
+    def _add_cyber_features(self, X):
+        """Add cybersecurity-specific engineered features."""
+        # Convert back to DataFrame for feature engineering
+        X_df = pd.DataFrame(X)
+        # Example cyber features (modify based on your specific needs):
+        # - Entropy of numerical features
+        # - Statistical moments (skewness, kurtosis)
+        # - Rolling windows statistics
+        for col in X_df.columns:
+            if X_df[col].dtype in ['float64', 'int64']:
+                # Calculate entropy for numerical columns
+                X_df[f'{col}_entropy'] = X_df[col].apply(lambda x: -np.sum(x * np.log2(x)) if x != 0 else 0)
+                # Add statistical moments
+                X_df[f'{col}_skew'] = X_df[col].skew()
+                X_df[f'{col}_kurt'] = X_df[col].kurtosis()
+                # Add rolling statistics
+                X_df[f'{col}_rolling_mean'] = X_df[col].rolling(window=3, min_periods=1).mean()
+                X_df[f'{col}_rolling_std'] = X_df[col].rolling(window=3, min_periods=1).std()
+        return X_df.values
+    def process_data(self, df, feature_cols, target_col, impute_strategy='mean',
+                    scaling_method='standard', feature_engineering_config=None):
+        """
+        Process the data using Dask for large datasets.
+        Args:
+            df: pandas DataFrame
+            feature_cols: list of feature columns
+            target_col: target column name
+            impute_strategy: strategy for handling missing values
+            scaling_method: method for scaling features
+            feature_engineering_config: dictionary of feature engineering parameters
+        Returns:
+            X_train, X_test, y_train, y_test: processed and split data
+        """
+        try:
+            # Convert to Dask DataFrame for large dataset handling
+            ddf = dd.from_pandas(df, npartitions=4)
+            # Select features and target
+            X = ddf[feature_cols].compute()
+            y = ddf[target_col].compute()
+            # Handle missing values
+            self.imputer = self._get_imputer(impute_strategy)
+            X = self.imputer.fit_transform(X)
+            # Scale features
+            self.scaler = self._get_scaler(scaling_method)
+            X = self.scaler.fit_transform(X)
+            # Apply feature engineering if config is provided
+            if feature_engineering_config:
+                X = self._engineer_features(X, feature_engineering_config)
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y,
+                test_size=0.2,
+                random_state=42,
+                stratify=y if len(np.unique(y)) > 1 else None
+            )
+            return X_train, X_test, y_train, y_test
+        except Exception as e:
+            raise Exception(f"Error in data processing: {str(e)}")

generated-icon.png ADDED Viewed

model_training.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+import numpy as np
+class ModelTrainer:
+    def __init__(self):
+        self.model = None
+    def train_model(self, X_train, X_test, y_train, y_test, **kwargs):
+        """
+        Train a Random Forest model with given parameters.
+        Args:
+            X_train, X_test, y_train, y_test: Training and test data
+            **kwargs: Model parameters
+        Returns:
+            model: Trained model
+            metrics: Dictionary of evaluation metrics
+        """
+        try:
+            # Initialize and train model
+            self.model = RandomForestClassifier(
+                n_estimators=kwargs.get('n_estimators', 100),
+                max_depth=kwargs.get('max_depth', 10),
+                min_samples_split=kwargs.get('min_samples_split', 2),
+                min_samples_leaf=kwargs.get('min_samples_leaf', 1),
+                random_state=42,
+                n_jobs=-1
+            )
+            self.model.fit(X_train, y_train)
+            # Make predictions
+            y_pred = self.model.predict(X_test)
+            # Calculate metrics
+            metrics = {
+                'Accuracy': accuracy_score(y_test, y_pred),
+                'Precision': precision_score(y_test, y_pred, average='weighted'),
+                'Recall': recall_score(y_test, y_pred, average='weighted'),
+                'F1 Score': f1_score(y_test, y_pred, average='weighted')
+            }
+            return self.model, metrics
+        except Exception as e:
+            raise Exception(f"Error in model training: {str(e)}")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[project]
+name = "repl-nix-workspace"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.11"
+dependencies = [
+    "dask[dataframe]>=2024.12.1",
+    "datasets>=3.2.0",
+    "huggingface-hub>=0.27.1",
+    "joblib>=1.4.2",
+    "matplotlib>=3.10.0",
+    "numpy>=2.2.1",
+    "pandas>=2.2.3",
+    "scikit-learn>=1.6.1",
+    "seaborn>=0.13.2",
+    "streamlit>=1.41.1",
+]

replit.nix ADDED Viewed

	@@ -0,0 +1,16 @@

+{pkgs}: {
+  deps = [
+    pkgs.tk
+    pkgs.tcl
+    pkgs.qhull
+    pkgs.pkg-config
+    pkgs.gtk3
+    pkgs.gobject-introspection
+    pkgs.ghostscript
+    pkgs.freetype
+    pkgs.ffmpeg-full
+    pkgs.cairo
+    pkgs.arrow-cpp
+    pkgs.glibcLocales
+  ];
+}

replit_zip_error_log.txt ADDED Viewed

	@@ -0,0 +1,83 @@

+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/python-3.11","time":"2025-01-14T00:13:02Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/replit","time":"2025-01-14T00:13:02Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohappyeyeballs/aiohappyeyeballs-2.4.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohttp/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiosignal/aiosignal-1.3.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/altair/altair-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/attrs/attrs-24.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/blinker/blinker-1.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cachetools/cachetools-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/certifi/certifi-2024.12.14-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/charset-normalizer/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/click/click-8.1.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cloudpickle/cloudpickle-3.1.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/contourpy/contourpy-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cycler/cycler-0.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask/dask-2024.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask-expr/dask_expr-1.1.21-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/datasets/datasets-3.2.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dill/dill-0.3.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/filelock/filelock-3.16.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fonttools/fonttools-4.55.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/frozenlist/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.12.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitdb/gitdb-4.0.12-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitpython/gitpython-3.1.44-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/huggingface-hub/huggingface_hub-0.27.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/idna/idna-3.10-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/importlib-metadata/importlib_metadata-8.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jinja2/jinja2-3.1.5-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/joblib/joblib-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema/jsonschema-4.23.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema-specifications/jsonschema_specifications-2024.10.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/kiwisolver/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/locket/locket-1.0.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markdown-it-py/markdown_it_py-3.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markupsafe/markupsafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/matplotlib/matplotlib-3.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/mdurl/mdurl-0.1.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multidict/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multiprocess/multiprocess-0.70.16-py311-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/narwhals/narwhals-1.22.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/numpy/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/packaging/packaging-24.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pandas/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/partd/partd-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pillow/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/propcache/propcache-0.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/protobuf/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyarrow/pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pydeck/pydeck-0.9.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pygments/pygments-2.19.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyparsing/pyparsing-3.2.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/python-dateutil/python_dateutil-2.9.0.post0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pytz/pytz-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyyaml/pyyaml-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/referencing/referencing-0.35.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/requests/requests-2.32.3-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rich/rich-13.9.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rpds-py/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scikit-learn/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scipy/scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/seaborn/seaborn-0.13.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/six/six-1.17.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/smmap/smmap-5.0.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/streamlit/streamlit-1.41.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tenacity/tenacity-9.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/threadpoolctl/threadpoolctl-3.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toml/toml-0.10.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toolz/toolz-1.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tornado/tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tqdm/tqdm-4.67.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/typing-extensions/typing_extensions-4.12.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tzdata/tzdata-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/urllib3/urllib3-2.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/watchdog/watchdog-6.0.0-py3-none-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/xxhash/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/yarl/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/zipp/zipp-3.21.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3.11","time":"2025-01-14T00:13:36Z"}
+{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/lib64","time":"2025-01-14T00:14:12Z"}

utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import pandas as pd
+import numpy as np
+import joblib
+import os
+import json
+from datetime import datetime
+def load_data(file):
+    """
+    Load data from uploaded file.
+    Args:
+        file: Streamlit uploaded file object
+    Returns:
+        pandas DataFrame
+    """
+    try:
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file)
+        elif file.name.endswith('.json'):
+            df = pd.read_json(file)
+        else:
+            raise ValueError("Unsupported file format")
+        return df
+    except Exception as e:
+        raise Exception(f"Error loading data: {str(e)}")
+def get_feature_names(df):
+    """
+    Get list of numeric columns suitable for features.
+    Args:
+        df: pandas DataFrame
+    Returns:
+        list of column names
+    """
+    try:
+        # Select numeric columns
+        numeric_cols = df.select_dtypes(
+            include=['int64', 'float64']
+        ).columns.tolist()
+        return numeric_cols
+    except Exception as e:
+        raise Exception(f"Error getting feature names: {str(e)}")
+def save_model(model, feature_cols, preprocessing_params, metrics, model_name=None):
+    """
+    Save trained model and its metadata.
+    Args:
+        model: Trained sklearn model
+        feature_cols: List of feature column names
+        preprocessing_params: Dictionary of preprocessing parameters
+        metrics: Dictionary of model performance metrics
+        model_name: Optional custom name for the model
+    Returns:
+        saved_path: Path where model was saved
+    """
+    try:
+        # Create models directory if it doesn't exist
+        os.makedirs('models', exist_ok=True)
+        # Generate model name if not provided
+        if model_name is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            model_name = f"model_{timestamp}"
+        # Save paths
+        model_path = f"models/{model_name}.joblib"
+        metadata_path = f"models/{model_name}_metadata.json"
+        # Save model using joblib
+        joblib.dump(model, model_path)
+        # Save metadata
+        metadata = {
+            'feature_columns': feature_cols,
+            'preprocessing_parameters': preprocessing_params,
+            'performance_metrics': metrics,
+            'created_at': datetime.now().isoformat(),
+            'model_type': type(model).__name__
+        }
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=4)
+        return model_path, metadata_path
+    except Exception as e:
+        raise Exception(f"Error saving model: {str(e)}")
+def load_saved_model(model_path, metadata_path):
+    """
+    Load a saved model and its metadata.
+    Args:
+        model_path: Path to the saved model file
+        metadata_path: Path to the model metadata file
+    Returns:
+        model: Loaded model
+        metadata: Dictionary containing model metadata
+    """
+    try:
+        # Load model
+        model = joblib.load(model_path)
+        # Load metadata
+        with open(metadata_path, 'r') as f:
+            metadata = json.load(f)
+        return model, metadata
+    except Exception as e:
+        raise Exception(f"Error loading model: {str(e)}")
+def list_saved_models():
+    """
+    List all saved models in the models directory.
+    Returns:
+        list of dictionaries containing model info
+    """
+    try:
+        models_info = []
+        if not os.path.exists('models'):
+            return models_info
+        for filename in os.listdir('models'):
+            if filename.endswith('_metadata.json'):
+                with open(f"models/{filename}", 'r') as f:
+                    metadata = json.load(f)
+                    model_name = filename.replace('_metadata.json', '')
+                    models_info.append({
+                        'name': model_name,
+                        'type': metadata['model_type'],
+                        'created_at': metadata['created_at'],
+                        'metrics': metadata['performance_metrics']
+                    })
+        return models_info
+    except Exception as e:
+        raise Exception(f"Error listing models: {str(e)}")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

visualizations.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import confusion_matrix, roc_curve, auc
+import numpy as np
+class Visualizer:
+    def __init__(self):
+        plt.style.use('seaborn')
+    def plot_feature_importance(self, model, feature_names):
+        """Plot feature importance from the trained model."""
+        try:
+            plt.figure(figsize=(10, 6))
+            importances = model.feature_importances_
+            indices = np.argsort(importances)[::-1]
+            plt.title("Feature Importance")
+            plt.bar(range(len(importances)), importances[indices])
+            plt.xticks(
+                range(len(importances)),
+                [feature_names[i] for i in indices],
+                rotation=45,
+                ha='right'
+            )
+            plt.tight_layout()
+            return plt.gcf()
+        except Exception as e:
+            raise Exception(f"Error plotting feature importance: {str(e)}")
+    def plot_confusion_matrix(self, y_true, y_pred):
+        """Plot confusion matrix."""
+        try:
+            plt.figure(figsize=(8, 6))
+            cm = confusion_matrix(y_true, y_pred)
+            sns.heatmap(
+                cm,
+                annot=True,
+                fmt='d',
+                cmap='Blues',
+                cbar=False
+            )
+            plt.title("Confusion Matrix")
+            plt.ylabel("True Label")
+            plt.xlabel("Predicted Label")
+            plt.tight_layout()
+            return plt.gcf()
+        except Exception as e:
+            raise Exception(f"Error plotting confusion matrix: {str(e)}")
+    def plot_roc_curve(self, model, X_test, y_test):
+        """Plot ROC curve."""
+        try:
+            plt.figure(figsize=(8, 6))
+            y_prob = model.predict_proba(X_test)
+            # Handle multi-class case
+            if y_prob.shape[1] > 2:
+                # Plot ROC curve for each class
+                for i in range(y_prob.shape[1]):
+                    fpr, tpr, _ = roc_curve(
+                        (y_test == i).astype(int),
+                        y_prob[:, i]
+                    )
+                    auc_score = auc(fpr, tpr)
+                    plt.plot(
+                        fpr,
+                        tpr,
+                        label=f'Class {i} (AUC = {auc_score:.2f})'
+                    )
+            else:
+                # Binary classification
+                fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
+                auc_score = auc(fpr, tpr)
+                plt.plot(
+                    fpr,
+                    tpr,
+                    label=f'ROC curve (AUC = {auc_score:.2f})'
+                )
+            plt.plot([0, 1], [0, 1], 'k--')
+            plt.xlim([0.0, 1.0])
+            plt.ylim([0.0, 1.05])
+            plt.xlabel('False Positive Rate')
+            plt.ylabel('True Positive Rate')
+            plt.title('Receiver Operating Characteristic (ROC) Curve')
+            plt.legend(loc="lower right")
+            plt.tight_layout()
+            return plt.gcf()
+        except Exception as e:
+            raise Exception(f"Error plotting ROC curve: {str(e)}")