Canstralian commited on
Commit
7316b09
·
verified ·
1 Parent(s): c7d95a9

Upload 16 files

Browse files
Files changed (16) hide show
  1. .gitignore +43 -0
  2. .replit +39 -0
  3. CODE_OF_CONDUCT.md +52 -0
  4. CONTRIBUTING.md +62 -0
  5. LICENSE +21 -0
  6. README.md +36 -31
  7. app.py +152 -23
  8. data_processing.py +129 -0
  9. generated-icon.png +0 -0
  10. model_training.py +48 -0
  11. pyproject.toml +17 -0
  12. replit.nix +16 -0
  13. replit_zip_error_log.txt +83 -0
  14. utils.py +149 -0
  15. uv.lock +0 -0
  16. visualizations.py +93 -0
.gitignore ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ .Python
6
+ env/
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+
22
+ # Virtual Environment
23
+ venv/
24
+ ENV/
25
+
26
+ # IDE
27
+ .idea/
28
+ .vscode/
29
+ *.swp
30
+ *.swo
31
+
32
+ # Streamlit
33
+ .streamlit/secrets.toml
34
+
35
+ # Model files
36
+ models/
37
+
38
+ # Logs
39
+ *.log
40
+
41
+ # System
42
+ .DS_Store
43
+ Thumbs.db
.replit ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ modules = ["python-3.11"]
2
+
3
+ [nix]
4
+ channel = "stable-24_05"
5
+
6
+ [deployment]
7
+ deploymentTarget = "autoscale"
8
+ run = ["sh", "-c", "streamlit run app.py --server.port 5000"]
9
+
10
+ [workflows]
11
+ runButton = "Project"
12
+
13
+ [[workflows.workflow]]
14
+ name = "Project"
15
+ mode = "parallel"
16
+ author = "agent"
17
+
18
+ [[workflows.workflow.tasks]]
19
+ task = "workflow.run"
20
+ args = "Streamlit App"
21
+
22
+ [[workflows.workflow]]
23
+ name = "Streamlit App"
24
+ author = "agent"
25
+
26
+ [workflows.workflow.metadata]
27
+ agentRequireRestartOnSave = false
28
+
29
+ [[workflows.workflow.tasks]]
30
+ task = "packager.installForAll"
31
+
32
+ [[workflows.workflow.tasks]]
33
+ task = "shell.exec"
34
+ args = "streamlit run app.py --server.port 5000"
35
+ waitForPort = 5000
36
+
37
+ [[ports]]
38
+ localPort = 5000
39
+ externalPort = 80
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity
10
+ and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to a positive environment:
15
+
16
+ * Using welcoming and inclusive language
17
+ * Being respectful of differing viewpoints and experiences
18
+ * Gracefully accepting constructive criticism
19
+ * Focusing on what is best for the community
20
+ * Showing empathy towards other community members
21
+
22
+ Examples of unacceptable behavior:
23
+
24
+ * The use of sexualized language or imagery, and sexual attention or advances
25
+ * Trolling, insulting/derogatory comments, and personal or political attacks
26
+ * Public or private harassment
27
+ * Publishing others' private information without explicit permission
28
+ * Other conduct which could reasonably be considered inappropriate
29
+
30
+ ## Enforcement Responsibilities
31
+
32
+ Project maintainers are responsible for clarifying and enforcing standards of
33
+ acceptable behavior and will take appropriate and fair corrective action in
34
+ response to any behavior that they deem inappropriate, threatening, offensive,
35
+ or harmful.
36
+
37
+ ## Scope
38
+
39
+ This Code of Conduct applies within all community spaces, and also applies when
40
+ an individual is officially representing the community in public spaces.
41
+
42
+ ## Enforcement
43
+
44
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
45
+ reported to the project team. All complaints will be reviewed and investigated
46
+ promptly and fairly.
47
+
48
+ ## Attribution
49
+
50
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
51
+ version 2.0, available at
52
+ https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
CONTRIBUTING.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to ML Pipeline for Cybersecurity Purple Teaming
2
+
3
+ First off, thank you for considering contributing to our project! 🎉
4
+
5
+ ## Code of Conduct
6
+
7
+ This project and everyone participating in it is governed by our [Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
8
+
9
+ ## How Can I Contribute?
10
+
11
+ ### Reporting Bugs 🐛
12
+
13
+ - Use the GitHub issue tracker
14
+ - Check if the bug has already been reported
15
+ - Include detailed steps to reproduce the bug
16
+ - Provide system information and stack traces if applicable
17
+
18
+ ### Suggesting Enhancements 💡
19
+
20
+ - First, read the documentation to make sure the functionality doesn't already exist
21
+ - Use the GitHub issue tracker and clearly describe the feature
22
+ - Explain why this enhancement would be useful
23
+ - Keep the scope as narrow as possible
24
+
25
+ ### Pull Requests 🔧
26
+
27
+ 1. Fork the repo and create your branch from `main`
28
+ 2. If you've added code that should be tested, add tests
29
+ 3. Ensure the test suite passes
30
+ 4. Make sure your code lints
31
+ 5. Issue that pull request!
32
+
33
+ ## Development Process
34
+
35
+ 1. **Setup Development Environment**
36
+ ```bash
37
+ pip install -r requirements-dev.txt
38
+ ```
39
+
40
+ 2. **Run Tests**
41
+ ```bash
42
+ pytest
43
+ ```
44
+
45
+ 3. **Code Style**
46
+ - Follow PEP 8 guidelines
47
+ - Use meaningful variable names
48
+ - Add comments for complex logic
49
+ - Write docstrings for functions and classes
50
+
51
+ 4. **Commit Messages**
52
+ - Use clear, descriptive commit messages
53
+ - Reference issues and pull requests
54
+ - Keep commits atomic and focused
55
+
56
+ ## Documentation 📚
57
+
58
+ - Update README.md with details of changes to the interface
59
+ - Update docstrings and comments
60
+ - Add any new installation requirements
61
+
62
+ Thank you for your contribution! 🙏
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Cybersecurity ML Pipeline Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,20 +1,21 @@
1
- ---
2
- title: ML Pipeline for Cybersecurity Purple Teaming 🛡️
3
- emoji: 🏃
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.41.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: A Streamlit-based machine learning pipeline platform
12
- ---
13
 
14
  # ML Pipeline for Cybersecurity Purple Teaming 🛡️
15
 
16
  A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
17
 
 
 
18
  ## Features 🚀
19
 
20
  - **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
@@ -32,21 +33,11 @@ A scalable Streamlit-based machine learning pipeline platform specialized for cy
32
 
33
  ## Getting Started 🏁
34
 
35
- 1. **Clone the repository**
36
- ```bash
37
- git clone https://github.com/yourusername/cybersec-ml-pipeline.git
38
- cd cybersec-ml-pipeline
39
- ```
40
-
41
- 2. **Install dependencies**
42
- ```bash
43
- pip install -r requirements.txt
44
- ```
45
-
46
- 3. **Run the application**
47
- ```bash
48
- streamlit run app.py
49
- ```
50
 
51
  ## Usage Guide 📖
52
 
@@ -64,13 +55,27 @@ streamlit run app.py
64
  - Real-time performance metrics
65
  - Visual model evaluation
66
 
67
- ## Contributing 🤝
68
 
69
- Please read our [Contributing Guidelines](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- ## Security 🔒
72
 
73
- For security concerns, please review our [Security Policy](.github/SECURITY.md).
74
 
75
  ## License 📄
76
 
 
1
+ ---
2
+ title: ML Pipeline for Cybersecurity Purple Teaming
3
+ emoji: 🛡️
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.28.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
 
12
 
13
  # ML Pipeline for Cybersecurity Purple Teaming 🛡️
14
 
15
  A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
16
 
17
+ [![Open In Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
18
+
19
  ## Features 🚀
20
 
21
  - **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
 
33
 
34
  ## Getting Started 🏁
35
 
36
+ 1. Visit the [Space on Hugging Face Hub](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
37
+ 2. Upload your cybersecurity dataset (CSV/JSON format)
38
+ 3. Configure the ML pipeline parameters
39
+ 4. Train and evaluate your model
40
+ 5. Export the trained model for deployment
 
 
 
 
 
 
 
 
 
 
41
 
42
  ## Usage Guide 📖
43
 
 
55
  - Real-time performance metrics
56
  - Visual model evaluation
57
 
58
+ ## Local Development
59
 
60
+ 1. **Clone the repository**
61
+ ```bash
62
+ git clone https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline
63
+ cd cybersec-ml-pipeline
64
+ ```
65
+
66
+ 2. **Install dependencies**
67
+ ```bash
68
+ pip install -r requirements.txt
69
+ ```
70
+
71
+ 3. **Run the application**
72
+ ```bash
73
+ streamlit run app.py
74
+ ```
75
 
76
+ ## Contributing 🤝
77
 
78
+ Please read our [Contributing Guidelines](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
79
 
80
  ## License 📄
81
 
app.py CHANGED
@@ -6,6 +6,12 @@ from model_training import ModelTrainer
6
  from visualizations import Visualizer
7
  from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
8
  import warnings
 
 
 
 
 
 
9
  warnings.filterwarnings('ignore')
10
 
11
  st.set_page_config(
@@ -14,21 +20,99 @@ st.set_page_config(
14
  layout="wide"
15
  )
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def main():
18
  st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
19
 
 
 
 
 
 
 
 
 
20
  # Sidebar
21
  st.sidebar.header("Pipeline Configuration")
22
 
23
- # File upload
24
- uploaded_file = st.sidebar.file_uploader(
25
- "Upload Dataset (CSV/JSON)",
26
- type=['csv', 'json']
27
  )
28
 
29
- if uploaded_file is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  try:
31
- df = load_data(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Initialize components
34
  processor = DataProcessor()
@@ -68,37 +152,66 @@ def main():
68
  st.subheader("Advanced Features")
69
  use_polynomial = st.checkbox("Use Polynomial Features")
70
  if use_polynomial:
71
- poly_degree = st.slider("Polynomial Degree", 2, 5, 2)
72
 
73
  use_feature_selection = st.checkbox("Use Feature Selection")
74
  if use_feature_selection:
75
- k_best_features = st.slider("Number of Best Features", 5, 50, 10)
 
 
 
 
 
 
 
76
 
77
  with col4:
78
  use_pca = st.checkbox("Use PCA")
79
  if use_pca:
80
- n_components = st.slider("PCA Components (%)", 1, 100, 95) / 100.0
 
 
 
 
 
81
 
82
  add_cyber_features = st.checkbox("Add Cybersecurity Features")
83
 
 
 
 
 
 
84
  feature_cols = st.multiselect(
85
  "Select Features",
86
- get_feature_names(df),
87
- default=get_feature_names(df)
 
88
  )
 
 
 
 
 
 
89
  target_col = st.selectbox(
90
  "Select Target Column",
91
- df.columns.tolist()
 
92
  )
93
 
 
 
 
 
94
  # Create feature engineering config
95
  feature_engineering_config = {
96
  'use_polynomial': use_polynomial,
97
- 'poly_degree': poly_degree if use_polynomial else None,
98
  'use_feature_selection': use_feature_selection,
99
- 'k_best_features': k_best_features if use_feature_selection else None,
100
  'use_pca': use_pca,
101
- 'n_components': n_components if use_pca else None,
102
  'add_cyber_features': add_cyber_features
103
  }
104
 
@@ -164,16 +277,28 @@ def main():
164
  for metric, value in metrics.items():
165
  st.metric(metric, f"{value:.4f}")
166
 
167
- # Add model export section
168
  st.subheader("Export Model")
169
- model_name = st.text_input("Model Name (optional)")
 
 
 
 
170
  if st.button("Save Model"):
171
  try:
 
 
 
 
 
 
172
  # Save model and metadata
173
  preprocessing_params = {
174
  'feature_engineering_config': feature_engineering_config,
175
  'handling_strategy': handling_strategy,
176
- 'scaling_method': scaling_method
 
 
177
  }
178
 
179
  model_path, metadata_path = save_model(
@@ -181,12 +306,13 @@ def main():
181
  feature_cols,
182
  preprocessing_params,
183
  metrics,
184
- model_name
185
  )
186
 
187
- st.success(f"Model saved successfully! Files:\n- {model_path}\n- {metadata_path}")
188
  except Exception as e:
189
  st.error(f"Error saving model: {str(e)}")
 
190
 
191
  with col8:
192
  if not use_pca: # Skip feature importance for PCA
@@ -215,10 +341,13 @@ def main():
215
  st.pyplot(fig_roc)
216
 
217
  except Exception as e:
218
- st.error(f"Error: {str(e)}")
219
-
220
  else:
221
- st.info("Please upload a dataset to begin.")
 
 
 
222
 
223
  # Add Model Management Section
224
  st.header("5. Saved Models")
 
6
  from visualizations import Visualizer
7
  from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
8
  import warnings
9
+ import re
10
+ from typing import Optional
11
+ from datasets import load_dataset
12
+ from huggingface_hub import list_datasets
13
+ import traceback
14
+
15
  warnings.filterwarnings('ignore')
16
 
17
  st.set_page_config(
 
20
  layout="wide"
21
  )
22
 
23
+ def validate_model_name(name: Optional[str]) -> str:
24
+ """Validate and sanitize model name"""
25
+ if not name:
26
+ return f"model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
27
+ sanitized = re.sub(r'[^\w\-]', '_', name)
28
+ return sanitized
29
+
30
+ def load_hf_dataset(dataset_name: str, config_name: Optional[str] = None) -> pd.DataFrame:
31
+ """Load a dataset from Hugging Face and convert to pandas DataFrame"""
32
+ try:
33
+ if config_name:
34
+ dataset = load_dataset(dataset_name, config_name)
35
+ else:
36
+ dataset = load_dataset(dataset_name)
37
+
38
+ # Convert to pandas DataFrame (using first split, usually 'train')
39
+ split_name = list(dataset.keys())[0]
40
+ df = dataset[split_name].to_pandas()
41
+ return df
42
+ except Exception as e:
43
+ raise Exception(f"Error loading dataset from Hugging Face: {str(e)}\n{traceback.format_exc()}")
44
+
45
  def main():
46
  st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
47
 
48
+ # Initialize default values for feature engineering
49
+ if 'poly_degree' not in st.session_state:
50
+ st.session_state.poly_degree = 2
51
+ if 'k_best_features' not in st.session_state:
52
+ st.session_state.k_best_features = 10
53
+ if 'n_components' not in st.session_state:
54
+ st.session_state.n_components = 0.95
55
+
56
  # Sidebar
57
  st.sidebar.header("Pipeline Configuration")
58
 
59
+ # Data Input Tabs
60
+ data_input_tab = st.radio(
61
+ "Choose Data Source",
62
+ ["Upload File", "Load from Hugging Face"]
63
  )
64
 
65
+ df = None
66
+
67
+ if data_input_tab == "Upload File":
68
+ uploaded_file = st.file_uploader(
69
+ "Upload Dataset (CSV/JSON)",
70
+ type=['csv', 'json']
71
+ )
72
+ if uploaded_file is not None:
73
+ try:
74
+ df = load_data(uploaded_file)
75
+ except Exception as e:
76
+ st.error(f"Error loading file: {str(e)}")
77
+ else:
78
+ # Hugging Face Dataset Loading
79
+ st.markdown("### Load Dataset from Hugging Face")
80
+ dataset_name = st.text_input(
81
+ "Dataset Name",
82
+ help="Enter the Hugging Face dataset name (e.g., 'username/dataset-name')"
83
+ )
84
+ config_name = st.text_input(
85
+ "Configuration Name (Optional)",
86
+ help="Enter the specific configuration name if the dataset has multiple configurations"
87
+ )
88
+
89
+ if dataset_name:
90
+ try:
91
+ with st.spinner("Loading dataset from Hugging Face..."):
92
+ df = load_hf_dataset(
93
+ dataset_name,
94
+ config_name if config_name else None
95
+ )
96
+ st.success(f"Successfully loaded dataset: {dataset_name}")
97
+ except Exception as e:
98
+ st.error(str(e))
99
+
100
+ if df is not None:
101
  try:
102
+ # Validate data
103
+ if df.empty:
104
+ st.error("The dataset contains no data.")
105
+ return
106
+
107
+ if df.shape[1] < 2:
108
+ st.error("Dataset must contain at least two columns (features and target).")
109
+ return
110
+
111
+ # Check for numeric columns
112
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
113
+ if len(numeric_cols) == 0:
114
+ st.error("Dataset must contain at least one numeric column for analysis.")
115
+ return
116
 
117
  # Initialize components
118
  processor = DataProcessor()
 
152
  st.subheader("Advanced Features")
153
  use_polynomial = st.checkbox("Use Polynomial Features")
154
  if use_polynomial:
155
+ st.session_state.poly_degree = st.slider("Polynomial Degree", 2, 5, st.session_state.poly_degree)
156
 
157
  use_feature_selection = st.checkbox("Use Feature Selection")
158
  if use_feature_selection:
159
+ max_features = min(50, df.shape[1]) # Limit k_best_features to number of columns
160
+ st.session_state.k_best_features = st.slider(
161
+ "Number of Best Features",
162
+ 2, # Minimum 2 features required
163
+ max_features,
164
+ min(st.session_state.k_best_features, max_features),
165
+ help="Select the number of most important features to use"
166
+ )
167
 
168
  with col4:
169
  use_pca = st.checkbox("Use PCA")
170
  if use_pca:
171
+ st.session_state.n_components = st.slider(
172
+ "PCA Components (%)",
173
+ 1, 100,
174
+ int(st.session_state.n_components * 100),
175
+ help="Percentage of variance to preserve"
176
+ ) / 100.0
177
 
178
  add_cyber_features = st.checkbox("Add Cybersecurity Features")
179
 
180
+ numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
181
+ if not numeric_features:
182
+ st.error("No numeric features found in the dataset.")
183
+ return
184
+
185
  feature_cols = st.multiselect(
186
  "Select Features",
187
+ numeric_features,
188
+ default=numeric_features,
189
+ help="Select the features to use for training"
190
  )
191
+
192
+ if not feature_cols:
193
+ st.error("Please select at least one feature column")
194
+ return
195
+
196
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
197
  target_col = st.selectbox(
198
  "Select Target Column",
199
+ [col for col in categorical_cols if col not in feature_cols],
200
+ help="Select the target variable to predict"
201
  )
202
 
203
+ if target_col is None:
204
+ st.error("No suitable target column found. Target should be categorical.")
205
+ return
206
+
207
  # Create feature engineering config
208
  feature_engineering_config = {
209
  'use_polynomial': use_polynomial,
210
+ 'poly_degree': st.session_state.poly_degree if use_polynomial else None,
211
  'use_feature_selection': use_feature_selection,
212
+ 'k_best_features': st.session_state.k_best_features if use_feature_selection else None,
213
  'use_pca': use_pca,
214
+ 'n_components': st.session_state.n_components if use_pca else None,
215
  'add_cyber_features': add_cyber_features
216
  }
217
 
 
277
  for metric, value in metrics.items():
278
  st.metric(metric, f"{value:.4f}")
279
 
280
+ # Add model export section with improved validation
281
  st.subheader("Export Model")
282
+ model_name = st.text_input(
283
+ "Model Name (optional)",
284
+ help="Enter a name for your model (alphanumeric and underscores only)"
285
+ )
286
+
287
  if st.button("Save Model"):
288
  try:
289
+ # Validate and sanitize model name
290
+ sanitized_name = validate_model_name(model_name)
291
+
292
+ if sanitized_name != model_name:
293
+ st.warning(f"Model name was sanitized to: {sanitized_name}")
294
+
295
  # Save model and metadata
296
  preprocessing_params = {
297
  'feature_engineering_config': feature_engineering_config,
298
  'handling_strategy': handling_strategy,
299
+ 'scaling_method': scaling_method,
300
+ 'feature_columns': feature_cols,
301
+ 'target_column': target_col
302
  }
303
 
304
  model_path, metadata_path = save_model(
 
306
  feature_cols,
307
  preprocessing_params,
308
  metrics,
309
+ sanitized_name
310
  )
311
 
312
+ st.success(f"Model saved successfully!\nFiles:\n- {model_path}\n- {metadata_path}")
313
  except Exception as e:
314
  st.error(f"Error saving model: {str(e)}")
315
+ st.error("Please ensure you have proper permissions and sufficient disk space.")
316
 
317
  with col8:
318
  if not use_pca: # Skip feature importance for PCA
 
341
  st.pyplot(fig_roc)
342
 
343
  except Exception as e:
344
+ st.error(f"An error occurred: {str(e)}")
345
+ st.error("Please check your input data and try again.")
346
  else:
347
+ if data_input_tab == "Upload File":
348
+ st.info("Please upload a dataset to begin.")
349
+ else:
350
+ st.info("Please enter a Hugging Face dataset name to begin.")
351
 
352
  # Add Model Management Section
353
  st.header("5. Saved Models")
data_processing.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
5
+ from sklearn.impute import SimpleImputer
6
+ from sklearn.feature_selection import SelectKBest, f_classif
7
+ from sklearn.decomposition import PCA
8
+ import dask.dataframe as dd
9
+
10
+ class DataProcessor:
11
+ def __init__(self):
12
+ self.scaler = None
13
+ self.imputer = None
14
+ self.poly_features = None
15
+ self.feature_selector = None
16
+ self.pca = None
17
+
18
+ def _get_scaler(self, method):
19
+ """Returns the appropriate scaler based on method."""
20
+ scalers = {
21
+ 'standard': StandardScaler(),
22
+ 'minmax': MinMaxScaler(),
23
+ 'robust': RobustScaler()
24
+ }
25
+ return scalers.get(method, StandardScaler())
26
+
27
+ def _get_imputer(self, strategy):
28
+ """Returns the appropriate imputer based on strategy."""
29
+ return SimpleImputer(strategy=strategy)
30
+
31
+ def _engineer_features(self, X, feature_engineering_config):
32
+ """Apply feature engineering transformations."""
33
+ # Polynomial Features
34
+ if feature_engineering_config.get('use_polynomial', False):
35
+ degree = feature_engineering_config.get('poly_degree', 2)
36
+ self.poly_features = PolynomialFeatures(degree=degree, include_bias=False)
37
+ X = self.poly_features.fit_transform(X)
38
+
39
+ # Feature Selection
40
+ if feature_engineering_config.get('use_feature_selection', False):
41
+ k = feature_engineering_config.get('k_best_features', 10)
42
+ self.feature_selector = SelectKBest(score_func=f_classif, k=k)
43
+ X = self.feature_selector.fit_transform(X)
44
+
45
+ # Dimensionality Reduction
46
+ if feature_engineering_config.get('use_pca', False):
47
+ n_components = feature_engineering_config.get('n_components', 0.95)
48
+ self.pca = PCA(n_components=n_components)
49
+ X = self.pca.fit_transform(X)
50
+
51
+ # Add cybersecurity-specific features
52
+ if feature_engineering_config.get('add_cyber_features', False):
53
+ X = self._add_cyber_features(X)
54
+
55
+ return X
56
+
57
+ def _add_cyber_features(self, X):
58
+ """Add cybersecurity-specific engineered features."""
59
+ # Convert back to DataFrame for feature engineering
60
+ X_df = pd.DataFrame(X)
61
+
62
+ # Example cyber features (modify based on your specific needs):
63
+ # - Entropy of numerical features
64
+ # - Statistical moments (skewness, kurtosis)
65
+ # - Rolling windows statistics
66
+
67
+ for col in X_df.columns:
68
+ if X_df[col].dtype in ['float64', 'int64']:
69
+ # Calculate entropy for numerical columns
70
+ X_df[f'{col}_entropy'] = X_df[col].apply(lambda x: -np.sum(x * np.log2(x)) if x != 0 else 0)
71
+
72
+ # Add statistical moments
73
+ X_df[f'{col}_skew'] = X_df[col].skew()
74
+ X_df[f'{col}_kurt'] = X_df[col].kurtosis()
75
+
76
+ # Add rolling statistics
77
+ X_df[f'{col}_rolling_mean'] = X_df[col].rolling(window=3, min_periods=1).mean()
78
+ X_df[f'{col}_rolling_std'] = X_df[col].rolling(window=3, min_periods=1).std()
79
+
80
+ return X_df.values
81
+
82
+ def process_data(self, df, feature_cols, target_col, impute_strategy='mean',
83
+ scaling_method='standard', feature_engineering_config=None):
84
+ """
85
+ Process the data using Dask for large datasets.
86
+
87
+ Args:
88
+ df: pandas DataFrame
89
+ feature_cols: list of feature columns
90
+ target_col: target column name
91
+ impute_strategy: strategy for handling missing values
92
+ scaling_method: method for scaling features
93
+ feature_engineering_config: dictionary of feature engineering parameters
94
+
95
+ Returns:
96
+ X_train, X_test, y_train, y_test: processed and split data
97
+ """
98
+ try:
99
+ # Convert to Dask DataFrame for large dataset handling
100
+ ddf = dd.from_pandas(df, npartitions=4)
101
+
102
+ # Select features and target
103
+ X = ddf[feature_cols].compute()
104
+ y = ddf[target_col].compute()
105
+
106
+ # Handle missing values
107
+ self.imputer = self._get_imputer(impute_strategy)
108
+ X = self.imputer.fit_transform(X)
109
+
110
+ # Scale features
111
+ self.scaler = self._get_scaler(scaling_method)
112
+ X = self.scaler.fit_transform(X)
113
+
114
+ # Apply feature engineering if config is provided
115
+ if feature_engineering_config:
116
+ X = self._engineer_features(X, feature_engineering_config)
117
+
118
+ # Split data
119
+ X_train, X_test, y_train, y_test = train_test_split(
120
+ X, y,
121
+ test_size=0.2,
122
+ random_state=42,
123
+ stratify=y if len(np.unique(y)) > 1 else None
124
+ )
125
+
126
+ return X_train, X_test, y_train, y_test
127
+
128
+ except Exception as e:
129
+ raise Exception(f"Error in data processing: {str(e)}")
generated-icon.png ADDED
model_training.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.ensemble import RandomForestClassifier
2
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
3
+ import numpy as np
4
+
5
+ class ModelTrainer:
6
+ def __init__(self):
7
+ self.model = None
8
+
9
+ def train_model(self, X_train, X_test, y_train, y_test, **kwargs):
10
+ """
11
+ Train a Random Forest model with given parameters.
12
+
13
+ Args:
14
+ X_train, X_test, y_train, y_test: Training and test data
15
+ **kwargs: Model parameters
16
+
17
+ Returns:
18
+ model: Trained model
19
+ metrics: Dictionary of evaluation metrics
20
+ """
21
+ try:
22
+ # Initialize and train model
23
+ self.model = RandomForestClassifier(
24
+ n_estimators=kwargs.get('n_estimators', 100),
25
+ max_depth=kwargs.get('max_depth', 10),
26
+ min_samples_split=kwargs.get('min_samples_split', 2),
27
+ min_samples_leaf=kwargs.get('min_samples_leaf', 1),
28
+ random_state=42,
29
+ n_jobs=-1
30
+ )
31
+
32
+ self.model.fit(X_train, y_train)
33
+
34
+ # Make predictions
35
+ y_pred = self.model.predict(X_test)
36
+
37
+ # Calculate metrics
38
+ metrics = {
39
+ 'Accuracy': accuracy_score(y_test, y_pred),
40
+ 'Precision': precision_score(y_test, y_pred, average='weighted'),
41
+ 'Recall': recall_score(y_test, y_pred, average='weighted'),
42
+ 'F1 Score': f1_score(y_test, y_pred, average='weighted')
43
+ }
44
+
45
+ return self.model, metrics
46
+
47
+ except Exception as e:
48
+ raise Exception(f"Error in model training: {str(e)}")
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "repl-nix-workspace"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "dask[dataframe]>=2024.12.1",
8
+ "datasets>=3.2.0",
9
+ "huggingface-hub>=0.27.1",
10
+ "joblib>=1.4.2",
11
+ "matplotlib>=3.10.0",
12
+ "numpy>=2.2.1",
13
+ "pandas>=2.2.3",
14
+ "scikit-learn>=1.6.1",
15
+ "seaborn>=0.13.2",
16
+ "streamlit>=1.41.1",
17
+ ]
replit.nix ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {pkgs}: {
2
+ deps = [
3
+ pkgs.tk
4
+ pkgs.tcl
5
+ pkgs.qhull
6
+ pkgs.pkg-config
7
+ pkgs.gtk3
8
+ pkgs.gobject-introspection
9
+ pkgs.ghostscript
10
+ pkgs.freetype
11
+ pkgs.ffmpeg-full
12
+ pkgs.cairo
13
+ pkgs.arrow-cpp
14
+ pkgs.glibcLocales
15
+ ];
16
+ }
replit_zip_error_log.txt ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/python-3.11","time":"2025-01-14T00:13:02Z"}
2
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/replit","time":"2025-01-14T00:13:02Z"}
3
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohappyeyeballs/aiohappyeyeballs-2.4.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
4
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohttp/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
5
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiosignal/aiosignal-1.3.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
6
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/altair/altair-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
7
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/attrs/attrs-24.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
8
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/blinker/blinker-1.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
9
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cachetools/cachetools-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
10
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/certifi/certifi-2024.12.14-py3-none-any","time":"2025-01-14T00:13:36Z"}
11
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/charset-normalizer/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
12
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/click/click-8.1.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
13
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cloudpickle/cloudpickle-3.1.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
14
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/contourpy/contourpy-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
15
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cycler/cycler-0.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
16
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask/dask-2024.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
17
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask-expr/dask_expr-1.1.21-py3-none-any","time":"2025-01-14T00:13:36Z"}
18
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/datasets/datasets-3.2.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
19
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dill/dill-0.3.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
20
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/filelock/filelock-3.16.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
21
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fonttools/fonttools-4.55.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
22
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/frozenlist/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
23
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.12.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
24
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
25
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitdb/gitdb-4.0.12-py3-none-any","time":"2025-01-14T00:13:36Z"}
26
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitpython/gitpython-3.1.44-py3-none-any","time":"2025-01-14T00:13:36Z"}
27
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/huggingface-hub/huggingface_hub-0.27.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
28
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/idna/idna-3.10-py3-none-any","time":"2025-01-14T00:13:36Z"}
29
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/importlib-metadata/importlib_metadata-8.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
30
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jinja2/jinja2-3.1.5-py3-none-any","time":"2025-01-14T00:13:36Z"}
31
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/joblib/joblib-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
32
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema/jsonschema-4.23.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
33
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema-specifications/jsonschema_specifications-2024.10.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
34
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/kiwisolver/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
35
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/locket/locket-1.0.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
36
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markdown-it-py/markdown_it_py-3.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
37
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markupsafe/markupsafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
38
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/matplotlib/matplotlib-3.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
39
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/mdurl/mdurl-0.1.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
40
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multidict/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
41
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multiprocess/multiprocess-0.70.16-py311-none-any","time":"2025-01-14T00:13:36Z"}
42
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/narwhals/narwhals-1.22.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
43
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/numpy/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
44
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/packaging/packaging-24.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
45
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pandas/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
46
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/partd/partd-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
47
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pillow/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
48
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/propcache/propcache-0.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
49
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/protobuf/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
50
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyarrow/pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
51
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pydeck/pydeck-0.9.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
52
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pygments/pygments-2.19.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
53
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyparsing/pyparsing-3.2.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
54
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/python-dateutil/python_dateutil-2.9.0.post0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
55
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pytz/pytz-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
56
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyyaml/pyyaml-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
57
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/referencing/referencing-0.35.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
58
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/requests/requests-2.32.3-py3-none-any","time":"2025-01-14T00:13:36Z"}
59
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rich/rich-13.9.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
60
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rpds-py/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
61
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scikit-learn/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
62
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scipy/scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
63
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/seaborn/seaborn-0.13.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
64
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/six/six-1.17.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
65
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/smmap/smmap-5.0.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
66
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/streamlit/streamlit-1.41.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
67
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tenacity/tenacity-9.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
68
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/threadpoolctl/threadpoolctl-3.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
69
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toml/toml-0.10.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
70
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toolz/toolz-1.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
71
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tornado/tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
72
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tqdm/tqdm-4.67.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
73
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/typing-extensions/typing_extensions-4.12.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
74
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tzdata/tzdata-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
75
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/urllib3/urllib3-2.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
76
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/watchdog/watchdog-6.0.0-py3-none-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
77
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/xxhash/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
78
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/yarl/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
79
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/zipp/zipp-3.21.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
80
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python","time":"2025-01-14T00:13:36Z"}
81
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3","time":"2025-01-14T00:13:36Z"}
82
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3.11","time":"2025-01-14T00:13:36Z"}
83
+ {"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/lib64","time":"2025-01-14T00:14:12Z"}
utils.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import os
5
+ import json
6
+ from datetime import datetime
7
+
8
+ def load_data(file):
9
+ """
10
+ Load data from uploaded file.
11
+
12
+ Args:
13
+ file: Streamlit uploaded file object
14
+
15
+ Returns:
16
+ pandas DataFrame
17
+ """
18
+ try:
19
+ if file.name.endswith('.csv'):
20
+ df = pd.read_csv(file)
21
+ elif file.name.endswith('.json'):
22
+ df = pd.read_json(file)
23
+ else:
24
+ raise ValueError("Unsupported file format")
25
+
26
+ return df
27
+ except Exception as e:
28
+ raise Exception(f"Error loading data: {str(e)}")
29
+
30
+ def get_feature_names(df):
31
+ """
32
+ Get list of numeric columns suitable for features.
33
+
34
+ Args:
35
+ df: pandas DataFrame
36
+
37
+ Returns:
38
+ list of column names
39
+ """
40
+ try:
41
+ # Select numeric columns
42
+ numeric_cols = df.select_dtypes(
43
+ include=['int64', 'float64']
44
+ ).columns.tolist()
45
+
46
+ return numeric_cols
47
+ except Exception as e:
48
+ raise Exception(f"Error getting feature names: {str(e)}")
49
+
50
+ def save_model(model, feature_cols, preprocessing_params, metrics, model_name=None):
51
+ """
52
+ Save trained model and its metadata.
53
+
54
+ Args:
55
+ model: Trained sklearn model
56
+ feature_cols: List of feature column names
57
+ preprocessing_params: Dictionary of preprocessing parameters
58
+ metrics: Dictionary of model performance metrics
59
+ model_name: Optional custom name for the model
60
+
61
+ Returns:
62
+ saved_path: Path where model was saved
63
+ """
64
+ try:
65
+ # Create models directory if it doesn't exist
66
+ os.makedirs('models', exist_ok=True)
67
+
68
+ # Generate model name if not provided
69
+ if model_name is None:
70
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
71
+ model_name = f"model_{timestamp}"
72
+
73
+ # Save paths
74
+ model_path = f"models/{model_name}.joblib"
75
+ metadata_path = f"models/{model_name}_metadata.json"
76
+
77
+ # Save model using joblib
78
+ joblib.dump(model, model_path)
79
+
80
+ # Save metadata
81
+ metadata = {
82
+ 'feature_columns': feature_cols,
83
+ 'preprocessing_parameters': preprocessing_params,
84
+ 'performance_metrics': metrics,
85
+ 'created_at': datetime.now().isoformat(),
86
+ 'model_type': type(model).__name__
87
+ }
88
+
89
+ with open(metadata_path, 'w') as f:
90
+ json.dump(metadata, f, indent=4)
91
+
92
+ return model_path, metadata_path
93
+
94
+ except Exception as e:
95
+ raise Exception(f"Error saving model: {str(e)}")
96
+
97
+ def load_saved_model(model_path, metadata_path):
98
+ """
99
+ Load a saved model and its metadata.
100
+
101
+ Args:
102
+ model_path: Path to the saved model file
103
+ metadata_path: Path to the model metadata file
104
+
105
+ Returns:
106
+ model: Loaded model
107
+ metadata: Dictionary containing model metadata
108
+ """
109
+ try:
110
+ # Load model
111
+ model = joblib.load(model_path)
112
+
113
+ # Load metadata
114
+ with open(metadata_path, 'r') as f:
115
+ metadata = json.load(f)
116
+
117
+ return model, metadata
118
+
119
+ except Exception as e:
120
+ raise Exception(f"Error loading model: {str(e)}")
121
+
122
+ def list_saved_models():
123
+ """
124
+ List all saved models in the models directory.
125
+
126
+ Returns:
127
+ list of dictionaries containing model info
128
+ """
129
+ try:
130
+ models_info = []
131
+ if not os.path.exists('models'):
132
+ return models_info
133
+
134
+ for filename in os.listdir('models'):
135
+ if filename.endswith('_metadata.json'):
136
+ with open(f"models/{filename}", 'r') as f:
137
+ metadata = json.load(f)
138
+ model_name = filename.replace('_metadata.json', '')
139
+ models_info.append({
140
+ 'name': model_name,
141
+ 'type': metadata['model_type'],
142
+ 'created_at': metadata['created_at'],
143
+ 'metrics': metadata['performance_metrics']
144
+ })
145
+
146
+ return models_info
147
+
148
+ except Exception as e:
149
+ raise Exception(f"Error listing models: {str(e)}")
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
visualizations.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ from sklearn.metrics import confusion_matrix, roc_curve, auc
4
+ import numpy as np
5
+
6
+ class Visualizer:
7
+ def __init__(self):
8
+ plt.style.use('seaborn')
9
+
10
+ def plot_feature_importance(self, model, feature_names):
11
+ """Plot feature importance from the trained model."""
12
+ try:
13
+ plt.figure(figsize=(10, 6))
14
+ importances = model.feature_importances_
15
+ indices = np.argsort(importances)[::-1]
16
+
17
+ plt.title("Feature Importance")
18
+ plt.bar(range(len(importances)), importances[indices])
19
+ plt.xticks(
20
+ range(len(importances)),
21
+ [feature_names[i] for i in indices],
22
+ rotation=45,
23
+ ha='right'
24
+ )
25
+ plt.tight_layout()
26
+ return plt.gcf()
27
+
28
+ except Exception as e:
29
+ raise Exception(f"Error plotting feature importance: {str(e)}")
30
+
31
+ def plot_confusion_matrix(self, y_true, y_pred):
32
+ """Plot confusion matrix."""
33
+ try:
34
+ plt.figure(figsize=(8, 6))
35
+ cm = confusion_matrix(y_true, y_pred)
36
+ sns.heatmap(
37
+ cm,
38
+ annot=True,
39
+ fmt='d',
40
+ cmap='Blues',
41
+ cbar=False
42
+ )
43
+ plt.title("Confusion Matrix")
44
+ plt.ylabel("True Label")
45
+ plt.xlabel("Predicted Label")
46
+ plt.tight_layout()
47
+ return plt.gcf()
48
+
49
+ except Exception as e:
50
+ raise Exception(f"Error plotting confusion matrix: {str(e)}")
51
+
52
+ def plot_roc_curve(self, model, X_test, y_test):
53
+ """Plot ROC curve."""
54
+ try:
55
+ plt.figure(figsize=(8, 6))
56
+ y_prob = model.predict_proba(X_test)
57
+
58
+ # Handle multi-class case
59
+ if y_prob.shape[1] > 2:
60
+ # Plot ROC curve for each class
61
+ for i in range(y_prob.shape[1]):
62
+ fpr, tpr, _ = roc_curve(
63
+ (y_test == i).astype(int),
64
+ y_prob[:, i]
65
+ )
66
+ auc_score = auc(fpr, tpr)
67
+ plt.plot(
68
+ fpr,
69
+ tpr,
70
+ label=f'Class {i} (AUC = {auc_score:.2f})'
71
+ )
72
+ else:
73
+ # Binary classification
74
+ fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
75
+ auc_score = auc(fpr, tpr)
76
+ plt.plot(
77
+ fpr,
78
+ tpr,
79
+ label=f'ROC curve (AUC = {auc_score:.2f})'
80
+ )
81
+
82
+ plt.plot([0, 1], [0, 1], 'k--')
83
+ plt.xlim([0.0, 1.0])
84
+ plt.ylim([0.0, 1.05])
85
+ plt.xlabel('False Positive Rate')
86
+ plt.ylabel('True Positive Rate')
87
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
88
+ plt.legend(loc="lower right")
89
+ plt.tight_layout()
90
+ return plt.gcf()
91
+
92
+ except Exception as e:
93
+ raise Exception(f"Error plotting ROC curve: {str(e)}")