Canstralian
commited on
Upload 16 files
Browse files- .gitignore +43 -0
- .replit +39 -0
- CODE_OF_CONDUCT.md +52 -0
- CONTRIBUTING.md +62 -0
- LICENSE +21 -0
- README.md +36 -31
- app.py +152 -23
- data_processing.py +129 -0
- generated-icon.png +0 -0
- model_training.py +48 -0
- pyproject.toml +17 -0
- replit.nix +16 -0
- replit_zip_error_log.txt +83 -0
- utils.py +149 -0
- uv.lock +0 -0
- visualizations.py +93 -0
.gitignore
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
.Python
|
6 |
+
env/
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
*.egg-info/
|
19 |
+
.installed.cfg
|
20 |
+
*.egg
|
21 |
+
|
22 |
+
# Virtual Environment
|
23 |
+
venv/
|
24 |
+
ENV/
|
25 |
+
|
26 |
+
# IDE
|
27 |
+
.idea/
|
28 |
+
.vscode/
|
29 |
+
*.swp
|
30 |
+
*.swo
|
31 |
+
|
32 |
+
# Streamlit
|
33 |
+
.streamlit/secrets.toml
|
34 |
+
|
35 |
+
# Model files
|
36 |
+
models/
|
37 |
+
|
38 |
+
# Logs
|
39 |
+
*.log
|
40 |
+
|
41 |
+
# System
|
42 |
+
.DS_Store
|
43 |
+
Thumbs.db
|
.replit
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
modules = ["python-3.11"]
|
2 |
+
|
3 |
+
[nix]
|
4 |
+
channel = "stable-24_05"
|
5 |
+
|
6 |
+
[deployment]
|
7 |
+
deploymentTarget = "autoscale"
|
8 |
+
run = ["sh", "-c", "streamlit run app.py --server.port 5000"]
|
9 |
+
|
10 |
+
[workflows]
|
11 |
+
runButton = "Project"
|
12 |
+
|
13 |
+
[[workflows.workflow]]
|
14 |
+
name = "Project"
|
15 |
+
mode = "parallel"
|
16 |
+
author = "agent"
|
17 |
+
|
18 |
+
[[workflows.workflow.tasks]]
|
19 |
+
task = "workflow.run"
|
20 |
+
args = "Streamlit App"
|
21 |
+
|
22 |
+
[[workflows.workflow]]
|
23 |
+
name = "Streamlit App"
|
24 |
+
author = "agent"
|
25 |
+
|
26 |
+
[workflows.workflow.metadata]
|
27 |
+
agentRequireRestartOnSave = false
|
28 |
+
|
29 |
+
[[workflows.workflow.tasks]]
|
30 |
+
task = "packager.installForAll"
|
31 |
+
|
32 |
+
[[workflows.workflow.tasks]]
|
33 |
+
task = "shell.exec"
|
34 |
+
args = "streamlit run app.py --server.port 5000"
|
35 |
+
waitForPort = 5000
|
36 |
+
|
37 |
+
[[ports]]
|
38 |
+
localPort = 5000
|
39 |
+
externalPort = 80
|
CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributor Covenant Code of Conduct
|
2 |
+
|
3 |
+
## Our Pledge
|
4 |
+
|
5 |
+
We as members, contributors, and leaders pledge to make participation in our
|
6 |
+
community a harassment-free experience for everyone, regardless of age, body
|
7 |
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
8 |
+
identity and expression, level of experience, education, socio-economic status,
|
9 |
+
nationality, personal appearance, race, religion, or sexual identity
|
10 |
+
and orientation.
|
11 |
+
|
12 |
+
## Our Standards
|
13 |
+
|
14 |
+
Examples of behavior that contributes to a positive environment:
|
15 |
+
|
16 |
+
* Using welcoming and inclusive language
|
17 |
+
* Being respectful of differing viewpoints and experiences
|
18 |
+
* Gracefully accepting constructive criticism
|
19 |
+
* Focusing on what is best for the community
|
20 |
+
* Showing empathy towards other community members
|
21 |
+
|
22 |
+
Examples of unacceptable behavior:
|
23 |
+
|
24 |
+
* The use of sexualized language or imagery, and sexual attention or advances
|
25 |
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
26 |
+
* Public or private harassment
|
27 |
+
* Publishing others' private information without explicit permission
|
28 |
+
* Other conduct which could reasonably be considered inappropriate
|
29 |
+
|
30 |
+
## Enforcement Responsibilities
|
31 |
+
|
32 |
+
Project maintainers are responsible for clarifying and enforcing standards of
|
33 |
+
acceptable behavior and will take appropriate and fair corrective action in
|
34 |
+
response to any behavior that they deem inappropriate, threatening, offensive,
|
35 |
+
or harmful.
|
36 |
+
|
37 |
+
## Scope
|
38 |
+
|
39 |
+
This Code of Conduct applies within all community spaces, and also applies when
|
40 |
+
an individual is officially representing the community in public spaces.
|
41 |
+
|
42 |
+
## Enforcement
|
43 |
+
|
44 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
45 |
+
reported to the project team. All complaints will be reviewed and investigated
|
46 |
+
promptly and fairly.
|
47 |
+
|
48 |
+
## Attribution
|
49 |
+
|
50 |
+
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
|
51 |
+
version 2.0, available at
|
52 |
+
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
CONTRIBUTING.md
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributing to ML Pipeline for Cybersecurity Purple Teaming
|
2 |
+
|
3 |
+
First off, thank you for considering contributing to our project! 🎉
|
4 |
+
|
5 |
+
## Code of Conduct
|
6 |
+
|
7 |
+
This project and everyone participating in it is governed by our [Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
|
8 |
+
|
9 |
+
## How Can I Contribute?
|
10 |
+
|
11 |
+
### Reporting Bugs 🐛
|
12 |
+
|
13 |
+
- Use the GitHub issue tracker
|
14 |
+
- Check if the bug has already been reported
|
15 |
+
- Include detailed steps to reproduce the bug
|
16 |
+
- Provide system information and stack traces if applicable
|
17 |
+
|
18 |
+
### Suggesting Enhancements 💡
|
19 |
+
|
20 |
+
- First, read the documentation to make sure the functionality doesn't already exist
|
21 |
+
- Use the GitHub issue tracker and clearly describe the feature
|
22 |
+
- Explain why this enhancement would be useful
|
23 |
+
- Keep the scope as narrow as possible
|
24 |
+
|
25 |
+
### Pull Requests 🔧
|
26 |
+
|
27 |
+
1. Fork the repo and create your branch from `main`
|
28 |
+
2. If you've added code that should be tested, add tests
|
29 |
+
3. Ensure the test suite passes
|
30 |
+
4. Make sure your code lints
|
31 |
+
5. Issue that pull request!
|
32 |
+
|
33 |
+
## Development Process
|
34 |
+
|
35 |
+
1. **Setup Development Environment**
|
36 |
+
```bash
|
37 |
+
pip install -r requirements-dev.txt
|
38 |
+
```
|
39 |
+
|
40 |
+
2. **Run Tests**
|
41 |
+
```bash
|
42 |
+
pytest
|
43 |
+
```
|
44 |
+
|
45 |
+
3. **Code Style**
|
46 |
+
- Follow PEP 8 guidelines
|
47 |
+
- Use meaningful variable names
|
48 |
+
- Add comments for complex logic
|
49 |
+
- Write docstrings for functions and classes
|
50 |
+
|
51 |
+
4. **Commit Messages**
|
52 |
+
- Use clear, descriptive commit messages
|
53 |
+
- Reference issues and pull requests
|
54 |
+
- Keep commits atomic and focused
|
55 |
+
|
56 |
+
## Documentation 📚
|
57 |
+
|
58 |
+
- Update README.md with details of changes to the interface
|
59 |
+
- Update docstrings and comments
|
60 |
+
- Add any new installation requirements
|
61 |
+
|
62 |
+
Thank you for your contribution! 🙏
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Cybersecurity ML Pipeline Team
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,20 +1,21 @@
|
|
1 |
-
---
|
2 |
-
title: ML Pipeline for Cybersecurity Purple Teaming
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
|
12 |
-
---
|
13 |
|
14 |
# ML Pipeline for Cybersecurity Purple Teaming 🛡️
|
15 |
|
16 |
A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
|
17 |
|
|
|
|
|
18 |
## Features 🚀
|
19 |
|
20 |
- **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
|
@@ -32,21 +33,11 @@ A scalable Streamlit-based machine learning pipeline platform specialized for cy
|
|
32 |
|
33 |
## Getting Started 🏁
|
34 |
|
35 |
-
1.
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
2. **Install dependencies**
|
42 |
-
```bash
|
43 |
-
pip install -r requirements.txt
|
44 |
-
```
|
45 |
-
|
46 |
-
3. **Run the application**
|
47 |
-
```bash
|
48 |
-
streamlit run app.py
|
49 |
-
```
|
50 |
|
51 |
## Usage Guide 📖
|
52 |
|
@@ -64,13 +55,27 @@ streamlit run app.py
|
|
64 |
- Real-time performance metrics
|
65 |
- Visual model evaluation
|
66 |
|
67 |
-
##
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
##
|
72 |
|
73 |
-
|
74 |
|
75 |
## License 📄
|
76 |
|
|
|
1 |
+
---
|
2 |
+
title: ML Pipeline for Cybersecurity Purple Teaming
|
3 |
+
emoji: 🛡️
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: blue
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.28.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
|
|
12 |
|
13 |
# ML Pipeline for Cybersecurity Purple Teaming 🛡️
|
14 |
|
15 |
A scalable Streamlit-based machine learning pipeline platform specialized for cybersecurity purple-teaming, enabling advanced data processing and model training.
|
16 |
|
17 |
+
[![Open In Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
|
18 |
+
|
19 |
## Features 🚀
|
20 |
|
21 |
- **Distributed Data Processing**: Leverage Dask for handling large-scale datasets
|
|
|
33 |
|
34 |
## Getting Started 🏁
|
35 |
|
36 |
+
1. Visit the [Space on Hugging Face Hub](https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline)
|
37 |
+
2. Upload your cybersecurity dataset (CSV/JSON format)
|
38 |
+
3. Configure the ML pipeline parameters
|
39 |
+
4. Train and evaluate your model
|
40 |
+
5. Export the trained model for deployment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
## Usage Guide 📖
|
43 |
|
|
|
55 |
- Real-time performance metrics
|
56 |
- Visual model evaluation
|
57 |
|
58 |
+
## Local Development
|
59 |
|
60 |
+
1. **Clone the repository**
|
61 |
+
```bash
|
62 |
+
git clone https://huggingface.co/spaces/Canstralian/cybersec-ml-pipeline
|
63 |
+
cd cybersec-ml-pipeline
|
64 |
+
```
|
65 |
+
|
66 |
+
2. **Install dependencies**
|
67 |
+
```bash
|
68 |
+
pip install -r requirements.txt
|
69 |
+
```
|
70 |
+
|
71 |
+
3. **Run the application**
|
72 |
+
```bash
|
73 |
+
streamlit run app.py
|
74 |
+
```
|
75 |
|
76 |
+
## Contributing 🤝
|
77 |
|
78 |
+
Please read our [Contributing Guidelines](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
|
79 |
|
80 |
## License 📄
|
81 |
|
app.py
CHANGED
@@ -6,6 +6,12 @@ from model_training import ModelTrainer
|
|
6 |
from visualizations import Visualizer
|
7 |
from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
|
8 |
import warnings
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
warnings.filterwarnings('ignore')
|
10 |
|
11 |
st.set_page_config(
|
@@ -14,21 +20,99 @@ st.set_page_config(
|
|
14 |
layout="wide"
|
15 |
)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def main():
|
18 |
st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Sidebar
|
21 |
st.sidebar.header("Pipeline Configuration")
|
22 |
|
23 |
-
#
|
24 |
-
|
25 |
-
"
|
26 |
-
|
27 |
)
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
try:
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
# Initialize components
|
34 |
processor = DataProcessor()
|
@@ -68,37 +152,66 @@ def main():
|
|
68 |
st.subheader("Advanced Features")
|
69 |
use_polynomial = st.checkbox("Use Polynomial Features")
|
70 |
if use_polynomial:
|
71 |
-
poly_degree = st.slider("Polynomial Degree", 2, 5,
|
72 |
|
73 |
use_feature_selection = st.checkbox("Use Feature Selection")
|
74 |
if use_feature_selection:
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
with col4:
|
78 |
use_pca = st.checkbox("Use PCA")
|
79 |
if use_pca:
|
80 |
-
n_components = st.slider(
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
add_cyber_features = st.checkbox("Add Cybersecurity Features")
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
feature_cols = st.multiselect(
|
85 |
"Select Features",
|
86 |
-
|
87 |
-
default=
|
|
|
88 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
target_col = st.selectbox(
|
90 |
"Select Target Column",
|
91 |
-
|
|
|
92 |
)
|
93 |
|
|
|
|
|
|
|
|
|
94 |
# Create feature engineering config
|
95 |
feature_engineering_config = {
|
96 |
'use_polynomial': use_polynomial,
|
97 |
-
'poly_degree': poly_degree if use_polynomial else None,
|
98 |
'use_feature_selection': use_feature_selection,
|
99 |
-
'k_best_features': k_best_features if use_feature_selection else None,
|
100 |
'use_pca': use_pca,
|
101 |
-
'n_components': n_components if use_pca else None,
|
102 |
'add_cyber_features': add_cyber_features
|
103 |
}
|
104 |
|
@@ -164,16 +277,28 @@ def main():
|
|
164 |
for metric, value in metrics.items():
|
165 |
st.metric(metric, f"{value:.4f}")
|
166 |
|
167 |
-
# Add model export section
|
168 |
st.subheader("Export Model")
|
169 |
-
model_name = st.text_input(
|
|
|
|
|
|
|
|
|
170 |
if st.button("Save Model"):
|
171 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
# Save model and metadata
|
173 |
preprocessing_params = {
|
174 |
'feature_engineering_config': feature_engineering_config,
|
175 |
'handling_strategy': handling_strategy,
|
176 |
-
'scaling_method': scaling_method
|
|
|
|
|
177 |
}
|
178 |
|
179 |
model_path, metadata_path = save_model(
|
@@ -181,12 +306,13 @@ def main():
|
|
181 |
feature_cols,
|
182 |
preprocessing_params,
|
183 |
metrics,
|
184 |
-
|
185 |
)
|
186 |
|
187 |
-
st.success(f"Model saved successfully
|
188 |
except Exception as e:
|
189 |
st.error(f"Error saving model: {str(e)}")
|
|
|
190 |
|
191 |
with col8:
|
192 |
if not use_pca: # Skip feature importance for PCA
|
@@ -215,10 +341,13 @@ def main():
|
|
215 |
st.pyplot(fig_roc)
|
216 |
|
217 |
except Exception as e:
|
218 |
-
st.error(f"
|
219 |
-
|
220 |
else:
|
221 |
-
|
|
|
|
|
|
|
222 |
|
223 |
# Add Model Management Section
|
224 |
st.header("5. Saved Models")
|
|
|
6 |
from visualizations import Visualizer
|
7 |
from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
|
8 |
import warnings
|
9 |
+
import re
|
10 |
+
from typing import Optional
|
11 |
+
from datasets import load_dataset
|
12 |
+
from huggingface_hub import list_datasets
|
13 |
+
import traceback
|
14 |
+
|
15 |
warnings.filterwarnings('ignore')
|
16 |
|
17 |
st.set_page_config(
|
|
|
20 |
layout="wide"
|
21 |
)
|
22 |
|
23 |
+
def validate_model_name(name: Optional[str]) -> str:
|
24 |
+
"""Validate and sanitize model name"""
|
25 |
+
if not name:
|
26 |
+
return f"model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
|
27 |
+
sanitized = re.sub(r'[^\w\-]', '_', name)
|
28 |
+
return sanitized
|
29 |
+
|
30 |
+
def load_hf_dataset(dataset_name: str, config_name: Optional[str] = None) -> pd.DataFrame:
|
31 |
+
"""Load a dataset from Hugging Face and convert to pandas DataFrame"""
|
32 |
+
try:
|
33 |
+
if config_name:
|
34 |
+
dataset = load_dataset(dataset_name, config_name)
|
35 |
+
else:
|
36 |
+
dataset = load_dataset(dataset_name)
|
37 |
+
|
38 |
+
# Convert to pandas DataFrame (using first split, usually 'train')
|
39 |
+
split_name = list(dataset.keys())[0]
|
40 |
+
df = dataset[split_name].to_pandas()
|
41 |
+
return df
|
42 |
+
except Exception as e:
|
43 |
+
raise Exception(f"Error loading dataset from Hugging Face: {str(e)}\n{traceback.format_exc()}")
|
44 |
+
|
45 |
def main():
|
46 |
st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
|
47 |
|
48 |
+
# Initialize default values for feature engineering
|
49 |
+
if 'poly_degree' not in st.session_state:
|
50 |
+
st.session_state.poly_degree = 2
|
51 |
+
if 'k_best_features' not in st.session_state:
|
52 |
+
st.session_state.k_best_features = 10
|
53 |
+
if 'n_components' not in st.session_state:
|
54 |
+
st.session_state.n_components = 0.95
|
55 |
+
|
56 |
# Sidebar
|
57 |
st.sidebar.header("Pipeline Configuration")
|
58 |
|
59 |
+
# Data Input Tabs
|
60 |
+
data_input_tab = st.radio(
|
61 |
+
"Choose Data Source",
|
62 |
+
["Upload File", "Load from Hugging Face"]
|
63 |
)
|
64 |
|
65 |
+
df = None
|
66 |
+
|
67 |
+
if data_input_tab == "Upload File":
|
68 |
+
uploaded_file = st.file_uploader(
|
69 |
+
"Upload Dataset (CSV/JSON)",
|
70 |
+
type=['csv', 'json']
|
71 |
+
)
|
72 |
+
if uploaded_file is not None:
|
73 |
+
try:
|
74 |
+
df = load_data(uploaded_file)
|
75 |
+
except Exception as e:
|
76 |
+
st.error(f"Error loading file: {str(e)}")
|
77 |
+
else:
|
78 |
+
# Hugging Face Dataset Loading
|
79 |
+
st.markdown("### Load Dataset from Hugging Face")
|
80 |
+
dataset_name = st.text_input(
|
81 |
+
"Dataset Name",
|
82 |
+
help="Enter the Hugging Face dataset name (e.g., 'username/dataset-name')"
|
83 |
+
)
|
84 |
+
config_name = st.text_input(
|
85 |
+
"Configuration Name (Optional)",
|
86 |
+
help="Enter the specific configuration name if the dataset has multiple configurations"
|
87 |
+
)
|
88 |
+
|
89 |
+
if dataset_name:
|
90 |
+
try:
|
91 |
+
with st.spinner("Loading dataset from Hugging Face..."):
|
92 |
+
df = load_hf_dataset(
|
93 |
+
dataset_name,
|
94 |
+
config_name if config_name else None
|
95 |
+
)
|
96 |
+
st.success(f"Successfully loaded dataset: {dataset_name}")
|
97 |
+
except Exception as e:
|
98 |
+
st.error(str(e))
|
99 |
+
|
100 |
+
if df is not None:
|
101 |
try:
|
102 |
+
# Validate data
|
103 |
+
if df.empty:
|
104 |
+
st.error("The dataset contains no data.")
|
105 |
+
return
|
106 |
+
|
107 |
+
if df.shape[1] < 2:
|
108 |
+
st.error("Dataset must contain at least two columns (features and target).")
|
109 |
+
return
|
110 |
+
|
111 |
+
# Check for numeric columns
|
112 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
113 |
+
if len(numeric_cols) == 0:
|
114 |
+
st.error("Dataset must contain at least one numeric column for analysis.")
|
115 |
+
return
|
116 |
|
117 |
# Initialize components
|
118 |
processor = DataProcessor()
|
|
|
152 |
st.subheader("Advanced Features")
|
153 |
use_polynomial = st.checkbox("Use Polynomial Features")
|
154 |
if use_polynomial:
|
155 |
+
st.session_state.poly_degree = st.slider("Polynomial Degree", 2, 5, st.session_state.poly_degree)
|
156 |
|
157 |
use_feature_selection = st.checkbox("Use Feature Selection")
|
158 |
if use_feature_selection:
|
159 |
+
max_features = min(50, df.shape[1]) # Limit k_best_features to number of columns
|
160 |
+
st.session_state.k_best_features = st.slider(
|
161 |
+
"Number of Best Features",
|
162 |
+
2, # Minimum 2 features required
|
163 |
+
max_features,
|
164 |
+
min(st.session_state.k_best_features, max_features),
|
165 |
+
help="Select the number of most important features to use"
|
166 |
+
)
|
167 |
|
168 |
with col4:
|
169 |
use_pca = st.checkbox("Use PCA")
|
170 |
if use_pca:
|
171 |
+
st.session_state.n_components = st.slider(
|
172 |
+
"PCA Components (%)",
|
173 |
+
1, 100,
|
174 |
+
int(st.session_state.n_components * 100),
|
175 |
+
help="Percentage of variance to preserve"
|
176 |
+
) / 100.0
|
177 |
|
178 |
add_cyber_features = st.checkbox("Add Cybersecurity Features")
|
179 |
|
180 |
+
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
|
181 |
+
if not numeric_features:
|
182 |
+
st.error("No numeric features found in the dataset.")
|
183 |
+
return
|
184 |
+
|
185 |
feature_cols = st.multiselect(
|
186 |
"Select Features",
|
187 |
+
numeric_features,
|
188 |
+
default=numeric_features,
|
189 |
+
help="Select the features to use for training"
|
190 |
)
|
191 |
+
|
192 |
+
if not feature_cols:
|
193 |
+
st.error("Please select at least one feature column")
|
194 |
+
return
|
195 |
+
|
196 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
197 |
target_col = st.selectbox(
|
198 |
"Select Target Column",
|
199 |
+
[col for col in categorical_cols if col not in feature_cols],
|
200 |
+
help="Select the target variable to predict"
|
201 |
)
|
202 |
|
203 |
+
if target_col is None:
|
204 |
+
st.error("No suitable target column found. Target should be categorical.")
|
205 |
+
return
|
206 |
+
|
207 |
# Create feature engineering config
|
208 |
feature_engineering_config = {
|
209 |
'use_polynomial': use_polynomial,
|
210 |
+
'poly_degree': st.session_state.poly_degree if use_polynomial else None,
|
211 |
'use_feature_selection': use_feature_selection,
|
212 |
+
'k_best_features': st.session_state.k_best_features if use_feature_selection else None,
|
213 |
'use_pca': use_pca,
|
214 |
+
'n_components': st.session_state.n_components if use_pca else None,
|
215 |
'add_cyber_features': add_cyber_features
|
216 |
}
|
217 |
|
|
|
277 |
for metric, value in metrics.items():
|
278 |
st.metric(metric, f"{value:.4f}")
|
279 |
|
280 |
+
# Add model export section with improved validation
|
281 |
st.subheader("Export Model")
|
282 |
+
model_name = st.text_input(
|
283 |
+
"Model Name (optional)",
|
284 |
+
help="Enter a name for your model (alphanumeric and underscores only)"
|
285 |
+
)
|
286 |
+
|
287 |
if st.button("Save Model"):
|
288 |
try:
|
289 |
+
# Validate and sanitize model name
|
290 |
+
sanitized_name = validate_model_name(model_name)
|
291 |
+
|
292 |
+
if sanitized_name != model_name:
|
293 |
+
st.warning(f"Model name was sanitized to: {sanitized_name}")
|
294 |
+
|
295 |
# Save model and metadata
|
296 |
preprocessing_params = {
|
297 |
'feature_engineering_config': feature_engineering_config,
|
298 |
'handling_strategy': handling_strategy,
|
299 |
+
'scaling_method': scaling_method,
|
300 |
+
'feature_columns': feature_cols,
|
301 |
+
'target_column': target_col
|
302 |
}
|
303 |
|
304 |
model_path, metadata_path = save_model(
|
|
|
306 |
feature_cols,
|
307 |
preprocessing_params,
|
308 |
metrics,
|
309 |
+
sanitized_name
|
310 |
)
|
311 |
|
312 |
+
st.success(f"Model saved successfully!\nFiles:\n- {model_path}\n- {metadata_path}")
|
313 |
except Exception as e:
|
314 |
st.error(f"Error saving model: {str(e)}")
|
315 |
+
st.error("Please ensure you have proper permissions and sufficient disk space.")
|
316 |
|
317 |
with col8:
|
318 |
if not use_pca: # Skip feature importance for PCA
|
|
|
341 |
st.pyplot(fig_roc)
|
342 |
|
343 |
except Exception as e:
|
344 |
+
st.error(f"An error occurred: {str(e)}")
|
345 |
+
st.error("Please check your input data and try again.")
|
346 |
else:
|
347 |
+
if data_input_tab == "Upload File":
|
348 |
+
st.info("Please upload a dataset to begin.")
|
349 |
+
else:
|
350 |
+
st.info("Please enter a Hugging Face dataset name to begin.")
|
351 |
|
352 |
# Add Model Management Section
|
353 |
st.header("5. Saved Models")
|
data_processing.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
|
5 |
+
from sklearn.impute import SimpleImputer
|
6 |
+
from sklearn.feature_selection import SelectKBest, f_classif
|
7 |
+
from sklearn.decomposition import PCA
|
8 |
+
import dask.dataframe as dd
|
9 |
+
|
10 |
+
class DataProcessor:
|
11 |
+
def __init__(self):
|
12 |
+
self.scaler = None
|
13 |
+
self.imputer = None
|
14 |
+
self.poly_features = None
|
15 |
+
self.feature_selector = None
|
16 |
+
self.pca = None
|
17 |
+
|
18 |
+
def _get_scaler(self, method):
|
19 |
+
"""Returns the appropriate scaler based on method."""
|
20 |
+
scalers = {
|
21 |
+
'standard': StandardScaler(),
|
22 |
+
'minmax': MinMaxScaler(),
|
23 |
+
'robust': RobustScaler()
|
24 |
+
}
|
25 |
+
return scalers.get(method, StandardScaler())
|
26 |
+
|
27 |
+
def _get_imputer(self, strategy):
|
28 |
+
"""Returns the appropriate imputer based on strategy."""
|
29 |
+
return SimpleImputer(strategy=strategy)
|
30 |
+
|
31 |
+
def _engineer_features(self, X, feature_engineering_config):
|
32 |
+
"""Apply feature engineering transformations."""
|
33 |
+
# Polynomial Features
|
34 |
+
if feature_engineering_config.get('use_polynomial', False):
|
35 |
+
degree = feature_engineering_config.get('poly_degree', 2)
|
36 |
+
self.poly_features = PolynomialFeatures(degree=degree, include_bias=False)
|
37 |
+
X = self.poly_features.fit_transform(X)
|
38 |
+
|
39 |
+
# Feature Selection
|
40 |
+
if feature_engineering_config.get('use_feature_selection', False):
|
41 |
+
k = feature_engineering_config.get('k_best_features', 10)
|
42 |
+
self.feature_selector = SelectKBest(score_func=f_classif, k=k)
|
43 |
+
X = self.feature_selector.fit_transform(X)
|
44 |
+
|
45 |
+
# Dimensionality Reduction
|
46 |
+
if feature_engineering_config.get('use_pca', False):
|
47 |
+
n_components = feature_engineering_config.get('n_components', 0.95)
|
48 |
+
self.pca = PCA(n_components=n_components)
|
49 |
+
X = self.pca.fit_transform(X)
|
50 |
+
|
51 |
+
# Add cybersecurity-specific features
|
52 |
+
if feature_engineering_config.get('add_cyber_features', False):
|
53 |
+
X = self._add_cyber_features(X)
|
54 |
+
|
55 |
+
return X
|
56 |
+
|
57 |
+
def _add_cyber_features(self, X):
|
58 |
+
"""Add cybersecurity-specific engineered features."""
|
59 |
+
# Convert back to DataFrame for feature engineering
|
60 |
+
X_df = pd.DataFrame(X)
|
61 |
+
|
62 |
+
# Example cyber features (modify based on your specific needs):
|
63 |
+
# - Entropy of numerical features
|
64 |
+
# - Statistical moments (skewness, kurtosis)
|
65 |
+
# - Rolling windows statistics
|
66 |
+
|
67 |
+
for col in X_df.columns:
|
68 |
+
if X_df[col].dtype in ['float64', 'int64']:
|
69 |
+
# Calculate entropy for numerical columns
|
70 |
+
X_df[f'{col}_entropy'] = X_df[col].apply(lambda x: -np.sum(x * np.log2(x)) if x != 0 else 0)
|
71 |
+
|
72 |
+
# Add statistical moments
|
73 |
+
X_df[f'{col}_skew'] = X_df[col].skew()
|
74 |
+
X_df[f'{col}_kurt'] = X_df[col].kurtosis()
|
75 |
+
|
76 |
+
# Add rolling statistics
|
77 |
+
X_df[f'{col}_rolling_mean'] = X_df[col].rolling(window=3, min_periods=1).mean()
|
78 |
+
X_df[f'{col}_rolling_std'] = X_df[col].rolling(window=3, min_periods=1).std()
|
79 |
+
|
80 |
+
return X_df.values
|
81 |
+
|
82 |
+
def process_data(self, df, feature_cols, target_col, impute_strategy='mean',
|
83 |
+
scaling_method='standard', feature_engineering_config=None):
|
84 |
+
"""
|
85 |
+
Process the data using Dask for large datasets.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
df: pandas DataFrame
|
89 |
+
feature_cols: list of feature columns
|
90 |
+
target_col: target column name
|
91 |
+
impute_strategy: strategy for handling missing values
|
92 |
+
scaling_method: method for scaling features
|
93 |
+
feature_engineering_config: dictionary of feature engineering parameters
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
X_train, X_test, y_train, y_test: processed and split data
|
97 |
+
"""
|
98 |
+
try:
|
99 |
+
# Convert to Dask DataFrame for large dataset handling
|
100 |
+
ddf = dd.from_pandas(df, npartitions=4)
|
101 |
+
|
102 |
+
# Select features and target
|
103 |
+
X = ddf[feature_cols].compute()
|
104 |
+
y = ddf[target_col].compute()
|
105 |
+
|
106 |
+
# Handle missing values
|
107 |
+
self.imputer = self._get_imputer(impute_strategy)
|
108 |
+
X = self.imputer.fit_transform(X)
|
109 |
+
|
110 |
+
# Scale features
|
111 |
+
self.scaler = self._get_scaler(scaling_method)
|
112 |
+
X = self.scaler.fit_transform(X)
|
113 |
+
|
114 |
+
# Apply feature engineering if config is provided
|
115 |
+
if feature_engineering_config:
|
116 |
+
X = self._engineer_features(X, feature_engineering_config)
|
117 |
+
|
118 |
+
# Split data
|
119 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
120 |
+
X, y,
|
121 |
+
test_size=0.2,
|
122 |
+
random_state=42,
|
123 |
+
stratify=y if len(np.unique(y)) > 1 else None
|
124 |
+
)
|
125 |
+
|
126 |
+
return X_train, X_test, y_train, y_test
|
127 |
+
|
128 |
+
except Exception as e:
|
129 |
+
raise Exception(f"Error in data processing: {str(e)}")
|
generated-icon.png
ADDED
model_training.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.ensemble import RandomForestClassifier
|
2 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
class ModelTrainer:
|
6 |
+
def __init__(self):
|
7 |
+
self.model = None
|
8 |
+
|
9 |
+
def train_model(self, X_train, X_test, y_train, y_test, **kwargs):
|
10 |
+
"""
|
11 |
+
Train a Random Forest model with given parameters.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
X_train, X_test, y_train, y_test: Training and test data
|
15 |
+
**kwargs: Model parameters
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
model: Trained model
|
19 |
+
metrics: Dictionary of evaluation metrics
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
# Initialize and train model
|
23 |
+
self.model = RandomForestClassifier(
|
24 |
+
n_estimators=kwargs.get('n_estimators', 100),
|
25 |
+
max_depth=kwargs.get('max_depth', 10),
|
26 |
+
min_samples_split=kwargs.get('min_samples_split', 2),
|
27 |
+
min_samples_leaf=kwargs.get('min_samples_leaf', 1),
|
28 |
+
random_state=42,
|
29 |
+
n_jobs=-1
|
30 |
+
)
|
31 |
+
|
32 |
+
self.model.fit(X_train, y_train)
|
33 |
+
|
34 |
+
# Make predictions
|
35 |
+
y_pred = self.model.predict(X_test)
|
36 |
+
|
37 |
+
# Calculate metrics
|
38 |
+
metrics = {
|
39 |
+
'Accuracy': accuracy_score(y_test, y_pred),
|
40 |
+
'Precision': precision_score(y_test, y_pred, average='weighted'),
|
41 |
+
'Recall': recall_score(y_test, y_pred, average='weighted'),
|
42 |
+
'F1 Score': f1_score(y_test, y_pred, average='weighted')
|
43 |
+
}
|
44 |
+
|
45 |
+
return self.model, metrics
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
raise Exception(f"Error in model training: {str(e)}")
|
pyproject.toml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "repl-nix-workspace"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
requires-python = ">=3.11"
|
6 |
+
dependencies = [
|
7 |
+
"dask[dataframe]>=2024.12.1",
|
8 |
+
"datasets>=3.2.0",
|
9 |
+
"huggingface-hub>=0.27.1",
|
10 |
+
"joblib>=1.4.2",
|
11 |
+
"matplotlib>=3.10.0",
|
12 |
+
"numpy>=2.2.1",
|
13 |
+
"pandas>=2.2.3",
|
14 |
+
"scikit-learn>=1.6.1",
|
15 |
+
"seaborn>=0.13.2",
|
16 |
+
"streamlit>=1.41.1",
|
17 |
+
]
|
replit.nix
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{pkgs}: {
|
2 |
+
deps = [
|
3 |
+
pkgs.tk
|
4 |
+
pkgs.tcl
|
5 |
+
pkgs.qhull
|
6 |
+
pkgs.pkg-config
|
7 |
+
pkgs.gtk3
|
8 |
+
pkgs.gobject-introspection
|
9 |
+
pkgs.ghostscript
|
10 |
+
pkgs.freetype
|
11 |
+
pkgs.ffmpeg-full
|
12 |
+
pkgs.cairo
|
13 |
+
pkgs.arrow-cpp
|
14 |
+
pkgs.glibcLocales
|
15 |
+
];
|
16 |
+
}
|
replit_zip_error_log.txt
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/python-3.11","time":"2025-01-14T00:13:02Z"}
|
2 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/replit/modules/replit","time":"2025-01-14T00:13:02Z"}
|
3 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohappyeyeballs/aiohappyeyeballs-2.4.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
4 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiohttp/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
5 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/aiosignal/aiosignal-1.3.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
6 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/altair/altair-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
7 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/attrs/attrs-24.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
8 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/blinker/blinker-1.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
9 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cachetools/cachetools-5.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
10 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/certifi/certifi-2024.12.14-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
11 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/charset-normalizer/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
12 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/click/click-8.1.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
13 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cloudpickle/cloudpickle-3.1.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
14 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/contourpy/contourpy-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
15 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/cycler/cycler-0.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
16 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask/dask-2024.12.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
17 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dask-expr/dask_expr-1.1.21-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
18 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/datasets/datasets-3.2.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
19 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/dill/dill-0.3.8-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
20 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/filelock/filelock-3.16.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
21 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fonttools/fonttools-4.55.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
22 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/frozenlist/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
23 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.12.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
24 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/fsspec/fsspec-2024.9.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
25 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitdb/gitdb-4.0.12-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
26 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/gitpython/gitpython-3.1.44-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
27 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/huggingface-hub/huggingface_hub-0.27.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
28 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/idna/idna-3.10-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
29 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/importlib-metadata/importlib_metadata-8.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
30 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jinja2/jinja2-3.1.5-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
31 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/joblib/joblib-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
32 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema/jsonschema-4.23.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
33 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/jsonschema-specifications/jsonschema_specifications-2024.10.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
34 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/kiwisolver/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
35 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/locket/locket-1.0.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
36 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markdown-it-py/markdown_it_py-3.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
37 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/markupsafe/markupsafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
38 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/matplotlib/matplotlib-3.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
39 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/mdurl/mdurl-0.1.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
40 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multidict/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
41 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/multiprocess/multiprocess-0.70.16-py311-none-any","time":"2025-01-14T00:13:36Z"}
|
42 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/narwhals/narwhals-1.22.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
43 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/numpy/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
44 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/packaging/packaging-24.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
45 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pandas/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
46 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/partd/partd-1.4.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
47 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pillow/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
|
48 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/propcache/propcache-0.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
49 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/protobuf/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
50 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyarrow/pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64","time":"2025-01-14T00:13:36Z"}
|
51 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pydeck/pydeck-0.9.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
52 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pygments/pygments-2.19.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
53 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyparsing/pyparsing-3.2.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
54 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/python-dateutil/python_dateutil-2.9.0.post0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
55 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pytz/pytz-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
56 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/pyyaml/pyyaml-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
57 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/referencing/referencing-0.35.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
58 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/requests/requests-2.32.3-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
59 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rich/rich-13.9.4-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
60 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/rpds-py/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
61 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scikit-learn/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
62 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/scipy/scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
63 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/seaborn/seaborn-0.13.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
64 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/six/six-1.17.0-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
65 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/smmap/smmap-5.0.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
66 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/streamlit/streamlit-1.41.1-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
67 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tenacity/tenacity-9.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
68 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/threadpoolctl/threadpoolctl-3.5.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
69 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toml/toml-0.10.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
70 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/toolz/toolz-1.0.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
71 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tornado/tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
72 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tqdm/tqdm-4.67.1-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
73 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/typing-extensions/typing_extensions-4.12.2-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
74 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/tzdata/tzdata-2024.2-py2.py3-none-any","time":"2025-01-14T00:13:36Z"}
|
75 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/urllib3/urllib3-2.3.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
76 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/watchdog/watchdog-6.0.0-py3-none-manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
77 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/xxhash/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
78 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/yarl/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64","time":"2025-01-14T00:13:36Z"}
|
79 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .cache/uv/wheels-v3/pypi/zipp/zipp-3.21.0-py3-none-any","time":"2025-01-14T00:13:36Z"}
|
80 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python","time":"2025-01-14T00:13:36Z"}
|
81 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3","time":"2025-01-14T00:13:36Z"}
|
82 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/bin/python3.11","time":"2025-01-14T00:13:36Z"}
|
83 |
+
{"error":".zip archives do not support non-regular files","level":"error","msg":"unable to write file .pythonlibs/lib64","time":"2025-01-14T00:14:12Z"}
|
utils.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import joblib
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
def load_data(file):
|
9 |
+
"""
|
10 |
+
Load data from uploaded file.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
file: Streamlit uploaded file object
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
pandas DataFrame
|
17 |
+
"""
|
18 |
+
try:
|
19 |
+
if file.name.endswith('.csv'):
|
20 |
+
df = pd.read_csv(file)
|
21 |
+
elif file.name.endswith('.json'):
|
22 |
+
df = pd.read_json(file)
|
23 |
+
else:
|
24 |
+
raise ValueError("Unsupported file format")
|
25 |
+
|
26 |
+
return df
|
27 |
+
except Exception as e:
|
28 |
+
raise Exception(f"Error loading data: {str(e)}")
|
29 |
+
|
30 |
+
def get_feature_names(df):
|
31 |
+
"""
|
32 |
+
Get list of numeric columns suitable for features.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
df: pandas DataFrame
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
list of column names
|
39 |
+
"""
|
40 |
+
try:
|
41 |
+
# Select numeric columns
|
42 |
+
numeric_cols = df.select_dtypes(
|
43 |
+
include=['int64', 'float64']
|
44 |
+
).columns.tolist()
|
45 |
+
|
46 |
+
return numeric_cols
|
47 |
+
except Exception as e:
|
48 |
+
raise Exception(f"Error getting feature names: {str(e)}")
|
49 |
+
|
50 |
+
def save_model(model, feature_cols, preprocessing_params, metrics, model_name=None):
|
51 |
+
"""
|
52 |
+
Save trained model and its metadata.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
model: Trained sklearn model
|
56 |
+
feature_cols: List of feature column names
|
57 |
+
preprocessing_params: Dictionary of preprocessing parameters
|
58 |
+
metrics: Dictionary of model performance metrics
|
59 |
+
model_name: Optional custom name for the model
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
saved_path: Path where model was saved
|
63 |
+
"""
|
64 |
+
try:
|
65 |
+
# Create models directory if it doesn't exist
|
66 |
+
os.makedirs('models', exist_ok=True)
|
67 |
+
|
68 |
+
# Generate model name if not provided
|
69 |
+
if model_name is None:
|
70 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
71 |
+
model_name = f"model_{timestamp}"
|
72 |
+
|
73 |
+
# Save paths
|
74 |
+
model_path = f"models/{model_name}.joblib"
|
75 |
+
metadata_path = f"models/{model_name}_metadata.json"
|
76 |
+
|
77 |
+
# Save model using joblib
|
78 |
+
joblib.dump(model, model_path)
|
79 |
+
|
80 |
+
# Save metadata
|
81 |
+
metadata = {
|
82 |
+
'feature_columns': feature_cols,
|
83 |
+
'preprocessing_parameters': preprocessing_params,
|
84 |
+
'performance_metrics': metrics,
|
85 |
+
'created_at': datetime.now().isoformat(),
|
86 |
+
'model_type': type(model).__name__
|
87 |
+
}
|
88 |
+
|
89 |
+
with open(metadata_path, 'w') as f:
|
90 |
+
json.dump(metadata, f, indent=4)
|
91 |
+
|
92 |
+
return model_path, metadata_path
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
raise Exception(f"Error saving model: {str(e)}")
|
96 |
+
|
97 |
+
def load_saved_model(model_path, metadata_path):
|
98 |
+
"""
|
99 |
+
Load a saved model and its metadata.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
model_path: Path to the saved model file
|
103 |
+
metadata_path: Path to the model metadata file
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
model: Loaded model
|
107 |
+
metadata: Dictionary containing model metadata
|
108 |
+
"""
|
109 |
+
try:
|
110 |
+
# Load model
|
111 |
+
model = joblib.load(model_path)
|
112 |
+
|
113 |
+
# Load metadata
|
114 |
+
with open(metadata_path, 'r') as f:
|
115 |
+
metadata = json.load(f)
|
116 |
+
|
117 |
+
return model, metadata
|
118 |
+
|
119 |
+
except Exception as e:
|
120 |
+
raise Exception(f"Error loading model: {str(e)}")
|
121 |
+
|
122 |
+
def list_saved_models():
|
123 |
+
"""
|
124 |
+
List all saved models in the models directory.
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
list of dictionaries containing model info
|
128 |
+
"""
|
129 |
+
try:
|
130 |
+
models_info = []
|
131 |
+
if not os.path.exists('models'):
|
132 |
+
return models_info
|
133 |
+
|
134 |
+
for filename in os.listdir('models'):
|
135 |
+
if filename.endswith('_metadata.json'):
|
136 |
+
with open(f"models/{filename}", 'r') as f:
|
137 |
+
metadata = json.load(f)
|
138 |
+
model_name = filename.replace('_metadata.json', '')
|
139 |
+
models_info.append({
|
140 |
+
'name': model_name,
|
141 |
+
'type': metadata['model_type'],
|
142 |
+
'created_at': metadata['created_at'],
|
143 |
+
'metrics': metadata['performance_metrics']
|
144 |
+
})
|
145 |
+
|
146 |
+
return models_info
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
raise Exception(f"Error listing models: {str(e)}")
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
visualizations.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import seaborn as sns
|
3 |
+
from sklearn.metrics import confusion_matrix, roc_curve, auc
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
class Visualizer:
|
7 |
+
def __init__(self):
|
8 |
+
plt.style.use('seaborn')
|
9 |
+
|
10 |
+
def plot_feature_importance(self, model, feature_names):
|
11 |
+
"""Plot feature importance from the trained model."""
|
12 |
+
try:
|
13 |
+
plt.figure(figsize=(10, 6))
|
14 |
+
importances = model.feature_importances_
|
15 |
+
indices = np.argsort(importances)[::-1]
|
16 |
+
|
17 |
+
plt.title("Feature Importance")
|
18 |
+
plt.bar(range(len(importances)), importances[indices])
|
19 |
+
plt.xticks(
|
20 |
+
range(len(importances)),
|
21 |
+
[feature_names[i] for i in indices],
|
22 |
+
rotation=45,
|
23 |
+
ha='right'
|
24 |
+
)
|
25 |
+
plt.tight_layout()
|
26 |
+
return plt.gcf()
|
27 |
+
|
28 |
+
except Exception as e:
|
29 |
+
raise Exception(f"Error plotting feature importance: {str(e)}")
|
30 |
+
|
31 |
+
def plot_confusion_matrix(self, y_true, y_pred):
|
32 |
+
"""Plot confusion matrix."""
|
33 |
+
try:
|
34 |
+
plt.figure(figsize=(8, 6))
|
35 |
+
cm = confusion_matrix(y_true, y_pred)
|
36 |
+
sns.heatmap(
|
37 |
+
cm,
|
38 |
+
annot=True,
|
39 |
+
fmt='d',
|
40 |
+
cmap='Blues',
|
41 |
+
cbar=False
|
42 |
+
)
|
43 |
+
plt.title("Confusion Matrix")
|
44 |
+
plt.ylabel("True Label")
|
45 |
+
plt.xlabel("Predicted Label")
|
46 |
+
plt.tight_layout()
|
47 |
+
return plt.gcf()
|
48 |
+
|
49 |
+
except Exception as e:
|
50 |
+
raise Exception(f"Error plotting confusion matrix: {str(e)}")
|
51 |
+
|
52 |
+
def plot_roc_curve(self, model, X_test, y_test):
|
53 |
+
"""Plot ROC curve."""
|
54 |
+
try:
|
55 |
+
plt.figure(figsize=(8, 6))
|
56 |
+
y_prob = model.predict_proba(X_test)
|
57 |
+
|
58 |
+
# Handle multi-class case
|
59 |
+
if y_prob.shape[1] > 2:
|
60 |
+
# Plot ROC curve for each class
|
61 |
+
for i in range(y_prob.shape[1]):
|
62 |
+
fpr, tpr, _ = roc_curve(
|
63 |
+
(y_test == i).astype(int),
|
64 |
+
y_prob[:, i]
|
65 |
+
)
|
66 |
+
auc_score = auc(fpr, tpr)
|
67 |
+
plt.plot(
|
68 |
+
fpr,
|
69 |
+
tpr,
|
70 |
+
label=f'Class {i} (AUC = {auc_score:.2f})'
|
71 |
+
)
|
72 |
+
else:
|
73 |
+
# Binary classification
|
74 |
+
fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
|
75 |
+
auc_score = auc(fpr, tpr)
|
76 |
+
plt.plot(
|
77 |
+
fpr,
|
78 |
+
tpr,
|
79 |
+
label=f'ROC curve (AUC = {auc_score:.2f})'
|
80 |
+
)
|
81 |
+
|
82 |
+
plt.plot([0, 1], [0, 1], 'k--')
|
83 |
+
plt.xlim([0.0, 1.0])
|
84 |
+
plt.ylim([0.0, 1.05])
|
85 |
+
plt.xlabel('False Positive Rate')
|
86 |
+
plt.ylabel('True Positive Rate')
|
87 |
+
plt.title('Receiver Operating Characteristic (ROC) Curve')
|
88 |
+
plt.legend(loc="lower right")
|
89 |
+
plt.tight_layout()
|
90 |
+
return plt.gcf()
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
raise Exception(f"Error plotting ROC curve: {str(e)}")
|