Upload 13 files
Browse files- .gitattributes +35 -35
- .gitignore +163 -0
- LICENSE +201 -0
- README.md +16 -12
- api.py +98 -0
- app.py +76 -0
- data_cleaning.py +206 -0
- model_building.py +41 -0
- model_load_save.py +13 -0
- requirements.txt +8 -0
- scaler.pkl +3 -0
- transformed_data.pkl +3 -0
- xgboost_model.pkl +3 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
hf_token.txt
|
6 |
+
|
7 |
+
# C extensions
|
8 |
+
*.so
|
9 |
+
|
10 |
+
# Distribution / packaging
|
11 |
+
.Python
|
12 |
+
build/
|
13 |
+
develop-eggs/
|
14 |
+
dist/
|
15 |
+
downloads/
|
16 |
+
eggs/
|
17 |
+
.eggs/
|
18 |
+
lib/
|
19 |
+
lib64/
|
20 |
+
parts/
|
21 |
+
sdist/
|
22 |
+
var/
|
23 |
+
wheels/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
cover/
|
54 |
+
|
55 |
+
# Translations
|
56 |
+
*.mo
|
57 |
+
*.pot
|
58 |
+
|
59 |
+
# Django stuff:
|
60 |
+
*.log
|
61 |
+
local_settings.py
|
62 |
+
db.sqlite3
|
63 |
+
db.sqlite3-journal
|
64 |
+
|
65 |
+
# Flask stuff:
|
66 |
+
instance/
|
67 |
+
.webassets-cache
|
68 |
+
|
69 |
+
# Scrapy stuff:
|
70 |
+
.scrapy
|
71 |
+
|
72 |
+
# Sphinx documentation
|
73 |
+
docs/_build/
|
74 |
+
|
75 |
+
# PyBuilder
|
76 |
+
.pybuilder/
|
77 |
+
target/
|
78 |
+
|
79 |
+
# Jupyter Notebook
|
80 |
+
.ipynb_checkpoints
|
81 |
+
|
82 |
+
# IPython
|
83 |
+
profile_default/
|
84 |
+
ipython_config.py
|
85 |
+
|
86 |
+
# pyenv
|
87 |
+
# For a library or package, you might want to ignore these files since the code is
|
88 |
+
# intended to run in multiple environments; otherwise, check them in:
|
89 |
+
# .python-version
|
90 |
+
|
91 |
+
# pipenv
|
92 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
93 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
94 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
95 |
+
# install all needed dependencies.
|
96 |
+
#Pipfile.lock
|
97 |
+
|
98 |
+
# poetry
|
99 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
100 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
101 |
+
# commonly ignored for libraries.
|
102 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
103 |
+
#poetry.lock
|
104 |
+
|
105 |
+
# pdm
|
106 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
107 |
+
#pdm.lock
|
108 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
109 |
+
# in version control.
|
110 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
111 |
+
.pdm.toml
|
112 |
+
.pdm-python
|
113 |
+
.pdm-build/
|
114 |
+
|
115 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
116 |
+
__pypackages__/
|
117 |
+
|
118 |
+
# Celery stuff
|
119 |
+
celerybeat-schedule
|
120 |
+
celerybeat.pid
|
121 |
+
|
122 |
+
# SageMath parsed files
|
123 |
+
*.sage.py
|
124 |
+
|
125 |
+
# Environments
|
126 |
+
.env
|
127 |
+
.venv
|
128 |
+
env/
|
129 |
+
venv/
|
130 |
+
ENV/
|
131 |
+
env.bak/
|
132 |
+
venv.bak/
|
133 |
+
|
134 |
+
# Spyder project settings
|
135 |
+
.spyderproject
|
136 |
+
.spyproject
|
137 |
+
|
138 |
+
# Rope project settings
|
139 |
+
.ropeproject
|
140 |
+
|
141 |
+
# mkdocs documentation
|
142 |
+
/site
|
143 |
+
|
144 |
+
# mypy
|
145 |
+
.mypy_cache/
|
146 |
+
.dmypy.json
|
147 |
+
dmypy.json
|
148 |
+
|
149 |
+
# Pyre type checker
|
150 |
+
.pyre/
|
151 |
+
|
152 |
+
# pytype static type analyzer
|
153 |
+
.pytype/
|
154 |
+
|
155 |
+
# Cython debug symbols
|
156 |
+
cython_debug/
|
157 |
+
|
158 |
+
# PyCharm
|
159 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
160 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
161 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
162 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
163 |
+
#.idea/
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,12 +1,16 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
<<<<<<< HEAD
|
2 |
+
# ml-end-to-end-project
|
3 |
+
=======
|
4 |
+
---
|
5 |
+
title: Heart Disease Predictor
|
6 |
+
emoji: 🔥
|
7 |
+
colorFrom: yellow
|
8 |
+
colorTo: purple
|
9 |
+
sdk: static
|
10 |
+
pinned: false
|
11 |
+
license: apache-2.0
|
12 |
+
short_description: An end-to-end ML project
|
13 |
+
---
|
14 |
+
|
15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
16 |
+
>>>>>>> 42e34e0244085b954508727d6dc65016d7f0bbd0
|
api.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import List
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from model_load_save import load_model
|
7 |
+
import dill
|
8 |
+
|
9 |
+
def load_preprocessing_components():
|
10 |
+
with open("encoder.pkl", "rb") as f:
|
11 |
+
encoder = dill.load(f)
|
12 |
+
with open("scaler.pkl", "rb") as f:
|
13 |
+
scaler = dill.load(f)
|
14 |
+
return encoder, scaler
|
15 |
+
|
16 |
+
app = FastAPI()
|
17 |
+
|
18 |
+
# Load trained model
|
19 |
+
model = load_model()
|
20 |
+
encoder, scaler = load_preprocessing_components()
|
21 |
+
|
22 |
+
# Define input schema
|
23 |
+
class InferenceData(BaseModel):
|
24 |
+
Age: float
|
25 |
+
Sex: str
|
26 |
+
ChestPainType: str
|
27 |
+
RestingBP: float
|
28 |
+
Cholesterol: float
|
29 |
+
FastingBS: int
|
30 |
+
RestingECG: str
|
31 |
+
MaxHR: float
|
32 |
+
ExerciseAngina: str
|
33 |
+
Oldpeak: float
|
34 |
+
ST_Slope: str
|
35 |
+
|
36 |
+
|
37 |
+
# Health check endpoint
|
38 |
+
@app.get("/")
|
39 |
+
def read_root():
|
40 |
+
return {"message": "Inference API is up and running"}
|
41 |
+
|
42 |
+
|
43 |
+
# Helper function for preprocessing
|
44 |
+
def preprocess_data(df: pd.DataFrame) -> np.ndarray:
|
45 |
+
# Encode categorical variables
|
46 |
+
encoded = encoder.transform(df[encoder.feature_names_in_])
|
47 |
+
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(), index=df.index)
|
48 |
+
|
49 |
+
# Extracting features
|
50 |
+
df = pd.concat([df.drop(encoder.feature_names_in_, axis=1), encoded_df], axis=1)
|
51 |
+
|
52 |
+
# Combine and scale features
|
53 |
+
df_selected = pd.concat([df[['Oldpeak', 'MaxHR', 'Age']], df[['ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up']]], axis=1) # directly extracted selected features
|
54 |
+
|
55 |
+
# Scale features
|
56 |
+
df = scaler.transform(df_selected)
|
57 |
+
|
58 |
+
return df
|
59 |
+
|
60 |
+
# Endpoint for single prediction
|
61 |
+
@app.post("/predict")
|
62 |
+
def predict(data: InferenceData):
|
63 |
+
try:
|
64 |
+
# Convert input data to DataFrame
|
65 |
+
df = pd.DataFrame([data.model_dump()])
|
66 |
+
|
67 |
+
# Preprocess data
|
68 |
+
processed_data = preprocess_data(df)
|
69 |
+
|
70 |
+
# Make prediction
|
71 |
+
prediction = model.predict(processed_data)
|
72 |
+
|
73 |
+
# Return prediction result
|
74 |
+
return {"prediction": int(prediction[0])}
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
raise HTTPException(status_code=500, detail=f"Error during prediction: {str(e)}")
|
78 |
+
|
79 |
+
|
80 |
+
# Endpoint for batch prediction
|
81 |
+
@app.post("/batch_predict")
|
82 |
+
def batch_predict(data: List[InferenceData]):
|
83 |
+
try:
|
84 |
+
# Convert list of inputs to DataFrame
|
85 |
+
df = pd.DataFrame([item.model_dump() for item in data])
|
86 |
+
|
87 |
+
# Preprocess data
|
88 |
+
processed_data = preprocess_data(df)
|
89 |
+
|
90 |
+
# Make batch predictions
|
91 |
+
predictions = model.predict(processed_data)
|
92 |
+
|
93 |
+
# Format and return predictions
|
94 |
+
results = [{"input": item.model_dump(), "prediction": int(pred)} for item, pred in zip(data, predictions)]
|
95 |
+
return {"predictions": results}
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
raise HTTPException(status_code=500, detail=f"Error during batch prediction: {str(e)}")
|
app.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
# Set the FastAPI URL
|
6 |
+
API_URL = "http://127.0.0.1:8000" # Replace with your FastAPI URL if different
|
7 |
+
|
8 |
+
# Define the user input form for prediction
|
9 |
+
st.title("Heart Disease Prediction")
|
10 |
+
|
11 |
+
st.subheader("Enter patient information below:")
|
12 |
+
age = st.number_input("Age", min_value=0, max_value=120, step=1)
|
13 |
+
sex = st.selectbox("Sex", ["M", "F"])
|
14 |
+
chest_pain_type = st.selectbox("Chest Pain Type", ["TA", "ATA", "NAP", "ASY"])
|
15 |
+
resting_bp = st.number_input("Resting Blood Pressure", min_value=0, max_value=300)
|
16 |
+
cholesterol = st.number_input("Cholesterol", min_value=0, max_value=600)
|
17 |
+
fasting_bs = st.selectbox("Fasting Blood Sugar", [0, 1])
|
18 |
+
resting_ecg = st.selectbox("Resting ECG", ["Normal", "ST", "LVH"])
|
19 |
+
max_hr = st.number_input("Maximum Heart Rate", min_value=0, max_value=220)
|
20 |
+
exercise_angina = st.selectbox("Exercise-Induced Angina", ["Y", "N"])
|
21 |
+
oldpeak = st.number_input("Oldpeak", min_value=0.0, max_value=10.0, step=0.1)
|
22 |
+
st_slope = st.selectbox("ST Slope", ["Up", "Flat", "Down"])
|
23 |
+
|
24 |
+
# Button to submit the form
|
25 |
+
if st.button("Predict"):
|
26 |
+
# Prepare the data payload
|
27 |
+
data = {
|
28 |
+
"Age": age,
|
29 |
+
"Sex": sex,
|
30 |
+
"ChestPainType": chest_pain_type,
|
31 |
+
"RestingBP": resting_bp,
|
32 |
+
"Cholesterol": cholesterol,
|
33 |
+
"FastingBS": fasting_bs,
|
34 |
+
"RestingECG": resting_ecg,
|
35 |
+
"MaxHR": max_hr,
|
36 |
+
"ExerciseAngina": exercise_angina,
|
37 |
+
"Oldpeak": oldpeak,
|
38 |
+
"ST_Slope": st_slope
|
39 |
+
}
|
40 |
+
|
41 |
+
# Send a request to the FastAPI server
|
42 |
+
response = requests.post(f"{API_URL}/predict", json=data)
|
43 |
+
|
44 |
+
# Display the result
|
45 |
+
if response.status_code == 200:
|
46 |
+
prediction = response.json()["prediction"]
|
47 |
+
result = "Positive for heart disease" if prediction == 1 else "Negative for heart disease"
|
48 |
+
st.success(f"Prediction: {result}")
|
49 |
+
else:
|
50 |
+
st.error("Error: Unable to get prediction from API. Please try again later.")
|
51 |
+
|
52 |
+
# Batch Prediction Section
|
53 |
+
st.subheader("Batch Prediction")
|
54 |
+
uploaded_file = st.file_uploader("Upload CSV for batch prediction", type="csv")
|
55 |
+
|
56 |
+
if uploaded_file:
|
57 |
+
# Load the CSV file
|
58 |
+
batch_data = pd.read_csv(uploaded_file)
|
59 |
+
st.write("Uploaded Data:")
|
60 |
+
st.write(batch_data)
|
61 |
+
|
62 |
+
# Prepare batch data for the API
|
63 |
+
batch_data = batch_data.to_dict(orient="records")
|
64 |
+
|
65 |
+
if st.button("Predict Batch"):
|
66 |
+
# Send batch data to the API
|
67 |
+
batch_response = requests.post(f"{API_URL}/batch_predict", json=batch_data)
|
68 |
+
|
69 |
+
# Display batch prediction results
|
70 |
+
if batch_response.status_code == 200:
|
71 |
+
predictions = batch_response.json()["predictions"]
|
72 |
+
results_df = pd.DataFrame(predictions)
|
73 |
+
st.write("Batch Prediction Results:")
|
74 |
+
st.write(results_df)
|
75 |
+
else:
|
76 |
+
st.error("Error: Unable to get batch predictions from API. Please try again later.")
|
data_cleaning.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.pipeline import Pipeline
|
3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
4 |
+
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.feature_selection import SelectKBest, chi2
|
9 |
+
import pandas as pd
|
10 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
11 |
+
from imblearn.over_sampling import SMOTE
|
12 |
+
import kagglehub
|
13 |
+
import pickle
|
14 |
+
|
15 |
+
|
16 |
+
# Encoder Class
|
17 |
+
class Encoder(BaseEstimator, TransformerMixin):
|
18 |
+
def __init__(self, categorical_columns, target_column):
|
19 |
+
self.categorical_columns = categorical_columns
|
20 |
+
self.target_column = target_column
|
21 |
+
self.ohe = OneHotEncoder(sparse_output=False)
|
22 |
+
self.le = LabelEncoder()
|
23 |
+
self.encoded_feature_names = [] # Store encoded feature names
|
24 |
+
|
25 |
+
def fit(self, X, y=None):
|
26 |
+
self.ohe.fit(X[self.categorical_columns])
|
27 |
+
self.le.fit(X[self.target_column])
|
28 |
+
self.encoded_feature_names = self.ohe.get_feature_names_out(self.categorical_columns).tolist() # Store encoded feature names
|
29 |
+
return self
|
30 |
+
|
31 |
+
def transform(self, X):
|
32 |
+
encoded = self.ohe.transform(X[self.categorical_columns])
|
33 |
+
|
34 |
+
encoded_df = pd.DataFrame(
|
35 |
+
encoded,
|
36 |
+
columns=self.encoded_feature_names,
|
37 |
+
index=X.index
|
38 |
+
)
|
39 |
+
|
40 |
+
result = pd.concat([
|
41 |
+
X.drop(self.categorical_columns + [self.target_column], axis=1),
|
42 |
+
encoded_df
|
43 |
+
], axis=1)
|
44 |
+
result[self.target_column] = self.le.transform(X[self.target_column])
|
45 |
+
return result
|
46 |
+
|
47 |
+
|
48 |
+
class FeatureSelector(BaseEstimator, TransformerMixin):
|
49 |
+
def __init__(self, numeric_features, encoded_features, target_column, num_k=5, cat_k=5):
|
50 |
+
"""
|
51 |
+
:param numeric_features: List of numeric feature names
|
52 |
+
:param encoded_features: List of encoded feature names
|
53 |
+
:param target_column: Target column name
|
54 |
+
:param num_k: Number of top numeric features to select
|
55 |
+
:param cat_k: Number of top encoded features to select
|
56 |
+
"""
|
57 |
+
self.numeric_features = numeric_features
|
58 |
+
self.encoded_features = encoded_features # Use encoded features
|
59 |
+
self.target_column = target_column
|
60 |
+
self.num_k = num_k
|
61 |
+
self.cat_k = cat_k
|
62 |
+
self.chi2_selector = None
|
63 |
+
self.numeric_selector = None
|
64 |
+
|
65 |
+
def fit(self, X, y=None):
|
66 |
+
# Pearson correlation for numeric features
|
67 |
+
self.numeric_selector = X[self.numeric_features].corrwith(X[self.target_column]).abs().nlargest(self.num_k).index.tolist()
|
68 |
+
|
69 |
+
# Chi-Square for encoded categorical features
|
70 |
+
X_encoded = X[self.encoded_features]
|
71 |
+
y = X[self.target_column]
|
72 |
+
|
73 |
+
# Apply chi-squared test and select top k features
|
74 |
+
self.chi2_selector = SelectKBest(chi2, k=self.cat_k).fit(X_encoded, y)
|
75 |
+
return self
|
76 |
+
|
77 |
+
def transform(self, X):
|
78 |
+
# Select top numeric features based on Pearson correlation
|
79 |
+
X_selected_num = X[self.numeric_selector]
|
80 |
+
y = X[self.target_column]
|
81 |
+
|
82 |
+
# Select top encoded categorical features based on Chi-Square
|
83 |
+
X_encoded = X[self.encoded_features]
|
84 |
+
X_selected_cat = pd.DataFrame(self.chi2_selector.transform(X_encoded), columns=self.chi2_selector.get_feature_names_out(), index=X.index)
|
85 |
+
|
86 |
+
# Concatenate selected numeric and categorical features
|
87 |
+
return pd.concat([X_selected_num, X_selected_cat, y], axis=1)
|
88 |
+
|
89 |
+
# Splitter Class
|
90 |
+
class Splitter(BaseEstimator, TransformerMixin):
|
91 |
+
def __init__(self, target_column, test_size=0.3, random_state=42):
|
92 |
+
self.target_column = target_column
|
93 |
+
self.test_size = test_size
|
94 |
+
self.random_state = random_state
|
95 |
+
|
96 |
+
def fit(self, X, y=None):
|
97 |
+
return self
|
98 |
+
|
99 |
+
def transform(self, X):
|
100 |
+
y = X[self.target_column]
|
101 |
+
X = X.drop(self.target_column, axis=1)
|
102 |
+
return tuple(train_test_split(X, y, test_size=self.test_size, random_state=self.random_state))
|
103 |
+
|
104 |
+
|
105 |
+
# Scaler Class
|
106 |
+
class Scaler(BaseEstimator, TransformerMixin):
|
107 |
+
def __init__(self, scaler_type='standard'):
|
108 |
+
self.scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
|
109 |
+
|
110 |
+
def fit(self, X, y=None):
|
111 |
+
return self
|
112 |
+
|
113 |
+
def transform(self, X):
|
114 |
+
if isinstance(X, tuple) and len(X) == 4:
|
115 |
+
X_train, X_test, y_train, y_test = X
|
116 |
+
X_train_scaled = self.scaler.fit_transform(X_train)
|
117 |
+
X_test_scaled = self.scaler.transform(X_test)
|
118 |
+
return X_train_scaled, X_test_scaled, y_train, y_test
|
119 |
+
else:
|
120 |
+
return self.scaler.fit_transform(X)
|
121 |
+
|
122 |
+
|
123 |
+
# Full pipeline with feature selection
|
124 |
+
class FullPipeline:
|
125 |
+
def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
|
126 |
+
self.encoder = Encoder(categorical_columns, target_column)
|
127 |
+
self.feature_selector = None # Initialize after encoding to access encoded names
|
128 |
+
self.splitter = Splitter(target_column)
|
129 |
+
self.scaler = Scaler()
|
130 |
+
self.numeric_features = numeric_features
|
131 |
+
self.num_k = num_k
|
132 |
+
self.cat_k = cat_k
|
133 |
+
|
134 |
+
def fit_transform(self, X):
|
135 |
+
# Apply encoding and retrieve encoded feature names
|
136 |
+
X = self.encoder.fit_transform(X)
|
137 |
+
self.feature_selector = FeatureSelector(
|
138 |
+
numeric_features=self.numeric_features,
|
139 |
+
encoded_features=self.encoder.encoded_feature_names,
|
140 |
+
target_column=self.encoder.target_column,
|
141 |
+
num_k=self.num_k, cat_k=self.cat_k
|
142 |
+
)
|
143 |
+
X = self.feature_selector.fit_transform(X)
|
144 |
+
X_train, X_test, y_train, y_test = self.splitter.transform(X)
|
145 |
+
return self.scaler.transform((X_train, X_test, y_train, y_test))
|
146 |
+
|
147 |
+
class FullPipeline:
|
148 |
+
def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
|
149 |
+
self.encoder = Encoder(categorical_columns, target_column)
|
150 |
+
self.feature_selector = None # Initialize after encoding to access encoded names
|
151 |
+
self.splitter = Splitter(target_column)
|
152 |
+
self.scaler = Scaler()
|
153 |
+
self.numeric_features = numeric_features
|
154 |
+
self.num_k = num_k
|
155 |
+
self.cat_k = cat_k
|
156 |
+
|
157 |
+
def fit_transform(self, X):
|
158 |
+
X = self.encoder.fit_transform(X)
|
159 |
+
|
160 |
+
pickle.dump(self.encoder, open("encoder.pkl", "wb"))
|
161 |
+
|
162 |
+
self.feature_selector = FeatureSelector(
|
163 |
+
numeric_features=self.numeric_features,
|
164 |
+
encoded_features=self.encoder.encoded_feature_names,
|
165 |
+
target_column=self.encoder.target_column,
|
166 |
+
num_k=self.num_k, cat_k=self.cat_k
|
167 |
+
)
|
168 |
+
X = self.feature_selector.fit_transform(X)
|
169 |
+
|
170 |
+
pickle.dump(self.feature_selector, open("feature_selector.pkl", "wb"))
|
171 |
+
|
172 |
+
X_train, X_test, y_train, y_test = self.splitter.transform(X)
|
173 |
+
|
174 |
+
pickle.dump(self.splitter, open("splitter.pkl", "wb"))
|
175 |
+
|
176 |
+
X_train_scaled, X_test_scaled, y_train, y_test = self.scaler.transform((X_train, X_test, y_train, y_test))
|
177 |
+
|
178 |
+
pickle.dump(self.scaler, open("scaler.pkl", "wb"))
|
179 |
+
|
180 |
+
return (X_train_scaled, X_test_scaled, y_train, y_test)
|
181 |
+
|
182 |
+
|
183 |
+
def main():
|
184 |
+
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")
|
185 |
+
df = pd.read_csv(path + r"\heart.csv")
|
186 |
+
|
187 |
+
df.drop_duplicates(inplace=True) # dropping the duplicates
|
188 |
+
|
189 |
+
# defining the pipeline
|
190 |
+
pipeline = FullPipeline(
|
191 |
+
categorical_columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
|
192 |
+
target_column='HeartDisease',
|
193 |
+
numeric_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'],
|
194 |
+
num_k=3, # Select top 3 numeric features
|
195 |
+
cat_k=3 # Select top 3 categorical features
|
196 |
+
)
|
197 |
+
|
198 |
+
# transforming the data
|
199 |
+
X_train, X_test, y_train, y_test = pipeline.fit_transform(df)
|
200 |
+
|
201 |
+
with open("transformed_data.pkl", "wb") as f:
|
202 |
+
pickle.dump((X_train, X_test, y_train, y_test), f)
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
main()
|
model_building.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import xgboost as xgb
|
2 |
+
from data_cleaning import main
|
3 |
+
from sklearn.metrics import classification_report
|
4 |
+
import pandas as pd
|
5 |
+
import dill
|
6 |
+
|
7 |
+
def load_data():
|
8 |
+
with open("transformed_data.pkl", "rb") as f:
|
9 |
+
X_train, X_test, y_train, y_test = dill.load(f)
|
10 |
+
|
11 |
+
return X_train, y_train, X_test, y_test
|
12 |
+
|
13 |
+
|
14 |
+
def build_model(X_train, y_train, X_test, y_test):
|
15 |
+
params = {
|
16 |
+
"objective": "binary:logistic",
|
17 |
+
"n_estimators": 500,
|
18 |
+
'learning_rate': 0.0010812936756470217,
|
19 |
+
'max_depth': 6,
|
20 |
+
'subsample': 0.36482338465400405,
|
21 |
+
'colsample_bytree': 0.17190210997311706,
|
22 |
+
'min_child_weight': 15
|
23 |
+
}
|
24 |
+
|
25 |
+
model = xgb.XGBClassifier(**params)
|
26 |
+
model.fit(X_train, y_train, verbose=False)
|
27 |
+
return model
|
28 |
+
|
29 |
+
|
30 |
+
def main():
|
31 |
+
X_train, y_train, X_test, y_test = load_data() # reading data
|
32 |
+
model = build_model(X_train, y_train, X_test, y_test) # building the model
|
33 |
+
|
34 |
+
y_pred = model.predict(X_test)
|
35 |
+
|
36 |
+
report = classification_report(y_test, y_pred)
|
37 |
+
print(report)
|
38 |
+
|
39 |
+
|
40 |
+
if __name__=="__main__":
|
41 |
+
main()
|
model_load_save.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dill
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def save_model(model):
|
5 |
+
with open("model.pkl", "wb") as f:
|
6 |
+
dill.dump(model, f)
|
7 |
+
|
8 |
+
|
9 |
+
def load_model():
|
10 |
+
with open("xgboost_model.pkl", "rb") as f:
|
11 |
+
model = dill.load(f)
|
12 |
+
|
13 |
+
return model
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
dill
|
5 |
+
streamlit
|
6 |
+
xgboost
|
7 |
+
requests
|
8 |
+
scikit-learn
|
scaler.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfcc5d384ca7bc517925e2ea1ae028e71a597e368a770c5d28d214b5b3f4fbdc
|
3 |
+
size 791
|
transformed_data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e19ecb854e956dfc67a5e972229e02c6e5d0b01cb891532b2672b331329efbc6
|
3 |
+
size 67077
|
xgboost_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78b1669b1aee287e888c1b582d3a33c43a10f16ca634e3fd70a054e0fc0be3a9
|
3 |
+
size 392329
|