Spaces:

zohaibterminator
/

heart-disease-predictor

Sleeping

App Files Files Community

zohaibterminator commited on Nov 9, 2024

Commit

8c10e4d

verified ·

1 Parent(s): 42e34e0

Upload 13 files

Browse files

Files changed (13) hide show

.gitattributes +35 -35
.gitignore +163 -0
LICENSE +201 -0
README.md +16 -12
api.py +98 -0
app.py +76 -0
data_cleaning.py +206 -0
model_building.py +41 -0
model_load_save.py +13 -0
requirements.txt +8 -0
scaler.pkl +3 -0
transformed_data.pkl +3 -0
xgboost_model.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+hf_token.txt
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,16 @@
----
-title: Heart Disease Predictor
-emoji: 🔥
-colorFrom: yellow
-colorTo: purple
-sdk: static
-pinned: false
-license: apache-2.0
-short_description: An end-to-end ML project
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<<<<<<< HEAD
+# ml-end-to-end-project
+=======
+---
+title: Heart Disease Predictor
+emoji: 🔥
+colorFrom: yellow
+colorTo: purple
+sdk: static
+pinned: false
+license: apache-2.0
+short_description: An end-to-end ML project
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+>>>>>>> 42e34e0244085b954508727d6dc65016d7f0bbd0

api.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+import pandas as pd
+import numpy as np
+from model_load_save import load_model
+import dill
+def load_preprocessing_components():
+    with open("encoder.pkl", "rb") as f:
+        encoder = dill.load(f)
+    with open("scaler.pkl", "rb") as f:
+        scaler = dill.load(f)
+    return encoder, scaler
+app = FastAPI()
+# Load trained model
+model = load_model()
+encoder, scaler = load_preprocessing_components()
+# Define input schema
+class InferenceData(BaseModel):
+    Age: float
+    Sex: str
+    ChestPainType: str
+    RestingBP: float
+    Cholesterol: float
+    FastingBS: int
+    RestingECG: str
+    MaxHR: float
+    ExerciseAngina: str
+    Oldpeak: float
+    ST_Slope: str
+# Health check endpoint
+@app.get("/")
+def read_root():
+    return {"message": "Inference API is up and running"}
+# Helper function for preprocessing
+def preprocess_data(df: pd.DataFrame) -> np.ndarray:
+    # Encode categorical variables
+    encoded = encoder.transform(df[encoder.feature_names_in_])
+    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(), index=df.index)
+    # Extracting features
+    df = pd.concat([df.drop(encoder.feature_names_in_, axis=1), encoded_df], axis=1)
+    # Combine and scale features
+    df_selected = pd.concat([df[['Oldpeak', 'MaxHR', 'Age']], df[['ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up']]], axis=1) # directly extracted selected features
+    # Scale features
+    df = scaler.transform(df_selected)
+    return df
+# Endpoint for single prediction
+@app.post("/predict")
+def predict(data: InferenceData):
+    try:
+        # Convert input data to DataFrame
+        df = pd.DataFrame([data.model_dump()])
+        # Preprocess data
+        processed_data = preprocess_data(df)
+        # Make prediction
+        prediction = model.predict(processed_data)
+        # Return prediction result
+        return {"prediction": int(prediction[0])}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error during prediction: {str(e)}")
+# Endpoint for batch prediction
+@app.post("/batch_predict")
+def batch_predict(data: List[InferenceData]):
+    try:
+        # Convert list of inputs to DataFrame
+        df = pd.DataFrame([item.model_dump() for item in data])
+        # Preprocess data
+        processed_data = preprocess_data(df)
+        # Make batch predictions
+        predictions = model.predict(processed_data)
+        # Format and return predictions
+        results = [{"input": item.model_dump(), "prediction": int(pred)} for item, pred in zip(data, predictions)]
+        return {"predictions": results}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error during batch prediction: {str(e)}")

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import streamlit as st
+import requests
+import pandas as pd
+# Set the FastAPI URL
+API_URL = "http://127.0.0.1:8000"  # Replace with your FastAPI URL if different
+# Define the user input form for prediction
+st.title("Heart Disease Prediction")
+st.subheader("Enter patient information below:")
+age = st.number_input("Age", min_value=0, max_value=120, step=1)
+sex = st.selectbox("Sex", ["M", "F"])
+chest_pain_type = st.selectbox("Chest Pain Type", ["TA", "ATA", "NAP", "ASY"])
+resting_bp = st.number_input("Resting Blood Pressure", min_value=0, max_value=300)
+cholesterol = st.number_input("Cholesterol", min_value=0, max_value=600)
+fasting_bs = st.selectbox("Fasting Blood Sugar", [0, 1])
+resting_ecg = st.selectbox("Resting ECG", ["Normal", "ST", "LVH"])
+max_hr = st.number_input("Maximum Heart Rate", min_value=0, max_value=220)
+exercise_angina = st.selectbox("Exercise-Induced Angina", ["Y", "N"])
+oldpeak = st.number_input("Oldpeak", min_value=0.0, max_value=10.0, step=0.1)
+st_slope = st.selectbox("ST Slope", ["Up", "Flat", "Down"])
+# Button to submit the form
+if st.button("Predict"):
+    # Prepare the data payload
+    data = {
+        "Age": age,
+        "Sex": sex,
+        "ChestPainType": chest_pain_type,
+        "RestingBP": resting_bp,
+        "Cholesterol": cholesterol,
+        "FastingBS": fasting_bs,
+        "RestingECG": resting_ecg,
+        "MaxHR": max_hr,
+        "ExerciseAngina": exercise_angina,
+        "Oldpeak": oldpeak,
+        "ST_Slope": st_slope
+    }
+    # Send a request to the FastAPI server
+    response = requests.post(f"{API_URL}/predict", json=data)
+    # Display the result
+    if response.status_code == 200:
+        prediction = response.json()["prediction"]
+        result = "Positive for heart disease" if prediction == 1 else "Negative for heart disease"
+        st.success(f"Prediction: {result}")
+    else:
+        st.error("Error: Unable to get prediction from API. Please try again later.")
+# Batch Prediction Section
+st.subheader("Batch Prediction")
+uploaded_file = st.file_uploader("Upload CSV for batch prediction", type="csv")
+if uploaded_file:
+    # Load the CSV file
+    batch_data = pd.read_csv(uploaded_file)
+    st.write("Uploaded Data:")
+    st.write(batch_data)
+    # Prepare batch data for the API
+    batch_data = batch_data.to_dict(orient="records")
+    if st.button("Predict Batch"):
+        # Send batch data to the API
+        batch_response = requests.post(f"{API_URL}/batch_predict", json=batch_data)
+        # Display batch prediction results
+        if batch_response.status_code == 200:
+            predictions = batch_response.json()["predictions"]
+            results_df = pd.DataFrame(predictions)
+            st.write("Batch Prediction Results:")
+            st.write(results_df)
+        else:
+            st.error("Error: Unable to get batch predictions from API. Please try again later.")

data_cleaning.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import pandas as pd
+from sklearn.pipeline import Pipeline
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import SelectKBest, chi2
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from imblearn.over_sampling import SMOTE
+import kagglehub
+import pickle
+# Encoder Class
+class Encoder(BaseEstimator, TransformerMixin):
+    def __init__(self, categorical_columns, target_column):
+        self.categorical_columns = categorical_columns
+        self.target_column = target_column
+        self.ohe = OneHotEncoder(sparse_output=False)
+        self.le = LabelEncoder()
+        self.encoded_feature_names = []  # Store encoded feature names
+    def fit(self, X, y=None):
+        self.ohe.fit(X[self.categorical_columns])
+        self.le.fit(X[self.target_column])
+        self.encoded_feature_names = self.ohe.get_feature_names_out(self.categorical_columns).tolist()  # Store encoded feature names
+        return self
+    def transform(self, X):
+        encoded = self.ohe.transform(X[self.categorical_columns])
+        encoded_df = pd.DataFrame(
+            encoded,
+            columns=self.encoded_feature_names,
+            index=X.index
+        )
+        result = pd.concat([
+            X.drop(self.categorical_columns + [self.target_column], axis=1),
+            encoded_df
+        ], axis=1)
+        result[self.target_column] = self.le.transform(X[self.target_column])
+        return result
+class FeatureSelector(BaseEstimator, TransformerMixin):
+    def __init__(self, numeric_features, encoded_features, target_column, num_k=5, cat_k=5):
+        """
+        :param numeric_features: List of numeric feature names
+        :param encoded_features: List of encoded feature names
+        :param target_column: Target column name
+        :param num_k: Number of top numeric features to select
+        :param cat_k: Number of top encoded features to select
+        """
+        self.numeric_features = numeric_features
+        self.encoded_features = encoded_features  # Use encoded features
+        self.target_column = target_column
+        self.num_k = num_k
+        self.cat_k = cat_k
+        self.chi2_selector = None
+        self.numeric_selector = None
+    def fit(self, X, y=None):
+        # Pearson correlation for numeric features
+        self.numeric_selector = X[self.numeric_features].corrwith(X[self.target_column]).abs().nlargest(self.num_k).index.tolist()
+        # Chi-Square for encoded categorical features
+        X_encoded = X[self.encoded_features]
+        y = X[self.target_column]
+        # Apply chi-squared test and select top k features
+        self.chi2_selector = SelectKBest(chi2, k=self.cat_k).fit(X_encoded, y)
+        return self
+    def transform(self, X):
+        # Select top numeric features based on Pearson correlation
+        X_selected_num = X[self.numeric_selector]
+        y = X[self.target_column]
+        # Select top encoded categorical features based on Chi-Square
+        X_encoded = X[self.encoded_features]
+        X_selected_cat = pd.DataFrame(self.chi2_selector.transform(X_encoded), columns=self.chi2_selector.get_feature_names_out(), index=X.index)
+        # Concatenate selected numeric and categorical features
+        return pd.concat([X_selected_num, X_selected_cat, y], axis=1)
+# Splitter Class
+class Splitter(BaseEstimator, TransformerMixin):
+    def __init__(self, target_column, test_size=0.3, random_state=42):
+        self.target_column = target_column
+        self.test_size = test_size
+        self.random_state = random_state
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        y = X[self.target_column]
+        X = X.drop(self.target_column, axis=1)
+        return tuple(train_test_split(X, y, test_size=self.test_size, random_state=self.random_state))
+# Scaler Class
+class Scaler(BaseEstimator, TransformerMixin):
+    def __init__(self, scaler_type='standard'):
+        self.scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        if isinstance(X, tuple) and len(X) == 4:
+            X_train, X_test, y_train, y_test = X
+            X_train_scaled = self.scaler.fit_transform(X_train)
+            X_test_scaled = self.scaler.transform(X_test)
+            return X_train_scaled, X_test_scaled, y_train, y_test
+        else:
+            return self.scaler.fit_transform(X)
+# Full pipeline with feature selection
+class FullPipeline:
+    def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
+        self.encoder = Encoder(categorical_columns, target_column)
+        self.feature_selector = None  # Initialize after encoding to access encoded names
+        self.splitter = Splitter(target_column)
+        self.scaler = Scaler()
+        self.numeric_features = numeric_features
+        self.num_k = num_k
+        self.cat_k = cat_k
+    def fit_transform(self, X):
+        # Apply encoding and retrieve encoded feature names
+        X = self.encoder.fit_transform(X)
+        self.feature_selector = FeatureSelector(
+            numeric_features=self.numeric_features,
+            encoded_features=self.encoder.encoded_feature_names,
+            target_column=self.encoder.target_column,
+            num_k=self.num_k, cat_k=self.cat_k
+        )
+        X = self.feature_selector.fit_transform(X)
+        X_train, X_test, y_train, y_test = self.splitter.transform(X)
+        return self.scaler.transform((X_train, X_test, y_train, y_test))
+class FullPipeline:
+    def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
+        self.encoder = Encoder(categorical_columns, target_column)
+        self.feature_selector = None  # Initialize after encoding to access encoded names
+        self.splitter = Splitter(target_column)
+        self.scaler = Scaler()
+        self.numeric_features = numeric_features
+        self.num_k = num_k
+        self.cat_k = cat_k
+    def fit_transform(self, X):
+        X = self.encoder.fit_transform(X)
+        pickle.dump(self.encoder, open("encoder.pkl", "wb"))
+        self.feature_selector = FeatureSelector(
+            numeric_features=self.numeric_features,
+            encoded_features=self.encoder.encoded_feature_names,
+            target_column=self.encoder.target_column,
+            num_k=self.num_k, cat_k=self.cat_k
+        )
+        X = self.feature_selector.fit_transform(X)
+        pickle.dump(self.feature_selector, open("feature_selector.pkl", "wb"))
+        X_train, X_test, y_train, y_test = self.splitter.transform(X)
+        pickle.dump(self.splitter, open("splitter.pkl", "wb"))
+        X_train_scaled, X_test_scaled, y_train, y_test = self.scaler.transform((X_train, X_test, y_train, y_test))
+        pickle.dump(self.scaler, open("scaler.pkl", "wb"))
+        return (X_train_scaled, X_test_scaled, y_train, y_test)
+def main():
+    path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")
+    df = pd.read_csv(path + r"\heart.csv")
+    df.drop_duplicates(inplace=True) # dropping the duplicates
+    # defining the pipeline
+    pipeline = FullPipeline(
+        categorical_columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
+        target_column='HeartDisease',
+        numeric_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'],
+        num_k=3,  # Select top 3 numeric features
+        cat_k=3   # Select top 3 categorical features
+    )
+    # transforming the data
+    X_train, X_test, y_train, y_test = pipeline.fit_transform(df)
+    with open("transformed_data.pkl", "wb") as f:
+        pickle.dump((X_train, X_test, y_train, y_test), f)
+if __name__ == "__main__":
+    main()

model_building.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import xgboost as xgb
+from data_cleaning import main
+from sklearn.metrics import classification_report
+import pandas as pd
+import dill
+def load_data():
+    with open("transformed_data.pkl", "rb") as f:
+        X_train, X_test, y_train, y_test = dill.load(f)
+    return X_train, y_train, X_test, y_test
+def build_model(X_train, y_train, X_test, y_test):
+    params = {
+        "objective": "binary:logistic",
+        "n_estimators": 500,
+        'learning_rate': 0.0010812936756470217,
+        'max_depth': 6,
+        'subsample': 0.36482338465400405,
+        'colsample_bytree': 0.17190210997311706,
+        'min_child_weight': 15
+    }
+    model = xgb.XGBClassifier(**params)
+    model.fit(X_train, y_train, verbose=False)
+    return model
+def main():
+    X_train, y_train, X_test, y_test = load_data() # reading data
+    model = build_model(X_train, y_train, X_test, y_test) # building the model
+    y_pred = model.predict(X_test)
+    report = classification_report(y_test, y_pred)
+    print(report)
+if __name__=="__main__":
+    main()

model_load_save.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import dill
+import pandas as pd
+def save_model(model):
+    with open("model.pkl", "wb") as f:
+        dill.dump(model, f)
+def load_model():
+    with open("xgboost_model.pkl", "rb") as f:
+        model = dill.load(f)
+    return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+pandas
+numpy
+dill
+streamlit
+xgboost
+requests
+scikit-learn

scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfcc5d384ca7bc517925e2ea1ae028e71a597e368a770c5d28d214b5b3f4fbdc
+size 791

transformed_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e19ecb854e956dfc67a5e972229e02c6e5d0b01cb891532b2672b331329efbc6
+size 67077

xgboost_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78b1669b1aee287e888c1b582d3a33c43a10f16ca634e3fd70a054e0fc0be3a9
+size 392329