Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
cbc_disease_model.joblib +3 -0
diagnosis.py +49 -0
diagnosis_app.py +102 -0
diagnosis_test.py +28 -0
disease_label_encoder.joblib +3 -0
expanded_integrated_flagged_cbc_dataset.xlsx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+expanded_integrated_flagged_cbc_dataset.xlsx filter=lfs diff=lfs merge=lfs -text

cbc_disease_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2c1b8eb82fcd76ff5f3d55288ab2d485b4529138c94b94f99d315f1a2e4c23b
+size 94589665

diagnosis.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import classification_report, accuracy_score
+# --- Read your dataset ---
+df = pd.read_excel('expanded_integrated_flagged_cbc_dataset.xlsx')  # Change to sep=',' if needed
+# --- Preprocessing ---
+df.columns = df.columns.str.strip().str.replace(' ', '_')
+df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
+feature_cols = [
+    'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
+    'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
+]
+df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')
+df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
+# Use first probable disease
+df['Label'] = df['Probable_Diseases'].fillna('None').apply(lambda x: x.split(',')[0].strip())
+label_encoder = LabelEncoder()
+df['Label_encoded'] = label_encoder.fit_transform(df['Label'])
+# --- Model Training & Evaluation ---
+X = df[feature_cols]
+y = df['Label_encoded']
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+import joblib
+# Save model and label encoder to file
+joblib.dump(clf, 'cbc_disease_model.joblib')
+joblib.dump(label_encoder, 'disease_label_encoder.joblib')
+print("✅ Model and label encoder saved successfully!")
+# --- Accuracy and Report ---
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

diagnosis_app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+import pandas as pd
+import joblib
+import numpy as np
+from PIL import Image
+# Load trained model and label encoder
+clf = joblib.load('D:\python\diagnosis_1\diagnosis\cbc_disease_model.joblib')
+label_encoder = joblib.load('D:\python\diagnosis_1\diagnosis\disease_label_encoder.joblib')
+FEATURE_ORDER = [
+    'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
+    'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
+]
+st.title("🩸 CBC Disease Prediction Web App")
+st.markdown("""
+Upload your CBC data or a blood report image to get a ranked prediction of possible diseases.
+""")
+option = st.radio("Input method:", ("Upload Image & Enter Data Manually", "Upload CBC as CSV/Excel File"))
+if option == "Upload Image & Enter Data Manually":
+    image_file = st.file_uploader("Upload CBC blood report image (JPG, PNG)", type=["jpg", "jpeg", "png"])
+    if image_file:
+        image = Image.open(image_file)
+        st.image(image, caption="CBC Blood Report Image", use_column_width=True)
+        st.info("Review and copy values from the image into the entry form below.")
+    st.subheader("Enter Your Complete Blood Count (CBC) Values")
+    user_input = {}
+    # Demographics always at top
+    demog_col, main_col1, main_col2 = st.columns([1, 2, 2])
+    with demog_col:
+        # Always show gender and age at the top
+        gender_str = st.selectbox("Gender", ["Female", "Male"])
+        user_input['Gender'] = 1 if gender_str == "Male" else 0
+        user_input['Age'] = st.number_input("Age (years)", min_value=0, max_value=120, value=30)
+    # Other CBC fields split nicely in two columns
+    other_fields = [f for f in FEATURE_ORDER if f not in ["Gender", "Age"]]
+    half = len(other_fields) // 2
+    with main_col1:
+        for field in other_fields[:half]:
+            user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
+    with main_col2:
+        for field in other_fields[half:]:
+            user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
+    if st.button("Predict Disease"):
+        df_input = pd.DataFrame([user_input])[FEATURE_ORDER]
+        probas = clf.predict_proba(df_input)[0]
+        sorted_indices = np.argsort(probas)[::-1]
+        top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
+        st.subheader("🧾 Predicted Disease Rankings")
+        for rank, (disease, prob) in enumerate(top_diseases, 1):
+            st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")
+elif option == "Upload CBC as CSV/Excel File":
+    data_file = st.file_uploader("Upload your CBC data file (.csv or .xlsx)", type=["csv", "xlsx"])
+    if data_file:
+        # Read uploaded file based on extension
+        if data_file.name.endswith(".csv"):
+            data = pd.read_csv(data_file)
+        else:
+            data = pd.read_excel(data_file)
+        st.write("Uploaded CBC Data:")
+        st.write(data)
+        # Check for missing Age/Gender
+        missing = []
+        if 'Age' not in data.columns:
+            missing.append('Age')
+        if 'Gender' not in data.columns:
+            missing.append('Gender')
+        # Prompt as needed
+        if missing:
+            st.warning(f"Uploaded file is missing: {', '.join(missing)}")
+            if 'Age' in missing:
+                age_value = st.number_input("Enter Age (years)", min_value=0, max_value=120, value=30, key='age_up')
+                data['Age'] = age_value
+            if 'Gender' in missing:
+                gender_choice = st.selectbox("Select Gender", ["Female", "Male"], key='gender_up')
+                data['Gender'] = 1 if gender_choice == "Male" else 0
+        # Reorder/limit columns and handle prediction for first row
+        try:
+            df_input = data[FEATURE_ORDER]
+        except Exception:
+            st.error("Uploaded file is missing other required columns.")
+        else:
+            # Only process the first row for prediction (can be expanded to batch)
+            df_input_single = df_input.iloc[[0]]
+            probas = clf.predict_proba(df_input_single)[0]
+            sorted_indices = np.argsort(probas)[::-1]
+            top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
+            st.subheader("🧾 Predicted Disease Rankings")
+            for rank, (disease, prob) in enumerate(top_diseases, 1):
+                st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")

diagnosis_test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import joblib
+import pandas as pd
+# Load model and encoder
+clf = joblib.load('cbc_disease_model.joblib')
+label_encoder = joblib.load('disease_label_encoder.joblib')
+# New patient input for Bacterial Infection
+new_patient = pd.DataFrame([{
+    'WBC': 16000, 'LY%': 15,'MO%': 5,'NE%': 78,'EO%': 1,'BA%': 0.2,'LY#': 2400,'MO#': 800,'NE#': 12480,
+    'EO#': 160,'BA#': 32,'RBC': 4.8,'HGB': 13.5,'HCT': 42.0,'MCV': 88,'MCHC': 33,'MCH': 29,'RDW': 13.5,
+    'PLT': 270000,'MPV': 9.5,'Age': 38,'Gender': 1 #1->male 0-> female
+}])
+# Predict top label
+prediction = clf.predict(new_patient)
+predicted_disease = label_encoder.inverse_transform(prediction)
+print("🧾 Most Probable Disease:", predicted_disease[0])
+# Predict probabilities
+probas = clf.predict_proba(new_patient)[0]
+sorted_indices = probas.argsort()[::-1]
+top_diseases = [(label_encoder.classes_[i], round(probas[i], 4)) for i in sorted_indices[:5]]
+# Print ranked results
+print("🔍 Predicted disease rankings:")
+for rank, (disease, prob) in enumerate(top_diseases, 1):
+    print(f"{rank}. {disease} — {prob * 100:.2f}%")

disease_label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fba78e2960366e6309eb5634bc4bbd35b8909d6821605688826386d838c9adf
+size 761

expanded_integrated_flagged_cbc_dataset.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89b1a8337f0ebe4f04c4834aa3d596a128a08e86ef19e48a9ee2203f9e73e6a7
+size 12293834