Upload 6 files
Browse files- .gitattributes +1 -0
- cbc_disease_model.joblib +3 -0
- diagnosis.py +49 -0
- diagnosis_app.py +102 -0
- diagnosis_test.py +28 -0
- disease_label_encoder.joblib +3 -0
- expanded_integrated_flagged_cbc_dataset.xlsx +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
expanded_integrated_flagged_cbc_dataset.xlsx filter=lfs diff=lfs merge=lfs -text
|
cbc_disease_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2c1b8eb82fcd76ff5f3d55288ab2d485b4529138c94b94f99d315f1a2e4c23b
|
3 |
+
size 94589665
|
diagnosis.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.ensemble import RandomForestClassifier
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.preprocessing import LabelEncoder
|
5 |
+
from sklearn.metrics import classification_report, accuracy_score
|
6 |
+
|
7 |
+
# --- Read your dataset ---
|
8 |
+
df = pd.read_excel('expanded_integrated_flagged_cbc_dataset.xlsx') # Change to sep=',' if needed
|
9 |
+
|
10 |
+
# --- Preprocessing ---
|
11 |
+
df.columns = df.columns.str.strip().str.replace(' ', '_')
|
12 |
+
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
|
13 |
+
|
14 |
+
feature_cols = [
|
15 |
+
'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
|
16 |
+
'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
|
17 |
+
]
|
18 |
+
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')
|
19 |
+
df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
|
20 |
+
|
21 |
+
# Use first probable disease
|
22 |
+
df['Label'] = df['Probable_Diseases'].fillna('None').apply(lambda x: x.split(',')[0].strip())
|
23 |
+
label_encoder = LabelEncoder()
|
24 |
+
df['Label_encoded'] = label_encoder.fit_transform(df['Label'])
|
25 |
+
|
26 |
+
# --- Model Training & Evaluation ---
|
27 |
+
X = df[feature_cols]
|
28 |
+
y = df['Label_encoded']
|
29 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
30 |
+
X, y, test_size=0.3, random_state=42, stratify=y
|
31 |
+
)
|
32 |
+
|
33 |
+
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
|
34 |
+
clf.fit(X_train, y_train)
|
35 |
+
|
36 |
+
y_pred = clf.predict(X_test)
|
37 |
+
|
38 |
+
import joblib
|
39 |
+
|
40 |
+
# Save model and label encoder to file
|
41 |
+
joblib.dump(clf, 'cbc_disease_model.joblib')
|
42 |
+
joblib.dump(label_encoder, 'disease_label_encoder.joblib')
|
43 |
+
|
44 |
+
print("✅ Model and label encoder saved successfully!")
|
45 |
+
|
46 |
+
|
47 |
+
# --- Accuracy and Report ---
|
48 |
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
49 |
+
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
|
diagnosis_app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import joblib
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
# Load trained model and label encoder
|
8 |
+
clf = joblib.load('D:\python\diagnosis_1\diagnosis\cbc_disease_model.joblib')
|
9 |
+
label_encoder = joblib.load('D:\python\diagnosis_1\diagnosis\disease_label_encoder.joblib')
|
10 |
+
|
11 |
+
FEATURE_ORDER = [
|
12 |
+
'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
|
13 |
+
'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
|
14 |
+
]
|
15 |
+
|
16 |
+
st.title("🩸 CBC Disease Prediction Web App")
|
17 |
+
|
18 |
+
st.markdown("""
|
19 |
+
Upload your CBC data or a blood report image to get a ranked prediction of possible diseases.
|
20 |
+
""")
|
21 |
+
|
22 |
+
option = st.radio("Input method:", ("Upload Image & Enter Data Manually", "Upload CBC as CSV/Excel File"))
|
23 |
+
|
24 |
+
if option == "Upload Image & Enter Data Manually":
|
25 |
+
image_file = st.file_uploader("Upload CBC blood report image (JPG, PNG)", type=["jpg", "jpeg", "png"])
|
26 |
+
if image_file:
|
27 |
+
image = Image.open(image_file)
|
28 |
+
st.image(image, caption="CBC Blood Report Image", use_column_width=True)
|
29 |
+
st.info("Review and copy values from the image into the entry form below.")
|
30 |
+
|
31 |
+
st.subheader("Enter Your Complete Blood Count (CBC) Values")
|
32 |
+
|
33 |
+
user_input = {}
|
34 |
+
|
35 |
+
# Demographics always at top
|
36 |
+
demog_col, main_col1, main_col2 = st.columns([1, 2, 2])
|
37 |
+
with demog_col:
|
38 |
+
# Always show gender and age at the top
|
39 |
+
gender_str = st.selectbox("Gender", ["Female", "Male"])
|
40 |
+
user_input['Gender'] = 1 if gender_str == "Male" else 0
|
41 |
+
user_input['Age'] = st.number_input("Age (years)", min_value=0, max_value=120, value=30)
|
42 |
+
|
43 |
+
# Other CBC fields split nicely in two columns
|
44 |
+
other_fields = [f for f in FEATURE_ORDER if f not in ["Gender", "Age"]]
|
45 |
+
half = len(other_fields) // 2
|
46 |
+
with main_col1:
|
47 |
+
for field in other_fields[:half]:
|
48 |
+
user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
|
49 |
+
with main_col2:
|
50 |
+
for field in other_fields[half:]:
|
51 |
+
user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
|
52 |
+
|
53 |
+
if st.button("Predict Disease"):
|
54 |
+
df_input = pd.DataFrame([user_input])[FEATURE_ORDER]
|
55 |
+
probas = clf.predict_proba(df_input)[0]
|
56 |
+
sorted_indices = np.argsort(probas)[::-1]
|
57 |
+
top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
|
58 |
+
st.subheader("🧾 Predicted Disease Rankings")
|
59 |
+
for rank, (disease, prob) in enumerate(top_diseases, 1):
|
60 |
+
st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")
|
61 |
+
|
62 |
+
elif option == "Upload CBC as CSV/Excel File":
|
63 |
+
data_file = st.file_uploader("Upload your CBC data file (.csv or .xlsx)", type=["csv", "xlsx"])
|
64 |
+
if data_file:
|
65 |
+
# Read uploaded file based on extension
|
66 |
+
if data_file.name.endswith(".csv"):
|
67 |
+
data = pd.read_csv(data_file)
|
68 |
+
else:
|
69 |
+
data = pd.read_excel(data_file)
|
70 |
+
st.write("Uploaded CBC Data:")
|
71 |
+
st.write(data)
|
72 |
+
|
73 |
+
# Check for missing Age/Gender
|
74 |
+
missing = []
|
75 |
+
if 'Age' not in data.columns:
|
76 |
+
missing.append('Age')
|
77 |
+
if 'Gender' not in data.columns:
|
78 |
+
missing.append('Gender')
|
79 |
+
# Prompt as needed
|
80 |
+
if missing:
|
81 |
+
st.warning(f"Uploaded file is missing: {', '.join(missing)}")
|
82 |
+
if 'Age' in missing:
|
83 |
+
age_value = st.number_input("Enter Age (years)", min_value=0, max_value=120, value=30, key='age_up')
|
84 |
+
data['Age'] = age_value
|
85 |
+
if 'Gender' in missing:
|
86 |
+
gender_choice = st.selectbox("Select Gender", ["Female", "Male"], key='gender_up')
|
87 |
+
data['Gender'] = 1 if gender_choice == "Male" else 0
|
88 |
+
|
89 |
+
# Reorder/limit columns and handle prediction for first row
|
90 |
+
try:
|
91 |
+
df_input = data[FEATURE_ORDER]
|
92 |
+
except Exception:
|
93 |
+
st.error("Uploaded file is missing other required columns.")
|
94 |
+
else:
|
95 |
+
# Only process the first row for prediction (can be expanded to batch)
|
96 |
+
df_input_single = df_input.iloc[[0]]
|
97 |
+
probas = clf.predict_proba(df_input_single)[0]
|
98 |
+
sorted_indices = np.argsort(probas)[::-1]
|
99 |
+
top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
|
100 |
+
st.subheader("🧾 Predicted Disease Rankings")
|
101 |
+
for rank, (disease, prob) in enumerate(top_diseases, 1):
|
102 |
+
st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")
|
diagnosis_test.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Load model and encoder
|
5 |
+
clf = joblib.load('cbc_disease_model.joblib')
|
6 |
+
label_encoder = joblib.load('disease_label_encoder.joblib')
|
7 |
+
|
8 |
+
# New patient input for Bacterial Infection
|
9 |
+
new_patient = pd.DataFrame([{
|
10 |
+
'WBC': 16000, 'LY%': 15,'MO%': 5,'NE%': 78,'EO%': 1,'BA%': 0.2,'LY#': 2400,'MO#': 800,'NE#': 12480,
|
11 |
+
'EO#': 160,'BA#': 32,'RBC': 4.8,'HGB': 13.5,'HCT': 42.0,'MCV': 88,'MCHC': 33,'MCH': 29,'RDW': 13.5,
|
12 |
+
'PLT': 270000,'MPV': 9.5,'Age': 38,'Gender': 1 #1->male 0-> female
|
13 |
+
}])
|
14 |
+
|
15 |
+
# Predict top label
|
16 |
+
prediction = clf.predict(new_patient)
|
17 |
+
predicted_disease = label_encoder.inverse_transform(prediction)
|
18 |
+
print("🧾 Most Probable Disease:", predicted_disease[0])
|
19 |
+
|
20 |
+
# Predict probabilities
|
21 |
+
probas = clf.predict_proba(new_patient)[0]
|
22 |
+
sorted_indices = probas.argsort()[::-1]
|
23 |
+
top_diseases = [(label_encoder.classes_[i], round(probas[i], 4)) for i in sorted_indices[:5]]
|
24 |
+
|
25 |
+
# Print ranked results
|
26 |
+
print("🔍 Predicted disease rankings:")
|
27 |
+
for rank, (disease, prob) in enumerate(top_diseases, 1):
|
28 |
+
print(f"{rank}. {disease} — {prob * 100:.2f}%")
|
disease_label_encoder.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fba78e2960366e6309eb5634bc4bbd35b8909d6821605688826386d838c9adf
|
3 |
+
size 761
|
expanded_integrated_flagged_cbc_dataset.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89b1a8337f0ebe4f04c4834aa3d596a128a08e86ef19e48a9ee2203f9e73e6a7
|
3 |
+
size 12293834
|