Mananjp commited on
Commit
2649b61
·
verified ·
1 Parent(s): a6bbb9a

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ expanded_integrated_flagged_cbc_dataset.xlsx filter=lfs diff=lfs merge=lfs -text
cbc_disease_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c1b8eb82fcd76ff5f3d55288ab2d485b4529138c94b94f99d315f1a2e4c23b
3
+ size 94589665
diagnosis.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from sklearn.metrics import classification_report, accuracy_score
6
+
7
+ # --- Read your dataset ---
8
+ df = pd.read_excel('expanded_integrated_flagged_cbc_dataset.xlsx') # Change to sep=',' if needed
9
+
10
+ # --- Preprocessing ---
11
+ df.columns = df.columns.str.strip().str.replace(' ', '_')
12
+ df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
13
+
14
+ feature_cols = [
15
+ 'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
16
+ 'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
17
+ ]
18
+ df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')
19
+ df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
20
+
21
+ # Use first probable disease
22
+ df['Label'] = df['Probable_Diseases'].fillna('None').apply(lambda x: x.split(',')[0].strip())
23
+ label_encoder = LabelEncoder()
24
+ df['Label_encoded'] = label_encoder.fit_transform(df['Label'])
25
+
26
+ # --- Model Training & Evaluation ---
27
+ X = df[feature_cols]
28
+ y = df['Label_encoded']
29
+ X_train, X_test, y_train, y_test = train_test_split(
30
+ X, y, test_size=0.3, random_state=42, stratify=y
31
+ )
32
+
33
+ clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
34
+ clf.fit(X_train, y_train)
35
+
36
+ y_pred = clf.predict(X_test)
37
+
38
+ import joblib
39
+
40
+ # Save model and label encoder to file
41
+ joblib.dump(clf, 'cbc_disease_model.joblib')
42
+ joblib.dump(label_encoder, 'disease_label_encoder.joblib')
43
+
44
+ print("✅ Model and label encoder saved successfully!")
45
+
46
+
47
+ # --- Accuracy and Report ---
48
+ print("Accuracy:", accuracy_score(y_test, y_pred))
49
+ print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
diagnosis_app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import joblib
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+ # Load trained model and label encoder
8
+ clf = joblib.load('D:\python\diagnosis_1\diagnosis\cbc_disease_model.joblib')
9
+ label_encoder = joblib.load('D:\python\diagnosis_1\diagnosis\disease_label_encoder.joblib')
10
+
11
+ FEATURE_ORDER = [
12
+ 'WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#',
13
+ 'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'Age', 'Gender'
14
+ ]
15
+
16
+ st.title("🩸 CBC Disease Prediction Web App")
17
+
18
+ st.markdown("""
19
+ Upload your CBC data or a blood report image to get a ranked prediction of possible diseases.
20
+ """)
21
+
22
+ option = st.radio("Input method:", ("Upload Image & Enter Data Manually", "Upload CBC as CSV/Excel File"))
23
+
24
+ if option == "Upload Image & Enter Data Manually":
25
+ image_file = st.file_uploader("Upload CBC blood report image (JPG, PNG)", type=["jpg", "jpeg", "png"])
26
+ if image_file:
27
+ image = Image.open(image_file)
28
+ st.image(image, caption="CBC Blood Report Image", use_column_width=True)
29
+ st.info("Review and copy values from the image into the entry form below.")
30
+
31
+ st.subheader("Enter Your Complete Blood Count (CBC) Values")
32
+
33
+ user_input = {}
34
+
35
+ # Demographics always at top
36
+ demog_col, main_col1, main_col2 = st.columns([1, 2, 2])
37
+ with demog_col:
38
+ # Always show gender and age at the top
39
+ gender_str = st.selectbox("Gender", ["Female", "Male"])
40
+ user_input['Gender'] = 1 if gender_str == "Male" else 0
41
+ user_input['Age'] = st.number_input("Age (years)", min_value=0, max_value=120, value=30)
42
+
43
+ # Other CBC fields split nicely in two columns
44
+ other_fields = [f for f in FEATURE_ORDER if f not in ["Gender", "Age"]]
45
+ half = len(other_fields) // 2
46
+ with main_col1:
47
+ for field in other_fields[:half]:
48
+ user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
49
+ with main_col2:
50
+ for field in other_fields[half:]:
51
+ user_input[field] = st.number_input(f"{field}:", value=0.0, format="%.2f")
52
+
53
+ if st.button("Predict Disease"):
54
+ df_input = pd.DataFrame([user_input])[FEATURE_ORDER]
55
+ probas = clf.predict_proba(df_input)[0]
56
+ sorted_indices = np.argsort(probas)[::-1]
57
+ top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
58
+ st.subheader("🧾 Predicted Disease Rankings")
59
+ for rank, (disease, prob) in enumerate(top_diseases, 1):
60
+ st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")
61
+
62
+ elif option == "Upload CBC as CSV/Excel File":
63
+ data_file = st.file_uploader("Upload your CBC data file (.csv or .xlsx)", type=["csv", "xlsx"])
64
+ if data_file:
65
+ # Read uploaded file based on extension
66
+ if data_file.name.endswith(".csv"):
67
+ data = pd.read_csv(data_file)
68
+ else:
69
+ data = pd.read_excel(data_file)
70
+ st.write("Uploaded CBC Data:")
71
+ st.write(data)
72
+
73
+ # Check for missing Age/Gender
74
+ missing = []
75
+ if 'Age' not in data.columns:
76
+ missing.append('Age')
77
+ if 'Gender' not in data.columns:
78
+ missing.append('Gender')
79
+ # Prompt as needed
80
+ if missing:
81
+ st.warning(f"Uploaded file is missing: {', '.join(missing)}")
82
+ if 'Age' in missing:
83
+ age_value = st.number_input("Enter Age (years)", min_value=0, max_value=120, value=30, key='age_up')
84
+ data['Age'] = age_value
85
+ if 'Gender' in missing:
86
+ gender_choice = st.selectbox("Select Gender", ["Female", "Male"], key='gender_up')
87
+ data['Gender'] = 1 if gender_choice == "Male" else 0
88
+
89
+ # Reorder/limit columns and handle prediction for first row
90
+ try:
91
+ df_input = data[FEATURE_ORDER]
92
+ except Exception:
93
+ st.error("Uploaded file is missing other required columns.")
94
+ else:
95
+ # Only process the first row for prediction (can be expanded to batch)
96
+ df_input_single = df_input.iloc[[0]]
97
+ probas = clf.predict_proba(df_input_single)[0]
98
+ sorted_indices = np.argsort(probas)[::-1]
99
+ top_diseases = [(label_encoder.classes_[i], probas[i]) for i in sorted_indices[:5]]
100
+ st.subheader("🧾 Predicted Disease Rankings")
101
+ for rank, (disease, prob) in enumerate(top_diseases, 1):
102
+ st.write(f"{rank}. **{disease}** — {prob * 100:.2f}%")
diagnosis_test.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+
4
+ # Load model and encoder
5
+ clf = joblib.load('cbc_disease_model.joblib')
6
+ label_encoder = joblib.load('disease_label_encoder.joblib')
7
+
8
+ # New patient input for Bacterial Infection
9
+ new_patient = pd.DataFrame([{
10
+ 'WBC': 16000, 'LY%': 15,'MO%': 5,'NE%': 78,'EO%': 1,'BA%': 0.2,'LY#': 2400,'MO#': 800,'NE#': 12480,
11
+ 'EO#': 160,'BA#': 32,'RBC': 4.8,'HGB': 13.5,'HCT': 42.0,'MCV': 88,'MCHC': 33,'MCH': 29,'RDW': 13.5,
12
+ 'PLT': 270000,'MPV': 9.5,'Age': 38,'Gender': 1 #1->male 0-> female
13
+ }])
14
+
15
+ # Predict top label
16
+ prediction = clf.predict(new_patient)
17
+ predicted_disease = label_encoder.inverse_transform(prediction)
18
+ print("🧾 Most Probable Disease:", predicted_disease[0])
19
+
20
+ # Predict probabilities
21
+ probas = clf.predict_proba(new_patient)[0]
22
+ sorted_indices = probas.argsort()[::-1]
23
+ top_diseases = [(label_encoder.classes_[i], round(probas[i], 4)) for i in sorted_indices[:5]]
24
+
25
+ # Print ranked results
26
+ print("🔍 Predicted disease rankings:")
27
+ for rank, (disease, prob) in enumerate(top_diseases, 1):
28
+ print(f"{rank}. {disease} — {prob * 100:.2f}%")
disease_label_encoder.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fba78e2960366e6309eb5634bc4bbd35b8909d6821605688826386d838c9adf
3
+ size 761
expanded_integrated_flagged_cbc_dataset.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89b1a8337f0ebe4f04c4834aa3d596a128a08e86ef19e48a9ee2203f9e73e6a7
3
+ size 12293834