Spaces:

Xalphinions
/

watermelon2

Sleeping

App Files Files Community

Xalphinions commited on Apr 11

Commit

6f4e394

verified ·

1 Parent(s): 83c2e18

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +3 -0
.nfs00000001a2244b30003726a6 +1 -0
.nfs00000001a2b1089c003726a7 +1 -0
__pycache__/evaluate_backbones.cpython-310.pyc +0 -0
__pycache__/preprocess.cpython-310.pyc +0 -0
app.py +139 -43
app_local_backup.py +100 -47
app_moe.py +439 -0
backbone_evaluation_results.json +110 -0
evaluate_backbones.py +670 -0
models/.nfs00000001a1a17512003726ad +3 -0
models/.nfs00000001a234d9cd003726ac +3 -0
models/.nfs00000001a2a11ea9003726ae +3 -0
models/efficientnet_b0_transformer_model.pt +3 -0
models/efficientnet_b3_transformer_model.pt +3 -0
models/resnet50_transformer_model.pt +3 -0
moe_evaluation_results.json +801 -0
templates/.nfs00000001a2893bde003726a5 +1 -0
test_moe_model.py +276 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 temp/temp_audio.wav filter=lfs diff=lfs merge=lfs -text
 temp/temp_image.jpg filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 temp/temp_audio.wav filter=lfs diff=lfs merge=lfs -text
 temp/temp_image.jpg filter=lfs diff=lfs merge=lfs -text
+models/.nfs00000001a1a17512003726ad filter=lfs diff=lfs merge=lfs -text
+models/.nfs00000001a234d9cd003726ac filter=lfs diff=lfs merge=lfs -text
+models/.nfs00000001a2a11ea9003726ae filter=lfs diff=lfs merge=lfs -text

.nfs00000001a2244b30003726a6 ADDED Viewed

	@@ -0,0 +1 @@


1	+

.nfs00000001a2b1089c003726a7 ADDED Viewed

	@@ -0,0 +1 @@


1	+

__pycache__/evaluate_backbones.cpython-310.pyc ADDED Viewed

Binary file (16.9 kB). View file

__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

app.py CHANGED Viewed

@@ -6,21 +6,82 @@ import gradio as gr
 import torchaudio
 import torchvision
 import spaces
-# # Import Gradio Spaces GPU decorator
-# try:
-#     from gradio import spaces
-#     HAS_SPACES = True
-#     print("\033[92mINFO\033[0m: Gradio Spaces detected, GPU acceleration will be enabled")
-# except ImportError:
-#     HAS_SPACES = False
-#     print("\033[93mWARN\033[0m: gradio.spaces not available, running without GPU optimization")
 # Add parent directory to path to import preprocess functions
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-# Import functions from infer_watermelon.py and train_watermelon for the model
-from train_watermelon import WatermelonModel
 # Modified version of process_audio_data specifically for the app to handle various tensor shapes
 def app_process_audio_data(waveform, sample_rate):
@@ -76,15 +137,12 @@ def app_process_audio_data(waveform, sample_rate):
         print(traceback.format_exc())
         return None
-# Similarly for images, but let's import the original one
-from preprocess import process_image_data
-    # Using the decorator directly on the function definition
 @spaces.GPU
-def predict_sugar_content(audio, image, model_path):
-    """Function with GPU acceleration to predict watermelon sugar content in Brix"""
     try:
-        # Now check CUDA availability inside the GPU-decorated function
         if torch.cuda.is_available():
             device = torch.device("cuda")
             print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
@@ -92,11 +150,11 @@ def predict_sugar_content(audio, image, model_path):
             device = torch.device("cpu")
             print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
-        # Load model inside the function to ensure it's on the correct device
-        model = WatermelonModel().to(device)
-        model.load_state_dict(torch.load(model_path, map_location=device))
-        model.eval()
-        print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
         # Debug information about input types
         print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
@@ -188,11 +246,11 @@ def predict_sugar_content(audio, image, model_path):
             processed_image = processed_image.unsqueeze(0).to(device)
             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
-        # Run inference
-        print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
         if mfcc is not None and processed_image is not None:
             with torch.no_grad():
-                brix_value = model(mfcc, processed_image)
                 print(f"\033[92mDEBUG\033[0m: Prediction successful: {brix_value.item()}")
         else:
             return "Error: Failed to process inputs. Please check the debug logs."
@@ -204,6 +262,12 @@ def predict_sugar_content(audio, image, model_path):
             # Create a header with the numerical result
             result = f"🍉 Predicted Sugar Content: {brix_score:.1f}° Brix 🍉\n\n"
             # Add Brix scale visualization
             result += "Sugar Content Scale (in °Brix):\n"
             result += "──────────────────────────────────\n"
@@ -257,22 +321,27 @@ def predict_sugar_content(audio, image, model_path):
         error_msg += traceback.format_exc()
         print(f"\033[91mERR!\033[0m: {error_msg}")
         return error_msg
-    print("\033[92mINFO\033[0m: GPU-accelerated prediction function created with @spaces.GPU decorator")
-def create_app(model_path):
     """Create and launch the Gradio interface"""
     # Define the prediction function with model path
     def predict_fn(audio, image):
-        return predict_sugar_content(audio, image, model_path)
     # Create Gradio interface
-    with gr.Blocks(title="Watermelon Sugar Content Predictor", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# 🍉 Watermelon Sugar Content Predictor")
         gr.Markdown("""
         This app predicts the sugar content (in °Brix) of a watermelon based on its sound and appearance.
         ## Instructions:
         1. Upload or record an audio of tapping the watermelon
         2. Upload or capture an image of the watermelon
@@ -286,7 +355,7 @@ def create_app(model_path):
                 submit_btn = gr.Button("Predict Sugar Content", variant="primary")
             with gr.Column():
-                output = gr.Textbox(label="Prediction Results", lines=12)
         submit_btn.click(
             fn=predict_fn,
@@ -302,6 +371,11 @@ def create_app(model_path):
         ## About Brix Measurement
         Brix (°Bx) is a measurement of sugar content in a solution. For watermelons, higher Brix values indicate sweeter fruit.
         The average ripe watermelon has a Brix value between 9-11°.
         """)
     return interface
@@ -309,12 +383,12 @@ def create_app(model_path):
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description="Watermelon Sugar Content Prediction App")
     parser.add_argument(
-        "--model_path",
         type=str,
-        default="models/watermelon_model_final.pt",
-        help="Path to the trained model file"
     )
     parser.add_argument(
         "--share",
@@ -326,18 +400,40 @@ if __name__ == "__main__":
         action="store_true",
         help="Enable verbose debug output"
     )
     args = parser.parse_args()
     if args.debug:
         print(f"\033[92mINFO\033[0m: Debug mode enabled")
-    # Check if model exists
-    if not os.path.exists(args.model_path):
-        print(f"\033[91mERR!\033[0m: Model not found at {args.model_path}")
-        print("\033[92mINFO\033[0m: Please train a model first or provide a valid model path")
         sys.exit(1)
     # Create and launch the app
-    app = create_app(args.model_path)
     app.launch(share=args.share)

 import torchaudio
 import torchvision
 import spaces
+import json
 # Add parent directory to path to import preprocess functions
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import functions from preprocess and model definitions
+from preprocess import process_image_data
+from evaluate_backbones import WatermelonModelModular, IMAGE_BACKBONES, AUDIO_BACKBONES
+# Define the top-performing models based on evaluation
+TOP_MODELS = [
+    {"image_backbone": "efficientnet_b3", "audio_backbone": "transformer"},
+    {"image_backbone": "efficientnet_b0", "audio_backbone": "transformer"},
+    {"image_backbone": "resnet50", "audio_backbone": "transformer"}
+]
+# Define the MoE Model
+class WatermelonMoEModel(torch.nn.Module):
+    def __init__(self, model_configs, model_dir="models", weights=None):
+        """
+        Mixture of Experts model that combines multiple backbone models.
+        Args:
+            model_configs: List of dictionaries with 'image_backbone' and 'audio_backbone' keys
+            model_dir: Directory where model checkpoints are stored
+            weights: Optional list of weights for each model (None for equal weighting)
+        """
+        super(WatermelonMoEModel, self).__init__()
+        self.models = []
+        self.model_configs = model_configs
+        # Load each model
+        for config in model_configs:
+            img_backbone = config["image_backbone"]
+            audio_backbone = config["audio_backbone"]
+            # Initialize model
+            model = WatermelonModelModular(img_backbone, audio_backbone)
+            # Load weights
+            model_path = os.path.join(model_dir, f"{img_backbone}_{audio_backbone}_model.pt")
+            if os.path.exists(model_path):
+                print(f"\033[92mINFO\033[0m: Loading model {img_backbone}_{audio_backbone} from {model_path}")
+                model.load_state_dict(torch.load(model_path, map_location='cpu'))
+            else:
+                print(f"\033[91mERR!\033[0m: Model checkpoint not found at {model_path}")
+                continue
+            model.eval()  # Set to evaluation mode
+            self.models.append(model)
+        # Set model weights (uniform by default)
+        if weights:
+            assert len(weights) == len(self.models), "Number of weights must match number of models"
+            self.weights = weights
+        else:
+            self.weights = [1.0 / len(self.models)] * len(self.models)
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.models)} models for MoE ensemble")
+        print(f"\033[92mINFO\033[0m: Model weights: {self.weights}")
+    def forward(self, mfcc, image):
+        """
+        Forward pass through the MoE model.
+        Returns the weighted average of all model outputs.
+        """
+        outputs = []
+        # Get outputs from each model
+        with torch.no_grad():
+            for i, model in enumerate(self.models):
+                output = model(mfcc, image)
+                outputs.append(output * self.weights[i])
+        # Return weighted average
+        return torch.sum(torch.stack(outputs), dim=0)
 # Modified version of process_audio_data specifically for the app to handle various tensor shapes
 def app_process_audio_data(waveform, sample_rate):
         print(traceback.format_exc())
         return None
+# Using the decorator for GPU acceleration
 @spaces.GPU
+def predict_sugar_content(audio, image, model_dir="models", weights=None):
+    """Function with GPU acceleration to predict watermelon sugar content in Brix using MoE model"""
     try:
+        # Check CUDA availability inside the GPU-decorated function
         if torch.cuda.is_available():
             device = torch.device("cuda")
             print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
             device = torch.device("cpu")
             print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
+        # Load MoE model
+        moe_model = WatermelonMoEModel(TOP_MODELS, model_dir, weights)
+        moe_model.to(device)
+        moe_model.eval()
+        print(f"\033[92mINFO\033[0m: Loaded MoE model with {len(moe_model.models)} backbone models")
         # Debug information about input types
         print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
             processed_image = processed_image.unsqueeze(0).to(device)
             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+        # Run inference with MoE model
+        print(f"\033[92mDEBUG\033[0m: Running inference with MoE model on device: {device}")
         if mfcc is not None and processed_image is not None:
             with torch.no_grad():
+                brix_value = moe_model(mfcc, processed_image)
                 print(f"\033[92mDEBUG\033[0m: Prediction successful: {brix_value.item()}")
         else:
             return "Error: Failed to process inputs. Please check the debug logs."
             # Create a header with the numerical result
             result = f"🍉 Predicted Sugar Content: {brix_score:.1f}° Brix 🍉\n\n"
+            # Add extra info about the MoE model
+            result += "Using Ensemble of Top-3 Models:\n"
+            result += "- EfficientNet-B3 + Transformer\n"
+            result += "- EfficientNet-B0 + Transformer\n"
+            result += "- ResNet-50 + Transformer\n\n"
             # Add Brix scale visualization
             result += "Sugar Content Scale (in °Brix):\n"
             result += "──────────────────────────────────\n"
         error_msg += traceback.format_exc()
         print(f"\033[91mERR!\033[0m: {error_msg}")
         return error_msg
+def create_app(model_dir="models", weights=None):
     """Create and launch the Gradio interface"""
     # Define the prediction function with model path
     def predict_fn(audio, image):
+        return predict_sugar_content(audio, image, model_dir, weights)
     # Create Gradio interface
+    with gr.Blocks(title="Watermelon Sugar Content Predictor (MoE)", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🍉 Watermelon Sugar Content Predictor (Ensemble Model)")
         gr.Markdown("""
         This app predicts the sugar content (in °Brix) of a watermelon based on its sound and appearance.
+        ## What's New
+        This version uses a Mixture of Experts (MoE) ensemble model that combines the three best-performing models:
+        - EfficientNet-B3 + Transformer
+        - EfficientNet-B0 + Transformer
+        - ResNet-50 + Transformer
+        The ensemble approach provides more accurate predictions than any single model!
         ## Instructions:
         1. Upload or record an audio of tapping the watermelon
         2. Upload or capture an image of the watermelon
                 submit_btn = gr.Button("Predict Sugar Content", variant="primary")
             with gr.Column():
+                output = gr.Textbox(label="Prediction Results", lines=15)
         submit_btn.click(
             fn=predict_fn,
         ## About Brix Measurement
         Brix (°Bx) is a measurement of sugar content in a solution. For watermelons, higher Brix values indicate sweeter fruit.
         The average ripe watermelon has a Brix value between 9-11°.
+        ## About the Mixture of Experts Model
+        This app uses a Mixture of Experts (MoE) model that combines predictions from multiple neural networks.
+        Our testing shows the ensemble approach achieves a Mean Absolute Error (MAE) of ~0.22, which is significantly
+        better than any individual model (best individual model: ~0.36 MAE).
         """)
     return interface
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description="Watermelon Sugar Content Prediction App (MoE)")
     parser.add_argument(
+        "--model_dir",
         type=str,
+        default="models",
+        help="Directory containing the model checkpoints"
     )
     parser.add_argument(
         "--share",
         action="store_true",
         help="Enable verbose debug output"
     )
+    parser.add_argument(
+        "--weighting",
+        type=str,
+        choices=["uniform", "performance"],
+        default="uniform",
+        help="How to weight the models (uniform or based on performance)"
+    )
     args = parser.parse_args()
     if args.debug:
         print(f"\033[92mINFO\033[0m: Debug mode enabled")
+    # Check if model directory exists
+    if not os.path.exists(args.model_dir):
+        print(f"\033[91mERR!\033[0m: Model directory not found at {args.model_dir}")
         sys.exit(1)
+    # Determine weights based on argument
+    weights = None
+    if args.weighting == "performance":
+        # Weights inversely proportional to the MAE (better models get higher weights)
+        # These are the MAE values from the evaluation results
+        mae_values = [0.3635, 0.3765, 0.3959]  # efficientnet_b3+transformer, efficientnet_b0+transformer, resnet50+transformer
+        # Convert to weights (inverse of MAE, normalized)
+        inverse_mae = [1/mae for mae in mae_values]
+        total = sum(inverse_mae)
+        weights = [val/total for val in inverse_mae]
+        print(f"\033[92mINFO\033[0m: Using performance-based weights: {weights}")
+    else:
+        print(f"\033[92mINFO\033[0m: Using uniform weights")
     # Create and launch the app
+    app = create_app(args.model_dir, weights)
     app.launch(share=args.share)

app_local_backup.py CHANGED Viewed

@@ -5,12 +5,22 @@ import numpy as np
 import gradio as gr
 import torchaudio
 import torchvision
 # Add parent directory to path to import preprocess functions
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-# Import functions from infer_watermelon.py
-from infer_watermelon import load_model
 # Modified version of process_audio_data specifically for the app to handle various tensor shapes
 def app_process_audio_data(waveform, sample_rate):
@@ -69,14 +79,25 @@ def app_process_audio_data(waveform, sample_rate):
 # Similarly for images, but let's import the original one
 from preprocess import process_image_data
-def init_model(model_path):
-    """Initialize the model for inference"""
-    model, device = load_model(model_path)
-    return model, device
-def predict_sweetness(audio, image, model, device):
-    """Predict sweetness of a watermelon from audio and image input"""
     try:
         # Debug information about input types
         print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
         print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
@@ -97,7 +118,6 @@ def predict_sweetness(audio, image, model, device):
             print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
         elif isinstance(audio, str):
             # Direct path to audio file
-            import torchaudio
             audio_data, sample_rate = torchaudio.load(audio)
             print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
         else:
@@ -111,9 +131,6 @@ def predict_sweetness(audio, image, model, device):
         temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
         # Import necessary libraries
-        import torchaudio
-        import torchvision
-        import torchvision.transforms.functional as F
         from PIL import Image
         # Audio handling - direct processing from the data in memory
@@ -162,7 +179,7 @@ def predict_sweetness(audio, image, model, device):
         processed_image = process_image_data(image_tensor)
         print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
-        # Add batch dimension for inference
         if mfcc is not None:
             mfcc = mfcc.unsqueeze(0).to(device)
             print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
@@ -172,31 +189,67 @@ def predict_sweetness(audio, image, model, device):
             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
         # Run inference
-        print(f"\033[92mDEBUG\033[0m: Running inference")
         if mfcc is not None and processed_image is not None:
             with torch.no_grad():
-                sweetness = model(mfcc, processed_image)
-                print(f"\033[92mDEBUG\033[0m: Prediction successful: {sweetness.item()}")
         else:
             return "Error: Failed to process inputs. Please check the debug logs."
-        # Format the result
-        if sweetness is not None:
-            result = f"Predicted Sweetness: {sweetness.item():.2f}/13"
-            # Add a qualitative description
-            if sweetness.item() < 9:
-                result += "\n\nThis watermelon is not very sweet. You might want to choose another one."
-            elif sweetness.item() < 10:
-                result += "\n\nThis watermelon has moderate sweetness."
-            elif sweetness.item() < 11:
-                result += "\n\nThis watermelon is sweet! A good choice."
             else:
-                result += "\n\nThis watermelon is very sweet! Excellent choice!"
             return result
         else:
-            return "Error: Could not predict sweetness. Please try again with different inputs."
     except Exception as e:
         import traceback
@@ -204,36 +257,36 @@ def predict_sweetness(audio, image, model, device):
         error_msg += traceback.format_exc()
         print(f"\033[91mERR!\033[0m: {error_msg}")
         return error_msg
 def create_app(model_path):
     """Create and launch the Gradio interface"""
-    # Initialize model
-    model, device = init_model(model_path)
-    # Define the prediction function with model and device
     def predict_fn(audio, image):
-        return predict_sweetness(audio, image, model, device)
     # Create Gradio interface
-    with gr.Blocks(title="Watermelon Sweetness Predictor") as interface:
-        gr.Markdown("# 🍉 Watermelon Sweetness Predictor")
         gr.Markdown("""
-        This app predicts the sweetness of a watermelon based on its sound and appearance.
         ## Instructions:
         1. Upload or record an audio of tapping the watermelon
         2. Upload or capture an image of the watermelon
-        3. Click 'Submit' to get the predicted sweetness
         """)
         with gr.Row():
             with gr.Column():
                 audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
                 image_input = gr.Image(label="Upload or Capture Image")
-                submit_btn = gr.Button("Predict Sweetness", variant="primary")
             with gr.Column():
-                output = gr.Textbox(label="Prediction Results", lines=6)
         submit_btn.click(
             fn=predict_fn,
@@ -242,13 +295,13 @@ def create_app(model_path):
         )
         gr.Markdown("""
-        ## How it works
-        The app uses a deep learning model that combines:
-        - Audio analysis using MFCC features and LSTM neural network
-        - Image analysis using ResNet-50 convolutional neural network
-        The model was trained on a dataset of watermelons with known sweetness values.
         """)
     return interface
@@ -256,7 +309,7 @@ def create_app(model_path):
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description="Watermelon Sweetness Prediction App")
     parser.add_argument(
         "--model_path",
         type=str,

 import gradio as gr
 import torchaudio
 import torchvision
+import spaces
+# # Import Gradio Spaces GPU decorator
+# try:
+#     from gradio import spaces
+#     HAS_SPACES = True
+#     print("\033[92mINFO\033[0m: Gradio Spaces detected, GPU acceleration will be enabled")
+# except ImportError:
+#     HAS_SPACES = False
+#     print("\033[93mWARN\033[0m: gradio.spaces not available, running without GPU optimization")
 # Add parent directory to path to import preprocess functions
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import functions from infer_watermelon.py and train_watermelon for the model
+from train_watermelon import WatermelonModel
 # Modified version of process_audio_data specifically for the app to handle various tensor shapes
 def app_process_audio_data(waveform, sample_rate):
 # Similarly for images, but let's import the original one
 from preprocess import process_image_data
+    # Using the decorator directly on the function definition
+@spaces.GPU
+def predict_sugar_content(audio, image, model_path):
+    """Function with GPU acceleration to predict watermelon sugar content in Brix"""
     try:
+        # Now check CUDA availability inside the GPU-decorated function
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
+        else:
+            device = torch.device("cpu")
+            print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
+        # Load model inside the function to ensure it's on the correct device
+        model = WatermelonModel().to(device)
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.eval()
+        print(f"\033[92mINFO\033[0m: Loaded model from {model_path}")
         # Debug information about input types
         print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
         print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
             print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
         elif isinstance(audio, str):
             # Direct path to audio file
             audio_data, sample_rate = torchaudio.load(audio)
             print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
         else:
         temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
         # Import necessary libraries
         from PIL import Image
         # Audio handling - direct processing from the data in memory
         processed_image = process_image_data(image_tensor)
         print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+        # Add batch dimension for inference and move to device
         if mfcc is not None:
             mfcc = mfcc.unsqueeze(0).to(device)
             print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
             print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
         # Run inference
+        print(f"\033[92mDEBUG\033[0m: Running inference on device: {device}")
         if mfcc is not None and processed_image is not None:
             with torch.no_grad():
+                brix_value = model(mfcc, processed_image)
+                print(f"\033[92mDEBUG\033[0m: Prediction successful: {brix_value.item()}")
         else:
             return "Error: Failed to process inputs. Please check the debug logs."
+        # Format the result with a range display
+        if brix_value is not None:
+            brix_score = brix_value.item()
+            # Create a header with the numerical result
+            result = f"🍉 Predicted Sugar Content: {brix_score:.1f}° Brix 🍉\n\n"
+            # Add Brix scale visualization
+            result += "Sugar Content Scale (in °Brix):\n"
+            result += "──────────────────────────────────\n"
+            # Create the scale display with Brix ranges
+            scale_ranges = [
+                (0, 8, "Low Sugar (< 8° Brix)"),
+                (8, 9, "Mild Sweetness (8-9° Brix)"),
+                (9, 10, "Medium Sweetness (9-10° Brix)"),
+                (10, 11, "Sweet (10-11° Brix)"),
+                (11, 13, "Very Sweet (11-13° Brix)")
+            ]
+            # Find which category the prediction falls into
+            user_category = None
+            for min_val, max_val, category_name in scale_ranges:
+                if min_val <= brix_score < max_val:
+                    user_category = category_name
+                    break
+            if brix_score >= scale_ranges[-1][0]:  # Handle edge case
+                user_category = scale_ranges[-1][2]
+            # Display the scale with the user's result highlighted
+            for min_val, max_val, category_name in scale_ranges:
+                if category_name == user_category:
+                    result += f"▶ {min_val}-{max_val}: {category_name} ◀ (YOUR WATERMELON)\n"
+                else:
+                    result += f"  {min_val}-{max_val}: {category_name}\n"
+            result += "──────────────────────────────────\n\n"
+            # Add assessment of the watermelon's sugar content
+            if brix_score < 8:
+                result += "Assessment: This watermelon has low sugar content. It may taste bland or slightly bitter."
+            elif brix_score < 9:
+                result += "Assessment: This watermelon has mild sweetness. Acceptable flavor but not very sweet."
+            elif brix_score < 10:
+                result += "Assessment: This watermelon has moderate sugar content. It should have pleasant sweetness."
+            elif brix_score < 11:
+                result += "Assessment: This watermelon has good sugar content! It should be sweet and juicy."
             else:
+                result += "Assessment: This watermelon has excellent sugar content! Perfect choice for maximum sweetness and flavor."
             return result
         else:
+            return "Error: Could not predict sugar content. Please try again with different inputs."
     except Exception as e:
         import traceback
         error_msg += traceback.format_exc()
         print(f"\033[91mERR!\033[0m: {error_msg}")
         return error_msg
+    print("\033[92mINFO\033[0m: GPU-accelerated prediction function created with @spaces.GPU decorator")
 def create_app(model_path):
     """Create and launch the Gradio interface"""
+    # Define the prediction function with model path
     def predict_fn(audio, image):
+        return predict_sugar_content(audio, image, model_path)
     # Create Gradio interface
+    with gr.Blocks(title="Watermelon Sugar Content Predictor", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🍉 Watermelon Sugar Content Predictor")
         gr.Markdown("""
+        This app predicts the sugar content (in °Brix) of a watermelon based on its sound and appearance.
         ## Instructions:
         1. Upload or record an audio of tapping the watermelon
         2. Upload or capture an image of the watermelon
+        3. Click 'Predict' to get the sugar content estimation
         """)
         with gr.Row():
             with gr.Column():
                 audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
                 image_input = gr.Image(label="Upload or Capture Image")
+                submit_btn = gr.Button("Predict Sugar Content", variant="primary")
             with gr.Column():
+                output = gr.Textbox(label="Prediction Results", lines=12)
         submit_btn.click(
             fn=predict_fn,
         )
         gr.Markdown("""
+        ## Tips for best results
+        - For audio: Tap the watermelon with your knuckle and record the sound
+        - For image: Take a clear photo of the whole watermelon in good lighting
+        ## About Brix Measurement
+        Brix (°Bx) is a measurement of sugar content in a solution. For watermelons, higher Brix values indicate sweeter fruit.
+        The average ripe watermelon has a Brix value between 9-11°.
         """)
     return interface
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description="Watermelon Sugar Content Prediction App")
     parser.add_argument(
         "--model_path",
         type=str,

app_moe.py ADDED Viewed

	@@ -0,0 +1,439 @@

+import os
+import sys
+import torch
+import numpy as np
+import gradio as gr
+import torchaudio
+import torchvision
+import spaces
+import json
+# Add parent directory to path to import preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import functions from preprocess and model definitions
+from preprocess import process_image_data
+from evaluate_backbones import WatermelonModelModular, IMAGE_BACKBONES, AUDIO_BACKBONES
+# Define the top-performing models based on evaluation
+TOP_MODELS = [
+    {"image_backbone": "efficientnet_b3", "audio_backbone": "transformer"},
+    {"image_backbone": "efficientnet_b0", "audio_backbone": "transformer"},
+    {"image_backbone": "resnet50", "audio_backbone": "transformer"}
+]
+# Define the MoE Model
+class WatermelonMoEModel(torch.nn.Module):
+    def __init__(self, model_configs, model_dir="models", weights=None):
+        """
+        Mixture of Experts model that combines multiple backbone models.
+        Args:
+            model_configs: List of dictionaries with 'image_backbone' and 'audio_backbone' keys
+            model_dir: Directory where model checkpoints are stored
+            weights: Optional list of weights for each model (None for equal weighting)
+        """
+        super(WatermelonMoEModel, self).__init__()
+        self.models = []
+        self.model_configs = model_configs
+        # Load each model
+        for config in model_configs:
+            img_backbone = config["image_backbone"]
+            audio_backbone = config["audio_backbone"]
+            # Initialize model
+            model = WatermelonModelModular(img_backbone, audio_backbone)
+            # Load weights
+            model_path = os.path.join(model_dir, f"{img_backbone}_{audio_backbone}_model.pt")
+            if os.path.exists(model_path):
+                print(f"\033[92mINFO\033[0m: Loading model {img_backbone}_{audio_backbone} from {model_path}")
+                model.load_state_dict(torch.load(model_path, map_location='cpu'))
+            else:
+                print(f"\033[91mERR!\033[0m: Model checkpoint not found at {model_path}")
+                continue
+            model.eval()  # Set to evaluation mode
+            self.models.append(model)
+        # Set model weights (uniform by default)
+        if weights:
+            assert len(weights) == len(self.models), "Number of weights must match number of models"
+            self.weights = weights
+        else:
+            self.weights = [1.0 / len(self.models)] * len(self.models)
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.models)} models for MoE ensemble")
+        print(f"\033[92mINFO\033[0m: Model weights: {self.weights}")
+    def forward(self, mfcc, image):
+        """
+        Forward pass through the MoE model.
+        Returns the weighted average of all model outputs.
+        """
+        outputs = []
+        # Get outputs from each model
+        with torch.no_grad():
+            for i, model in enumerate(self.models):
+                output = model(mfcc, image)
+                outputs.append(output * self.weights[i])
+        # Return weighted average
+        return torch.sum(torch.stack(outputs), dim=0)
+# Modified version of process_audio_data specifically for the app to handle various tensor shapes
+def app_process_audio_data(waveform, sample_rate):
+    """Modified version of process_audio_data for the app that handles different tensor dimensions"""
+    try:
+        print(f"\033[92mDEBUG\033[0m: Processing audio - Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
+        # Handle different tensor dimensions
+        if waveform.dim() == 3:
+            print(f"\033[92mDEBUG\033[0m: Found 3D tensor, converting to 2D")
+            # For 3D tensor, take the first item (batch dimension)
+            waveform = waveform[0]
+        if waveform.dim() == 2:
+            # Use the first channel for stereo audio
+            waveform = waveform[0]
+            print(f"\033[92mDEBUG\033[0m: Using first channel, new shape: {waveform.shape}")
+        # Resample to 16kHz if needed
+        resample_rate = 16000
+        if sample_rate != resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Resampling from {sample_rate}Hz to {resample_rate}Hz")
+            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        # Ensure 3 seconds of audio
+        if waveform.size(0) < 3 * resample_rate:
+            print(f"\033[92mDEBUG\033[0m: Padding audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0)))
+        else:
+            print(f"\033[92mDEBUG\033[0m: Trimming audio from {waveform.size(0)} to {3 * resample_rate} samples")
+            waveform = waveform[: 3 * resample_rate]
+        # Apply MFCC transformation
+        print(f"\033[92mDEBUG\033[0m: Applying MFCC transformation")
+        mfcc_transform = torchaudio.transforms.MFCC(
+            sample_rate=resample_rate,
+            n_mfcc=13,
+            melkwargs={
+                "n_fft": 256,
+                "win_length": 256,
+                "hop_length": 128,
+                "n_mels": 40,
+            }
+        )
+        mfcc = mfcc_transform(waveform)
+        print(f"\033[92mDEBUG\033[0m: MFCC output shape: {mfcc.shape}")
+        return mfcc
+    except Exception as e:
+        import traceback
+        print(f"\033[91mERR!\033[0m: Error in audio processing: {e}")
+        print(traceback.format_exc())
+        return None
+# Using the decorator for GPU acceleration
+@spaces.GPU
+def predict_sugar_content(audio, image, model_dir="models", weights=None):
+    """Function with GPU acceleration to predict watermelon sugar content in Brix using MoE model"""
+    try:
+        # Check CUDA availability inside the GPU-decorated function
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            print(f"\033[92mINFO\033[0m: CUDA is available. Using device: {device}")
+        else:
+            device = torch.device("cpu")
+            print(f"\033[92mINFO\033[0m: CUDA is not available. Using device: {device}")
+        # Load MoE model
+        moe_model = WatermelonMoEModel(TOP_MODELS, model_dir, weights)
+        moe_model.to(device)
+        moe_model.eval()
+        print(f"\033[92mINFO\033[0m: Loaded MoE model with {len(moe_model.models)} backbone models")
+        # Debug information about input types
+        print(f"\033[92mDEBUG\033[0m: Audio input type: {type(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Audio input shape/length: {len(audio)}")
+        print(f"\033[92mDEBUG\033[0m: Image input type: {type(image)}")
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Image input shape: {image.shape}")
+        # Handle different audio input formats
+        if isinstance(audio, tuple) and len(audio) == 2:
+            # Standard Gradio format: (sample_rate, audio_data)
+            sample_rate, audio_data = audio
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, tuple) and len(audio) > 2:
+            # Sometimes Gradio returns (sample_rate, audio_data, other_info...)
+            sample_rate, audio_data = audio[0], audio[-1]
+            print(f"\033[92mDEBUG\033[0m: Audio sample rate: {sample_rate}")
+            print(f"\033[92mDEBUG\033[0m: Audio data shape: {audio_data.shape}")
+        elif isinstance(audio, str):
+            # Direct path to audio file
+            audio_data, sample_rate = torchaudio.load(audio)
+            print(f"\033[92mDEBUG\033[0m: Loaded audio from path with shape: {audio_data.shape}")
+        else:
+            return f"Error: Unsupported audio format. Got {type(audio)}"
+        # Create a temporary file path for the audio and image
+        temp_dir = "temp"
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
+        temp_image_path = os.path.join(temp_dir, "temp_image.jpg")
+        # Import necessary libraries
+        from PIL import Image
+        # Audio handling - direct processing from the data in memory
+        if isinstance(audio_data, np.ndarray):
+            # Convert numpy array to tensor
+            print(f"\033[92mDEBUG\033[0m: Converting numpy audio with shape {audio_data.shape} to tensor")
+            audio_tensor = torch.tensor(audio_data).float()
+            # Handle different audio dimensions
+            if audio_data.ndim == 1:
+                # Single channel audio
+                audio_tensor = audio_tensor.unsqueeze(0)
+            elif audio_data.ndim == 2:
+                # Ensure channels are first dimension
+                if audio_data.shape[0] > audio_data.shape[1]:
+                    # More rows than columns, probably (samples, channels)
+                    audio_tensor = torch.tensor(audio_data.T).float()
+        else:
+            # Already a tensor
+            audio_tensor = audio_data.float()
+        print(f"\033[92mDEBUG\033[0m: Audio tensor shape before processing: {audio_tensor.shape}")
+        # Skip saving/loading and process directly
+        mfcc = app_process_audio_data(audio_tensor, sample_rate)
+        print(f"\033[92mDEBUG\033[0m: MFCC tensor shape after processing: {mfcc.shape if mfcc is not None else None}")
+        # Image handling
+        if isinstance(image, np.ndarray):
+            print(f"\033[92mDEBUG\033[0m: Converting numpy image with shape {image.shape} to PIL")
+            pil_image = Image.fromarray(image)
+            pil_image.save(temp_image_path)
+            print(f"\033[92mDEBUG\033[0m: Saved image to {temp_image_path}")
+        elif isinstance(image, str):
+            # If image is already a path
+            temp_image_path = image
+            print(f"\033[92mDEBUG\033[0m: Using provided image path: {temp_image_path}")
+        else:
+            return f"Error: Unsupported image format. Got {type(image)}"
+        # Process image
+        print(f"\033[92mDEBUG\033[0m: Loading and preprocessing image from {temp_image_path}")
+        image_tensor = torchvision.io.read_image(temp_image_path)
+        print(f"\033[92mDEBUG\033[0m: Loaded image shape: {image_tensor.shape}")
+        image_tensor = image_tensor.float()
+        processed_image = process_image_data(image_tensor)
+        print(f"\033[92mDEBUG\033[0m: Processed image shape: {processed_image.shape if processed_image is not None else None}")
+        # Add batch dimension for inference and move to device
+        if mfcc is not None:
+            mfcc = mfcc.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final MFCC shape with batch dimension: {mfcc.shape}")
+        if processed_image is not None:
+            processed_image = processed_image.unsqueeze(0).to(device)
+            print(f"\033[92mDEBUG\033[0m: Final image shape with batch dimension: {processed_image.shape}")
+        # Run inference with MoE model
+        print(f"\033[92mDEBUG\033[0m: Running inference with MoE model on device: {device}")
+        if mfcc is not None and processed_image is not None:
+            with torch.no_grad():
+                brix_value = moe_model(mfcc, processed_image)
+                print(f"\033[92mDEBUG\033[0m: Prediction successful: {brix_value.item()}")
+        else:
+            return "Error: Failed to process inputs. Please check the debug logs."
+        # Format the result with a range display
+        if brix_value is not None:
+            brix_score = brix_value.item()
+            # Create a header with the numerical result
+            result = f"🍉 Predicted Sugar Content: {brix_score:.1f}° Brix 🍉\n\n"
+            # Add extra info about the MoE model
+            result += "Using Ensemble of Top-3 Models:\n"
+            result += "- EfficientNet-B3 + Transformer\n"
+            result += "- EfficientNet-B0 + Transformer\n"
+            result += "- ResNet-50 + Transformer\n\n"
+            # Add Brix scale visualization
+            result += "Sugar Content Scale (in °Brix):\n"
+            result += "──────────────────────────────────\n"
+            # Create the scale display with Brix ranges
+            scale_ranges = [
+                (0, 8, "Low Sugar (< 8° Brix)"),
+                (8, 9, "Mild Sweetness (8-9° Brix)"),
+                (9, 10, "Medium Sweetness (9-10° Brix)"),
+                (10, 11, "Sweet (10-11° Brix)"),
+                (11, 13, "Very Sweet (11-13° Brix)")
+            ]
+            # Find which category the prediction falls into
+            user_category = None
+            for min_val, max_val, category_name in scale_ranges:
+                if min_val <= brix_score < max_val:
+                    user_category = category_name
+                    break
+            if brix_score >= scale_ranges[-1][0]:  # Handle edge case
+                user_category = scale_ranges[-1][2]
+            # Display the scale with the user's result highlighted
+            for min_val, max_val, category_name in scale_ranges:
+                if category_name == user_category:
+                    result += f"▶ {min_val}-{max_val}: {category_name} ◀ (YOUR WATERMELON)\n"
+                else:
+                    result += f"  {min_val}-{max_val}: {category_name}\n"
+            result += "──────────────────────────────────\n\n"
+            # Add assessment of the watermelon's sugar content
+            if brix_score < 8:
+                result += "Assessment: This watermelon has low sugar content. It may taste bland or slightly bitter."
+            elif brix_score < 9:
+                result += "Assessment: This watermelon has mild sweetness. Acceptable flavor but not very sweet."
+            elif brix_score < 10:
+                result += "Assessment: This watermelon has moderate sugar content. It should have pleasant sweetness."
+            elif brix_score < 11:
+                result += "Assessment: This watermelon has good sugar content! It should be sweet and juicy."
+            else:
+                result += "Assessment: This watermelon has excellent sugar content! Perfect choice for maximum sweetness and flavor."
+            return result
+        else:
+            return "Error: Could not predict sugar content. Please try again with different inputs."
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n\n"
+        error_msg += traceback.format_exc()
+        print(f"\033[91mERR!\033[0m: {error_msg}")
+        return error_msg
+def create_app(model_dir="models", weights=None):
+    """Create and launch the Gradio interface"""
+    # Define the prediction function with model path
+    def predict_fn(audio, image):
+        return predict_sugar_content(audio, image, model_dir, weights)
+    # Create Gradio interface
+    with gr.Blocks(title="Watermelon Sugar Content Predictor (MoE)", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🍉 Watermelon Sugar Content Predictor (Ensemble Model)")
+        gr.Markdown("""
+        This app predicts the sugar content (in °Brix) of a watermelon based on its sound and appearance.
+        ## What's New
+        This version uses a Mixture of Experts (MoE) ensemble model that combines the three best-performing models:
+        - EfficientNet-B3 + Transformer
+        - EfficientNet-B0 + Transformer
+        - ResNet-50 + Transformer
+        The ensemble approach provides more accurate predictions than any single model!
+        ## Instructions:
+        1. Upload or record an audio of tapping the watermelon
+        2. Upload or capture an image of the watermelon
+        3. Click 'Predict' to get the sugar content estimation
+        """)
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(label="Upload or Record Audio", type="numpy")
+                image_input = gr.Image(label="Upload or Capture Image")
+                submit_btn = gr.Button("Predict Sugar Content", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(label="Prediction Results", lines=15)
+        submit_btn.click(
+            fn=predict_fn,
+            inputs=[audio_input, image_input],
+            outputs=output
+        )
+        gr.Markdown("""
+        ## Tips for best results
+        - For audio: Tap the watermelon with your knuckle and record the sound
+        - For image: Take a clear photo of the whole watermelon in good lighting
+        ## About Brix Measurement
+        Brix (°Bx) is a measurement of sugar content in a solution. For watermelons, higher Brix values indicate sweeter fruit.
+        The average ripe watermelon has a Brix value between 9-11°.
+        ## About the Mixture of Experts Model
+        This app uses a Mixture of Experts (MoE) model that combines predictions from multiple neural networks.
+        Our testing shows the ensemble approach achieves a Mean Absolute Error (MAE) of ~0.22, which is significantly
+        better than any individual model (best individual model: ~0.36 MAE).
+        """)
+    return interface
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Watermelon Sugar Content Prediction App (MoE)")
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="models",
+        help="Directory containing the model checkpoints"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a shareable link for the app"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable verbose debug output"
+    )
+    parser.add_argument(
+        "--weighting",
+        type=str,
+        choices=["uniform", "performance"],
+        default="uniform",
+        help="How to weight the models (uniform or based on performance)"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        print(f"\033[92mINFO\033[0m: Debug mode enabled")
+    # Check if model directory exists
+    if not os.path.exists(args.model_dir):
+        print(f"\033[91mERR!\033[0m: Model directory not found at {args.model_dir}")
+        sys.exit(1)
+    # Determine weights based on argument
+    weights = None
+    if args.weighting == "performance":
+        # Weights inversely proportional to the MAE (better models get higher weights)
+        # These are the MAE values from the evaluation results
+        mae_values = [0.3635, 0.3765, 0.3959]  # efficientnet_b3+transformer, efficientnet_b0+transformer, resnet50+transformer
+        # Convert to weights (inverse of MAE, normalized)
+        inverse_mae = [1/mae for mae in mae_values]
+        total = sum(inverse_mae)
+        weights = [val/total for val in inverse_mae]
+        print(f"\033[92mINFO\033[0m: Using performance-based weights: {weights}")
+    else:
+        print(f"\033[92mINFO\033[0m: Using uniform weights")
+    # Create and launch the app
+    app = create_app(args.model_dir, weights)
+    app.launch(share=args.share)

backbone_evaluation_results.json ADDED Viewed

	@@ -0,0 +1,110 @@

+[
+    {
+        "image_backbone": "efficientnet_b3",
+        "audio_backbone": "transformer",
+        "validation_mse": 0.21577325425086877,
+        "validation_mae": 0.36228722945237773,
+        "test_mse": 0.21746371760964395,
+        "test_mae": 0.36353210285305976,
+        "model_path": "test_models/efficientnet_b3_transformer_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b0",
+        "audio_backbone": "transformer",
+        "validation_mse": 0.24033201676912797,
+        "validation_mae": 0.42209602166444826,
+        "test_mse": 0.19470563121140003,
+        "test_mae": 0.37649240642786025,
+        "model_path": "test_models/efficientnet_b0_transformer_model.pt"
+    },
+    {
+        "image_backbone": "resnet50",
+        "audio_backbone": "transformer",
+        "validation_mse": 0.22672857019381645,
+        "validation_mae": 0.3926378931754675,
+        "test_mse": 0.22427306957542897,
+        "test_mae": 0.39585837423801423,
+        "model_path": "test_models/resnet50_transformer_model.pt"
+    },
+    {
+        "image_backbone": "resnet50",
+        "audio_backbone": "bidirectional_lstm",
+        "validation_mse": 0.2967155438203078,
+        "validation_mae": 0.3850937023376807,
+        "test_mse": 0.36476454623043536,
+        "test_mae": 0.425818096101284,
+        "model_path": "test_models/resnet50_bidirectional_lstm_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b0",
+        "audio_backbone": "bidirectional_lstm",
+        "validation_mse": 0.5120524473679371,
+        "validation_mae": 0.5665570046657171,
+        "test_mse": 0.5059382550418376,
+        "test_mae": 0.555050653219223,
+        "model_path": "test_models/efficientnet_b0_bidirectional_lstm_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b3",
+        "audio_backbone": "bidirectional_lstm",
+        "validation_mse": 0.8020018790012751,
+        "validation_mae": 0.7953977386156718,
+        "test_mse": 0.7042828559875488,
+        "test_mae": 0.7441241115331649,
+        "model_path": "test_models/efficientnet_b3_bidirectional_lstm_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b0",
+        "audio_backbone": "gru",
+        "validation_mse": 1.1340507984161377,
+        "validation_mae": 0.8290961503982544,
+        "test_mse": 0.9705999374389649,
+        "test_mae": 0.7704607486724854,
+        "model_path": "test_models/efficientnet_b0_gru_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b0",
+        "audio_backbone": "lstm",
+        "validation_mse": 2.787272185087204,
+        "validation_mae": 1.5404645502567291,
+        "test_mse": 2.901867628097534,
+        "test_mae": 1.5843785762786866,
+        "model_path": "test_models/efficientnet_b0_lstm_model.pt"
+    },
+    {
+        "image_backbone": "resnet50",
+        "audio_backbone": "gru",
+        "validation_mse": 3.9335442543029786,
+        "validation_mae": 1.8762320041656495,
+        "test_mse": 3.72695152759552,
+        "test_mae": 1.8381730556488036,
+        "model_path": "test_models/resnet50_gru_model.pt"
+    },
+    {
+        "image_backbone": "resnet50",
+        "audio_backbone": "lstm",
+        "validation_mse": 6.088638782501221,
+        "validation_mae": 2.3887929677963258,
+        "test_mse": 6.1847597599029545,
+        "test_mae": 2.418113374710083,
+        "model_path": "test_models/resnet50_lstm_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b3",
+        "audio_backbone": "gru",
+        "validation_mse": 104.58460273742676,
+        "validation_mae": 10.183499813079834,
+        "test_mse": 104.58482055664062,
+        "test_mae": 10.180697345733643,
+        "model_path": "test_models/efficientnet_b3_gru_model.pt"
+    },
+    {
+        "image_backbone": "efficientnet_b3",
+        "audio_backbone": "lstm",
+        "validation_mse": 105.40057525634765,
+        "validation_mae": 10.221695899963379,
+        "test_mse": 105.17274551391601,
+        "test_mae": 10.21053056716919,
+        "model_path": "test_models/efficientnet_b3_lstm_model.pt"
+    }
+]

evaluate_backbones.py ADDED Viewed

	@@ -0,0 +1,670 @@

+import os
+import torch
+import torchaudio
+import torchvision
+import numpy as np
+import time
+import json
+from torch.utils.data import Dataset, DataLoader
+import sys
+from tqdm import tqdm
+# Add parent directory to path to import the preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from preprocess import process_audio_data, process_image_data
+# Print library versions
+print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
+print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
+# Device selection
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+print(f"\033[92mINFO\033[0m: Using device: {device}")
+# Hyperparameters
+batch_size = 16
+epochs = 1  # Just one epoch for evaluation
+learning_rate = 0.0001
+class WatermelonDataset(Dataset):
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+        self.samples = []
+        # Walk through the directory structure
+        for sweetness_dir in os.listdir(data_dir):
+            sweetness = float(sweetness_dir)
+            sweetness_path = os.path.join(data_dir, sweetness_dir)
+            if os.path.isdir(sweetness_path):
+                for id_dir in os.listdir(sweetness_path):
+                    id_path = os.path.join(sweetness_path, id_dir)
+                    if os.path.isdir(id_path):
+                        audio_file = os.path.join(id_path, f"{id_dir}.wav")
+                        image_file = os.path.join(id_path, f"{id_dir}.jpg")
+                        if os.path.exists(audio_file) and os.path.exists(image_file):
+                            self.samples.append((audio_file, image_file, sweetness))
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.samples)} samples from {data_dir}")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        audio_path, image_path, label = self.samples[idx]
+        # Load and process audio
+        try:
+            waveform, sample_rate = torchaudio.load(audio_path)
+            mfcc = process_audio_data(waveform, sample_rate)
+            # Load and process image
+            image = torchvision.io.read_image(image_path)
+            image = image.float()
+            processed_image = process_image_data(image)
+            return mfcc, processed_image, torch.tensor(label).float()
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: Error processing sample {idx}: {e}")
+            # Return a fallback sample or skip this sample
+            # For simplicity, we'll return the first sample again
+            if idx == 0:  # Prevent infinite recursion
+                raise e
+            return self.__getitem__(0)
+# Define available backbone models
+IMAGE_BACKBONES = {
+    "resnet50": {
+        "model": torchvision.models.resnet50,
+        "weights": torchvision.models.ResNet50_Weights.DEFAULT,
+        "output_dim": lambda model: model.fc.in_features
+    },
+    "efficientnet_b0": {
+        "model": torchvision.models.efficientnet_b0,
+        "weights": torchvision.models.EfficientNet_B0_Weights.DEFAULT,
+        "output_dim": lambda model: model.classifier[1].in_features
+    },
+    "efficientnet_b3": {
+        "model": torchvision.models.efficientnet_b3,
+        "weights": torchvision.models.EfficientNet_B3_Weights.DEFAULT,
+        "output_dim": lambda model: model.classifier[1].in_features
+    }
+}
+AUDIO_BACKBONES = {
+    "lstm": {
+        "model": lambda input_size, hidden_size: torch.nn.LSTM(
+            input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True
+        ),
+        "output_dim": lambda hidden_size: hidden_size
+    },
+    "gru": {
+        "model": lambda input_size, hidden_size: torch.nn.GRU(
+            input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True
+        ),
+        "output_dim": lambda hidden_size: hidden_size
+    },
+    "bidirectional_lstm": {
+        "model": lambda input_size, hidden_size: torch.nn.LSTM(
+            input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True, bidirectional=True
+        ),
+        "output_dim": lambda hidden_size: hidden_size * 2  # * 2 because bidirectional
+    },
+    "transformer": {
+        "model": lambda input_size, hidden_size: torch.nn.TransformerEncoder(
+            torch.nn.TransformerEncoderLayer(
+                d_model=input_size, nhead=8, dim_feedforward=hidden_size, batch_first=True
+            ),
+            num_layers=2
+        ),
+        "output_dim": lambda hidden_size: 376  # Using input_size (mfcc dimensions)
+    }
+}
+class WatermelonModelModular(torch.nn.Module):
+    def __init__(self, image_backbone_name, audio_backbone_name, audio_hidden_size=128):
+        super(WatermelonModelModular, self).__init__()
+        # Audio backbone setup
+        self.audio_backbone_name = audio_backbone_name
+        self.audio_hidden_size = audio_hidden_size
+        self.audio_input_size = 376  # From MFCC dimensions
+        audio_config = AUDIO_BACKBONES[audio_backbone_name]
+        self.audio_backbone = audio_config["model"](self.audio_input_size, self.audio_hidden_size)
+        audio_output_dim = audio_config["output_dim"](self.audio_hidden_size)
+        self.audio_fc = torch.nn.Linear(audio_output_dim, 128)
+        # Image backbone setup
+        self.image_backbone_name = image_backbone_name
+        image_config = IMAGE_BACKBONES[image_backbone_name]
+        self.image_backbone = image_config["model"](weights=image_config["weights"])
+        # Replace final layer for all image backbones to get features
+        if image_backbone_name.startswith("resnet"):
+            self.image_output_dim = image_config["output_dim"](self.image_backbone)
+            self.image_backbone.fc = torch.nn.Identity()
+        elif image_backbone_name.startswith("efficientnet"):
+            self.image_output_dim = image_config["output_dim"](self.image_backbone)
+            self.image_backbone.classifier = torch.nn.Identity()
+        elif image_backbone_name.startswith("convnext"):
+            self.image_output_dim = image_config["output_dim"](self.image_backbone)
+            self.image_backbone.classifier = torch.nn.Identity()
+        elif image_backbone_name.startswith("swin"):
+            self.image_output_dim = image_config["output_dim"](self.image_backbone)
+            self.image_backbone.head = torch.nn.Identity()
+        self.image_fc = torch.nn.Linear(self.image_output_dim, 128)
+        # Fully connected layers for final prediction
+        self.fc1 = torch.nn.Linear(256, 64)
+        self.fc2 = torch.nn.Linear(64, 1)
+        self.relu = torch.nn.ReLU()
+    def forward(self, mfcc, image):
+        # Audio backbone processing
+        if self.audio_backbone_name == "lstm" or self.audio_backbone_name == "gru":
+            audio_output, _ = self.audio_backbone(mfcc)
+            audio_output = audio_output[:, -1, :]  # Use the output of the last time step
+        elif self.audio_backbone_name == "bidirectional_lstm":
+            audio_output, _ = self.audio_backbone(mfcc)
+            audio_output = audio_output[:, -1, :]  # Use the output of the last time step
+        elif self.audio_backbone_name == "transformer":
+            audio_output = self.audio_backbone(mfcc)
+            audio_output = audio_output.mean(dim=1)  # Average pooling over sequence length
+        audio_output = self.audio_fc(audio_output)
+        # Image backbone processing
+        image_output = self.image_backbone(image)
+        image_output = self.image_fc(image_output)
+        # Concatenate audio and image outputs
+        merged = torch.cat((audio_output, image_output), dim=1)
+        # Fully connected layers
+        output = self.relu(self.fc1(merged))
+        output = self.fc2(output)
+        return output
+def evaluate_model(data_dir, image_backbone, audio_backbone, audio_hidden_size=128, save_model_dir=None):
+    # Adjust batch size based on model complexity to avoid OOM errors
+    adjusted_batch_size = batch_size
+    # Models that typically require more memory get smaller batch sizes
+    if image_backbone in ["swin_b", "convnext_base"] or audio_backbone in ["transformer", "bidirectional_lstm"]:
+        adjusted_batch_size = max(4, batch_size // 2)  # At least batch size of 4, but reduce by half if needed
+        print(f"\033[92mINFO\033[0m: Adjusted batch size to {adjusted_batch_size} for larger model")
+    # Create dataset
+    dataset = WatermelonDataset(data_dir)
+    n_samples = len(dataset)
+    # Split dataset
+    train_size = int(0.7 * n_samples)
+    val_size = int(0.2 * n_samples)
+    test_size = n_samples - train_size - val_size
+    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
+        dataset, [train_size, val_size, test_size]
+    )
+    train_loader = DataLoader(train_dataset, batch_size=adjusted_batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=adjusted_batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=adjusted_batch_size, shuffle=False)
+    # Initialize model
+    model = WatermelonModelModular(image_backbone, audio_backbone, audio_hidden_size).to(device)
+    # Loss function and optimizer
+    criterion = torch.nn.MSELoss()
+    mae_criterion = torch.nn.L1Loss()  # For MAE evaluation
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    print(f"\033[92mINFO\033[0m: Evaluating model with {image_backbone} (image) and {audio_backbone} (audio)")
+    print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}")
+    print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}")
+    print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}")
+    print(f"\033[92mINFO\033[0m: Batch size: {adjusted_batch_size}")
+    # Training loop
+    print(f"\033[92mINFO\033[0m: Training for evaluation...")
+    model.train()
+    running_loss = 0.0
+    # Wrap with tqdm for progress visualization
+    train_iterator = tqdm(train_loader, desc="Training")
+    for i, (mfcc, image, label) in enumerate(train_iterator):
+        try:
+            mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+            optimizer.zero_grad()
+            output = model(mfcc, image)
+            label = label.view(-1, 1).float()
+            loss = criterion(output, label)
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+            train_iterator.set_postfix({"Loss": f"{loss.item():.4f}"})
+            # Clear memory after each batch
+            if device.type == 'cuda':
+                del mfcc, image, label, output, loss
+                torch.cuda.empty_cache()
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: Error in training batch {i}: {e}")
+            # Clear memory in case of error
+            if device.type == 'cuda':
+                torch.cuda.empty_cache()
+            continue
+    # Validation phase
+    print(f"\033[92mINFO\033[0m: Validating...")
+    model.eval()
+    val_loss = 0.0
+    val_mae = 0.0
+    val_iterator = tqdm(val_loader, desc="Validation")
+    with torch.no_grad():
+        for i, (mfcc, image, label) in enumerate(val_iterator):
+            try:
+                mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                output = model(mfcc, image)
+                label = label.view(-1, 1).float()
+                # Calculate MSE loss
+                loss = criterion(output, label)
+                val_loss += loss.item()
+                # Calculate MAE
+                mae = mae_criterion(output, label)
+                val_mae += mae.item()
+                val_iterator.set_postfix({"MSE": f"{loss.item():.4f}", "MAE": f"{mae.item():.4f}"})
+                # Clear memory after each batch
+                if device.type == 'cuda':
+                    del mfcc, image, label, output, loss, mae
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error in validation batch {i}: {e}")
+                # Clear memory in case of error
+                if device.type == 'cuda':
+                    torch.cuda.empty_cache()
+                continue
+    avg_val_loss = val_loss / len(val_loader) if len(val_loader) > 0 else float('inf')
+    avg_val_mae = val_mae / len(val_loader) if len(val_loader) > 0 else float('inf')
+    # Test phase
+    print(f"\033[92mINFO\033[0m: Testing...")
+    model.eval()
+    test_loss = 0.0
+    test_mae = 0.0
+    test_iterator = tqdm(test_loader, desc="Testing")
+    with torch.no_grad():
+        for i, (mfcc, image, label) in enumerate(test_iterator):
+            try:
+                mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                output = model(mfcc, image)
+                label = label.view(-1, 1).float()
+                # Calculate MSE loss
+                loss = criterion(output, label)
+                test_loss += loss.item()
+                # Calculate MAE
+                mae = mae_criterion(output, label)
+                test_mae += mae.item()
+                test_iterator.set_postfix({"MSE": f"{loss.item():.4f}", "MAE": f"{mae.item():.4f}"})
+                # Clear memory after each batch
+                if device.type == 'cuda':
+                    del mfcc, image, label, output, loss, mae
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error in test batch {i}: {e}")
+                # Clear memory in case of error
+                if device.type == 'cuda':
+                    torch.cuda.empty_cache()
+                continue
+    avg_test_loss = test_loss / len(test_loader) if len(test_loader) > 0 else float('inf')
+    avg_test_mae = test_mae / len(test_loader) if len(test_loader) > 0 else float('inf')
+    results = {
+        "image_backbone": image_backbone,
+        "audio_backbone": audio_backbone,
+        "validation_mse": avg_val_loss,
+        "validation_mae": avg_val_mae,
+        "test_mse": avg_test_loss,
+        "test_mae": avg_test_mae
+    }
+    print(f"\033[92mINFO\033[0m: Evaluation Results:")
+    print(f"Image Backbone: {image_backbone}")
+    print(f"Audio Backbone: {audio_backbone}")
+    print(f"Validation MSE: {avg_val_loss:.4f}")
+    print(f"Validation MAE: {avg_val_mae:.4f}")
+    print(f"Test MSE: {avg_test_loss:.4f}")
+    print(f"Test MAE: {avg_test_mae:.4f}")
+    # Save model if save_model_dir is provided
+    if save_model_dir:
+        os.makedirs(save_model_dir, exist_ok=True)
+        model_filename = f"{image_backbone}_{audio_backbone}_model.pt"
+        model_path = os.path.join(save_model_dir, model_filename)
+        torch.save(model.state_dict(), model_path)
+        print(f"\033[92mINFO\033[0m: Model saved to {model_path}")
+        # Add model path to results
+        results["model_path"] = model_path
+    # Clean up memory before returning
+    if device.type == 'cuda':
+        del model, optimizer, criterion, mae_criterion
+        torch.cuda.empty_cache()
+    return results
+def evaluate_all_combinations(data_dir, image_backbones=None, audio_backbones=None, save_model_dir="test_models", results_file="backbone_evaluation_results.json"):
+    if image_backbones is None:
+        image_backbones = list(IMAGE_BACKBONES.keys())
+    if audio_backbones is None:
+        audio_backbones = list(AUDIO_BACKBONES.keys())
+    # Create directory for saving models
+    if save_model_dir:
+        os.makedirs(save_model_dir, exist_ok=True)
+    # Load previous results if the file exists
+    results = []
+    evaluated_combinations = set()
+    if os.path.exists(results_file):
+        try:
+            with open(results_file, 'r') as f:
+                results = json.load(f)
+                evaluated_combinations = {(r["image_backbone"], r["audio_backbone"]) for r in results}
+                print(f"\033[92mINFO\033[0m: Loaded {len(results)} previous results from {results_file}")
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: Error loading previous results from {results_file}: {e}")
+            results = []
+            evaluated_combinations = set()
+    else:
+        print(f"\033[93mWARN\033[0m: Results file '{results_file}' does not exist. Starting with empty results.")
+    # Create combinations to evaluate, skipping any that have already been evaluated
+    combinations = [(img, aud) for img in image_backbones for aud in audio_backbones
+                   if (img, aud) not in evaluated_combinations]
+    if len(combinations) < len(image_backbones) * len(audio_backbones):
+        print(f"\033[92mINFO\033[0m: Skipping {len(evaluated_combinations)} already evaluated combinations")
+    print(f"\033[92mINFO\033[0m: Will evaluate {len(combinations)} combinations")
+    for image_backbone, audio_backbone in combinations:
+        print(f"\033[92mINFO\033[0m: Evaluating {image_backbone} + {audio_backbone}")
+        try:
+            # Clean GPU memory before each model evaluation
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                print(f"\033[92mINFO\033[0m: CUDA memory cleared before evaluation")
+                # Print memory usage for debugging
+                print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+                print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+            result = evaluate_model(data_dir, image_backbone, audio_backbone, save_model_dir=save_model_dir)
+            results.append(result)
+            # Save results after each evaluation
+            save_results(results, results_file)
+            print(f"\033[92mINFO\033[0m: Updated results saved to {results_file}")
+            # Force garbage collection to free memory
+            import gc
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                print(f"\033[92mINFO\033[0m: CUDA memory cleared after evaluation")
+                # Print memory usage for debugging
+                print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+                print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+        except Exception as e:
+            print(f"\033[91mERR!\033[0m: Error evaluating {image_backbone} + {audio_backbone}: {e}")
+            print(f"\033[91mERR!\033[0m: To continue from this point, use --start_from={image_backbone}:{audio_backbone}")
+            # Force garbage collection to free memory even if there's an error
+            import gc
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                print(f"\033[92mINFO\033[0m: CUDA memory cleared after error")
+            continue
+    # Sort results by test MAE (ascending)
+    results.sort(key=lambda x: x["test_mae"])
+    # Save final sorted results
+    save_results(results, results_file)
+    print("\n\033[92mINFO\033[0m: === FINAL RESULTS (Sorted by Test MAE) ===")
+    print(f"{'Image Backbone':<20} {'Audio Backbone':<20} {'Val MAE':<10} {'Test MAE':<10}")
+    print("="*60)
+    for result in results:
+        print(f"{result['image_backbone']:<20} {result['audio_backbone']:<20} {result['validation_mae']:<10.4f} {result['test_mae']:<10.4f}")
+    return results
+def save_results(results, filename="backbone_evaluation_results.json"):
+    """Save evaluation results to a JSON file."""
+    with open(filename, 'w') as f:
+        json.dump(results, f, indent=4)
+    print(f"\033[92mINFO\033[0m: Results saved to {filename}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Evaluate Different Backbones for Watermelon Sweetness Prediction")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="../cleaned",
+        help="Path to the cleaned dataset directory"
+    )
+    parser.add_argument(
+        "--image_backbone",
+        type=str,
+        default=None,
+        help="Specific image backbone to evaluate (leave empty to evaluate all available)"
+    )
+    parser.add_argument(
+        "--audio_backbone",
+        type=str,
+        default=None,
+        help="Specific audio backbone to evaluate (leave empty to evaluate all available)"
+    )
+    parser.add_argument(
+        "--evaluate_all",
+        action="store_true",
+        help="Evaluate all combinations of backbones"
+    )
+    parser.add_argument(
+        "--start_from",
+        type=str,
+        default=None,
+        help="Start evaluation from a specific combination, format: 'image_backbone:audio_backbone'"
+    )
+    parser.add_argument(
+        "--prioritize_efficient",
+        action="store_true",
+        help="Prioritize more efficient models first to avoid memory issues"
+    )
+    parser.add_argument(
+        "--results_file",
+        type=str,
+        default="backbone_evaluation_results.json",
+        help="File to save the evaluation results"
+    )
+    parser.add_argument(
+        "--load_previous_results",
+        action="store_true",
+        help="Load previous results from results_file if it exists"
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="test_models",
+        help="Directory to save model checkpoints"
+    )
+    args = parser.parse_args()
+    # Create model directory if it doesn't exist
+    if args.model_dir:
+        os.makedirs(args.model_dir, exist_ok=True)
+    print(f"\033[92mINFO\033[0m: === Available Image Backbones ===")
+    for name in IMAGE_BACKBONES.keys():
+        print(f"- {name}")
+    print(f"\033[92mINFO\033[0m: === Available Audio Backbones ===")
+    for name in AUDIO_BACKBONES.keys():
+        print(f"- {name}")
+    if args.evaluate_all:
+        evaluate_all_combinations(args.data_dir, results_file=args.results_file, save_model_dir=args.model_dir)
+    elif args.image_backbone and args.audio_backbone:
+        result = evaluate_model(args.data_dir, args.image_backbone, args.audio_backbone, save_model_dir=args.model_dir)
+        save_results([result], args.results_file)
+    else:
+        # Define a default set of backbones to evaluate if not specified
+        if args.prioritize_efficient:
+            # Start with less memory-intensive models
+            image_backbones = ["resnet50", "efficientnet_b0", "resnet101", "efficientnet_b3", "convnext_base", "swin_b"]
+            audio_backbones = ["lstm", "gru", "bidirectional_lstm", "transformer"]
+        else:
+            # Default selection focusing on better performance models
+            image_backbones = ["resnet101", "efficientnet_b3", "swin_b"]
+            audio_backbones = ["lstm", "bidirectional_lstm", "transformer"]
+        # Create all combinations
+        combinations = [(img, aud) for img in image_backbones for aud in audio_backbones]
+        # Load previous results if requested and file exists
+        previous_results = []
+        previous_combinations = set()
+        if args.load_previous_results:
+            try:
+                if os.path.exists(args.results_file):
+                    with open(args.results_file, 'r') as f:
+                        previous_results = json.load(f)
+                        previous_combinations = {(r["image_backbone"], r["audio_backbone"]) for r in previous_results}
+                        print(f"\033[92mINFO\033[0m: Loaded {len(previous_results)} previous results")
+                else:
+                    print(f"\033[93mWARN\033[0m: Results file '{args.results_file}' does not exist. Starting with empty results.")
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error loading previous results: {e}")
+                previous_results = []
+                previous_combinations = set()
+        # If starting from a specific point
+        if args.start_from:
+            try:
+                start_img, start_aud = args.start_from.split(':')
+                start_idx = combinations.index((start_img, start_aud))
+                combinations = combinations[start_idx:]
+                print(f"\033[92mINFO\033[0m: Starting from combination: {start_img} (image) + {start_aud} (audio)")
+            except (ValueError, IndexError):
+                print(f"\033[91mERR!\033[0m: Invalid start_from format or combination not found. Format should be 'image_backbone:audio_backbone'")
+                print(f"\033[91mERR!\033[0m: Continuing with all combinations.")
+        # Skip combinations that have already been evaluated
+        if previous_combinations:
+            original_count = len(combinations)
+            combinations = [(img, aud) for img, aud in combinations if (img, aud) not in previous_combinations]
+            print(f"\033[92mINFO\033[0m: Skipping {original_count - len(combinations)} already evaluated combinations")
+        # Evaluate each combination
+        results = previous_results.copy()
+        for img_backbone, audio_backbone in combinations:
+            print(f"\033[92mINFO\033[0m: Evaluating {img_backbone} + {audio_backbone}")
+            try:
+                # Clean GPU memory before each model evaluation
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    print(f"\033[92mINFO\033[0m: CUDA memory cleared before evaluation")
+                    print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+                    print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+                result = evaluate_model(args.data_dir, img_backbone, audio_backbone, save_model_dir=args.model_dir)
+                results.append(result)
+                # Save results after each evaluation
+                save_results(results, args.results_file)
+                # Force garbage collection to free memory
+                import gc
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    print(f"\033[92mINFO\033[0m: CUDA memory cleared after evaluation")
+                    print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+                    print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error evaluating {img_backbone} + {audio_backbone}: {e}")
+                print(f"\033[91mERR!\033[0m: To continue from this point later, use --start_from={img_backbone}:{audio_backbone}")
+                # Force garbage collection to free memory even if there's an error
+                import gc
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    print(f"\033[92mINFO\033[0m: CUDA memory cleared after error")
+                continue
+        # Sort results by test MAE (ascending)
+        results.sort(key=lambda x: x["test_mae"])
+        # Save final sorted results
+        save_results(results, args.results_file)
+        print("\n\033[92mINFO\033[0m: === FINAL RESULTS (Sorted by Test MAE) ===")
+        print(f"{'Image Backbone':<20} {'Audio Backbone':<20} {'Val MAE':<10} {'Test MAE':<10}")
+        print("="*60)
+        for result in results:
+            print(f"{result['image_backbone']:<20} {result['audio_backbone']:<20} {result['validation_mae']:<10.4f} {result['test_mae']:<10.4f}")

models/.nfs00000001a1a17512003726ad ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02999bd33592de717dc1ec8054dc570193074c3f25a7283b3daa580b727b7134
+size 96095572

models/.nfs00000001a234d9cd003726ac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5df632222fa87e09e635f90e5cce14bdd9fd34b442bf18daaf13e54dedfed132
+size 96095572

models/.nfs00000001a2a11ea9003726ae ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80f999a1540c42ed74491692aa66c3b5a6171f972bdf47c9d52556fe1673c8dd
+size 96095572

models/efficientnet_b0_transformer_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eec8d23f6454198e147db3ff31e497a0fed8cc0fa690f58e2576e9190ca54aa7
+size 22597034

models/efficientnet_b3_transformer_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da70bf6bef70cfa3795e566fd58523a9b41b01c151fb37fd3b255262c2b47451
+size 49751930

models/resnet50_transformer_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cec4fe964defc58fea1f6c26c714c27680a4aa81b131795e8cbeadb6e7be9bd5
+size 101004668

moe_evaluation_results.json ADDED Viewed

	@@ -0,0 +1,801 @@

+{
+    "moe_test_mae": 0.19680618420243262,
+    "moe_test_mse": 0.05606407420709729,
+    "true_labels": [
+        10.5,
+        9.399999618530273,
+        11.600000381469727,
+        8.699999809265137,
+        10.399999618530273,
+        10.800000190734863,
+        11.600000381469727,
+        10.5,
+        11.600000381469727,
+        11.100000381469727,
+        10.399999618530273,
+        10.5,
+        11.0,
+        10.5,
+        10.899999618530273,
+        10.5,
+        11.100000381469727,
+        9.600000381469727,
+        12.699999809265137,
+        10.0,
+        10.300000190734863,
+        10.399999618530273,
+        9.399999618530273,
+        10.800000190734863,
+        10.0,
+        11.600000381469727,
+        10.0,
+        10.399999618530273,
+        9.399999618530273,
+        10.399999618530273,
+        10.300000190734863,
+        9.399999618530273,
+        10.899999618530273,
+        9.0,
+        10.300000190734863,
+        10.899999618530273,
+        11.0,
+        12.699999809265137,
+        10.399999618530273,
+        9.600000381469727,
+        8.699999809265137,
+        10.199999809265137,
+        10.300000190734863,
+        11.600000381469727,
+        9.0,
+        9.0,
+        11.0,
+        8.699999809265137,
+        9.699999809265137,
+        10.399999618530273,
+        10.0,
+        11.600000381469727,
+        9.399999618530273,
+        9.0,
+        10.300000190734863,
+        10.5,
+        10.399999618530273,
+        11.0,
+        10.899999618530273,
+        9.399999618530273,
+        8.699999809265137,
+        10.300000190734863,
+        9.699999809265137,
+        10.300000190734863,
+        9.399999618530273,
+        10.300000190734863,
+        9.399999618530273,
+        10.0,
+        10.399999618530273,
+        10.199999809265137,
+        11.0,
+        12.699999809265137,
+        12.699999809265137,
+        10.0,
+        11.0,
+        9.0,
+        10.0,
+        10.5,
+        11.600000381469727,
+        9.399999618530273,
+        10.0,
+        11.0,
+        11.100000381469727,
+        10.899999618530273,
+        9.399999618530273,
+        10.300000190734863,
+        9.399999618530273,
+        8.699999809265137,
+        10.0,
+        12.699999809265137,
+        12.699999809265137,
+        9.699999809265137,
+        9.399999618530273,
+        11.0,
+        9.399999618530273,
+        9.0,
+        11.100000381469727,
+        10.300000190734863,
+        10.300000190734863,
+        10.300000190734863,
+        10.0,
+        9.399999618530273,
+        9.399999618530273,
+        10.899999618530273,
+        11.0,
+        9.699999809265137,
+        12.699999809265137,
+        10.5,
+        11.0,
+        10.899999618530273,
+        12.699999809265137,
+        10.899999618530273,
+        11.0,
+        10.300000190734863,
+        11.0,
+        9.699999809265137,
+        10.300000190734863,
+        10.300000190734863,
+        10.199999809265137,
+        10.199999809265137,
+        10.899999618530273,
+        10.5,
+        11.0,
+        8.699999809265137,
+        9.699999809265137,
+        12.699999809265137,
+        11.600000381469727,
+        10.899999618530273,
+        11.0,
+        9.399999618530273,
+        10.300000190734863,
+        12.699999809265137,
+        10.199999809265137,
+        10.199999809265137,
+        10.800000190734863,
+        8.699999809265137,
+        9.0,
+        11.0,
+        9.399999618530273,
+        10.800000190734863,
+        11.100000381469727,
+        11.100000381469727,
+        10.199999809265137,
+        9.399999618530273,
+        10.199999809265137,
+        10.199999809265137,
+        9.399999618530273,
+        10.899999618530273,
+        10.199999809265137,
+        11.100000381469727,
+        11.600000381469727,
+        8.699999809265137,
+        11.600000381469727,
+        10.199999809265137,
+        9.399999618530273,
+        9.699999809265137,
+        9.399999618530273
+    ],
+    "moe_predictions": [
+        10.906482696533203,
+        9.413387298583984,
+        11.58445930480957,
+        8.627098083496094,
+        10.55517578125,
+        10.969362258911133,
+        11.596641540527344,
+        10.598587036132812,
+        11.712945938110352,
+        11.415390968322754,
+        10.500967979431152,
+        10.939116477966309,
+        11.23089599609375,
+        10.928877830505371,
+        11.180931091308594,
+        10.805574417114258,
+        11.44560432434082,
+        9.797750473022461,
+        12.00424575805664,
+        9.924805641174316,
+        10.419149398803711,
+        10.459878921508789,
+        9.774242401123047,
+        10.985288619995117,
+        10.047812461853027,
+        11.745304107666016,
+        10.191004753112793,
+        10.527164459228516,
+        9.581968307495117,
+        10.483012199401855,
+        10.368606567382812,
+        9.450727462768555,
+        11.197010040283203,
+        9.173027038574219,
+        10.50676441192627,
+        11.195816040039062,
+        11.227279663085938,
+        13.106525421142578,
+        10.4664945602417,
+        9.891031265258789,
+        8.75540542602539,
+        10.572815895080566,
+        10.214585304260254,
+        12.000329971313477,
+        8.887301445007324,
+        8.929031372070312,
+        11.054266929626465,
+        8.85447883605957,
+        9.515145301818848,
+        10.480228424072266,
+        10.193933486938477,
+        11.7305908203125,
+        9.437666893005371,
+        9.13387680053711,
+        10.629348754882812,
+        10.703892707824707,
+        10.539461135864258,
+        11.135326385498047,
+        11.19705867767334,
+        9.558942794799805,
+        8.898516654968262,
+        10.628425598144531,
+        9.657480239868164,
+        10.513351440429688,
+        9.459192276000977,
+        10.358184814453125,
+        9.432706832885742,
+        10.078161239624023,
+        10.572355270385742,
+        10.58112907409668,
+        10.910698890686035,
+        13.053973197937012,
+        12.972726821899414,
+        10.170805931091309,
+        11.225208282470703,
+        8.872610092163086,
+        10.091118812561035,
+        10.724177360534668,
+        11.729219436645508,
+        9.66834545135498,
+        10.027229309082031,
+        11.232885360717773,
+        11.518696784973145,
+        11.261479377746582,
+        9.523242950439453,
+        10.484042167663574,
+        9.522797584533691,
+        8.75236988067627,
+        10.083819389343262,
+        13.073421478271484,
+        13.001571655273438,
+        9.905550003051758,
+        9.318197250366211,
+        11.141549110412598,
+        9.754105567932129,
+        9.013923645019531,
+        11.429242134094238,
+        10.375783920288086,
+        10.526394844055176,
+        10.307140350341797,
+        10.169934272766113,
+        9.429258346557617,
+        9.29328441619873,
+        11.136444091796875,
+        11.040485382080078,
+        9.723966598510742,
+        12.936074256896973,
+        10.913898468017578,
+        11.255935668945312,
+        11.032815933227539,
+        12.95362663269043,
+        10.942233085632324,
+        11.014484405517578,
+        10.47386646270752,
+        11.207697868347168,
+        9.531013488769531,
+        10.512401580810547,
+        10.791257858276367,
+        10.385677337646484,
+        10.393269538879395,
+        11.13322639465332,
+        10.893503189086914,
+        11.24067497253418,
+        8.767911911010742,
+        9.76015853881836,
+        13.095734596252441,
+        11.651636123657227,
+        11.08572006225586,
+        10.958650588989258,
+        9.548912048339844,
+        10.243309020996094,
+        13.102086067199707,
+        10.579414367675781,
+        10.406577110290527,
+        11.255165100097656,
+        8.494292259216309,
+        8.890151023864746,
+        11.146952629089355,
+        9.766341209411621,
+        11.163339614868164,
+        11.502073287963867,
+        11.408285140991211,
+        10.383015632629395,
+        9.54578971862793,
+        10.56948184967041,
+        10.558614730834961,
+        9.794357299804688,
+        10.885274887084961,
+        10.377969741821289,
+        11.410195350646973,
+        11.537992477416992,
+        8.826037406921387,
+        12.070415496826172,
+        10.559798240661621,
+        9.605077743530273,
+        9.737533569335938,
+        9.520374298095703
+    ],
+    "individual_predictions": {
+        "efficientnet_b3_transformer": [
+            10.619565963745117,
+            9.285565376281738,
+            11.017762184143066,
+            8.358080863952637,
+            9.92147159576416,
+            10.68340015411377,
+            11.023524284362793,
+            10.292417526245117,
+            10.513864517211914,
+            10.958821296691895,
+            10.322061538696289,
+            10.383071899414062,
+            10.330121040344238,
+            10.344510078430176,
+            11.309442520141602,
+            10.321882247924805,
+            10.974185943603516,
+            9.367315292358398,
+            11.474529266357422,
+            9.296891212463379,
+            10.27892780303955,
+            10.14356803894043,
+            9.155308723449707,
+            10.249421119689941,
+            9.534292221069336,
+            11.197205543518066,
+            9.988767623901367,
+            10.485107421875,
+            9.040623664855957,
+            10.171326637268066,
+            10.153056144714355,
+            9.17545223236084,
+            10.604523658752441,
+            8.7711763381958,
+            10.127464294433594,
+            11.29480266571045,
+            10.326626777648926,
+            13.54947566986084,
+            10.142123222351074,
+            9.914827346801758,
+            7.935253620147705,
+            10.513096809387207,
+            9.79228687286377,
+            11.721403121948242,
+            7.996966361999512,
+            8.011720657348633,
+            10.551737785339355,
+            8.663973808288574,
+            8.74413776397705,
+            10.276195526123047,
+            10.136805534362793,
+            11.221556663513184,
+            8.912840843200684,
+            8.619383811950684,
+            10.178643226623535,
+            10.311914443969727,
+            10.487189292907715,
+            10.548056602478027,
+            11.258485794067383,
+            9.288726806640625,
+            8.140922546386719,
+            10.216073989868164,
+            9.068129539489746,
+            10.33917236328125,
+            9.11395263671875,
+            10.140262603759766,
+            8.864439010620117,
+            9.560175895690918,
+            10.1554594039917,
+            10.011631965637207,
+            10.838635444641113,
+            13.890799522399902,
+            13.743374824523926,
+            10.119439125061035,
+            11.073603630065918,
+            7.99126672744751,
+            10.012906074523926,
+            10.309550285339355,
+            10.537038803100586,
+            9.361739158630371,
+            9.594813346862793,
+            10.32430362701416,
+            11.0283842086792,
+            11.271435737609863,
+            9.267289161682129,
+            10.143651962280273,
+            9.201630592346191,
+            8.489853858947754,
+            9.663308143615723,
+            13.539351463317871,
+            13.890753746032715,
+            9.300865173339844,
+            8.978877067565918,
+            10.455121994018555,
+            9.145268440246582,
+            8.390588760375977,
+            10.97396183013916,
+            10.023279190063477,
+            10.194899559020996,
+            9.974883079528809,
+            10.101761817932129,
+            9.511059761047363,
+            8.89189624786377,
+            10.77907657623291,
+            10.7083158493042,
+            9.067532539367676,
+            13.406800270080566,
+            10.60212516784668,
+            10.704161643981934,
+            11.133363723754883,
+            13.293631553649902,
+            9.996685981750488,
+            10.766114234924316,
+            10.15234088897705,
+            11.180027961730957,
+            8.875227928161621,
+            10.376603126525879,
+            10.074305534362793,
+            10.001667022705078,
+            10.027312278747559,
+            10.606922149658203,
+            10.565585136413574,
+            10.699769020080566,
+            8.507576942443848,
+            9.084380149841309,
+            13.500945091247559,
+            11.240296363830566,
+            10.65023136138916,
+            10.248372077941895,
+            9.269180297851562,
+            9.840892791748047,
+            13.547538757324219,
+            9.992758750915527,
+            10.026358604431152,
+            10.71567440032959,
+            8.320480346679688,
+            8.000975608825684,
+            10.548954963684082,
+            9.176098823547363,
+            11.098072052001953,
+            11.02483081817627,
+            11.12319278717041,
+            9.996392250061035,
+            9.263312339782715,
+            10.517735481262207,
+            9.8799409866333,
+            9.319127082824707,
+            9.990796089172363,
+            9.982155799865723,
+            11.105603218078613,
+            10.747210502624512,
+            8.343344688415527,
+            11.73001480102539,
+            10.511062622070312,
+            9.331645965576172,
+            9.131060600280762,
+            8.956952095031738
+        ],
+        "efficientnet_b0_transformer": [
+            11.040512084960938,
+            9.555410385131836,
+            11.689399719238281,
+            8.434002876281738,
+            11.386773109436035,
+            10.940624237060547,
+            11.708887100219727,
+            11.056541442871094,
+            12.392988204956055,
+            11.619367599487305,
+            10.591476440429688,
+            11.15828800201416,
+            11.810995101928711,
+            11.26023006439209,
+            11.246732711791992,
+            11.448994636535645,
+            11.935430526733398,
+            10.085470199584961,
+            12.768455505371094,
+            10.39224910736084,
+            10.590924263000488,
+            10.642997741699219,
+            9.948995590209961,
+            11.38804817199707,
+            10.38807487487793,
+            11.55557632446289,
+            10.514514923095703,
+            10.37149429321289,
+            9.95881462097168,
+            10.645825386047363,
+            10.480897903442383,
+            9.64439868927002,
+            11.213277816772461,
+            9.551204681396484,
+            10.929215431213379,
+            11.268585205078125,
+            11.799053192138672,
+            12.975137710571289,
+            10.657550811767578,
+            9.907003402709961,
+            9.108478546142578,
+            10.350242614746094,
+            10.475027084350586,
+            12.249593734741211,
+            9.311214447021484,
+            9.402128219604492,
+            11.460792541503906,
+            8.638538360595703,
+            10.098196029663086,
+            10.429000854492188,
+            10.63322639465332,
+            11.521190643310547,
+            9.934067726135254,
+            9.390719413757324,
+            10.85897445678711,
+            10.96368408203125,
+            10.440620422363281,
+            11.39995002746582,
+            11.138040542602539,
+            9.738420486450195,
+            9.13027286529541,
+            10.834165573120117,
+            9.734615325927734,
+            10.535043716430664,
+            9.7576904296875,
+            10.504064559936523,
+            9.726502418518066,
+            10.391711235046387,
+            10.526286125183105,
+            10.450986862182617,
+            10.732028007507324,
+            13.047806739807129,
+            12.901583671569824,
+            10.609762191772461,
+            11.112765312194824,
+            9.227752685546875,
+            10.403764724731445,
+            10.97991943359375,
+            12.400298118591309,
+            9.740009307861328,
+            10.546162605285645,
+            11.811308860778809,
+            12.024316787719727,
+            11.304412841796875,
+            9.642568588256836,
+            10.770721435546875,
+            9.673535346984863,
+            8.692492485046387,
+            10.140533447265625,
+            13.103691101074219,
+            12.987236022949219,
+            9.978914260864258,
+            9.647960662841797,
+            11.465564727783203,
+            9.91793155670166,
+            8.99271011352539,
+            11.874197959899902,
+            10.875059127807617,
+            10.751541137695312,
+            10.586625099182129,
+            10.616861343383789,
+            9.251531600952148,
+            9.575355529785156,
+            11.49870777130127,
+            11.352771759033203,
+            9.970162391662598,
+            12.869828224182129,
+            11.021011352539062,
+            11.830097198486328,
+            10.895241737365723,
+            13.477546691894531,
+            11.435956001281738,
+            11.21767807006836,
+            10.8616361618042,
+            11.25930404663086,
+            9.386629104614258,
+            10.510151863098145,
+            11.104487419128418,
+            10.017858505249023,
+            10.365488052368164,
+            11.206178665161133,
+            11.027682304382324,
+            11.81328010559082,
+            8.614967346191406,
+            10.088481903076172,
+            12.978555679321289,
+            11.964248657226562,
+            11.287935256958008,
+            11.514422416687012,
+            9.758452415466309,
+            10.500945091247559,
+            12.95924186706543,
+            10.438175201416016,
+            10.364145278930664,
+            11.490489959716797,
+            8.45285415649414,
+            9.380582809448242,
+            11.404769897460938,
+            10.42972183227539,
+            11.568924903869629,
+            11.746879577636719,
+            11.68482780456543,
+            10.019561767578125,
+            9.662923812866211,
+            10.360588073730469,
+            10.901131629943848,
+            10.128849029541016,
+            11.287601470947266,
+            10.017107009887695,
+            11.725995063781738,
+            11.726645469665527,
+            8.865287780761719,
+            12.030455589294434,
+            10.348114013671875,
+            9.747005462646484,
+            9.905638694763184,
+            9.855661392211914
+        ],
+        "resnet50_transformer": [
+            11.059370040893555,
+            9.399184226989746,
+            12.046213150024414,
+            9.089208602905273,
+            10.357281684875488,
+            11.284062385559082,
+            12.057510375976562,
+            10.44680118560791,
+            12.231982231140137,
+            11.667984008789062,
+            10.58936595916748,
+            11.275989532470703,
+            11.5515718460083,
+            11.181893348693848,
+            10.986615180969238,
+            10.645844459533691,
+            11.427197456359863,
+            9.94046688079834,
+            11.769749641418457,
+            10.08527660369873,
+            10.387595176696777,
+            10.593070030212402,
+            10.218421936035156,
+            11.31839656829834,
+            10.221070289611816,
+            12.48313045501709,
+            10.069729804992676,
+            10.72489070892334,
+            9.746464729309082,
+            10.631884574890137,
+            10.4718656539917,
+            9.532330513000488,
+            11.773228645324707,
+            9.196700096130371,
+            10.46361255645752,
+            11.024060249328613,
+            11.556159019470215,
+            12.794964790344238,
+            10.599808692932129,
+            9.851262092590332,
+            9.222484588623047,
+            10.855106353759766,
+            10.37644100189209,
+            12.02999210357666,
+            9.35372257232666,
+            9.37324333190918,
+            11.150269508361816,
+            9.2609224319458,
+            9.703102111816406,
+            10.735487937927246,
+            9.811766624450684,
+            12.44902515411377,
+            9.46609115600586,
+            9.391528129577637,
+            10.850428581237793,
+            10.836078643798828,
+            10.690573692321777,
+            11.45797348022461,
+            11.194649696350098,
+            9.649679183959961,
+            9.42435359954834,
+            10.835038185119629,
+            10.169693946838379,
+            10.665839195251465,
+            9.50593376159668,
+            10.43022632598877,
+            9.70718002319336,
+            10.282594680786133,
+            11.035321235656738,
+            11.280767440795898,
+            11.161433219909668,
+            12.223311424255371,
+            12.273221015930176,
+            9.783215522766113,
+            11.48925495147705,
+            9.398808479309082,
+            9.856684684753418,
+            10.883062362670898,
+            12.250321388244629,
+            9.903286933898926,
+            9.940712928771973,
+            11.563044548034668,
+            11.503388404846191,
+            11.208588600158691,
+            9.659869194030762,
+            10.537753105163574,
+            9.693224906921387,
+            9.074763298034668,
+            10.447615623474121,
+            12.577223777770996,
+            12.126725196838379,
+            10.436871528625488,
+            9.327754020690918,
+            11.503960609436035,
+            10.199116706848145,
+            9.658470153808594,
+            11.43956470489502,
+            10.229013442993164,
+            10.632741928100586,
+            10.35991096496582,
+            9.791178703308105,
+            9.52518367767334,
+            9.412601470947266,
+            11.131546974182129,
+            11.0603666305542,
+            10.13420295715332,
+            12.53159236907959,
+            11.118557929992676,
+            11.233548164367676,
+            11.069842338562012,
+            12.089702606201172,
+            11.394057273864746,
+            11.059659957885742,
+            10.407622337341309,
+            11.183761596679688,
+            10.331181526184082,
+            10.6504487991333,
+            11.194979667663574,
+            11.137504577636719,
+            10.787008285522461,
+            11.586577415466309,
+            11.08724308013916,
+            11.208975791931152,
+            9.181191444396973,
+            10.107614517211914,
+            12.807703018188477,
+            11.750362396240234,
+            11.31899356842041,
+            11.11315631866455,
+            9.619100570678711,
+            10.388087272644043,
+            12.79947566986084,
+            11.307307243347168,
+            10.82922649383545,
+            11.55932903289795,
+            8.709542274475098,
+            9.288893699645996,
+            11.48713207244873,
+            9.693202018737793,
+            10.82302188873291,
+            11.73450756072998,
+            11.416834831237793,
+            11.133091926574707,
+            9.71113109588623,
+            10.830121040344238,
+            10.894770622253418,
+            9.935094833374023,
+            11.377425193786621,
+            11.13464641571045,
+            11.39898681640625,
+            12.140122413635254,
+            9.269479751586914,
+            12.450774192810059,
+            10.820216178894043,
+            9.736580848693848,
+            10.17590045928955,
+            9.74850845336914
+        ]
+    }
+}

templates/.nfs00000001a2893bde003726a5 ADDED Viewed

	@@ -0,0 +1 @@


1	+

test_moe_model.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+import torch
+import torchaudio
+import torchvision
+import numpy as np
+import json
+from torch.utils.data import Dataset, DataLoader
+import sys
+from tqdm import tqdm
+# Add parent directory to path to import the preprocess functions
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from preprocess import process_audio_data, process_image_data
+# Import the WatermelonDataset and WatermelonModelModular from the evaluate_backbones.py file
+from evaluate_backbones import WatermelonDataset, WatermelonModelModular, IMAGE_BACKBONES, AUDIO_BACKBONES
+# Print library versions
+print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
+print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}")
+print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}")
+# Device selection
+device = torch.device(
+    "cuda" if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available()
+    else "cpu"
+)
+print(f"\033[92mINFO\033[0m: Using device: {device}")
+# Define the top-performing models based on the previous evaluation
+TOP_MODELS = [
+    {"image_backbone": "efficientnet_b3", "audio_backbone": "transformer"},
+    {"image_backbone": "efficientnet_b0", "audio_backbone": "transformer"},
+    {"image_backbone": "resnet50", "audio_backbone": "transformer"}
+]
+# Define class for the MoE model
+class WatermelonMoEModel(torch.nn.Module):
+    def __init__(self, model_configs, model_dir="test_models", weights=None):
+        """
+        Mixture of Experts model that combines multiple backbone models.
+        Args:
+            model_configs: List of dictionaries with 'image_backbone' and 'audio_backbone' keys
+            model_dir: Directory where model checkpoints are stored
+            weights: Optional list of weights for each model (None for equal weighting)
+        """
+        super(WatermelonMoEModel, self).__init__()
+        self.models = []
+        self.model_configs = model_configs
+        # Load each model
+        for config in model_configs:
+            img_backbone = config["image_backbone"]
+            audio_backbone = config["audio_backbone"]
+            # Initialize model
+            model = WatermelonModelModular(img_backbone, audio_backbone)
+            # Load weights
+            model_path = os.path.join(model_dir, f"{img_backbone}_{audio_backbone}_model.pt")
+            if os.path.exists(model_path):
+                print(f"\033[92mINFO\033[0m: Loading model {img_backbone}_{audio_backbone} from {model_path}")
+                model.load_state_dict(torch.load(model_path, map_location=device))
+            else:
+                print(f"\033[91mERR!\033[0m: Model checkpoint not found at {model_path}")
+                continue
+            model.to(device)
+            model.eval()  # Set to evaluation mode
+            self.models.append(model)
+        # Set model weights (uniform by default)
+        if weights:
+            assert len(weights) == len(self.models), "Number of weights must match number of models"
+            self.weights = weights
+        else:
+            self.weights = [1.0 / len(self.models)] * len(self.models)
+        print(f"\033[92mINFO\033[0m: Loaded {len(self.models)} models for MoE ensemble")
+        print(f"\033[92mINFO\033[0m: Model weights: {self.weights}")
+    def forward(self, mfcc, image):
+        """
+        Forward pass through the MoE model.
+        Returns the weighted average of all model outputs.
+        """
+        outputs = []
+        # Get outputs from each model
+        with torch.no_grad():
+            for i, model in enumerate(self.models):
+                output = model(mfcc, image)
+                outputs.append(output * self.weights[i])
+        # Return weighted average
+        return torch.sum(torch.stack(outputs), dim=0)
+def evaluate_moe_model(data_dir, model_dir="test_models", weights=None):
+    """
+    Evaluate the MoE model on the test set.
+    """
+    # Load dataset
+    print(f"\033[92mINFO\033[0m: Loading dataset from {data_dir}")
+    dataset = WatermelonDataset(data_dir)
+    n_samples = len(dataset)
+    # Split dataset
+    train_size = int(0.7 * n_samples)
+    val_size = int(0.2 * n_samples)
+    test_size = n_samples - train_size - val_size
+    _, _, test_dataset = torch.utils.data.random_split(
+        dataset, [train_size, val_size, test_size]
+    )
+    # Use a reasonable batch size
+    batch_size = 8
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize MoE model
+    moe_model = WatermelonMoEModel(TOP_MODELS, model_dir, weights)
+    moe_model.eval()
+    # Evaluation metrics
+    mae_criterion = torch.nn.L1Loss()
+    mse_criterion = torch.nn.MSELoss()
+    test_mae = 0.0
+    test_mse = 0.0
+    print(f"\033[92mINFO\033[0m: Evaluating MoE model on {len(test_dataset)} test samples")
+    # Individual model predictions for analysis
+    individual_predictions = {f"{config['image_backbone']}_{config['audio_backbone']}": []
+                             for config in TOP_MODELS}
+    true_labels = []
+    moe_predictions = []
+    # Evaluation loop
+    test_iterator = tqdm(test_loader, desc="Testing MoE")
+    with torch.no_grad():
+        for i, (mfcc, image, label) in enumerate(test_iterator):
+            try:
+                mfcc, image, label = mfcc.to(device), image.to(device), label.to(device)
+                # Store individual model outputs for analysis
+                for j, model in enumerate(moe_model.models):
+                    config = TOP_MODELS[j]
+                    model_name = f"{config['image_backbone']}_{config['audio_backbone']}"
+                    output = model(mfcc, image)
+                    individual_predictions[model_name].extend(output.view(-1).cpu().numpy())
+                # Get MoE prediction
+                output = moe_model(mfcc, image)
+                moe_predictions.extend(output.view(-1).cpu().numpy())
+                # Store true labels
+                label = label.view(-1, 1).float()
+                true_labels.extend(label.view(-1).cpu().numpy())
+                # Calculate metrics
+                mae = mae_criterion(output, label)
+                mse = mse_criterion(output, label)
+                test_mae += mae.item()
+                test_mse += mse.item()
+                test_iterator.set_postfix({"MAE": f"{mae.item():.4f}", "MSE": f"{mse.item():.4f}"})
+                # Clean up memory
+                if device.type == 'cuda':
+                    del mfcc, image, label, output, mae, mse
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"\033[91mERR!\033[0m: Error in test batch {i}: {e}")
+                if device.type == 'cuda':
+                    torch.cuda.empty_cache()
+                continue
+    # Calculate average metrics
+    avg_test_mae = test_mae / len(test_loader) if len(test_loader) > 0 else float('inf')
+    avg_test_mse = test_mse / len(test_loader) if len(test_loader) > 0 else float('inf')
+    print(f"\n\033[92mINFO\033[0m: === MoE Model Results ===")
+    print(f"Test MAE: {avg_test_mae:.4f}")
+    print(f"Test MSE: {avg_test_mse:.4f}")
+    # Compare with individual models
+    print(f"\n\033[92mINFO\033[0m: === Comparison with Individual Models ===")
+    print(f"{'Model':<30} {'Test MAE':<15}")
+    print("="*45)
+    # Load previous results
+    results_file = "backbone_evaluation_results.json"
+    if os.path.exists(results_file):
+        with open(results_file, 'r') as f:
+            previous_results = json.load(f)
+        # Filter results for our top models
+        for config in TOP_MODELS:
+            img_backbone = config["image_backbone"]
+            audio_backbone = config["audio_backbone"]
+            for result in previous_results:
+                if result["image_backbone"] == img_backbone and result["audio_backbone"] == audio_backbone:
+                    print(f"{img_backbone}_{audio_backbone:<20} {result['test_mae']:<15.4f}")
+    print(f"MoE (Ensemble)               {avg_test_mae:<15.4f}")
+    # Save results and predictions
+    results = {
+        "moe_test_mae": float(avg_test_mae),
+        "moe_test_mse": float(avg_test_mse),
+        "true_labels": [float(x) for x in true_labels],
+        "moe_predictions": [float(x) for x in moe_predictions],
+        "individual_predictions": {key: [float(x) for x in values]
+                                  for key, values in individual_predictions.items()}
+    }
+    with open("moe_evaluation_results.json", 'w') as f:
+        json.dump(results, f, indent=4)
+    print(f"\033[92mINFO\033[0m: Results saved to moe_evaluation_results.json")
+    return avg_test_mae, avg_test_mse
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Test Mixture of Experts (MoE) Model for Watermelon Sweetness Prediction")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="../cleaned",
+        help="Path to the cleaned dataset directory"
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="test_models",
+        help="Directory containing model checkpoints"
+    )
+    parser.add_argument(
+        "--weighting",
+        type=str,
+        choices=["uniform", "performance"],
+        default="uniform",
+        help="How to weight the models (uniform or based on performance)"
+    )
+    args = parser.parse_args()
+    # Determine weights based on argument
+    weights = None
+    if args.weighting == "performance":
+        # Weights inversely proportional to the MAE (better models get higher weights)
+        # These are the MAE values from the provided results
+        mae_values = [0.3635, 0.3765, 0.3959]  # efficientnet_b3+transformer, efficientnet_b0+transformer, resnet50+transformer
+        # Convert to weights (inverse of MAE, normalized)
+        inverse_mae = [1/mae for mae in mae_values]
+        total = sum(inverse_mae)
+        weights = [val/total for val in inverse_mae]
+        print(f"\033[92mINFO\033[0m: Using performance-based weights: {weights}")
+    else:
+        print(f"\033[92mINFO\033[0m: Using uniform weights")
+    # Evaluate the MoE model
+    evaluate_moe_model(args.data_dir, args.model_dir, weights)