Spaces:

ahmedtarekabd
/

audio-classifier

Sleeping

App Files Files Community

ahmedtarekabd commited on May 10

Commit

4c8f740

1 Parent(s): 61e8d59

Add Models & files.

Browse files

Files changed (22) hide show

.docker-compose.yaml +10 -0
.dockerignore +181 -0
.gitignore +178 -0
Dockerfile +9 -7
README.md +37 -17
config.py +27 -0
inference.py +83 -0
models/base_model.py +153 -0
models/catboost.py +53 -0
models/gmboost.py +47 -0
models/lightgbm.py +69 -0
models/svm.py +58 -0
models/xgboost.py +66 -0
modules/evaluate.py +9 -0
modules/feature_extraction.py +161 -0
modules/pipelines.py +83 -0
modules/preprocessing.py +114 -0
requirements.txt +0 -0
requirements_docker.txt +0 -0
src/app.py +62 -0
src/streamlit_app.py +0 -40
utils.py +95 -0

.docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+version: "3.8"
+services:
+  inference:
+    build:
+      context: .
+    image: audio-infer
+    volumes:
+      - ./data/data_20_files:/data
+      - ./data/output:/results
+    command: ["--team_id", "8"]

.dockerignore ADDED Viewed

	@@ -0,0 +1,181 @@

+data/
+catboost_info/
+# mlruns/
+# !mlruns/models
+.git
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+data/
+catboost_info/
+mlruns/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

Dockerfile CHANGED Viewed

@@ -1,21 +1,23 @@
-FROM python:3.9-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     software-properties-common \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.10-slim
 WORKDIR /app
+# Source: https://stackoverflow.com/questions/55036740/lightgbm-inside-docker-libgomp-so-1-cannot-open-shared-object-file
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     software-properties-common \
     git \
+    libgomp1 \
     && rm -rf /var/lib/apt/lists/*
+COPY requirements_docker.txt requirements_docker.txt
+RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements_docker.txt
+COPY . .
+VOLUME ["/data", "/results"]
+EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,20 +1,40 @@
----
-title: Audio Classifier
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: This space is used to deploy Speaker Age & Gender clf.
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# Audio Classification
+This repository contains a collection of Jupyter notebooks and Python scripts for audio classification tasks using various machine learning and deep learning techniques. The main focus is on classifying audio files into different categories based on their content.
+The project includes the following components:
+- **Data Preprocessing**: Scripts for loading and preprocessing audio data, including feature extraction using libraries like `librosa`.
+- **Model Training**: Jupyter notebooks for training different models, including traditional machine learning algorithms and deep learning architectures.
+- **Model Evaluation**: Scripts for evaluating the performance of trained models using metrics like accuracy, precision, recall, and F1-score.
+- **Visualization**: Tools for visualizing audio data and model performance, including confusion matrices and ROC curves.
+## How to Use
+1. Clone the repository:
+   ```bash
+   git clone Adsasda
+    cd Audio-Classification
+    ```
+2. Install the required dependencies:
+3. ```bash
+   pip install -r requirements.txt
+   ```
+4. Prepare your audio dataset and place it in the `data/` directory.
+### Using Docker
+1. Build the Docker image:
+   ```bash
+   docker build -t audio-infer .
+   ```
+2. Run the Docker container with your audio files mounted:
+   ```bash
+   docker run --rm -v "$(pwd)/data/data_20_files:/data" -v "$(pwd)/data/output:/results" audio-infer --team_id 8
+    ```
+3. The results will be saved in the `data/output` directory.
+4. You can also run the container with a specific model:
+   ```bash
+   docker run --rm -v "$(pwd)/data/data_20_files:/data" -v "$(pwd)/data/output:/results" audio-infer --team_id 8 --model_path /path/to/your/model
+   ```
+### Using Docker Compose
+1. Build the Docker image:
+   ```bash
+   docker-compose -f .docker-compose.yaml up --build
+   ```

config.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from pathlib import Path
+from tqdm import tqdm
+import pandas as pd
+if os.path.exists("/kaggle"):
+    # Running on Kaggle
+    DATA_DIR = Path("/kaggle/input/your-dataset-name")
+elif os.path.exists("/content"):
+    # Running on Google Colab
+    DATA_DIR = Path("/content")
+else:
+    DATA_DIR = Path("data")
+AUDIO_PATH = DATA_DIR / "audios"
+AUDIO_CACHE = DATA_DIR / "audio_cache"
+PREPROCESSED_CACHE = DATA_DIR / "preprocessed_cache"
+FEATURES_CACHE = DATA_DIR / "features_cache"
+MODELS_DIR = DATA_DIR / "models"
+NUM_WORKERS = os.cpu_count() or 4
+def run_config():
+    for folder in [AUDIO_CACHE, PREPROCESSED_CACHE, FEATURES_CACHE, MODELS_DIR]:
+        folder.mkdir(parents=True, exist_ok=True)
+    tqdm.pandas()

inference.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+import os
+import time
+from glob import glob
+import pandas as pd
+from pathlib import Path
+from modules.preprocessing import AudioPreprocessor
+from modules.feature_extraction import FeatureExtractor
+from models.lightgbm import LightGBMModel
+from models.xgboost import XGBoostModel
+from modules.pipelines import ModelPipeline
+import warnings
+warnings.filterwarnings("ignore")
+MODEL_NAME = {
+    "XGBoost": XGBoostModel,
+    "LightGBM": LightGBMModel,
+}
+def run_batch_inference(model, input_folder, output_folder, sr=16000, feature_mode="traditional"):
+    preprocessor = AudioPreprocessor()
+    extractor = FeatureExtractor()
+    # Sort files in the correct order
+    files = sorted(glob(os.path.join(input_folder, "*")), key=lambda x: int(Path(x).stem))
+    # Overwrite if exsists
+    results_path = os.path.join(output_folder, "results.txt")
+    time_path = os.path.join(output_folder, "time.txt")
+    with open(results_path, "w") as f: pass
+    with open(time_path, "w") as f: pass
+    pred = 0
+    for file in files:
+        # Measure inference time
+        start_time = time.time()
+        y = preprocessor.preprocess(preprocessor.load_audio(str(file), sr=sr))
+        if y is not None:
+            x = extractor.extract(y, sr=sr, mode=feature_mode, n_mfcc=20)
+            pred = model.predict([x])[0]
+        end_time = time.time()
+        # Save results to results.txt
+        with open(results_path, "a") as f:
+            f.write(f"{pred}\n")
+        # Save inference time to time.txt
+        with open(time_path, "a") as f:
+            f.write(f"{end_time - start_time:.6f}\n")
+    print(f"✅ Results saved to {results_path}")
+    print(f"✅ Inference time saved to {time_path}")
+def main(input_path, model_name, output_folder):
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Input path {input_path} does not exist.")
+    if model_name not in MODEL_NAME.keys():
+        raise ValueError(f"Model name {model_name} is not valid. Choose from {list(MODEL_NAME.keys())}.")
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Output folder {output_folder} created.")
+    model = ModelPipeline(model=MODEL_NAME[model_name])
+    model.load_model_from_registry(model_name=model_name)
+    print("✅ Model loaded successfully")
+    run_batch_inference(model, input_path, output_folder)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, default="/data", help="Path to the input folder containing test audio files. Default is '/data'.")
+    parser.add_argument('--model-name', type=str, default="XGBoost", help="Name of the model to use for inference. Default is 'XGBoost'.")
+    parser.add_argument('--team_id', type=str, required=True, help="Team ID for output folder.")
+    args = parser.parse_args()
+    output_folder = os.path.join("/results", args.team_id)
+    print(f"Input Path: {args.input_path}")
+    print(f"Model Name: {args.model_name}")
+    print(f"Output Folder: {output_folder}")
+    main(args.input_path, args.model_name, output_folder)

models/base_model.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import mlflow
+from typing import Any, Dict
+from numpy import ndarray
+from sklearn.base import BaseEstimator
+from modules.evaluate import PerformanceAnalyzer
+# === Base Model Interface ===
+class BaseModel:
+    def __init__(self) -> None:
+        self.model: BaseEstimator = None
+        self.best_params: Dict[str, Any] = {}
+        self.model_name: str = self.__class__.__name__  # Automatically set model name
+    def train(self, X_train: ndarray, y_train: ndarray, X_val: ndarray, y_val: ndarray) -> None:
+        raise NotImplementedError
+    def predict(self, X: ndarray) -> ndarray:
+        return self.model.predict(X)
+    def score(self, X: ndarray, y: ndarray) -> float:
+        return self.model.score(X, y)
+    def log_mlflow(self, y_val: ndarray, y_pred: ndarray):
+        """
+        Logs model performance metrics and the trained model to MLflow.
+        This method evaluates the model's performance using the provided true
+        and predicted values, logs the evaluation metrics to MLflow, and saves
+        the trained model to MLflow for tracking and reproducibility.
+        Args:
+            y_val (ndarray): The ground truth target values.
+            y_pred (ndarray): The predicted target values from the model.
+        Returns:
+            str | dict: A string representation of the evaluation metrics or
+            a dictionary containing the metrics.
+        Input Example:
+            y_val = np.array([1, 0, 1, 1, 0])
+            y_pred = np.array([1, 0, 1, 0, 0])
+        """
+        analyzer = PerformanceAnalyzer()
+        metrics, metrics_str = analyzer.evaluate(y_val, y_pred)
+        mlflow.log_params(self.best_params or {})
+        for category, category_metrics in metrics.items():
+            if isinstance(category_metrics, dict):
+                mlflow.log_metrics({f"{category}_{k}": v for k, v in category_metrics.items() if isinstance(v, (int, float))})
+        mlflow.sklearn.log_model(self.model, "model")
+        mlflow.set_tag("model_name", self.model_name)  # Add model name as a tag
+        return metrics_str
+    def load_model_from_run(
+        self,
+        run_id: str = None,
+        experiment_id: str = None,
+        experiment_name: str = None,
+        best_metric: str = None,
+        maximize: bool = True,
+        additional_tags: Dict[str, str] = None
+    ) -> None:
+        """
+        Loads a model from a specific MLflow run, the last run, or the best run based on a metric.
+        Args:
+            run_id (str, optional): The ID of the MLflow run from which to load the model. Defaults to None.
+            experiment_id (str, optional): The ID of the MLflow experiment to search for runs. Defaults to None.
+            experiment_name (str, optional): The name of the MLflow experiment to search for runs. Required if run_id is not provided.
+            best_metric (str, optional): The metric to use for selecting the best run. Defaults to None. Example: "weighted avg_f1-score
+            maximize (bool, optional): Whether to maximize or minimize the metric when selecting the best run. Defaults to True.
+            additional_tags (dict, optional): Additional tags to filter runs. Defaults to None.
+        Raises:
+            ValueError: If neither `run_id` nor `experiment_name` is provided.
+        """
+        if run_id:
+            # Load model from the specified run ID
+            run = mlflow.get_run(run_id)
+        # elif experiment_id or experiment_name:
+        else:
+            # Default to the first experiment if not provided
+            if not (experiment_id or experiment_name): experiment_id = "0"
+            # Determine the order_by clause
+            if best_metric:
+                metric_order = f"metrics.'{best_metric}' {'DESC' if maximize else 'ASC'}"
+                order_by = [metric_order]
+            else:
+                order_by = ["start_time DESC"]
+            # Build the filter string
+            filter_string = f"attributes.run_name LIKE '{self.model_name}%'"
+            if additional_tags:
+                for key, value in additional_tags.items():
+                    filter_string += f" and tags.{key} = '{value}'"
+            # Search for the most relevant run with the model name and additional tags as filters
+            runs = mlflow.search_runs(
+                experiment_ids=[experiment_id] if experiment_id else None,
+                experiment_names=[experiment_name] if experiment_name else None,
+                filter_string=filter_string,
+                order_by=order_by,
+                max_results=1
+            )
+            if runs.empty:
+                raise ValueError(f"No runs found in experiment '{experiment_name}' with the specified criteria.")
+            # Get the best or last run
+            run = mlflow.get_run(runs.iloc[0]["run_id"])
+        # else:
+        #     raise ValueError("Either 'run_id' or 'experiment_id' or 'experiment_name' must be provided.")
+        # Load the model and metadata
+        # self.model = mlflow.pyfunc.load_model(mlflow.get_tracking_uri() + f"/{experiment_id}/{run.info.run_id}/artifacts/model")
+        self.model = mlflow.pyfunc.load_model(f"runs:/{run.info.run_id}/model")
+        self.best_params = run.data.params
+        self.metrics = run.data.metrics
+        self.model_name = run.info.run_name
+        self.run_id = run.info.run_id
+    def register_model(
+        self,
+        run_id: str,
+        model_name: str = None,
+        tags: Dict[str, str] = None
+    ) -> None:
+        """
+        Registers a model in MLflow's Model Registry.
+        Args:
+            run_id (str): The ID of the MLflow run containing the model to register.
+            model_name (str): The name to assign to the registered model.
+            description (str, optional): A description for the registered model. Defaults to None.
+            tags (dict, optional): Tags to associate with the registered model. Defaults to None.
+        """
+        mlflow.register_model(
+            model_uri=f"runs:/{run_id}/model",
+            name=model_name or self.model_name,
+            tags=tags
+        )
+    def load_model_from_registry(self, model_name: str, version: int = None) -> None:
+        """
+        Loads a model from MLflow's Model Registry.
+        Args:
+            model_name (str): The name of the model to load.
+            version (int, optional): The version of the model to load. If None, the latest version is loaded. Defaults to None.
+        """
+        self.model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{version if version else 'latest'}")

models/catboost.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from catboost import CatBoostClassifier
+import optuna
+import cupy as cp
+from numpy import ndarray
+import numpy as np
+from models.base_model import BaseModel
+from typing import Dict, Any
+# === CatBoost Implementation ===
+class CatBoostModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+    def objective(
+        self,
+        trial: optuna.trial.Trial,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray
+    ) -> float:
+        params: Dict[str, int] = {
+            "iterations": trial.suggest_int("iterations", 300, 500),
+            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 1e-1, log=True),
+            "depth": trial.suggest_int("depth", 10, 15),
+            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-5, 1.0, log=True),
+            "task_type": "GPU" if cp.cuda.is_available() else "CPU",
+            "verbose": False
+        }
+        model = CatBoostClassifier(**params)
+        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)
+        return model.score(X_val, y_val)
+    def train(
+        self,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        use_optuna: bool = False,
+        n_trials: int = 20
+    ) -> None:
+        if use_optuna:
+            study = optuna.create_study(direction="maximize")
+            # CatBoost returns device (cuda) already in use if n_jobs > 1
+            study.optimize(lambda trial: self.objective(trial, X_train, y_train, X_val, y_val), n_trials=n_trials) #, n_jobs=-1
+            self.best_params: Dict[str, Any] = study.best_params
+            self.model = CatBoostClassifier(**self.best_params, verbose=False)
+        else:
+            self.model = CatBoostClassifier(verbose=0)
+        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
+        self.model.fit(X, y)

models/gmboost.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from sklearn.ensemble import GradientBoostingClassifier
+import optuna
+from models.base_model import BaseModel
+from numpy import ndarray
+import numpy as np
+# === GradientBoosting Implementation ===
+class GradientBoostingModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+    def objective(
+        self,
+        trial: optuna.trial.Trial,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray
+    ) -> float:
+        params = {
+            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
+            "max_depth": trial.suggest_int("max_depth", 3, 10),
+            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
+            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+        }
+        model = GradientBoostingClassifier(**params)
+        model.fit(X_train, y_train)
+        return model.score(X_val, y_val)
+    def train(
+        self,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        use_optuna: bool = False,
+        n_trials: int = 20
+    ) -> None:
+        if use_optuna:
+            study = optuna.create_study(direction="maximize")
+            study.optimize(lambda trial: self.objective(trial, X_train, y_train, X_val, y_val), n_trials=n_trials, n_jobs=-1)
+            self.best_params = study.best_params
+            self.model = GradientBoostingClassifier(**self.best_params)
+        else:
+            self.model = GradientBoostingClassifier()
+        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
+        self.model.fit(X, y)

models/lightgbm.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import optuna
+from models.base_model import BaseModel
+import lightgbm as lgb
+from numpy import ndarray
+import numpy as np
+from sklearn.utils import class_weight
+# === LightGBM Implementation ===
+class LightGBMModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+    def objective(
+        self,
+        trial: optuna.trial.Trial,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        class_weight_type: str = "",
+    ) -> float:
+        params = {
+            "objective": "multiclass",
+            "num_class": len(set(y_train)),
+            "metric": "multi_logloss",
+            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 2e-1, log=True),
+            "num_leaves": trial.suggest_int("num_leaves", 50, 130),
+            "max_depth": trial.suggest_int("max_depth", 20, 30),
+            "min_child_samples": trial.suggest_int("min_child_samples", 20, 50),
+            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
+            "n_jobs": -1,
+            "verbosity": -1
+        }
+        model = lgb.LGBMClassifier(**params)
+        # Compute class weights if specified
+        if class_weight_type:
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y_train)
+            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], sample_weight=class_weights)
+        else:
+            model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) # Fit without class weights
+        return model.score(X_val, y_val)
+    def train(
+        self,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        use_optuna: bool = False,
+        n_trials: int = 20,
+        class_weight_type: str = "",
+    ) -> None:
+        if use_optuna:
+            study = optuna.create_study(direction="maximize")
+            study.optimize(lambda trial: self.objective(trial, X_train, y_train, X_val, y_val, class_weight_type), n_trials=n_trials, n_jobs=-1, show_progress_bar=True)
+            self.best_params = study.best_params
+            self.model = lgb.LGBMClassifier(**self.best_params, verbosity=-1)
+        else:
+            self.model = lgb.LGBMClassifier(**self.best_params)
+        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
+        if class_weight_type:
+            # Compute class weights if specified
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y)
+            self.model.fit(X, y, sample_weight=class_weights)
+        else:
+            # Fit without class weights
+            self.model.fit(X, y)

models/svm.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from models.base_model import BaseModel
+from sklearn.svm import SVC
+from sklearn.utils import class_weight
+import optuna
+from numpy import ndarray
+import numpy as np
+# === SVM Implementation ===
+class SVMModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+    def objective(
+        self,
+        trial: optuna.trial.Trial,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        class_weight_type: str = "",
+    ) -> float:
+        params = {
+            "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
+            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
+            "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
+        }
+        model = SVC(**params, probability=False)
+        if class_weight_type:
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y_train)
+            model.fit(X_train, y_train, sample_weight=class_weights)
+        else:
+            model.fit(X_train, y_train)
+        return model.score(X_val, y_val)
+    def train(
+        self,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        use_optuna: bool = False,
+        n_trials: int = 20,
+        class_weight_type: str = "",
+    ) -> None:
+        if use_optuna:
+            study = optuna.create_study(direction="maximize")
+            study.optimize(lambda trial: self.objective(trial, X_train, y_train, X_val, y_val, class_weight_type), n_trials=n_trials, n_jobs=-1, show_progress_bar=True)
+            self.best_params = study.best_params
+            self.model = SVC(**self.best_params, probability=False)
+        else:
+            self.model = SVC(**self.best_params, probability=False)
+        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
+        if class_weight_type:
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y)
+            self.model.fit(X, y, sample_weight=class_weights)
+        else:
+            self.model.fit(X, y)

models/xgboost.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from models.base_model import BaseModel
+from xgboost import XGBClassifier
+import optuna
+from numpy import ndarray
+import numpy as np
+import cupy as cp
+from sklearn.utils import class_weight
+# === XGBoost Implementation ===
+class XGBoostModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+    def objective(
+        self,
+        trial: optuna.trial.Trial,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        class_weight_type: str = "",
+    ) -> float:
+        params = {
+            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
+            "max_depth": trial.suggest_int("max_depth", 15, 20),
+            "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
+            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
+            "gamma": trial.suggest_float("gamma", 0, 5),
+            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
+            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
+            "device": "cuda" if cp.cuda.is_available() else "cpu",
+        }
+        model = XGBClassifier(**params, use_label_encoder=False, eval_metric="mlogloss")
+        if class_weight_type:
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y_train)
+            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False, sample_weight=class_weights)
+        else:
+            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
+        return model.score(X_val, y_val)
+    def train(
+        self,
+        X_train: ndarray,
+        y_train: ndarray,
+        X_val: ndarray,
+        y_val: ndarray,
+        use_optuna: bool = False,
+        n_trials: int = 20,
+        class_weight_type: str = "",
+    ) -> None:
+        if use_optuna:
+            study = optuna.create_study(direction="maximize")
+            study.optimize(lambda trial: self.objective(trial, X_train, y_train, X_val, y_val, class_weight_type), n_trials=n_trials, n_jobs=2, show_progress_bar=True)
+            self.best_params = study.best_params
+            self.model = XGBClassifier(**self.best_params, use_label_encoder=False, eval_metric="mlogloss")
+        else:
+            self.model = XGBClassifier(**self.best_params, use_label_encoder=False, eval_metric="mlogloss")
+        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
+        if class_weight_type:
+            # Source: https://stackoverflow.com/questions/42192227/xgboost-python-classifier-class-weight-option
+            class_weights = class_weight.compute_sample_weight(class_weight_type, y=y)
+            self.model.fit(X, y, sample_weight=class_weights)
+        else:
+            self.model.fit(X, y)

modules/evaluate.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from numpy import ndarray # For type hinting
+from sklearn.metrics import classification_report
+# === Evaluation ===
+class PerformanceAnalyzer:
+    def evaluate(self, y_true: ndarray, y_pred: ndarray):
+        report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
+        report_str = classification_report(y_true, y_pred, zero_division=0)
+        return report, report_str

modules/feature_extraction.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# import torch
+import librosa
+import numpy as np
+import parselmouth
+from transformers import Wav2Vec2Model, Wav2Vec2Processor
+from config import FEATURES_CACHE
+from pathlib import Path
+from typing import Tuple, Optional
+# === Feature Extraction ===
+class FeatureExtractor:
+    def __init__(self) -> None:
+        # self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        # self.wav2vec_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+        pass
+    def traditional(self, y: np.ndarray, sr: int = 16000, n_mfcc: int = 13) -> np.ndarray:
+        # MFCCs (13 is standard for voice tasks)
+        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
+        # delta = librosa.feature.delta(mfcc)
+        # delta2 = librosa.feature.delta(mfcc, order=2)
+        # Chroma
+        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
+        # Spectral Contrast
+        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
+        # # Tonnetz
+        # tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
+        # RMS Energy & ZCR
+        rmse = librosa.feature.rms(y=y)
+        zcr = librosa.feature.zero_crossing_rate(y)
+        # Spectral Centroid
+        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+        #* PROSODIC FEATURES
+        # Fundamental frequency (F0) using YIN
+        try:
+            f0 = librosa.yin(y, fmin=50, fmax=500, sr=sr)
+            f0_mean = np.nanmean(f0)
+            f0_std = np.nanstd(f0)
+            f0_max = np.nanmax(f0)
+        except:
+            f0_mean = f0_std = f0_max = 0
+        # Loudness (Log energy)
+        loudness = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
+        loudness_mean = np.mean(loudness)
+        loudness_std = np.std(loudness)
+        # Rhythm / Duration
+        intervals = librosa.effects.split(y, top_db=30)
+        durations = [(e - s) / sr for s, e in intervals]
+        if durations:
+            dur_mean = np.mean(durations)
+            dur_std = np.std(durations)
+            dur_count = len(durations)
+        else:
+            dur_mean = dur_std = dur_count = 0
+        # Formant Features
+        formants = self.extract_formants(y, sr)
+        f1_mean = formants["f1_mean"]
+        f1_std = formants["f1_std"]
+        f2_mean = formants["f2_mean"]
+        f2_std = formants["f2_std"]
+        return np.concatenate([
+            mfcc.mean(axis=1),
+            # delta.mean(axis=1),
+            # delta2.mean(axis=1),
+            chroma.mean(axis=1),
+            contrast.mean(axis=1),
+            # tonnetz.mean(axis=1),
+            [rmse.mean()],
+            [zcr.mean()],
+            [centroid.mean()],
+            [f0_mean, f0_std, f0_max],
+            [loudness_mean, loudness_std],
+            [dur_mean, dur_std, dur_count],
+            [f1_mean, f1_std, f2_mean, f2_std],
+        ])
+    def extract_formants(self, audio: np.ndarray, sr: int = 16000) -> dict:
+        try:
+            sound = parselmouth.Sound(audio, sampling_frequency=sr)
+            formant = sound.to_formant_burg()
+            duration = sound.duration
+            times = np.linspace(0.01, duration - 0.01, 100)
+            f1_list, f2_list = [], []
+            for t in times:
+                f1 = formant.get_value_at_time(1, t)
+                f2 = formant.get_value_at_time(2, t)
+                if f1: f1_list.append(f1)
+                if f2: f2_list.append(f2)
+            return {
+                "f1_mean": np.nanmean(f1_list) if f1_list else 0,
+                "f1_std": np.nanstd(f1_list) if f1_list else 0,
+                "f2_mean": np.nanmean(f2_list) if f2_list else 0,
+                "f2_std": np.nanstd(f2_list) if f2_list else 0,
+            }
+        except Exception as e:
+            print(f"[Formant Error] {e}")
+            return {
+                "f1_mean": 0, "f1_std": 0,
+                "f2_mean": 0, "f2_std": 0,
+            }
+    # def wav2vec(self, y: np.ndarray, sr: int = 16000) -> np.ndarray:
+    #     if sr != 16000:
+    #         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+    #     input_values: torch.Tensor = self.wav2vec_proc(y, return_tensors="pt", sampling_rate=16000).input_values
+    #     with torch.no_grad():
+    #         embeddings: torch.Tensor = self.wav2vec_model(input_values).last_hidden_state
+    #     return embeddings.mean(dim=1).squeeze().numpy()
+    def extract(self, y: np.ndarray, sr: int = 16000, mode: str = "traditional", n_mfcc: int = 40) -> np.ndarray:
+        return self.traditional(y, sr, n_mfcc=n_mfcc) if mode == "traditional" else self.wav2vec(y, sr)
+    def cache_features(self, X: np.ndarray, y: np.ndarray, mode: str, version: Optional[int] = None, force_update: bool = False) -> None:
+        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
+        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
+        if force_update or not X_path.exists() or not y_path.exists():
+            np.save(X_path, X)
+            np.save(y_path, y)
+    def load_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
+        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
+        if X_path.exists() and y_path.exists():
+            return np.load(X_path), np.load(y_path)
+        return None, None
+    def remove_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
+        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
+        if X_path.exists(): X_path.unlink()
+        if y_path.exists(): y_path.unlink()
+        return None, None
+    def merge_features(self, mode: str) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+        X = []
+        y = []
+        for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy"):
+            X.append(np.load(file))
+            y.append(np.load(file.with_name(file.name.replace("X_", "y_"))))
+        return np.concatenate(X), np.concatenate(y) if y else None
+    def get_latest_version(self, mode: str) -> int:
+        versions = [
+            int(file.stem.split("_v")[-1])
+            for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy")
+            if "_v" in file.stem and file.stem.split("_v")[-1].isdigit()
+        ]
+        return max(versions) if versions else 0

modules/pipelines.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import mlflow
+from datetime import datetime
+from models.lightgbm import LightGBMModel
+from modules.preprocessing import AudioPreprocessor
+from models.base_model import BaseModel
+from typing import Tuple
+import numpy as np
+from typing import Dict, Optional
+from modules.evaluate import PerformanceAnalyzer
+# === Unified Model Pipeline ===
+class ModelPipeline:
+    def __init__(self, model: BaseModel = LightGBMModel) -> None:
+        self.model = model()
+        self.model_name = self.model.__class__.__name__
+        self.best_params = {}
+        self.metrics = {}
+        self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.preprocessor = AudioPreprocessor()
+    def load_model(self, run_id: str = None, experiment_id: str = None, experiment_name: str = None, best_metric: str = None, maximize: bool = True, additional_tags: Dict[str, str] = None) -> None:
+        self.model.load_model_from_run(run_id, experiment_id, experiment_name, best_metric, maximize, additional_tags)
+    def train(
+        self,
+        X_train: np.ndarray,
+        y_train: np.ndarray,
+        X_val: np.ndarray,
+        y_val: np.ndarray,
+        X_test: Optional[np.ndarray] = None,
+        y_test: Optional[np.ndarray] = None,
+        use_optuna: bool = False,
+        n_trials: int = 20,
+        class_weight_type: str = "",
+        save_run: bool = True,
+        experiment_name: Optional[str] = None,
+        run_name: str = None,
+        mlflow_tags: Optional[Dict[str, str]] = None,
+    ) -> (str | dict):
+        try:
+            experiment_id = mlflow.set_experiment(experiment_name).experiment_id if experiment_name else None
+        except mlflow.exceptions.RestException:
+            experiment_id = None
+        with mlflow.start_run(run_name=run_name or f"{self.model_name}_{self.run_id}", experiment_id=experiment_id):
+            self.model.train(X_train, y_train, X_val, y_val, use_optuna=use_optuna, n_trials=n_trials, class_weight_type=class_weight_type)
+            ## If X_test and y_test are not provided, use X_val and y_val for testing
+            if X_test is None or y_test is None:
+                X_test, y_test = X_val, y_val
+            y_pred_test = self.model.predict(X_test)
+            if save_run:
+                metrics = self.model.log_mlflow(y_test, y_pred_test)
+                mlflow.set_tags(mlflow_tags or {})
+            else:
+                metrics = self.model.classification_report(y_test, y_pred_test)
+        return metrics
+    def load_model_from_registry(self, model_name: str, version: int = None) -> None:
+        self.model.load_model_from_registry(model_name, version)
+    def register_model(
+        self,
+        run_id: str,
+        model_name: str = None,
+        tags: Dict[str, str] = None
+    ) -> None:
+        self.model.register_model(run_id, model_name, tags)
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        return self.model.predict(X)
+    def score(self, X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
+        return self.model.score(X, y)
+    def classification_report(self, X: np.ndarray, y: np.ndarray) -> str:
+        evaluator = PerformanceAnalyzer()
+        y_pred = self.model.predict(X)
+        report, report_str = evaluator.evaluate(y, y_pred)
+        return report_str

modules/preprocessing.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+import librosa
+from config import PREPROCESSED_CACHE
+import noisereduce as nr
+from sklearn.model_selection import train_test_split
+from typing import Optional
+from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
+from imblearn.combine import SMOTETomek
+import random
+from collections import Counter
+# === Preprocessing ===
+class AudioPreprocessor:
+    def __init__(self):
+        self.augment_pipeline = Compose([
+            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.0),
+            TimeStretch(min_rate=0.9, max_rate=1.1, p=1.0),
+            PitchShift(min_semitones=-2, max_semitones=2, p=1.0),
+            Shift(min_shift=-0.2, max_shift=0.2, p=1.0),
+        ])
+        self.augment_prob_by_class = {  # set your probabilities here
+            0: 0.01,
+            1: 0.8,
+            2: 0.9,
+            3: 0.95
+        }
+    def load_audio(self, path: str, sr: int = 16000) -> Optional[np.ndarray]:
+        try:
+            y, _ = librosa.load(path, sr=sr)
+            return y
+        except Exception as e:
+            print(f"[ERROR] {path}: {e}")
+            return None
+    def preprocess(self, y: Optional[np.ndarray], sr: int = 16000, padding: bool = False, label: Optional[int] = None) -> Optional[np.ndarray]:
+        if y is None: return None
+        # Remove silence
+        intervals = librosa.effects.split(y, top_db=20)
+        y_trimmed = np.concatenate([y[start:end] for start, end in intervals])
+        # Normalize volume: Volume variations, Different microphone quality
+        y_norm = librosa.util.normalize(y_trimmed)
+        # Noise reduction
+        y_denoised = nr.reduce_noise(y=y_norm, sr=sr, n_jobs=-1)
+        # Conditional augmentation
+        if label is not None and random.random() < self.augment_prob_by_class.get(label, 0.5):
+            y_augmented = self.augment_pipeline(samples=y_denoised, sample_rate=sr)
+        else:
+            y_augmented = y_denoised
+        # Padding
+        if padding:
+            desired_len = sr * 5
+            if len(y_augmented) > desired_len:
+                y_augmented = y_augmented[:desired_len]
+            else:
+                y_augmented = np.pad(y_augmented, (0, max(0, desired_len - len(y_augmented))))
+        return y_augmented
+    def cache_preprocessed(self, idx: str, y: np.ndarray, force_update: bool = False) -> None:
+        path = PREPROCESSED_CACHE / f"{idx}.npy"
+        if force_update or not path.exists():
+            np.save(path, y)
+    def load_cached_preprocessed(self, idx: str) -> Optional[np.ndarray]:
+        try:
+            path = PREPROCESSED_CACHE / f"{idx}.npy"
+            return np.load(path) if path.exists() else None
+        except Exception as e:
+            print(f"[ERROR] {path}: {e}")
+            return None
+    def split_data(self, X, y, train_size: float = 0.75, val_size: float = 0.1, random_state: int = 42, stratify: bool = True,
+                    apply_smote: bool = False, smote_percentage: float = 0.7, verbose = True) -> tuple:
+        # First split: train vs (val + test)
+        stratify_option = y if stratify else None
+        X_train, X_temp, y_train, y_temp = train_test_split(
+            X, y, train_size=train_size, random_state=random_state, stratify=stratify_option
+        )
+        # Second split: validation vs test
+        stratify_temp = y_temp if stratify else None
+        X_val, X_test, y_val, y_test = train_test_split(
+            X_temp, y_temp, train_size=val_size / (1 - train_size), random_state=random_state, stratify=stratify_temp
+        )
+        if apply_smote:
+            if verbose: print(f"[INFO] Class distribution before SMOTE: {Counter(y_train)}")
+            class_counts = Counter(y_train)
+            majority_class_count = max(class_counts.values())
+            sampling_strategy = {
+                cls: int(majority_class_count * smote_percentage) for cls in class_counts.keys()
+            }
+            sampling_strategy[0] = majority_class_count
+            resampler = SMOTETomek(
+                random_state=random_state,
+                n_jobs=-1,
+                sampling_strategy=sampling_strategy  # Specify sampling strategy as a dictionary
+            )
+            X_train, y_train = resampler.fit_resample(X_train, y_train)
+            if verbose: print(f"[INFO] Class distribution after SMOTE: {Counter(y_train)}")
+        return X_train, y_train, X_val, y_val, X_test, y_test

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

requirements_docker.txt ADDED Viewed

Binary file (6.11 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+import os
+import tempfile
+import time
+from pathlib import Path
+from modules.preprocessing import AudioPreprocessor
+from modules.feature_extraction import FeatureExtractor
+from models.lightgbm import LightGBMModel
+from models.xgboost import XGBoostModel
+from modules.pipelines import ModelPipeline
+import warnings
+warnings.filterwarnings("ignore")
+# Constants
+MODEL_NAME = {
+    "XGBoost": XGBoostModel,
+    "LightGBM": LightGBMModel,
+}
+# UI Layout
+st.set_page_config(page_title="Audio Classification App", layout="centered")
+st.title("🎧 Audio Classification")
+st.markdown("Upload an `.mp3` or `.wav` file and select a model to get a prediction.")
+# File Upload
+uploaded_file = st.file_uploader("Upload your audio file", type=["wav", "mp3"])
+# Model Selection
+selected_model_name = st.selectbox("Select a model", list(MODEL_NAME.keys()))
+# Process if file is uploaded
+if uploaded_file is not None:
+    # Save uploaded file temporarily
+    with tempfile.TemporaryDirectory() as tmpdir:
+        audio_path = os.path.join(tmpdir, "input_audio.wav")
+        with open(audio_path, "wb") as f:
+            f.write(uploaded_file.read())
+        # Preprocess, extract features, predict
+        st.info("🔍 Processing audio...")
+        try:
+            # Initialize pipeline
+            preprocessor = AudioPreprocessor()
+            extractor = FeatureExtractor()
+            model = ModelPipeline(model=MODEL_NAME[selected_model_name])
+            model.load_model_from_registry(model_name=selected_model_name)
+            # Preprocess and predict
+            start_time = time.time()
+            y = preprocessor.preprocess(preprocessor.load_audio(audio_path, sr=16000))
+            if y is None:
+                st.error("Audio preprocessing failed.")
+            else:
+                x = extractor.extract(y, sr=16000, mode="traditional", n_mfcc=20)
+                pred = model.predict([x])[0]
+                elapsed = time.time() - start_time
+                # Display result
+                st.success(f"✅ Predicted Class: `{pred}`")
+                st.write(f"Inference time: `{elapsed:.4f}` seconds")
+        except Exception as e:
+            st.error(f"❌ An error occurred: {str(e)}")

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import json
+import joblib
+import numpy as np
+import mlflow
+import mlflow.sklearn
+from mlflow.tracking import MlflowClient
+from config import AUDIO_CACHE, FEATURES_CACHE, MODELS_DIR
+#* Audio Caching
+def cache_audio(data: np.ndarray, filename: str = "default", force_update=False):
+    path = AUDIO_CACHE / f"{filename}.npy"
+    if force_update or not path.exists():
+        np.save(path, data)
+def load_cached_audio(filename: str = "default"):
+    path = AUDIO_CACHE / f"{filename}.npy"
+    return np.load(path) if path.exists() else None
+#* Feature Caching
+def cache_features(X, y, feature_name: str = "features", label_name: str = "labels", force_update=False):
+    X_path = FEATURES_CACHE / f"{feature_name}.npy"
+    y_path = FEATURES_CACHE / f"{label_name}.npy"
+    if force_update or not X_path.exists() or not y_path.exists():
+        np.save(X_path, X)
+        np.save(y_path, y)
+def load_cached_features(feature_name: str = "features", label_name: str = "labels"):
+    X_path = FEATURES_CACHE / f"{feature_name}.npy"
+    y_path = FEATURES_CACHE / f"{label_name}.npy"
+    if X_path.exists() and y_path.exists():
+        return np.load(X_path), np.load(y_path)
+    return None, None
+#* Model Caching
+def cache_model(model, best_params: dict, model_name: str = None, save_option='default', force_update=False):
+    model_class = model.__class__.__name__
+    model_folder = MODELS_DIR / (model_name or model_class)
+    model_folder.mkdir(exist_ok=True)
+    model_path = model_folder / ("model.pkl" if save_option == "joblib" else "model.cbm")
+    params_path = model_folder / "best_params.json"
+    # Save model
+    if force_update or not model_path.exists():
+        if save_option == "joblib":
+            joblib.dump(model, model_path)
+        else:
+            model.save_model(model_path)
+    # Save best params
+    if force_update or not params_path.exists():
+        with open(params_path, "w") as f:
+            json.dump(best_params, f, indent=2)
+def load_model(model_class, model_name: str = None, save_option='default'):
+    model_class_name = model_class.__name__
+    model_folder = MODELS_DIR / (model_name or model_class_name)
+    model_path = model_folder / ("model.pkl" if save_option == "joblib" else "model.cbm")
+    params_path = model_folder / "best_params.json"
+    if not model_path.exists() or not params_path.exists():
+        return None, None
+    with open(params_path, "r") as f:
+        best_params = json.load(f)
+    if save_option == "joblib":
+        model = joblib.load(model_path)
+    else:
+        model = model_class()
+        model.load_model(model_path)
+    return model, best_params
+# === Utility: MLflow Helpers ===
+def list_top_mlflow_runs(metric="f1-score", top_n=5):
+    client = MlflowClient()
+    runs = mlflow.search_runs(experiment_ids=["0"], order_by=[f"metrics.weighted avg.{metric} DESC"])
+    return runs[["run_id", "params.model_type", f"metrics.weighted avg.{metric}"]].head(top_n)
+def load_mlflow_model(run_id):
+    client = MlflowClient()
+    run = client.get_run(run_id)
+    model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
+    params = run.data.params
+    metrics = run.data.metrics
+    return model, params, metrics