File size: 6,784 Bytes
4c8f740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# import torch
import librosa
import numpy as np
import parselmouth
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from config import FEATURES_CACHE
from pathlib import Path
from typing import Tuple, Optional

# === Feature Extraction ===
class FeatureExtractor:
    def __init__(self) -> None:
        # self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        # self.wav2vec_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        pass

    def traditional(self, y: np.ndarray, sr: int = 16000, n_mfcc: int = 13) -> np.ndarray:
        # MFCCs (13 is standard for voice tasks)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        # delta = librosa.feature.delta(mfcc)
        # delta2 = librosa.feature.delta(mfcc, order=2)

        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)

        # Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

        # # Tonnetz
        # tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

        # RMS Energy & ZCR
        rmse = librosa.feature.rms(y=y)
        zcr = librosa.feature.zero_crossing_rate(y)

        # Spectral Centroid
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

        #* PROSODIC FEATURES
        # Fundamental frequency (F0) using YIN
        try:
            f0 = librosa.yin(y, fmin=50, fmax=500, sr=sr)
            f0_mean = np.nanmean(f0)
            f0_std = np.nanstd(f0)
            f0_max = np.nanmax(f0)
        except:
            f0_mean = f0_std = f0_max = 0

        # Loudness (Log energy)
        loudness = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
        loudness_mean = np.mean(loudness)
        loudness_std = np.std(loudness)

        # Rhythm / Duration
        intervals = librosa.effects.split(y, top_db=30)
        durations = [(e - s) / sr for s, e in intervals]
        if durations:
            dur_mean = np.mean(durations)
            dur_std = np.std(durations)
            dur_count = len(durations)
        else:
            dur_mean = dur_std = dur_count = 0

        # Formant Features
        formants = self.extract_formants(y, sr)
        f1_mean = formants["f1_mean"]
        f1_std = formants["f1_std"]
        f2_mean = formants["f2_mean"]
        f2_std = formants["f2_std"]

        return np.concatenate([
            mfcc.mean(axis=1),
            # delta.mean(axis=1),
            # delta2.mean(axis=1),
            chroma.mean(axis=1),
            contrast.mean(axis=1),
            # tonnetz.mean(axis=1),
            [rmse.mean()],
            [zcr.mean()],
            [centroid.mean()],
            [f0_mean, f0_std, f0_max],
            [loudness_mean, loudness_std],
            [dur_mean, dur_std, dur_count],
            [f1_mean, f1_std, f2_mean, f2_std],
        ])

    def extract_formants(self, audio: np.ndarray, sr: int = 16000) -> dict:
        try:
            sound = parselmouth.Sound(audio, sampling_frequency=sr)
            formant = sound.to_formant_burg()

            duration = sound.duration
            times = np.linspace(0.01, duration - 0.01, 100)
            f1_list, f2_list = [], []

            for t in times:
                f1 = formant.get_value_at_time(1, t)
                f2 = formant.get_value_at_time(2, t)
                if f1: f1_list.append(f1)
                if f2: f2_list.append(f2)

            return {
                "f1_mean": np.nanmean(f1_list) if f1_list else 0,
                "f1_std": np.nanstd(f1_list) if f1_list else 0,
                "f2_mean": np.nanmean(f2_list) if f2_list else 0,
                "f2_std": np.nanstd(f2_list) if f2_list else 0,
            }
        except Exception as e:
            print(f"[Formant Error] {e}")
            return {
                "f1_mean": 0, "f1_std": 0,
                "f2_mean": 0, "f2_std": 0,
            }

    # def wav2vec(self, y: np.ndarray, sr: int = 16000) -> np.ndarray:
    #     if sr != 16000:
    #         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
    #     input_values: torch.Tensor = self.wav2vec_proc(y, return_tensors="pt", sampling_rate=16000).input_values
    #     with torch.no_grad():
    #         embeddings: torch.Tensor = self.wav2vec_model(input_values).last_hidden_state
    #     return embeddings.mean(dim=1).squeeze().numpy()

    def extract(self, y: np.ndarray, sr: int = 16000, mode: str = "traditional", n_mfcc: int = 40) -> np.ndarray:
        return self.traditional(y, sr, n_mfcc=n_mfcc) if mode == "traditional" else self.wav2vec(y, sr)

    def cache_features(self, X: np.ndarray, y: np.ndarray, mode: str, version: Optional[int] = None, force_update: bool = False) -> None:
        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
        if force_update or not X_path.exists() or not y_path.exists():
            np.save(X_path, X)
            np.save(y_path, y)

    def load_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
        if X_path.exists() and y_path.exists():
            return np.load(X_path), np.load(y_path)
        return None, None
    
    def remove_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
        y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
        if X_path.exists(): X_path.unlink()
        if y_path.exists(): y_path.unlink()
        return None, None
    
    def merge_features(self, mode: str) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        X = []
        y = []
        for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy"):
            X.append(np.load(file))
            y.append(np.load(file.with_name(file.name.replace("X_", "y_"))))
        return np.concatenate(X), np.concatenate(y) if y else None

    def get_latest_version(self, mode: str) -> int:
        versions = [
            int(file.stem.split("_v")[-1]) 
            for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy") 
            if "_v" in file.stem and file.stem.split("_v")[-1].isdigit()
        ]
        return max(versions) if versions else 0