Spaces:
Sleeping
Sleeping
File size: 6,784 Bytes
4c8f740 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# import torch
import librosa
import numpy as np
import parselmouth
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from config import FEATURES_CACHE
from pathlib import Path
from typing import Tuple, Optional
# === Feature Extraction ===
class FeatureExtractor:
def __init__(self) -> None:
# self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# self.wav2vec_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
pass
def traditional(self, y: np.ndarray, sr: int = 16000, n_mfcc: int = 13) -> np.ndarray:
# MFCCs (13 is standard for voice tasks)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
# delta = librosa.feature.delta(mfcc)
# delta2 = librosa.feature.delta(mfcc, order=2)
# Chroma
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
# Spectral Contrast
contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
# # Tonnetz
# tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
# RMS Energy & ZCR
rmse = librosa.feature.rms(y=y)
zcr = librosa.feature.zero_crossing_rate(y)
# Spectral Centroid
centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
#* PROSODIC FEATURES
# Fundamental frequency (F0) using YIN
try:
f0 = librosa.yin(y, fmin=50, fmax=500, sr=sr)
f0_mean = np.nanmean(f0)
f0_std = np.nanstd(f0)
f0_max = np.nanmax(f0)
except:
f0_mean = f0_std = f0_max = 0
# Loudness (Log energy)
loudness = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
loudness_mean = np.mean(loudness)
loudness_std = np.std(loudness)
# Rhythm / Duration
intervals = librosa.effects.split(y, top_db=30)
durations = [(e - s) / sr for s, e in intervals]
if durations:
dur_mean = np.mean(durations)
dur_std = np.std(durations)
dur_count = len(durations)
else:
dur_mean = dur_std = dur_count = 0
# Formant Features
formants = self.extract_formants(y, sr)
f1_mean = formants["f1_mean"]
f1_std = formants["f1_std"]
f2_mean = formants["f2_mean"]
f2_std = formants["f2_std"]
return np.concatenate([
mfcc.mean(axis=1),
# delta.mean(axis=1),
# delta2.mean(axis=1),
chroma.mean(axis=1),
contrast.mean(axis=1),
# tonnetz.mean(axis=1),
[rmse.mean()],
[zcr.mean()],
[centroid.mean()],
[f0_mean, f0_std, f0_max],
[loudness_mean, loudness_std],
[dur_mean, dur_std, dur_count],
[f1_mean, f1_std, f2_mean, f2_std],
])
def extract_formants(self, audio: np.ndarray, sr: int = 16000) -> dict:
try:
sound = parselmouth.Sound(audio, sampling_frequency=sr)
formant = sound.to_formant_burg()
duration = sound.duration
times = np.linspace(0.01, duration - 0.01, 100)
f1_list, f2_list = [], []
for t in times:
f1 = formant.get_value_at_time(1, t)
f2 = formant.get_value_at_time(2, t)
if f1: f1_list.append(f1)
if f2: f2_list.append(f2)
return {
"f1_mean": np.nanmean(f1_list) if f1_list else 0,
"f1_std": np.nanstd(f1_list) if f1_list else 0,
"f2_mean": np.nanmean(f2_list) if f2_list else 0,
"f2_std": np.nanstd(f2_list) if f2_list else 0,
}
except Exception as e:
print(f"[Formant Error] {e}")
return {
"f1_mean": 0, "f1_std": 0,
"f2_mean": 0, "f2_std": 0,
}
# def wav2vec(self, y: np.ndarray, sr: int = 16000) -> np.ndarray:
# if sr != 16000:
# y = librosa.resample(y, orig_sr=sr, target_sr=16000)
# input_values: torch.Tensor = self.wav2vec_proc(y, return_tensors="pt", sampling_rate=16000).input_values
# with torch.no_grad():
# embeddings: torch.Tensor = self.wav2vec_model(input_values).last_hidden_state
# return embeddings.mean(dim=1).squeeze().numpy()
def extract(self, y: np.ndarray, sr: int = 16000, mode: str = "traditional", n_mfcc: int = 40) -> np.ndarray:
return self.traditional(y, sr, n_mfcc=n_mfcc) if mode == "traditional" else self.wav2vec(y, sr)
def cache_features(self, X: np.ndarray, y: np.ndarray, mode: str, version: Optional[int] = None, force_update: bool = False) -> None:
X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
if force_update or not X_path.exists() or not y_path.exists():
np.save(X_path, X)
np.save(y_path, y)
def load_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
if X_path.exists() and y_path.exists():
return np.load(X_path), np.load(y_path)
return None, None
def remove_cached_features(self, mode: str, version: Optional[int] = None) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
X_path = FEATURES_CACHE / f"X_{mode}.npy" if version is None else FEATURES_CACHE / f"X_{mode}_v{version}.npy"
y_path = FEATURES_CACHE / f"y_{mode}.npy" if version is None else FEATURES_CACHE / f"y_{mode}_v{version}.npy"
if X_path.exists(): X_path.unlink()
if y_path.exists(): y_path.unlink()
return None, None
def merge_features(self, mode: str) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
X = []
y = []
for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy"):
X.append(np.load(file))
y.append(np.load(file.with_name(file.name.replace("X_", "y_"))))
return np.concatenate(X), np.concatenate(y) if y else None
def get_latest_version(self, mode: str) -> int:
versions = [
int(file.stem.split("_v")[-1])
for file in FEATURES_CACHE.glob(f"X_{mode}_*.npy")
if "_v" in file.stem and file.stem.split("_v")[-1].isdigit()
]
return max(versions) if versions else 0 |