Fix pipeline tag, add library_name and link to code

This PR ensures the model can be found at https://huggingface.co/models?pipeline_tag=automatic-speech-recognition and adds the `library_name`.

Files changed (1) hide show

README.md +15 -15

README.md CHANGED Viewed

@@ -1,19 +1,21 @@
 ---
-tags:
-- model_hub_mixin
-- pytorch_model_hub_mixin
-license: bsd-2-clause
 language:
 - en
 metrics:
 - accuracy
-base_model:
-- openai/whisper-large-v3
-datasets:
-- ajd12342/paraspeechcaps
 pipeline_tag: audio-classification
 ---
-# Whisper Large v3 for Voice (Sounding) Quality Classification
 # Model Description
 This model includes the implementation of voice quality classification described in Vox-Profile: A Speech Foundation Model Benchmark for Characterizing Diverse Speaker and Speech Traits (https://arxiv.org/pdf/2505.14648)
@@ -23,7 +25,6 @@ Specifically, we report speaker-level Macro-F1 scores. Specifically, we randomly
 ### Special Note:
 We exclude EARS from ParaSpeechCaps due to its limited number of samples in the holdout set.
 The included labels are:
 <pre>
 [
@@ -35,8 +36,8 @@ The included labels are:
 ]
 </pre>
-- Library: https://github.com/tiantiaf0627/vox-profile-release
 # How to use this model
 ## Download repo
@@ -55,11 +56,11 @@ pip install -e .
 # Load libraries
 import torch
 import torch.nn.functional as F
-from src.model.voice_quality.whisper_voice_quality import WhisperWrapper
 # Find device
 device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
 # Load model from Huggingface
-model = WhisperWrapper.from_pretrained("tiantiaf/whisper-large-v3-voice-quality").to(device)
 model.eval()
 ```
@@ -76,7 +77,7 @@ voice_quality_label_list = [
 # Load data, here just zeros as the example
 # Our training data filters output audio shorter than 3 seconds (unreliable predictions) and longer than 15 seconds (computation limitation)
-# So you need to prepare your audio to a maximum of 15 seconds, 16kHz and mono channel
 max_audio_length = 15 * 16000
 data = torch.zeros([1, 16000]).float().to(device)[:, :max_audio_length]
 logits = model(
@@ -92,7 +93,6 @@ threshold = 0.7
 predictions = (voice_quality_prob > threshold).int().detach().cpu().numpy()[0].tolist()
 for label_idx in range(len(predictions)):
     if predictions[label_idx] == 1: voice_label.append(voice_quality_label_list[label_idx])
 # print the voice quality labels
 print(voice_label)
 ```

 ---
+base_model:
+- microsoft/wavlm-large
+datasets:
+- ajd12342/paraspeechcaps
 language:
 - en
+license: apache-2.0
 metrics:
 - accuracy
 pipeline_tag: audio-classification
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+library_name: transformers
 ---
+# WavLM-Large for Voice (Sounding) Quality Classification
 # Model Description
 This model includes the implementation of voice quality classification described in Vox-Profile: A Speech Foundation Model Benchmark for Characterizing Diverse Speaker and Speech Traits (https://arxiv.org/pdf/2505.14648)
 ### Special Note:
 We exclude EARS from ParaSpeechCaps due to its limited number of samples in the holdout set.
 The included labels are:
 <pre>
 [
 ]
 </pre>
+- Library: https://github.com/tiantiaf0627/vox-profile-release
 # How to use this model
 ## Download repo
 # Load libraries
 import torch
 import torch.nn.functional as F
+from src.model.voice_quality.wavlm_voice_quality import WavLMWrapper
 # Find device
 device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
 # Load model from Huggingface
+model = WavLMWrapper.from_pretrained("tiantiaf/wavlm-large-voice-quality").to(device)
 model.eval()
 ```
 # Load data, here just zeros as the example
 # Our training data filters output audio shorter than 3 seconds (unreliable predictions) and longer than 15 seconds (computation limitation)
+# So you need to prepare your audio to a maximum of 15 seconds, 16kHz, and mono channel
 max_audio_length = 15 * 16000
 data = torch.zeros([1, 16000]).float().to(device)[:, :max_audio_length]
 logits = model(
 predictions = (voice_quality_prob > threshold).int().detach().cpu().numpy()[0].tolist()
 for label_idx in range(len(predictions)):
     if predictions[label_idx] == 1: voice_label.append(voice_quality_label_list[label_idx])
 # print the voice quality labels
 print(voice_label)
 ```