--- license: mit inference: false --- A simple use case: ```shell from transformers import Wav2Vec2Processor, HubertModel import torch from torch import nn from datasets import load_dataset # load demo audio and set processor dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") dataset = dataset.sort("id") sampling_rate = dataset.features["audio"].sampling_rate processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") # loading our model weights model = HubertModel.from_pretrained("m-a-p/MERT-v0") # audio file is decoded on the fly inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True) # take a look at the output shape, there are 13 layers of representation # each layer performs differently in different downstream tasks, you should choose empirically all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() print(all_layer_hidden_states.shape) # [13 layer, 292 timestep, 768 feature_dim] # for utterance level classification tasks, you can simply reduce the representation in time time_reduced_hidden_states = all_layer_hidden_states.mean(-2) print(time_reduced_hidden_states.shape) # [13, 768] # you can even use a learnable weighted average representation aggregator = nn.Conv1d(in_channels=13, out_channels=1, kernel_size=1) weighted_avg_hidden_states = aggregator(time_reduced_hidden_states.unsqueeze(0)).squeeze() print(weighted_avg_hidden_states.shape) # [768] ```