mispeech
/

r1-aqa

@@ -53,8 +53,9 @@ model = Qwen2AudioForConditionalGeneration.from_pretrained(model_name, torch_dty
 # Load example audio
 wav_path = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # from MMAU dataset
 waveform, sampling_rate = torchaudio.load(wav_path)
-assert sampling_rate == 16000
-audios = [waveform.numpy()]
 # Make prompt text
 question = "Based on the given audio, identify the source of the speaking voice."

 # Load example audio
 wav_path = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # from MMAU dataset
 waveform, sampling_rate = torchaudio.load(wav_path)
+if sampling_rate != 16000:
+    waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(waveform)
+audios = [waveform[0].numpy()]
 # Make prompt text
 question = "Based on the given audio, identify the source of the speaking voice."