mispeech
/

r1-aqa

@@ -8,6 +8,10 @@ tags: []
 <!-- Provide a quick summary of what the model is/does. -->
 ## Inference
@@ -25,7 +29,7 @@ def _get_audio(wav_path):
     return audio
 model_name = "mispeech/r1-aqa"
-audio_url = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"
 processor = AutoProcessor.from_pretrained(model_name)
 model = Qwen2AudioForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

 <!-- Provide a quick summary of what the model is/does. -->
+## Introduction
+R1-AQA is based on `Qwen2-Audio-7B-Instruc`, but applied group relative policy optimization (GRPO) algorithm to the Audio Question Answering(AQA) task.
+For more details, please refer to our [Github](https://github.com/xiaomi/r1-aqa) and [Report]().
 ## Inference
     return audio
 model_name = "mispeech/r1-aqa"
+audio_url = "test-mini-audios/3fe64f3d-282c-4bc8-a753-68f8f6c35652.wav"  # Copyied from MMAU dataset
 processor = AutoProcessor.from_pretrained(model_name)
 model = Qwen2AudioForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")