inclusionAI
/

Ming-Lite-Omni

Model card Files Files and versions Community

qingpei commited on May 6

Commit

9a10e16

·

verified ·

1 Parent(s): cd61489

Update test_infer.py

Files changed (1) hide show

test_infer.py +5 -3

test_infer.py CHANGED Viewed

@@ -15,7 +15,8 @@ if __name__ == '__main__':
         attn_implementation="flash_attention_2",
     ).to("cuda")
-    vision_path = "/input/zhangqinglong.zql/assets/"
     # qa
     # messages = [
@@ -71,12 +72,13 @@ if __name__ == '__main__':
     #     },
     # ]
     messages = [
         {
             "role": "HUMAN",
             "content": [
                 {"type": "text", "text": "Please recognize the language of this speech and transcribe it. Format: oral."},
-                {"type": "audio", "audio": '/input/dongli.xq/BAC009S0915W0292.wav'},
             ],
         },
     ]
@@ -95,7 +97,7 @@ if __name__ == '__main__':
     for k in inputs.keys():
         if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
             inputs[k] = inputs[k].to(dtype=torch.bfloat16)
     generated_ids = model.generate(
         **inputs,
         max_new_tokens=128,

         attn_implementation="flash_attention_2",
     ).to("cuda")
+    # replace with your model path
+    vision_path = environment.get("VISION_PATH", "") or os.path.join(os.path.dirname(__file__), "vision")
     # qa
     # messages = [
     #     },
     # ]
+    # notice place the audio file in the same directory as the output file
     messages = [
         {
             "role": "HUMAN",
             "content": [
                 {"type": "text", "text": "Please recognize the language of this speech and transcribe it. Format: oral."},
+                {"type": "audio", "audio": os.path.join(vision_path, "audio.wav")},
             ],
         },
     ]
     for k in inputs.keys():
         if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
             inputs[k] = inputs[k].to(dtype=torch.bfloat16)
     generated_ids = model.generate(
         **inputs,
         max_new_tokens=128,