llava-hf
/

llava-next-110b-hf

@@ -7,6 +7,7 @@ tags:
 - image-text-to-text
 language:
 - en
 ---
 # LLaVa-Next Model Card
@@ -52,7 +53,20 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-1
 # prepare image and text prompt, using the appropriate prompt template
 url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 image = Image.open(requests.get(url, stream=True).raw)
-prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
 inputs = processor(prompt, image, return_tensors="pt").to(model.device)

 - image-text-to-text
 language:
 - en
+pipeline_tag: image-text-to-text
 ---
 # LLaVa-Next Model Card
 # prepare image and text prompt, using the appropriate prompt template
 url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What is shown in this image?"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 inputs = processor(prompt, image, return_tensors="pt").to(model.device)