vidore
/

colqwen2-v1.0-hf

@@ -12,9 +12,6 @@ base_model:
 pipeline_tag: visual-document-retrieval
 ---
-> [!WARNING]
-> EXPERIMENTAL: Wait for https://github.com/huggingface/transformers/pull/35778 to be merged before using!
 > [!IMPORTANT]
 > This version of ColQwen2 should be loaded with the `transformers 🤗` release, not with `colpali-engine`.
 > It was converted using the `convert_colqwen2_weights_to_hf.py` script
@@ -44,6 +41,7 @@ A validation set is created with 2% of the samples to tune hyperparameters.
 ## Usage
 ```python
 import torch
 from PIL import Image
@@ -51,39 +49,55 @@ from transformers import ColQwen2ForRetrieval, ColQwen2Processor
 from transformers.utils.import_utils import is_flash_attn_2_available
 model_name = "vidore/colqwen2-v1.0-hf"
 model = ColQwen2ForRetrieval.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
-    device_map="cuda:0",  # or "mps" if on Apple Silicon
-    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
-).eval()
 processor = ColQwen2Processor.from_pretrained(model_name)
-# Your inputs (replace dummy images with screenshots of your documents)
 images = [
-    Image.new("RGB", (128, 128), color="white"),
-    Image.new("RGB", (64, 32), color="black"),
 ]
 queries = [
-    "What is the organizational structure for our R&D department?",
-    "Can you provide a breakdown of last year’s financial performance?",
 ]
 # Process the inputs
-batch_images = processor(images=images).to(model.device)
-batch_queries = processor(text=queries).to(model.device)
 # Forward pass
 with torch.no_grad():
-    image_embeddings = model(**batch_images).embeddings
-    query_embeddings = model(**batch_queries).embeddings
 # Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)
 ```
 ## Limitations

 pipeline_tag: visual-document-retrieval
 ---
 > [!IMPORTANT]
 > This version of ColQwen2 should be loaded with the `transformers 🤗` release, not with `colpali-engine`.
 > It was converted using the `convert_colqwen2_weights_to_hf.py` script
 ## Usage
 ```python
+import requests
 import torch
 from PIL import Image
 from transformers.utils.import_utils import is_flash_attn_2_available
+# Load the model and the processor
 model_name = "vidore/colqwen2-v1.0-hf"
 model = ColQwen2ForRetrieval.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
+    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa",
+)
 processor = ColQwen2Processor.from_pretrained(model_name)
+# The document page screenshots from your corpus
+url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
+url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
 images = [
+    Image.open(requests.get(url1, stream=True).raw),
+    Image.open(requests.get(url2, stream=True).raw),
 ]
+# The queries you want to retrieve documents for
 queries = [
+    "When was the United States Declaration of Independence proclaimed?",
+    "Who printed the edition of Romeo and Juliet?",
 ]
 # Process the inputs
+inputs_images = processor(images=images).to(model.device)
+inputs_text = processor(text=queries).to(model.device)
 # Forward pass
 with torch.no_grad():
+    image_embeddings = model(**inputs_images).embeddings
+    query_embeddings = model(**inputs_text).embeddings
 # Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)
+print("Retrieval scores (query x image):")
+print(scores)
+```
+If you have issue with loading the images with PIL, you can use the following code to create dummy images:
+```python
+images = [
+    Image.new("RGB", (128, 128), color="white"),
+    Image.new("RGB", (64, 32), color="black"),
+]
 ```
 ## Limitations