jinaai
/

jina-embeddings-v4

@@ -1,25 +1,23 @@
-import os
 import math
-import numpy as np
 from dataclasses import dataclass
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
-from peft import PeftModel
 import torch
 from torch import nn
 from torch.utils.data import DataLoader
-from functools import partial
-from PIL import Image
 from tqdm import tqdm
-from enum import Enum
-from peft.utils.hotswap import hotswap_adapter
 from transformers import BatchFeature
-from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor, Qwen2_5_VLForConditionalGeneration
-from huggingface_hub import snapshot_download
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
@@ -28,6 +26,13 @@ class PromptType(str, Enum):
     query = "query"
     passage = "passage"
 class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
     def __init__(self, *args, **kwargs) -> None:
         Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
@@ -58,8 +63,12 @@ class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
             images = cast(List[List[Image.Image]], images)
             text_doc = []
             for i in range(len(images)):
-                conversation = [{"role": "user", "content": [{"type": "image"}] * len(images[i])}]
-                template = self.apply_chat_template(conversation, add_generation_prompt=False)
                 text_doc.append(template[self.assistant_prefix_len :])
         else:
@@ -78,7 +87,16 @@ class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
         max_length = max([len(pv) for pv in pixel_values])
         pixel_values = [
-            torch.cat([pv, torch.zeros((max_length - len(pv), pv.shape[1]), dtype=pv.dtype, device=pv.device)])
             for pv in pixel_values
         ]
@@ -93,7 +111,11 @@ class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
         padding: Optional[str] = None,
     ) -> BatchFeature:
-        max_length = self.text_max_length if max_length is None else min(max_length, self.text_max_length)
         padded_texts: List[str] = []
         for text in texts:
@@ -127,7 +149,7 @@ class JinaEmbeddingsV4ModelOutput:
     multi_vec_emb: Optional[torch.Tensor] = None
-class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
     config_class = JinaEmbeddingsV4Config
     main_input_name: ClassVar[str] = "doc_input_ids"
@@ -135,7 +157,9 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
         Qwen2_5_VLForConditionalGeneration.__init__(self, config)
         self._init_projection_layers(config)
         self.post_init()
-        self.processor = JinaEmbeddingsV4Processor.from_pretrained(self.name_or_path, trust_remote_code=True)
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
@@ -147,7 +171,9 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
     ) -> torch.Tensor:
         if "pixel_values" in kwargs:
             offsets = kwargs["image_grid_thw"][:, 1] * kwargs["image_grid_thw"][:, 2]
-            kwargs["pixel_values"] = torch.cat([pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0)
         position_ids, rope_deltas = super().get_rope_index(  # type: ignore
             input_ids=input_ids,
@@ -155,7 +181,7 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
             attention_mask=attention_mask,
         )
-        kwargs['output_hidden_states'] = True
         outputs = super().forward(
             input_ids,
@@ -199,14 +225,22 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
-            img_start_pos = torch.where(input_ids[0] == self.config.vision_start_token_id)[0][0]
-            img_end_pos = torch.where(input_ids[0] == self.config.vision_end_token_id)[0][0]
-            pooled_output = hidden_states[0][img_start_pos:img_end_pos + 1].mean(dim=0).unsqueeze(0)
         else:  # got query text
-            pooled_output = torch.sum(hidden_states * attention_mask.unsqueeze(-1), dim=1) / torch.sum(
-                attention_mask, dim=1, keepdim=True
-            )
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
@@ -248,15 +282,21 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
         )  # (batch_size, seq_length, hidden_size)
         # Compute the embeddings
-        single_vec_emb = self.project_to_single_vector_embeddings(hidden_states, attention_mask, input_ids=input_ids)
-        multi_vec_emb = self.project_to_multi_vector_embeddings(hidden_states, attention_mask)
         return JinaEmbeddingsV4ModelOutput(
-            vlm_last_hidden_states=hidden_states if output_vlm_last_hidden_states else None,
             single_vec_emb=single_vec_emb,
             multi_vec_emb=multi_vec_emb,
         )
     def _process_batches(
         self,
         data: List[Union[str, Image.Image]],
@@ -284,7 +324,11 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
                         embeddings = embeddings.single_vec_emb
                     else:
                         embeddings = embeddings.multi_vec_emb
-                    results.append(embeddings.cpu() if return_numpy else list(torch.unbind(embeddings)))
         if return_numpy:
             return np.concatenate([result.numpy() for result in results], axis=0)
         return [item for sublist in results for item in sublist]
@@ -298,7 +342,9 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
         desc: Optional[str] = None,
         **kwargs,
     ) -> List[torch.Tensor]:
-        processor_fn = partial(self.processor.process_texts, max_length=max_length, prefix="Query")
         return self._process_batches(
             data=queries,
             processor_fn=processor_fn,
@@ -325,17 +371,6 @@ class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
             **kwargs,
         )
-class JinaEmbeddingsV4Model:
-    """
-    Wrapper class for QwenVL25Embeddings that handles the loading of models and adapters.
-    """
-    def __init__(self, model, adapter_dir):
-        self.model = model
-        self.adapter_dir = adapter_dir
     @classmethod
     def from_pretrained(
         cls,
@@ -345,48 +380,53 @@ class JinaEmbeddingsV4Model:
     ):
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
-        task = kwargs.pop('task', 'retrieval')
-        model = QwenVL25Embeddings.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        if os.path.isdir(model.name_or_path):
-            adapter_dir = os.path.join(model.name_or_path, 'adapters')
         else:
             adapter_cache_path = snapshot_download(
-                repo_id=model.name_or_path,
-                allow_patterns=['adapters/*']
             )
-            adapter_dir = os.path.join(adapter_cache_path, 'adapters')
-        model = PeftModel.from_pretrained(model, os.path.join(adapter_dir, task))
-        je_v4_model = cls(model, adapter_dir)
-        return je_v4_model
-    def set_task(self, task: str):
-        """
-        Set the task adapter for the model.
-        Args:
-            task (str): The task name. Must be one of ['retrieval', 'text-matching', 'code']
-        """
-        if task not in ['retrieval', 'text-matching', 'code']:
-            raise ValueError(f"Invalid task: {task}. Must be one of ['retrieval', 'text-matching', 'code']")
-        adapter_path = os.path.join(self.adapter_dir, task)
-        hotswap_adapter(self.model, adapter_path, adapter_name='default')
-    def __getattr__(self, name):
-        """
-        Delegate attribute access to the underlying model.
-        """
-        if hasattr(self.model, name):
-            return getattr(self.model, name)
-        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
-    def __call__(self, *args, **kwargs):
-        """
-        Forward the call to the underlying model's forward method.
-        """
-        return self.model(*args, **kwargs)

 import math
+import os
 from dataclasses import dataclass
+from enum import Enum
+from functools import partial
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
+import numpy as np
 import torch
+from huggingface_hub import snapshot_download
+from peft import PeftModel
+from peft.utils.hotswap import hotswap_adapter
+from PIL import Image
 from torch import nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BatchFeature
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen2_5_vl import (Qwen2_5_VLForConditionalGeneration,
+                                            Qwen2_5_VLProcessor)
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
     query = "query"
     passage = "passage"
+class TaskType(str, Enum):
+    retrieval = "retrieval"
+    code = "code"
+    text_matching = "text-matching"
 class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
     def __init__(self, *args, **kwargs) -> None:
         Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
             images = cast(List[List[Image.Image]], images)
             text_doc = []
             for i in range(len(images)):
+                conversation = [
+                    {"role": "user", "content": [{"type": "image"}] * len(images[i])}
+                ]
+                template = self.apply_chat_template(
+                    conversation, add_generation_prompt=False
+                )
                 text_doc.append(template[self.assistant_prefix_len :])
         else:
         max_length = max([len(pv) for pv in pixel_values])
         pixel_values = [
+            torch.cat(
+                [
+                    pv,
+                    torch.zeros(
+                        (max_length - len(pv), pv.shape[1]),
+                        dtype=pv.dtype,
+                        device=pv.device,
+                    ),
+                ]
+            )
             for pv in pixel_values
         ]
         padding: Optional[str] = None,
     ) -> BatchFeature:
+        max_length = (
+            self.text_max_length
+            if max_length is None
+            else min(max_length, self.text_max_length)
+        )
         padded_texts: List[str] = []
         for text in texts:
     multi_vec_emb: Optional[torch.Tensor] = None
+class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
     config_class = JinaEmbeddingsV4Config
     main_input_name: ClassVar[str] = "doc_input_ids"
         Qwen2_5_VLForConditionalGeneration.__init__(self, config)
         self._init_projection_layers(config)
         self.post_init()
+        self.processor = JinaEmbeddingsV4Processor.from_pretrained(
+            self.name_or_path, trust_remote_code=True
+        )
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
     ) -> torch.Tensor:
         if "pixel_values" in kwargs:
             offsets = kwargs["image_grid_thw"][:, 1] * kwargs["image_grid_thw"][:, 2]
+            kwargs["pixel_values"] = torch.cat(
+                [pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0
+            )
         position_ids, rope_deltas = super().get_rope_index(  # type: ignore
             input_ids=input_ids,
             attention_mask=attention_mask,
         )
+        kwargs["output_hidden_states"] = True
         outputs = super().forward(
             input_ids,
         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
+            img_start_pos = torch.where(
+                input_ids[0] == self.config.vision_start_token_id
+            )[0][0]
+            img_end_pos = torch.where(input_ids[0] == self.config.vision_end_token_id)[
+                0
+            ][0]
+            pooled_output = (
+                hidden_states[0][img_start_pos : img_end_pos + 1]
+                .mean(dim=0)
+                .unsqueeze(0)
+            )
         else:  # got query text
+            pooled_output = torch.sum(
+                hidden_states * attention_mask.unsqueeze(-1), dim=1
+            ) / torch.sum(attention_mask, dim=1, keepdim=True)
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
         )  # (batch_size, seq_length, hidden_size)
         # Compute the embeddings
+        single_vec_emb = self.project_to_single_vector_embeddings(
+            hidden_states, attention_mask, input_ids=input_ids
+        )
+        multi_vec_emb = self.project_to_multi_vector_embeddings(
+            hidden_states, attention_mask
+        )
         return JinaEmbeddingsV4ModelOutput(
+            vlm_last_hidden_states=(
+                hidden_states if output_vlm_last_hidden_states else None
+            ),
             single_vec_emb=single_vec_emb,
             multi_vec_emb=multi_vec_emb,
         )
     def _process_batches(
         self,
         data: List[Union[str, Image.Image]],
                         embeddings = embeddings.single_vec_emb
                     else:
                         embeddings = embeddings.multi_vec_emb
+                    results.append(
+                        embeddings.cpu()
+                        if return_numpy
+                        else list(torch.unbind(embeddings))
+                    )
         if return_numpy:
             return np.concatenate([result.numpy() for result in results], axis=0)
         return [item for sublist in results for item in sublist]
         desc: Optional[str] = None,
         **kwargs,
     ) -> List[torch.Tensor]:
+        processor_fn = partial(
+            self.processor.process_texts, max_length=max_length, prefix="Query"
+        )
         return self._process_batches(
             data=queries,
             processor_fn=processor_fn,
             **kwargs,
         )
     @classmethod
     def from_pretrained(
         cls,
     ):
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
+        task = kwargs.pop("task", TaskType.retrieval)
+        # Get the base model first
+        base_model = super().from_pretrained(
+            pretrained_model_name_or_path, *args, **kwargs
+        )
+        # Configure adapter directory
+        if os.path.isdir(base_model.name_or_path):
+            adapter_dir = os.path.join(base_model.name_or_path, "adapters")
         else:
             adapter_cache_path = snapshot_download(
+                repo_id=base_model.name_or_path, allow_patterns=["adapters/*"]
             )
+            adapter_dir = os.path.join(adapter_cache_path, "adapters")
+        # Store adapter directory for later use with set_task
+        base_model.adapter_dir = adapter_dir
+        # Create the PEFT model with the requested task adapter
+        peft_model = PeftModel.from_pretrained(
+            base_model, os.path.join(adapter_dir, task)
+        )
+        # Add set_task method to the PEFT model instance
+        def set_task_method(self, task_name: Union[str, TaskType]):
+            """
+            Set the task adapter for the model.
+            Args:
+                task_name (Union[str, TaskType]): The task name. Must be one of TaskType values or
+                                                  one of ['retrieval', 'text-matching', 'code']
+            """
+            if isinstance(task_name, str):
+                try:
+                    task_name = TaskType(task_name)
+                except ValueError:
+                    valid_tasks = [t.value for t in TaskType]
+                    raise ValueError(
+                        f"Invalid task: {task_name}. Must be one of {valid_tasks}"
+                    )
+            adapter_path = os.path.join(self.adapter_dir, task_name.value)
+            hotswap_adapter(self, adapter_path, adapter_name="default")
+        # Bind the method to the instance
+        peft_model.set_task = set_task_method.__get__(peft_model, type(peft_model))
+        return peft_model