jinaai
/

jina-embeddings-v4

   "vision_start_token_id": 151652,
   "vision_token_id": 151654,
   "vocab_size": 151936,
+  "truncate_dim": null,
+  "task_names": ["retrieval", "text-matching", "code"],
+  "matryoshka_dims": [128, 256, 512, 1024]
 }

custom_lora_module.py CHANGED Viewed

@@ -2,31 +2,35 @@ from __future__ import annotations
 import math
 import warnings
-from typing import Any, Optional, Union
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from accelerate.utils.imports import is_xpu_available
-from torch import svd_lowrank
-from transformers.pytorch_utils import Conv1D
-from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
-from peft.utils.integrations import (
-    dequantize_module_weight,
-    gather_params_ctx,
-    get_bnb_param_type,
-    skip_init_on_device,
-)
-from peft.utils.other import transpose
 from peft.tuners.lora import LoraLayer
-class Linear(nn.Module, LoraLayer):
-    # Lora implemented in a dense layer
     def __init__(
         self,
         base_layer,
         adapter_name: str,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
@@ -40,8 +44,9 @@ class Linear(nn.Module, LoraLayer):
     ) -> None:
         super().__init__()
         LoraLayer.__init__(self, base_layer, **kwargs)
-        self.fan_in_fan_out = fan_in_fan_out
         self._active_adapter = adapter_name
         self.update_layer(
             adapter_name,
@@ -55,160 +60,14 @@ class Linear(nn.Module, LoraLayer):
         )
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-        for active_adapter in adapter_names:
-            if active_adapter in self.lora_A.keys():
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    # Note that safe_merge will be slower than the normal merge
-                    # because of the copy operation.
-                    orig_weights = base_layer.weight.data.clone()
-                    delta_weight = self.get_delta_weight(active_adapter)
-                    if not self.use_dora[active_adapter]:
-                        orig_weights += delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = (
-                            self.lora_magnitude_vector[active_adapter]
-                            .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1)
-                            .detach()
-                        )
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
-                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
-                        orig_weights = dora_factor * (orig_weights + delta_weight)
-                    if not torch.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-                    base_layer.weight.data = orig_weights
-                    if self.lora_bias[active_adapter]:
-                        new_bias = base_layer.bias + self.lora_B[active_adapter].bias
-                        if not torch.isfinite(new_bias).all():
-                            raise ValueError(
-                                f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                            )
-                        base_layer.bias.data = new_bias
-                else:
-                    delta_weight = self.get_delta_weight(active_adapter)
-                    if not self.use_dora[active_adapter]:
-                        base_layer.weight.data += delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = (
-                            self.lora_magnitude_vector[active_adapter]
-                            .get_weight_norm(
-                                base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
-                            )
-                            .detach()
-                        )
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
-                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
-                        new_weight = dora_factor * (base_layer.weight.data + delta_weight)
-                        base_layer.weight.data = new_weight
-                    if self.lora_bias[active_adapter]:
-                        base_layer.bias.data += self.lora_B[active_adapter].bias
-                self.merged_adapters.append(active_adapter)
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.lora_A.keys():
-                weight = self.get_base_layer().weight
-                delta_weight = self.get_delta_weight(active_adapter)
-                if not self.use_dora[active_adapter]:
-                    weight.data -= delta_weight
-                else:
-                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
-                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
-                    weight.data = weight_orig
-                if self.lora_bias[active_adapter]:
-                    self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias
-    def get_delta_weight(self, adapter) -> torch.Tensor:
-        """
-        Compute the delta weight for the given adapter.
-        Args:
-            adapter (str):
-                The name of the adapter for which the delta weight should be computed.
-        """
-        device = self.lora_B[adapter].weight.device
-        dtype = self.lora_B[adapter].weight.dtype
-        # In case users wants to merge the adapter weights that are in
-        # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
-        # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
-        cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
-        weight_A = self.lora_A[adapter].weight
-        weight_B = self.lora_B[adapter].weight
-        if cast_to_fp32:
-            weight_A = weight_A.float()
-            weight_B = weight_B.float()
-        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
-        if cast_to_fp32:
-            output_tensor = output_tensor.to(dtype=dtype)
-            # cast back the weights
-            self.lora_A[adapter].weight.data = weight_A.to(dtype)
-            self.lora_B[adapter].weight.data = weight_B.to(dtype)
-        return output_tensor
-    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         self._check_forward_args(x, *args, **kwargs)
-        adapter_names = kwargs.pop("adapter_names", None)
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
             result = self.base_layer(x, *args, **kwargs)
-        elif adapter_names is not None:
-            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
         elif self.merged:
             result = self.base_layer(x, *args, **kwargs)
         else:
@@ -219,30 +78,34 @@ class Linear(nn.Module, LoraLayer):
             for active_adapter in self.active_adapters:
                 if active_adapter not in lora_A_keys:
                     continue
-                lora_A = self.lora_A[active_adapter]['default']
-                lora_B = self.lora_B[active_adapter]['default']
-                dropout = self.lora_dropout[active_adapter]
-                scaling = self.scaling[active_adapter]
-                x = self._cast_input_dtype(x, lora_A.weight.dtype)
-                if not self.use_dora[active_adapter]:
                     result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
-                    if isinstance(dropout, nn.Identity) or not self.training:
-                        base_result = result
-                    else:
-                        x = dropout(x)
-                        base_result = None
-                    result = result + self.lora_magnitude_vector[active_adapter](
-                        x,
-                        lora_A=lora_A,
-                        lora_B=lora_B,
-                        scaling=scaling,
-                        base_layer=self.get_base_layer(),
-                        base_result=base_result,
-                    )
             result = result.to(torch_result_dtype)
@@ -278,12 +141,12 @@ class Linear(nn.Module, LoraLayer):
         self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
         # Actual trainable parameters
         self.lora_A[adapter_name] = nn.ModuleDict({
-            "default": nn.Linear(self.in_features, r, bias=False),
-            "second_adapter": nn.Linear(self.in_features, r, bias=False)
         })
         self.lora_B[adapter_name] = nn.ModuleDict({
-            "default": nn.Linear(r, self.out_features, bias=lora_bias),
-            "second_adapter": nn.Linear(r, self.out_features, bias=lora_bias)
         })
         self.lora_bias[adapter_name] = lora_bias
@@ -303,15 +166,28 @@ class Linear(nn.Module, LoraLayer):
         if init_lora_weights is True:
             # initialize A the same way as the default for nn.Linear and B to zero
             # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
-            nn.init.kaiming_uniform_(self.lora_A[adapter_name]['default'].weight, a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.lora_A[adapter_name]['second_adapter'].weight, a=math.sqrt(5))
         elif init_lora_weights.lower() == "gaussian":
-            nn.init.normal_(self.lora_A[adapter_name]['default'].weight, std=1 / self.r[adapter_name])
-            nn.init.normal_(self.lora_A[adapter_name]['second_adapter'].weight, std=1 / self.r[adapter_name])
         else:
             raise ValueError(f"Unknown initialization {init_lora_weights=}")
-        nn.init.zeros_(self.lora_B[adapter_name]['default'].weight)
-        nn.init.zeros_(self.lora_B[adapter_name]['second_adapter'].weight)
         if self.lora_bias[adapter_name]:
-            nn.init.zeros_(self.lora_B[adapter_name]['default'].bias)
-            nn.init.zeros_(self.lora_B[adapter_name]['second_adapter'].bias)

 import math
 import warnings
+from typing import Any, Optional, Union, List
 import torch
 import torch.nn as nn
 from peft.tuners.lora import LoraLayer
+class MultiAdapterLinear(nn.Module, LoraLayer):
+    """
+    Custom LoRA module supporting multiple adapters for a linear layer.
+    This module extends the standard LoRA implementation to support multiple task-specific
+    adapters that can be dynamically selected during the forward pass. The task_label
+    parameter passed to the forward function determines which LoRA adapter(s) to use:
+    - If task_label is a string, all examples in the batch use the same adapter
+    - If task_label is a list of strings, each example can use a different adapter
+    This enables efficient multi-task inference where all task-specific LoRA adapters
+    are loaded in memory simultaneously and dynamically selected per example, eliminating
+    the need to switch adapter states between tasks and allowing optimal throughput
+    for mixed-task batches.
+    Derived from peft.tuners.lora.Linear.
+    """
     def __init__(
         self,
         base_layer,
         adapter_name: str,
+        task_names: List[str],
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
     ) -> None:
         super().__init__()
         LoraLayer.__init__(self, base_layer, **kwargs)
+        self.fan_in_fan_out = fan_in_fan_out
+        self.task_names = task_names
         self._active_adapter = adapter_name
         self.update_layer(
             adapter_name,
         )
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
+    def forward(self, x: torch.Tensor, task_label: Union[str, List[str]], *args: Any, **kwargs: Any) -> torch.Tensor:
         self._check_forward_args(x, *args, **kwargs)
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
             result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
             result = self.base_layer(x, *args, **kwargs)
         else:
             for active_adapter in self.active_adapters:
                 if active_adapter not in lora_A_keys:
                     continue
+                if isinstance(task_label, str):
+                    lora_A = self.lora_A[active_adapter][task_label]
+                    lora_B = self.lora_B[active_adapter][task_label]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+                    x = self._cast_input_dtype(x, lora_A.weight.dtype)
                     result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
+                    unique_tasks = list(set(task_label))
+                    lora_output = torch.zeros_like(result)
+                    for task in unique_tasks:
+                        task_indices = [i for i, t in enumerate(task_label) if t == task]
+                        task_x = x[task_indices]
+                        lora_A = self.lora_A[active_adapter][task]
+                        lora_B = self.lora_B[active_adapter][task]
+                        dropout = self.lora_dropout[active_adapter]
+                        scaling = self.scaling[active_adapter]
+                        task_x = self._cast_input_dtype(task_x, lora_A.weight.dtype)
+                        task_lora_value = lora_B(lora_A(dropout(task_x))) * scaling
+                        for i, idx in enumerate(task_indices):
+                            lora_output[idx] = task_lora_value[i]
+                    result = result + lora_output
             result = result.to(torch_result_dtype)
         self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
         # Actual trainable parameters
         self.lora_A[adapter_name] = nn.ModuleDict({
+            task_name: nn.Linear(self.in_features, r, bias=False)
+            for task_name in self.task_names
         })
         self.lora_B[adapter_name] = nn.ModuleDict({
+            task_name: nn.Linear(r, self.out_features, bias=lora_bias)
+            for task_name in self.task_names
         })
         self.lora_bias[adapter_name] = lora_bias
         if init_lora_weights is True:
             # initialize A the same way as the default for nn.Linear and B to zero
             # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
+            for task_name in self.task_names:
+                nn.init.kaiming_uniform_(self.lora_A[adapter_name][task_name].weight, a=math.sqrt(5))
         elif init_lora_weights.lower() == "gaussian":
+            for task_name in self.task_names:
+                nn.init.normal_(self.lora_A[adapter_name][task_name].weight, std=1 / self.r[adapter_name])
         else:
             raise ValueError(f"Unknown initialization {init_lora_weights=}")
+        for task_name in self.task_names:
+            nn.init.zeros_(self.lora_B[adapter_name][task_name].weight)
         if self.lora_bias[adapter_name]:
+            for task_name in self.task_names:
+                nn.init.zeros_(self.lora_B[adapter_name][task_name].bias)
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+        """
+        raise NotImplementedError("Merge operation is not supported")
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        raise NotImplementedError("Unmerge operation is not supported")

modeling_jina_embeddings_v4.py CHANGED Viewed

@@ -20,22 +20,15 @@ from transformers import BatchFeature
 from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 import peft
-from .custom_lora_module import Linear
 class PromptType(str, Enum):
     query = "query"
     passage = "passage"
-class TaskType(str, Enum):
-    retrieval = "retrieval"
-    code = "code"
-    text_matching = "text-matching"
-    test = "test"
 PREFIX_DICT = {"query": "Query", "passage": "Passage"}
-TRUNCATE_DIMS = [128, 256, 512, 1024]
 VECTOR_TYPES = ["single_vector", "multi_vector"]
@@ -153,9 +146,28 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         )
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
     def get_last_hidden_states(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         **kwargs,
@@ -174,8 +186,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         kwargs["output_hidden_states"] = True
         outputs = super().forward(
-            input_ids,
-            attention_mask,
             **kwargs,
             position_ids=position_ids,
             rope_deltas=rope_deltas,
@@ -207,6 +220,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
     def project_to_single_vector_embeddings(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
         input_ids: Optional[torch.LongTensor] = None,
@@ -215,33 +229,48 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
-            img_start_positions = torch.where(input_ids == self.config.vision_start_token_id)[1]
-            img_end_positions = torch.where(input_ids == self.config.vision_end_token_id)[1]
             batch_size, seq_len = input_ids.shape
-            position_indices = torch.arange(seq_len, device=input_ids.device).expand(batch_size, -1)
-            image_mask = (position_indices >= img_start_positions.unsqueeze(1)) & (position_indices <= img_end_positions.unsqueeze(1))
             masked_hidden_states = hidden_states * image_mask.unsqueeze(-1)
-            pooled_output = masked_hidden_states.sum(dim=1) / image_mask.sum(dim=1, keepdim=True)
         else:  # got query text
             pooled_output = torch.sum(
                 hidden_states * attention_mask.unsqueeze(-1), dim=1
             ) / torch.sum(attention_mask, dim=1, keepdim=True)
-        single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
     def project_to_multi_vector_embeddings(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         """
         Project the hidden states to multi-vector embeddings.
         """
-        multi_vec_emb = self.multi_vector_projector(hidden_states)
         multi_vec_emb = torch.nn.functional.normalize(multi_vec_emb, dim=-1)
         return multi_vec_emb * attention_mask.unsqueeze(-1)
@@ -250,6 +279,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
     def forward(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         output_vlm_last_hidden_states: bool = False,
@@ -267,14 +297,22 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         """
         # Forward pass through the VLM
         hidden_states = self.get_last_hidden_states(
-            input_ids=input_ids, attention_mask=attention_mask, **kwargs
         )  # (batch_size, seq_length, hidden_size)
         # Compute the embeddings
         single_vec_emb = self.project_to_single_vector_embeddings(
-            hidden_states, attention_mask, input_ids=input_ids
         )
         multi_vec_emb = self.project_to_multi_vector_embeddings(
-            hidden_states, attention_mask
         )
         return JinaEmbeddingsV4ModelOutput(
@@ -288,6 +326,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
     def _process_batches(
         self,
         data: List[Union[str, Image.Image]],
         processor_fn: Callable,
         desc: str,
         vector_type: str = "single_vector",
@@ -307,7 +346,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
                 with torch.autocast(device_type=torch.device(self.device).type):
-                    embeddings = self(**batch)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
                         if truncate_dim is not None:
@@ -338,7 +377,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             else:
                 encode_kwargs["prefix"] = (
                     PREFIX_DICT[prompt_name]
-                    if self.task != TaskType.text_matching
                     else PREFIX_DICT["query"]
                 )
@@ -351,18 +390,32 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             encode_kwargs["vector_type"] = vector_type
         truncate_dim = truncate_dim or self.config.truncate_dim
-        if truncate_dim is not None and truncate_dim not in TRUNCATE_DIMS:
             raise ValueError(
-                f"Invalid truncate_dim: {truncate_dim}. Must be one of {TRUNCATE_DIMS}."
             )
         else:
             encode_kwargs["truncate_dim"] = truncate_dim
         return encode_kwargs
     def encode_texts(
         self,
         texts: List[str],
         max_length: int = 8192,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
@@ -390,6 +443,8 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             vector_type, truncate_dim, prompt_name
         )
         processor_fn = partial(
             self.processor.process_texts,
             max_length=max_length,
@@ -400,6 +455,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             data=texts,
             processor_fn=processor_fn,
             desc="Encoding texts...",
             return_numpy=return_numpy,
             batch_size=batch_size,
             **encode_kwargs,
@@ -410,6 +466,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
     def encode_images(
         self,
         images: List[Image.Image],
         batch_size: int = 8,
         vector_type: Optional[str] = None,
         return_numpy: bool = False,
@@ -432,14 +489,17 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         """
         if max_pixels:
             default_max_pixels = self.processor.image_processor.max_pixels
-            self.processor.image_processor.max_pixels = max_pixels  # change during encoding
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,
             desc="Encoding images...",
             batch_size=batch_size,
             return_numpy=return_numpy,
             **encode_kwargs,
@@ -463,15 +523,6 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
-        task_value = kwargs.pop("task", "test")
-        try:
-            task = TaskType(task_value)
-        except ValueError:
-            valid_tasks = [t.value for t in TaskType]
-            raise ValueError(
-                f"Invalid task: {task_value}. Must be one of {valid_tasks}."
-            )
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
         )
@@ -485,46 +536,31 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             )
             adapter_dir = os.path.join(adapter_cache_path, "adapters")
-        base_model.adapter_dir = adapter_dir
-        base_model.task = task
-        lora_config = LoraConfig.from_pretrained(os.path.join(adapter_dir, task.value))
-        lora_config._custom_modules = {torch.nn.modules.linear.Linear: Linear}
-        # Create the PEFT model with the requested task adapter
         peft_model = PeftModel.from_pretrained(
-            model=base_model, model_id=os.path.join(adapter_dir, task.value), config=lora_config
         )
-        # Add set_task method to the PEFT model instance
-        def set_task_method(self, task: Union[str, TaskType]):
-            """
-            Set the task adapter for the model.
-            Args:
-                task (Union[str, TaskType]): The task name. Must be one of TaskType values or
-                                                  one of ['retrieval', 'text-matching', 'code']
-            """
-            if isinstance(task, str):
-                try:
-                    task = TaskType(task)
-                except ValueError:
-                    valid_tasks = [t.value for t in TaskType]
-                    raise ValueError(
-                        f"Invalid task: {task}. Must be one of {valid_tasks}"
-                    )
-            if self.model.task != task:
-                adapter_path = os.path.join(self.adapter_dir, task.value)
-                hotswap_adapter(self, adapter_path, adapter_name="default")
-                self.model.task = task
-        def get_task_method(self):
-            """
-            Get the task adapter for the model.
-            """
-            return self.model.task.value
-        # Bind the methods to the instance
-        peft_model.set_task = set_task_method.__get__(peft_model, type(peft_model))
-        peft_model.get_task = get_task_method.__get__(peft_model, type(peft_model))
         return peft_model

 from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 import peft
+from .custom_lora_module import MultiAdapterLinear
 class PromptType(str, Enum):
     query = "query"
     passage = "passage"
 PREFIX_DICT = {"query": "Query", "passage": "Passage"}
 VECTOR_TYPES = ["single_vector", "multi_vector"]
         )
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
+        self._task = None
+    @property
+    def task(self) -> Optional[str]:
+        """Get the current task set for the model."""
+        return self._task
+    @task.setter
+    def task(self, task: str):
+        """
+        Set the task for the model.
+        Args:
+            task (str): The task name. Must be one of ['retrieval', 'text-matching', 'code']
+        """
+        if task not in self.config.task_names:
+            raise ValueError(f"Invalid task: {task}. Must be one of {self.config.task_names}.")
+        self._task = task
     def get_last_hidden_states(
         self,
+        task_label: Union[str, List[str]],
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         **kwargs,
         kwargs["output_hidden_states"] = True
         outputs = super().forward(
+            task_label=task_label,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
             **kwargs,
             position_ids=position_ids,
             rope_deltas=rope_deltas,
     def project_to_single_vector_embeddings(
         self,
+        task_label: Union[str, List[str]],
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
         input_ids: Optional[torch.LongTensor] = None,
         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
+            img_start_positions = torch.where(
+                input_ids == self.config.vision_start_token_id
+            )[1]
+            img_end_positions = torch.where(
+                input_ids == self.config.vision_end_token_id
+            )[1]
             batch_size, seq_len = input_ids.shape
+            position_indices = torch.arange(seq_len, device=input_ids.device).expand(
+                batch_size, -1
+            )
+            image_mask = (position_indices >= img_start_positions.unsqueeze(1)) & (
+                position_indices <= img_end_positions.unsqueeze(1)
+            )
             masked_hidden_states = hidden_states * image_mask.unsqueeze(-1)
+            pooled_output = masked_hidden_states.sum(dim=1) / image_mask.sum(
+                dim=1, keepdim=True
+            )
         else:  # got query text
             pooled_output = torch.sum(
                 hidden_states * attention_mask.unsqueeze(-1), dim=1
             ) / torch.sum(attention_mask, dim=1, keepdim=True)
+        single_vec_emb = self.single_vector_projector(
+            pooled_output, task_label=task_label
+        )
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
     def project_to_multi_vector_embeddings(
         self,
+        task_label: Union[str, List[str]],
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         """
         Project the hidden states to multi-vector embeddings.
         """
+        multi_vec_emb = self.multi_vector_projector(
+            hidden_states, task_label=task_label
+        )
         multi_vec_emb = torch.nn.functional.normalize(multi_vec_emb, dim=-1)
         return multi_vec_emb * attention_mask.unsqueeze(-1)
     def forward(
         self,
+        task_label: Union[str, List[str]],
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         output_vlm_last_hidden_states: bool = False,
         """
         # Forward pass through the VLM
         hidden_states = self.get_last_hidden_states(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            task_label=task_label,
+            **kwargs,
         )  # (batch_size, seq_length, hidden_size)
         # Compute the embeddings
         single_vec_emb = self.project_to_single_vector_embeddings(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            input_ids=input_ids,
+            task_label=task_label,
         )
         multi_vec_emb = self.project_to_multi_vector_embeddings(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            task_label=task_label,
         )
         return JinaEmbeddingsV4ModelOutput(
     def _process_batches(
         self,
         data: List[Union[str, Image.Image]],
+        task_label: Union[str, List[str]],
         processor_fn: Callable,
         desc: str,
         vector_type: str = "single_vector",
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
                 with torch.autocast(device_type=torch.device(self.device).type):
+                    embeddings = self(**batch, task_label=task_label)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
                         if truncate_dim is not None:
             else:
                 encode_kwargs["prefix"] = (
                     PREFIX_DICT[prompt_name]
+                    if self.task != "text-matching"
                     else PREFIX_DICT["query"]
                 )
             encode_kwargs["vector_type"] = vector_type
         truncate_dim = truncate_dim or self.config.truncate_dim
+        if truncate_dim is not None and truncate_dim not in self.config.matryoshka_dims:
             raise ValueError(
+                f"Invalid truncate_dim: {truncate_dim}. Must be one of {self.config.matryoshka_dims}."
             )
         else:
             encode_kwargs["truncate_dim"] = truncate_dim
         return encode_kwargs
+    def _validate_task(self, task: Optional[str] = None) -> str:
+        if task is None:
+            if self.task is None:
+                raise ValueError(
+                    "Task must be specified before encoding data. You can set it either as a model property "
+                    "(e.g., model.task = 'retrieval') or pass it as an argument to the encode method."
+                )
+            task = self.task
+        else:
+            if task not in self.config.task_names:
+                raise ValueError(f"Invalid task: {task}. Must be one of {self.config.task_names}.")
+        return task
     def encode_texts(
         self,
         texts: List[str],
+        task: Optional[str] = None,
         max_length: int = 8192,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
             vector_type, truncate_dim, prompt_name
         )
+        task = self._validate_task(task)
         processor_fn = partial(
             self.processor.process_texts,
             max_length=max_length,
             data=texts,
             processor_fn=processor_fn,
             desc="Encoding texts...",
+            task_label=task,
             return_numpy=return_numpy,
             batch_size=batch_size,
             **encode_kwargs,
     def encode_images(
         self,
         images: List[Image.Image],
+        task: Optional[str] = None,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
         return_numpy: bool = False,
         """
         if max_pixels:
             default_max_pixels = self.processor.image_processor.max_pixels
+            self.processor.image_processor.max_pixels = (
+                max_pixels  # change during encoding
+            )
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
+        task = self._validate_task(task)
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,
             desc="Encoding images...",
+            task_label=task,
             batch_size=batch_size,
             return_numpy=return_numpy,
             **encode_kwargs,
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
         )
             )
             adapter_dir = os.path.join(adapter_cache_path, "adapters")
+        lora_config = LoraConfig.from_pretrained(os.path.join(adapter_dir, "test"))
+        lora_config._custom_modules = {
+            torch.nn.modules.linear.Linear: partial(
+                MultiAdapterLinear,
+                task_names=base_model.config.task_names,
+            )
+        }
         peft_model = PeftModel.from_pretrained(
+            model=base_model,
+            model_id=os.path.join(adapter_dir, "test"),
+            config=lora_config,
         )
+        @property
+        def task(self):
+            return self.model.task
+        @task.setter
+        def task(self, value):
+            self.model.task = value
+        peft_model.task = property(task.fget, task.fset)
+        peft_model.__class__.task = property(
+            lambda self: self.model.task,
+            lambda self, value: setattr(self.model, 'task', value)
+        )
         return peft_model

qwen2_5_vl.py CHANGED Viewed

@@ -1,28 +1,6 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_qwen2_5_vl.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
@@ -256,32 +234,6 @@ class Qwen2_5_VLConfig(PretrainedConfig):
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_qwen2_5_vl.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import math
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -891,8 +843,8 @@ class Qwen2MLP(nn.Module):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
@@ -1179,6 +1131,7 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
     # Adapted from Qwen2Attention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -1207,9 +1160,9 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
         bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
@@ -1255,7 +1208,7 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
@@ -1285,6 +1238,7 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -1323,6 +1277,7 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
         # Self Attention
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1337,7 +1292,7 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -1381,6 +1336,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -1461,7 +1417,8 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
                 )
             else:
                 layer_outputs = decoder_layer(
-                    hidden_states,
                     attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
@@ -1979,6 +1936,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
     @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2115,6 +2073,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
                 position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
         outputs = self.model(
             input_ids=None,
             position_ids=position_ids,
             attention_mask=attention_mask,
@@ -2324,32 +2283,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
         return input_ids, model_kwargs
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_qwen2_5_vl.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from typing import List, Union
 from transformers.feature_extraction_utils import BatchFeature

+# This file is a modified version of the Qwen2_5_VL model from the transformers library
+# that implements task-specific LoRA layers for multi-task embeddings.
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 import math
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x, task_label: Union[str, List[str]]):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, task_label=task_label)) * self.up_proj(x, task_label=task_label), task_label=task_label)
         return down_proj
     # Adapted from Qwen2Attention.forward
     def forward(
         self,
+        task_label: Union[str, List[str]],
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states, task_label=task_label)
+        key_states = self.k_proj(hidden_states, task_label=task_label)
+        value_states = self.v_proj(hidden_states, task_label=task_label)
         query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output, task_label=task_label)
         return attn_output, None, past_key_value
     def forward(
         self,
+        task_label: Union[str, List[str]],
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         # Self Attention
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            task_label=task_label,
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states, task_label=task_label)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
     def forward(
         self,
+        task_label: Union[str, List[str]],
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
                 )
             else:
                 layer_outputs = decoder_layer(
+                    task_label=task_label,
+                    hidden_states=hidden_states,
                     attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
     @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
+        task_label: Union[str, List[str]],
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
                 position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
         outputs = self.model(
+            task_label=task_label,
             input_ids=None,
             position_ids=position_ids,
             attention_mask=attention_mask,
         return input_ids, model_kwargs
 from typing import List, Union
 from transformers.feature_extraction_utils import BatchFeature