jinaai
/

jina-embeddings-v4

@@ -5,20 +5,24 @@ import os
 from dataclasses import dataclass
 from enum import Enum
 from functools import partial
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
-from peft import PeftModel, LoraConfig
 from PIL import Image
 from torch import nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BatchFeature
-from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 from .custom_lora_module import MultiAdapterLinear
 class PromptType(str, Enum):
@@ -140,7 +144,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         self._init_projection_layers(config)
         self.post_init()
         self.processor = JinaEmbeddingsV4Processor.from_pretrained(
-            self.name_or_path, trust_remote_code=True
         )
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
@@ -160,7 +164,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             task (str): The task name. Must be one of ['retrieval', 'text-matching', 'code']
         """
         if task not in self.config.task_names:
-            raise ValueError(f"Invalid task: {task}. Must be one of {self.config.task_names}.")
         self._task = task
     def get_last_hidden_states(
@@ -342,7 +348,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         for batch in tqdm(dataloader, desc=desc):
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
-                with torch.autocast(device_type=torch.device(self.device).type, dtype=torch.bfloat16):
                     embeddings = self(**batch, task_label=task_label)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
@@ -395,7 +403,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             encode_kwargs["truncate_dim"] = truncate_dim
         return encode_kwargs
     def _validate_task(self, task: Optional[str] = None) -> str:
         if task is None:
             if self.task is None:
@@ -406,7 +414,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             task = self.task
         else:
             if task not in self.config.task_names:
-                raise ValueError(f"Invalid task: {task}. Must be one of {self.config.task_names}.")
         return task
     def encode_texts(
@@ -460,9 +470,23 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         return embeddings
     def encode_images(
         self,
-        images: List[Image.Image],
         task: Optional[str] = None,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
@@ -474,7 +498,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         Encodes a list of images into embeddings.
         Args:
-            images: List of PIL images to encode
             batch_size: Number of images to process at once
             vector_type: Type of embedding vector to generate ('single_vector' or 'multi_vector')
             return_numpy: Whether to return numpy arrays instead of torch tensors
@@ -489,9 +513,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             self.processor.image_processor.max_pixels = (
                 max_pixels  # change during encoding
             )
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
         task = self._validate_task(task)
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,
@@ -519,8 +543,10 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         """
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
         kwargs["key_mapping"] = super()._checkpoint_conversion_mapping
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
@@ -547,19 +573,19 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             model_id=adapter_dir,
             config=lora_config,
         )
         @property
         def task(self):
             return self.model.task
         @task.setter
         def task(self, value):
             self.model.task = value
         peft_model.task = property(task.fget, task.fset)
         peft_model.__class__.task = property(
             lambda self: self.model.task,
-            lambda self, value: setattr(self.model, 'task', value)
         )
         return peft_model

 from dataclasses import dataclass
 from enum import Enum
 from functools import partial
+from io import BytesIO
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
 import numpy as np
+import requests
 import torch
 from huggingface_hub import snapshot_download
+from peft import LoraConfig, PeftModel
 from PIL import Image
 from torch import nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BatchFeature
+from transformers.utils import is_flash_attn_2_available
 from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 from .custom_lora_module import MultiAdapterLinear
+from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 class PromptType(str, Enum):
         self._init_projection_layers(config)
         self.post_init()
         self.processor = JinaEmbeddingsV4Processor.from_pretrained(
+            self.name_or_path, trust_remote_code=True, use_fast=True
         )
         self.single_vector_projector_dim = config.single_vector_projector_dim
         self.multi_vector_projector_dim = config.multi_vector_projector_dim
             task (str): The task name. Must be one of ['retrieval', 'text-matching', 'code']
         """
         if task not in self.config.task_names:
+            raise ValueError(
+                f"Invalid task: {task}. Must be one of {self.config.task_names}."
+            )
         self._task = task
     def get_last_hidden_states(
         for batch in tqdm(dataloader, desc=desc):
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
+                with torch.autocast(
+                    device_type=torch.device(self.device).type, dtype=torch.bfloat16
+                ):
                     embeddings = self(**batch, task_label=task_label)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
             encode_kwargs["truncate_dim"] = truncate_dim
         return encode_kwargs
     def _validate_task(self, task: Optional[str] = None) -> str:
         if task is None:
             if self.task is None:
             task = self.task
         else:
             if task not in self.config.task_names:
+                raise ValueError(
+                    f"Invalid task: {task}. Must be one of {self.config.task_names}."
+                )
         return task
     def encode_texts(
         return embeddings
+    def _load_images_if_needed(
+        self, images: List[Union[str, Image.Image]]
+    ) -> List[Image.Image]:
+        loaded_images = []
+        for image in images:
+            if isinstance(image, str):
+                if image.startswith("http"):
+                    response = requests.get(image)
+                    image = Image.open(BytesIO(response.content)).convert("RGB")
+                else:
+                    image = Image.open(image).convert("RGB")
+            loaded_images.append(image)
+        return loaded_images
     def encode_images(
         self,
+        images: List[Union[str, Image.Image]],
         task: Optional[str] = None,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
         Encodes a list of images into embeddings.
         Args:
+            images: List of PIL images, URLs, or local file paths to encode
             batch_size: Number of images to process at once
             vector_type: Type of embedding vector to generate ('single_vector' or 'multi_vector')
             return_numpy: Whether to return numpy arrays instead of torch tensors
             self.processor.image_processor.max_pixels = (
                 max_pixels  # change during encoding
             )
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
         task = self._validate_task(task)
+        images = self._load_images_if_needed(images)
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,
         """
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
         kwargs["key_mapping"] = super()._checkpoint_conversion_mapping
+        if not is_flash_attn_2_available():
+            kwargs["attn_implementation"] = "sdpa"
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
             model_id=adapter_dir,
             config=lora_config,
         )
         @property
         def task(self):
             return self.model.task
         @task.setter
         def task(self, value):
             self.model.task = value
         peft_model.task = property(task.fget, task.fset)
         peft_model.__class__.task = property(
             lambda self: self.model.task,
+            lambda self, value: setattr(self.model, "task", value),
         )
         return peft_model

tokenizer_config.json CHANGED Viewed

@@ -202,7 +202,7 @@
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
-  "processor_class": "ColQwen25DuoProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
+  "processor_class": "JinaEmbeddingsV4Processor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null