jina-embeddings-v4 / processing_colqwen_duo.py

upload model

d1e5d70 4 months ago

9.25 kB

	import torch
	import math

	from abc import ABC, abstractmethod
	from PIL import Image

	from typing import Dict, List, Optional, Union, cast
	from typing_extensions import Unpack

	from transformers import BatchEncoding, BatchFeature
	from transformers.processing_utils import (
	AllKwargsForChatTemplate,
	ImageInput,
	PreTokenizedInput,
	TextInput,
	VideoInput,
	)
	from transformers.models.qwen2_vl import Qwen2VLProcessor


	def get_torch_device(device: str = "auto") -> str:
	"""
	Returns the device (string) to be used by PyTorch.

	`device` arg defaults to "auto" which will use:
	- "cuda:0" if available
	- else "mps" if available
	- else "cpu".
	"""
	if device == "auto":
	if torch.cuda.is_available():
	device = "cuda:0"
	elif torch.backends.mps.is_available(): # for Apple Silicon
	device = "mps"
	else:
	device = "cpu"
	return device

	class BaseVisualRetrieverProcessor(ABC):
	"""
	Base class for visual retriever processors.
	"""

	@abstractmethod
	def process_images(
	self,
	images: List[Image.Image],
	) -> Union[BatchFeature, BatchEncoding]:
	pass

	@abstractmethod
	def process_texts(
	self,
	texts: List[str],
	max_length: int = 50,
	suffix: Optional[str] = None,
	prefix: Optional[str] = None,
	) -> Union[BatchFeature, BatchEncoding]:
	pass

	@abstractmethod
	def score(
	self,
	qs: List[torch.Tensor],
	ps: List[torch.Tensor],
	device: Optional[Union[str, torch.device]] = None,
	**kwargs,
	) -> torch.Tensor:
	pass

	@staticmethod
	def score_single_vector(
	qs: List[torch.Tensor],
	ps: List[torch.Tensor],
	device: Optional[Union[str, torch.device]] = None,
	) -> torch.Tensor:
	"""
	Compute the dot product score for the given single-vector query and passage embeddings.
	"""
	device = device or get_torch_device("auto")

	if len(qs) == 0:
	raise ValueError("No queries provided")
	if len(ps) == 0:
	raise ValueError("No passages provided")

	qs_stacked = torch.stack(qs).to(device)
	ps_stacked = torch.stack(ps).to(device)

	scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
	assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"

	scores = scores.to(torch.float32)
	return scores

	@staticmethod
	def score_multi_vector(
	qs: List[torch.Tensor],
	ps: List[torch.Tensor],
	batch_size: int = 128,
	device: Optional[Union[str, torch.device]] = None,
	) -> torch.Tensor:
	"""
	Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
	"""
	device = device or get_torch_device("auto")

	if len(qs) == 0:
	raise ValueError("No queries provided")
	if len(ps) == 0:
	raise ValueError("No passages provided")

	scores_list: List[torch.Tensor] = []

	for i in range(0, len(qs), batch_size):
	scores_batch = []
	qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
	device
	)
	for j in range(0, len(ps), batch_size):
	ps_batch = torch.nn.utils.rnn.pad_sequence(
	ps[j : j + batch_size], batch_first=True, padding_value=0
	).to(device)
	scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
	scores_batch = torch.cat(scores_batch, dim=1).cpu()
	scores_list.append(scores_batch)

	scores = torch.cat(scores_list, dim=0)
	assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"

	scores = scores.to(torch.float32)
	return scores



	class QwenVLProcessor(ABC):

	def __call__(
	self,
	images: Optional[ImageInput] = None,
	text: Optional[Union[TextInput, PreTokenizedInput, List[PreTokenizedInput]]] = None,
	videos: Optional[VideoInput] = None,
	**kwargs,
	) -> BatchFeature:
	return super().__call__(images=images, text=text, videos=videos, **kwargs) # type: ignore

	def apply_chat_template(
	self,
	conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
	chat_template: Optional[str] = None,
	**kwargs: Unpack[AllKwargsForChatTemplate],
	) -> str:
	return super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs) # type: ignore


	class QwenVLEmbeddingProcessorBase(BaseVisualRetrieverProcessor, QwenVLProcessor):

	assistant_prefix_len: int = 58 # length of prefix created by
	# super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs)

	@staticmethod
	def round_by_factor(number: float, factor: int) -> int:
	"""Returns the closest integer to 'number' that is divisible by 'factor'."""
	return round(number / factor) * factor

	@staticmethod
	def ceil_by_factor(number: float, factor: int) -> int:
	"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
	return math.ceil(number / factor) * factor

	@staticmethod
	def floor_by_factor(number: float, factor: int) -> int:
	"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
	return math.floor(number / factor) * factor

	def process_images(
	self,
	images: Union[List[Image.Image], List[List[Image.Image]]],
	) -> BatchFeature:

	if isinstance(images[0], list):
	images = cast(List[List[Image.Image]], images)
	text_doc = []
	for i in range(len(images)):
	conversation = [{"role": "user", "content": [{"type": "image"}] * len(images[i])}]
	template = self.apply_chat_template(conversation, add_generation_prompt=False)
	text_doc.append(template[self.assistant_prefix_len :])

	else:
	images = cast(List[Image.Image], images)
	text_doc = [
	"<\|im_start\|>user\n<\|vision_start\|><\|image_pad\|><\|vision_end\|>Describe the image.<\|im_end\|>\n"
	] * len(images)

	# The following code is a hack to make sure the scatter in DDP is done correctly when training on multiple GPUs
	batch_doc = self(text=text_doc, images=images, padding="longest", return_tensors="pt") # type: ignore
	# Separate pixel_values for each image
	offsets = batch_doc["image_grid_thw"][:, 1] * batch_doc["image_grid_thw"][:, 2]
	# Pad pixel_values to the same length to be able to make it into a tensor
	pixel_values = torch.split(batch_doc["pixel_values"], offsets.tolist())

	max_length = max([len(pv) for pv in pixel_values])

	pixel_values = [
	torch.cat([pv, torch.zeros((max_length - len(pv), pv.shape[1]), dtype=pv.dtype, device=pv.device)])
	for pv in pixel_values
	]

	batch_doc["pixel_values"] = torch.stack(pixel_values)
	return batch_doc

	def process_texts(
	self,
	texts: List[str],
	max_length: int,
	suffix: Optional[str] = None,
	prefix: Optional[str] = None,
	padding: Optional[str] = None,
	) -> BatchFeature:

	if suffix is None:
	suffix = "<pad>" * 10

	padded_texts: List[str] = []

	for text in texts:
	if prefix:
	text = f"{prefix}: {text}"
	text += suffix
	padded_texts.append(text)

	text_batch = self(
	text=padded_texts,
	return_tensors="pt",
	padding=padding or "longest",
	max_length=max_length,
	truncation=True,
	)

	return text_batch




	class ColQwenDuoProcessorBase(QwenVLEmbeddingProcessorBase):
	"""
	Processor for ColQwenDuo. Mirrors the `ColQwen2Processor` class.
	"""

	def score(
	self,
	qs: List[torch.Tensor],
	ps: List[torch.Tensor],
	vector_type: str,
	device: Optional[Union[str, torch.device]] = None,
	truncate: Optional[int] = None,
	**kwargs,
	) -> torch.Tensor:
	"""
	Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
	"""
	if truncate:
	qs = [q[..., :truncate] for q in qs]
	ps = [p[..., :truncate] for p in ps]

	if vector_type == "single_vector":
	return self.score_single_vector(qs, ps, device=device)
	elif vector_type == "multi_vector":
	return self.score_multi_vector(qs, ps, device=device, **kwargs)
	else:
	raise ValueError('vector_type must be one of the following: [`single_vector`, `multi_vector`]')


	class ColQwen2DuoProcessor(ColQwenDuoProcessorBase, Qwen2VLProcessor):
	def __init__(self, args, *kwargs) -> None:
	Qwen2VLProcessor.__init__(self, args, *kwargs)