File size: 9,249 Bytes

d1e5d70

import torch
import math

from abc import ABC, abstractmethod
from PIL import Image

from typing import Dict, List, Optional, Union, cast
from typing_extensions import Unpack

from transformers import BatchEncoding, BatchFeature
from transformers.processing_utils import (
    AllKwargsForChatTemplate,
    ImageInput,
    PreTokenizedInput,
    TextInput,
    VideoInput,
)
from transformers.models.qwen2_vl import Qwen2VLProcessor


def get_torch_device(device: str = "auto") -> str:
    """
    Returns the device (string) to be used by PyTorch.

    `device` arg defaults to "auto" which will use:
    - "cuda:0" if available
    - else "mps" if available
    - else "cpu".
    """
    if device == "auto":
        if torch.cuda.is_available():
            device = "cuda:0"
        elif torch.backends.mps.is_available():  # for Apple Silicon
            device = "mps"
        else:
            device = "cpu"
    return device

class BaseVisualRetrieverProcessor(ABC):
    """
    Base class for visual retriever processors.
    """

    @abstractmethod
    def process_images(
        self,
        images: List[Image.Image],
    ) -> Union[BatchFeature, BatchEncoding]:
        pass

    @abstractmethod
    def process_texts(
        self,
        texts: List[str],
        max_length: int = 50,
        suffix: Optional[str] = None,
        prefix: Optional[str] = None,
    ) -> Union[BatchFeature, BatchEncoding]:
        pass

    @abstractmethod
    def score(
        self,
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        device: Optional[Union[str, torch.device]] = None,
        **kwargs,
    ) -> torch.Tensor:
        pass

    @staticmethod
    def score_single_vector(
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        device: Optional[Union[str, torch.device]] = None,
    ) -> torch.Tensor:
        """
        Compute the dot product score for the given single-vector query and passage embeddings.
        """
        device = device or get_torch_device("auto")

        if len(qs) == 0:
            raise ValueError("No queries provided")
        if len(ps) == 0:
            raise ValueError("No passages provided")

        qs_stacked = torch.stack(qs).to(device)
        ps_stacked = torch.stack(ps).to(device)

        scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
        assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"

        scores = scores.to(torch.float32)
        return scores

    @staticmethod
    def score_multi_vector(
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        batch_size: int = 128,
        device: Optional[Union[str, torch.device]] = None,
    ) -> torch.Tensor:
        """
        Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
        """
        device = device or get_torch_device("auto")

        if len(qs) == 0:
            raise ValueError("No queries provided")
        if len(ps) == 0:
            raise ValueError("No passages provided")

        scores_list: List[torch.Tensor] = []

        for i in range(0, len(qs), batch_size):
            scores_batch = []
            qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
                device
            )
            for j in range(0, len(ps), batch_size):
                ps_batch = torch.nn.utils.rnn.pad_sequence(
                    ps[j : j + batch_size], batch_first=True, padding_value=0
                ).to(device)
                scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
            scores_batch = torch.cat(scores_batch, dim=1).cpu()
            scores_list.append(scores_batch)

        scores = torch.cat(scores_list, dim=0)
        assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"

        scores = scores.to(torch.float32)
        return scores



class QwenVLProcessor(ABC):

    def __call__(
        self,
        images: Optional[ImageInput] = None,
        text: Optional[Union[TextInput, PreTokenizedInput, List[PreTokenizedInput]]] = None,
        videos: Optional[VideoInput] = None,
        **kwargs,
    ) -> BatchFeature:
        return super().__call__(images=images, text=text, videos=videos, **kwargs)  # type: ignore

    def apply_chat_template(
        self,
        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
        chat_template: Optional[str] = None,
        **kwargs: Unpack[AllKwargsForChatTemplate],
    ) -> str:
        return super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs)  # type: ignore


class QwenVLEmbeddingProcessorBase(BaseVisualRetrieverProcessor, QwenVLProcessor):

    assistant_prefix_len: int = 58  # length of prefix created by
    # super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs)

    @staticmethod
    def round_by_factor(number: float, factor: int) -> int:
        """Returns the closest integer to 'number' that is divisible by 'factor'."""
        return round(number / factor) * factor

    @staticmethod
    def ceil_by_factor(number: float, factor: int) -> int:
        """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
        return math.ceil(number / factor) * factor

    @staticmethod
    def floor_by_factor(number: float, factor: int) -> int:
        """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
        return math.floor(number / factor) * factor

    def process_images(
        self,
        images: Union[List[Image.Image], List[List[Image.Image]]],
    ) -> BatchFeature:

        if isinstance(images[0], list):
            images = cast(List[List[Image.Image]], images)
            text_doc = []
            for i in range(len(images)):
                conversation = [{"role": "user", "content": [{"type": "image"}] * len(images[i])}]
                template = self.apply_chat_template(conversation, add_generation_prompt=False)
                text_doc.append(template[self.assistant_prefix_len :])

        else:
            images = cast(List[Image.Image], images)
            text_doc = [
                "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n"
            ] * len(images)

        # The following code is a hack to make sure the scatter in DDP is done correctly when training on multiple GPUs
        batch_doc = self(text=text_doc, images=images, padding="longest", return_tensors="pt")  # type: ignore
        # Separate pixel_values for each image
        offsets = batch_doc["image_grid_thw"][:, 1] * batch_doc["image_grid_thw"][:, 2]
        # Pad pixel_values to the same length to be able to make it into a tensor
        pixel_values = torch.split(batch_doc["pixel_values"], offsets.tolist())

        max_length = max([len(pv) for pv in pixel_values])

        pixel_values = [
            torch.cat([pv, torch.zeros((max_length - len(pv), pv.shape[1]), dtype=pv.dtype, device=pv.device)])
            for pv in pixel_values
        ]

        batch_doc["pixel_values"] = torch.stack(pixel_values)
        return batch_doc

    def process_texts(
        self,
        texts: List[str],
        max_length: int,
        suffix: Optional[str] = None,
        prefix: Optional[str] = None,
        padding: Optional[str] = None,
    ) -> BatchFeature:

        if suffix is None:
            suffix = "<pad>" * 10

        padded_texts: List[str] = []

        for text in texts:
            if prefix:
                text = f"{prefix}: {text}"
            text += suffix
            padded_texts.append(text)

        text_batch = self(
            text=padded_texts,
            return_tensors="pt",
            padding=padding or "longest",
            max_length=max_length,
            truncation=True,
        )

        return text_batch




class ColQwenDuoProcessorBase(QwenVLEmbeddingProcessorBase):
    """
    Processor for ColQwenDuo. Mirrors the `ColQwen2Processor` class.
    """

    def score(
        self,
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        vector_type: str,
        device: Optional[Union[str, torch.device]] = None,
        truncate: Optional[int] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
        """
        if truncate:
            qs = [q[..., :truncate] for q in qs]
            ps = [p[..., :truncate] for p in ps]

        if vector_type == "single_vector":
            return self.score_single_vector(qs, ps, device=device)
        elif vector_type == "multi_vector":
            return self.score_multi_vector(qs, ps, device=device, **kwargs)
        else:
            raise ValueError('vector_type must be one of the following: [`single_vector`, `multi_vector`]')


class ColQwen2DuoProcessor(ColQwenDuoProcessorBase, Qwen2VLProcessor):
    def __init__(self, *args, **kwargs) -> None:
        Qwen2VLProcessor.__init__(self, *args, **kwargs)