feat: mrope position id kernel and reference

Files changed (11) hide show

.gitignore +5 -0
build.toml +19 -0
ext-torch/get_position_ids/__init__.py +16 -0
ext-torch/registration.h +27 -0
ext-torch/torch_binding.cpp +11 -0
ext-torch/torch_binding.h +6 -0
flake.lock +97 -0
flake.nix +21 -0
get_position_ids/get_position_ids.cu +167 -0
test/reference.py +91 -0
test/test.py +101 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.pytest_cache
+__pycache__
+.bak
+# result

build.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[general]
+version = "0.0.1"
+[torch]
+name = "get_position_ids"
+src = [
+  "ext-torch/registration.h",
+  "ext-torch/torch_binding.cpp",
+  "ext-torch/torch_binding.h",
+]
+include = ["."]
+pyroot = "ext-torch"
+pyext = ["py", "json"]
+[kernel.get_position_ids]
+capabilities = ["7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0"]
+src = ["get_position_ids/get_position_ids.cu"]
+include = ["."]
+depends = ["torch"]

ext-torch/get_position_ids/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+try:
+    from ._ops import ops
+except ImportError as e:
+    # Fallback for local development.
+    try:
+        import _get_position_ids
+        ops = torch.ops._get_position_ids
+    except ImportError:
+        raise e
+def get_position_ids(out: torch.Tensor, input_ids: torch.Tensor, image_grid_thw: torch.Tensor) -> torch.Tensor:
+    ops.get_position_ids(out, input_ids, image_grid_thw)
+    return out

ext-torch/registration.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

ext-torch/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,11 @@

+#include "torch_binding.h"
+#include "registration.h"
+#include <torch/library.h>
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("get_position_ids(Tensor out, Tensor input_ids, Tensor "
+          "image_grid_thw) -> ()");
+  ops.impl("get_position_ids", torch::kCUDA, &get_position_ids);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

ext-torch/torch_binding.h ADDED Viewed

	@@ -0,0 +1,6 @@

+#pragma once
+#include <torch/torch.h>
+void get_position_ids(torch::Tensor &out, torch::Tensor &input_ids,
+                               torch::Tensor &image_grid_thw);

flake.lock ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1738830746,
+        "narHash": "sha256-WwMzQXiHnkgb+4xEn3mlTOLJ9/7rInn+SJdaC/rQr3M=",
+        "ref": "refs/heads/main",
+        "rev": "21c056ac3575e78d4228e9ed7924cfbe987398d6",
+        "revCount": 73,
+        "submodules": true,
+        "type": "git",
+        "url": "git+ssh://[email protected]/huggingface/kernel-builder"
+      },
+      "original": {
+        "submodules": true,
+        "type": "git",
+        "url": "git+ssh://[email protected]/huggingface/kernel-builder"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1738247409,
+        "narHash": "sha256-F72dKl9Na6/2N+garOm9qCXPa92GzR8eYSuDra6kbjY=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "358f57074b70e3ee9e1dc118151a4f6f81fcd3bb",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cuda-12.6-for-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+ description = "Flake for mrope_get_position_ids kernel";
+ inputs = {
+   kernel-builder = {
+     url = "git+ssh://[email protected]/huggingface/kernel-builder";
+     type = "git";
+     submodules = true;
+   };
+ };
+ outputs =
+   {
+     self,
+     kernel-builder,
+   }:
+   kernel-builder.lib.genFlakeOutputs ./.;
+ nixConfig = {
+   extra-substituters = [ "https://kernel-builder.cachix.org" ];
+   extra-trusted-public-keys = [ "kernel-builder.cachix.org-1:JCt71vSCqW9tnmOsUigxf7tVLztjYxQ198FI/j8LrFQ=" ];
+ };
+}

get_position_ids/get_position_ids.cu ADDED Viewed

	@@ -0,0 +1,167 @@

+#include <cuda_runtime.h>
+#include <torch/torch.h>
+#include <vector>
+#include <stdio.h>
+#define SPATIAL_MERGE_SIZE 2
+#define MAX_THREADS_PER_BLOCK 256
+// Kernel: each block processes one vision segment.
+// For a given segment, the kernel computes image positions by "unraveling" a 1D index
+// into 3D coordinates (t_idx, h_idx, w_idx) and then adds a per‑segment offset.
+__global__ void create_image_positions_kernel(
+    const int *image_grid_thw,                // shape: [num_segments * 3]
+    const int *segment_offsets,               // shape: [num_segments]
+    const int *vision_segment_lengths_cumsum, // shape: [num_segments]
+    int *image_positions)                     // output: shape [total_image_positions, 3]
+{
+    int segment_idx = blockIdx.x;
+    // Load grid dims for this segment.
+    int t = image_grid_thw[segment_idx * 3];
+    int h = image_grid_thw[segment_idx * 3 + 1] / SPATIAL_MERGE_SIZE;
+    int w = image_grid_thw[segment_idx * 3 + 2] / SPATIAL_MERGE_SIZE;
+    int total_length = t * h * w;
+    // Get the starting output position for this segment.
+    int pos_offset = segment_offsets[segment_idx];
+    // The per‐segment offset to add to each coordinate.
+    int offset_add = vision_segment_lengths_cumsum[segment_idx];
+    // Process all positions in this segment using a grid–stride loop.
+    for (int pos_idx = threadIdx.x; pos_idx < total_length; pos_idx += blockDim.x)
+    {
+        // Compute the "unraveled" coordinates.
+        int t_idx = pos_idx / (h * w);
+        int h_idx = (pos_idx / w) % h;
+        int w_idx = pos_idx % w;
+        // Write out the 3 coordinates (each image token gets 3 ints).
+        int out_index = (pos_offset + pos_idx) * 3;
+        image_positions[out_index] = t_idx + offset_add;
+        image_positions[out_index + 1] = h_idx + offset_add;
+        image_positions[out_index + 2] = w_idx + offset_add;
+    }
+}
+// This function computes text and image position ids then interleaves them as:
+//    [text segment 0, image segment 0, text segment 1, image segment 1, ...].
+// If extra text tokens exist after the last vision segment, they are appended at the end.
+void get_position_ids(
+    torch::Tensor &out,            // Final output tensor
+    torch::Tensor &input_ids,      // tensor holding token ids
+    torch::Tensor &image_grid_thw) // tensor of shape [num_segments, 3]: each row is [t, h, w]
+{
+    TORCH_CHECK(input_ids.device().is_cuda(), "input_ids must be a CUDA tensor");
+    TORCH_CHECK(image_grid_thw.device().is_cuda(), "image_grid_thw must be a CUDA tensor");
+    TORCH_CHECK(out.device().is_cuda(), "out must be a CUDA tensor");
+    const int input_len = input_ids.size(0);
+    auto options_int = torch::TensorOptions().device(input_ids.device()).dtype(torch::kInt);
+    auto options_long = torch::TensorOptions().device(input_ids.device()).dtype(torch::kLong);
+    const int VISION_START_TOKEN_ID = 151652;
+    const int VISION_END_TOKEN_ID = 151653;
+    // Find vision segments
+    auto vision_starts_mask = input_ids == VISION_START_TOKEN_ID;
+    auto vision_ends_mask = input_ids == VISION_END_TOKEN_ID;
+    auto starts = torch::where(vision_starts_mask)[0].to(torch::kInt);
+    auto ends = torch::where(vision_ends_mask)[0].to(torch::kInt);
+    int actual_segments = starts.size(0);
+    auto prev_end = torch::cat({torch::zeros({1}, options_long), ends.slice(0, 0, actual_segments - 1)});
+    // Compute text lengths between vision tokens.
+    auto text_lengths_between_vision = starts - prev_end + 1;
+    auto zeros = torch::zeros({1}, options_long);
+    auto widths = image_grid_thw.slice(0, 0, actual_segments).select(1, 2);
+    auto divided_widths = widths / SPATIAL_MERGE_SIZE;
+    auto vision_widths_max = torch::cat({zeros, divided_widths.slice(0, 0, actual_segments - 1)});
+    // The vision segment length is the sum of text tokens plus the (merged) image width.
+    auto vision_segment_lengths = text_lengths_between_vision + vision_widths_max;
+    auto vision_segment_lengths_cumsum = vision_segment_lengths.cumsum(0);
+    auto text_segment_lengths = vision_segment_lengths_cumsum - text_lengths_between_vision;
+    // Compute per‑segment starting indices for image positions.
+    std::vector<int> segment_offsets_vec(actual_segments);
+    int total_image_positions = 0;
+    // (Using a CPU copy because the number of segments is small.)
+    auto image_grid_cpu = image_grid_thw.to(torch::kCPU);
+    auto image_grid_accessor = image_grid_cpu.accessor<int, 2>(); // shape: [actual_segments, 3]
+    for (int i = 0; i < actual_segments; i++)
+    {
+        int t = image_grid_accessor[i][0];
+        int h = image_grid_accessor[i][1] / SPATIAL_MERGE_SIZE;
+        int w = image_grid_accessor[i][2] / SPATIAL_MERGE_SIZE;
+        segment_offsets_vec[i] = total_image_positions;
+        total_image_positions += t * h * w;
+    }
+    // IMPORTANT: Create the segment_offsets tensor directly so that its memory is on the device.
+    auto segment_offsets_tensor = torch::tensor(segment_offsets_vec, options_int);
+    // Make sure vision_segment_lengths_cumsum is int and on the correct device.
+    auto vision_segment_lengths_cumsum_int = vision_segment_lengths_cumsum.to(torch::kInt);
+    // Allocate one contiguous output tensor for all image positions.
+    // Each image token produces 3 ints.
+    auto image_positions_tensor = torch::empty({total_image_positions, 3}, options_int);
+    // Launch one block per vision segment.
+    int threads = MAX_THREADS_PER_BLOCK;
+    int blocks = actual_segments;
+    create_image_positions_kernel<<<blocks, threads>>>(
+        image_grid_thw.data_ptr<int>(),
+        segment_offsets_tensor.data_ptr<int>(),
+        vision_segment_lengths_cumsum_int.data_ptr<int>(),
+        image_positions_tensor.data_ptr<int>());
+    cudaDeviceSynchronize();
+    cudaError_t error = cudaGetLastError();
+    TORCH_CHECK(error == cudaSuccess, "CUDA error: ", cudaGetErrorString(error));
+    // Process text segments on host
+    // Each text segment is computed as a tensor of shape [3, seq_len] with all entries equal to text_segment_lengths[i].
+    std::vector<torch::Tensor> text_positions_list;
+    for (int i = 0; i < actual_segments; i++)
+    {
+        int seq_len = text_lengths_between_vision[i].item<int>();
+        auto text_range = torch::zeros({3, seq_len}, options_long) + text_segment_lengths[i];
+        text_positions_list.push_back(text_range);
+    }
+    // Interleave text and image segments
+    std::vector<torch::Tensor> full_positions_list;
+    // For each vision segment, first add its text positions then add its image positions.
+    for (int i = 0; i < actual_segments; i++)
+    {
+        // Append text segment for vision segment i.
+        full_positions_list.push_back(text_positions_list[i]);
+        // Determine the slice boundaries for this vision segment’s image positions.
+        int start = segment_offsets_vec[i];
+        int seg_length = 0;
+        if (i == actual_segments - 1)
+            seg_length = total_image_positions - segment_offsets_vec[i];
+        else
+            seg_length = segment_offsets_vec[i + 1] - segment_offsets_vec[i];
+        // Slice the image_positions_tensor for this segment.
+        // (Kernel output is [total_image_positions, 3]; we want to obtain a tensor of shape [3, seg_length] as in the Python reference.)
+        torch::Tensor image_segment = image_positions_tensor.slice(0, start, start + seg_length).t();
+        full_positions_list.push_back(image_segment);
+    }
+    // If there are extra text tokens after the last vision segment, add them.
+    int full_text_len = input_len - ends[actual_segments - 1].item<int>();
+    if (full_text_len > 0)
+    {
+        int max_s = full_positions_list.back().max().item<int>() + 1;
+        auto extra_text = torch::arange(full_text_len, options_long).view({1, -1}).expand({3, -1}) + max_s;
+        full_positions_list.push_back(extra_text);
+    }
+    // Concatenate along dimension 1 (the "position" dimension), then transpose so that the final tensor is [total_tokens, 3].
+    auto full_positions_concatenated = torch::cat(full_positions_list, 1);
+    auto full_positions_concatenated_transposed = full_positions_concatenated.t();
+    // Write final result to output tensor.
+    out.copy_(full_positions_concatenated_transposed);
+}

test/reference.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from typing import Optional
+class DummyModel:
+    spatial_merge_size = 2
+    vision_start_token_id = 151652
+    vision_end_token_id = 151653
+    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
+    # modified to first find segments then initialize position ids for each segment
+    # Steps:
+    #  locate all vision and text segments
+    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
+    #  calculate `text_segment_lengths` for each text segment to be used as offset
+    #  create position ids for each vision segment based on the image grid
+    #  create position ids for each text segment
+    #  combine all the position ids
+    #  the final segment is the difference between the last vision segment and the end of the input
+    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
+    def get_position_ids(
+        self,
+        input_ids: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+        spatial_merge_size = self.spatial_merge_size
+        vision_start_token_id = self.vision_start_token_id
+        vision_end_token_id = self.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+        # create position ids for each text segment
+        text_ranges = [
+            torch.zeros(3, seq_len, device=device) + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids

test/test.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import time
+import torch
+import pytest
+import get_position_ids # noqa: E402
+from reference import DummyModel
+# Each configuration includes:
+#   - name: A label for the test case.
+#   - input_ids: A list of token IDs (with vision start (151652) and vision end (151653) tokens embedded).
+#   - grid: A list of [t, h, w] values (one per vision segment).
+#
+# The cases below include:
+#   1. one_segment: a single vision segment.
+#   2. two_segments: two vision segments with extra text tokens afterward.
+#   3. three_segments: three vision segments.
+VISION_CONFIGS = [
+    {
+        "name": "one_segment",
+        "input_ids": (
+            [10] * 5 +            # 5 text tokens before vision segment
+            [151652, 151653] +    # vision tokens for segment 1
+            [20] * 5              # 5 extra text tokens after vision segment
+        ),
+        "grid": [[2, 4, 6]]       # one vision segment grid
+    },
+    {
+        "name": "two_segments",
+        "input_ids": (
+            [100] * 5 +           # 5 text tokens for segment 1
+            [151652, 151653] +    # vision tokens for segment 1
+            [101] * 5 +           # 5 text tokens for segment 2
+            [151652, 151653] +    # vision tokens for segment 2
+            [102] * 5             # 5 extra text tokens after last vision segment
+        ),
+        "grid": [
+            [2, 4, 6],          # vision segment 1 grid
+            [3, 4, 6]           # vision segment 2 grid
+        ],
+    },
+    {
+        "name": "three_segments",
+        "input_ids": (
+            [11] * 5 +            # Segment 1: 5 text tokens
+            [151652, 151653] +    # vision tokens for segment 1
+            [12] * 6 +            # Segment 2: 6 text tokens
+            [151652, 151653] +    # vision tokens for segment 2
+            [13] * 7 +            # Segment 3: 7 text tokens
+            [151652, 151653] +    # vision tokens for segment 3
+            [14] * 8              # 8 extra text tokens after the last vision segment
+        ),
+        "grid": [
+            [2, 4, 6],          # vision segment 1 grid
+            [3, 6, 6],          # vision segment 2 grid
+            [4, 4, 8]           # vision segment 3 grid
+        ],
+    },
+]
+CUDA_DEVICES = ["cuda"]     # List of CUDA devices; you can add more if needed.
+SEEDS = [42]                # Seeds for reproducibility.
+DTYPES = [torch.int32]      # In our test the tokens and grid are created with int32.
+@pytest.mark.parametrize("vision_config",
+                         VISION_CONFIGS,
+                         ids=[cfg["name"] for cfg in VISION_CONFIGS])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_get_position_ids(vision_config, seed, device):
+    torch.manual_seed(seed)
+    input_ids = torch.tensor(vision_config["input_ids"], dtype=torch.int32, device=device)
+    image_grid_thw = torch.tensor(vision_config["grid"], dtype=torch.int32, device=device)
+    # Create a DummyModel instance from the reference implementation.
+    dummy_model = DummyModel()
+    # reference implementation
+    torch.cuda.synchronize()
+    start_ref = time.perf_counter()
+    pos_ids_ref = dummy_model.get_position_ids(input_ids, image_grid_thw)
+    torch.cuda.synchronize()
+    end_ref = time.perf_counter()
+    ref_time = (end_ref - start_ref) * 1000  # ms
+    print(f"\nVision config {vision_config['name']} - Reference time: {ref_time:.2f} ms")
+    # Convert reference output to int32 for comparison (since its returned as a float tensor).
+    pos_ids_ref = pos_ids_ref.to(dtype=torch.int32)
+    # kernel implementation
+    torch.cuda.synchronize()
+    start_ext = time.perf_counter()
+    out = torch.empty(pos_ids_ref.shape, dtype=torch.int32, device=device)
+    get_position_ids.get_position_ids(out, input_ids, image_grid_thw)
+    torch.cuda.synchronize()
+    end_ext = time.perf_counter()
+    ext_time = (end_ext - start_ext) * 1000  # ms
+    print(f"Vision config {vision_config['name']} - Extension time: {ext_time:.2f} ms\n")
+    ext_out = out.clone()
+    # verify the results
+    torch.testing.assert_close(ext_out.cpu(), pos_ids_ref.cpu())