{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "dca3239c-17d6-4284-a2cf-83237a55a7df", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c99a5a79955e4e84a260ffacd313e0ed", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading pipeline components...: 0%| | 0/5 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "нарядная новогодняя елка, красивые игрушки, звезда сверху, огоньки, на тёмном фоне\n" ] } ], "source": [ "prompt = '1girl, solo, animal ears, bow, teeth, jacket, tail, open mouth, brown hair, orange background, bowtie, orange nails, simple background, cat ears, orange eyes, blue bow, animal ear fluff, cat tail, looking at viewer, upper body, shirt, school uniform, hood, striped bow, striped, white shirt, black jacket, blue bowtie, fingernails, long sleeves, cat girl, bangs, fangs, collared shirt, striped bowtie, short hair, tongue, hoodie, sharp teeth, facial mark, claw pose'\n", "prompt = 'нарядная новогодняя елка, красивые игрушки, звезда сверху, огоньки, на тёмном фоне'\n", "generator = torch.Generator(device=\"cuda\").manual_seed(42)\n", "\n", "image = pipe(\n", " prompt = prompt,\n", " negative_prompt = \"\",\n", " generator=generator,\n", ")[0]\n", "\n", "for img in image:\n", " img.show()\n", " print(prompt)\n", " " ] }, { "cell_type": "code", "execution_count": 19, "id": "7118f94a-e747-4bce-a464-2d8ed34cac6a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 512, 1024])\n", "tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')\n" ] } ], "source": [ "prompt_embeds, prompt_attention_mask2 = txt_embeds(\"cat\")\n", "print(prompt_embeds.shape)\n", "print(prompt_attention_mask2)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1b82a561-b93b-4261-9bc7-fe867168bdd2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SiglipImageProcessor {\n", " \"do_convert_rgb\": null,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"image_mean\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"image_processor_type\": \"SiglipImageProcessor\",\n", " \"image_std\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"processor_class\": \"SiglipProcessor\",\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"height\": 384,\n", " \"width\": 384\n", " }\n", "}\n", "\n", "tensor([[[[-0.2471, -0.2236, -0.2158, ..., -0.9844, -0.9766, -0.9844],\n", " [-0.3730, -0.3418, -0.2871, ..., -0.9844, -0.9766, -0.9844],\n", " [-0.5625, -0.5234, -0.4668, ..., -0.9844, -0.9844, -0.9844],\n", " ...,\n", " [ 0.7734, 0.8672, 0.9062, ..., -0.2002, -0.1846, -0.2002],\n", " [ 0.9062, 0.8984, 0.8906, ..., -0.2002, -0.2002, -0.1533],\n", " [ 0.9062, 0.8906, 0.8984, ..., -0.2080, -0.2236, -0.1689]],\n", "\n", " [[ 0.3027, 0.3184, 0.3105, ..., -0.1216, -0.1060, -0.1060],\n", " [ 0.2393, 0.2471, 0.2793, ..., -0.1216, -0.1060, -0.1060],\n", " [ 0.1611, 0.1611, 0.1768, ..., -0.1216, -0.1138, -0.1060],\n", " ...,\n", " [ 0.6797, 0.7734, 0.7969, ..., -0.3652, -0.3965, -0.4043],\n", " [ 0.8047, 0.7969, 0.7891, ..., -0.3652, -0.3809, -0.3262],\n", " [ 0.8047, 0.7891, 0.7969, ..., -0.3730, -0.3887, -0.3184]],\n", "\n", " [[ 0.7109, 0.7031, 0.6719, ..., 0.5469, 0.5547, 0.5547],\n", " [ 0.6953, 0.6875, 0.6953, ..., 0.5469, 0.5547, 0.5547],\n", " [ 0.6797, 0.6719, 0.6719, ..., 0.5469, 0.5547, 0.5547],\n", " ...,\n", " [ 0.5078, 0.6172, 0.6641, ..., -0.4199, -0.4121, -0.4590],\n", " [ 0.6719, 0.6719, 0.6641, ..., -0.3809, -0.4199, -0.3574],\n", " [ 0.6797, 0.6641, 0.6719, ..., -0.3965, -0.3965, -0.3418]]]],\n", " device='cuda:0', dtype=torch.bfloat16)\n" ] } ], "source": [ "from transformers import AutoModel, AutoTokenizer, AutoImageProcessor\n", "from PIL import Image\n", "import requests\n", "import torch\n", "\n", "# Загрузка модели, токенизатора и процессора изображений\n", "model = AutoModel.from_pretrained(\n", " \"visheratin/mexma-siglip\",\n", " torch_dtype=torch.bfloat16,\n", " trust_remote_code=True,\n", " optimized=True\n", ").to(\"cuda\")\n", "processor = AutoImageProcessor.from_pretrained(\"visheratin/mexma-siglip\")\n", "\n", "# Загрузка и обработка изображения\n", "img_url = \"https://static.independent.co.uk/s3fs-public/thumbnails/image/2014/03/25/12/eiffel.jpg\"\n", "img = Image.open(requests.get(img_url, stream=True).raw)\n", "img = processor(images=img, return_tensors=\"pt\")[\"pixel_values\"]\n", "img = img.to(torch.bfloat16).to(\"cuda\")\n", "\n", "# Получение логов и эмбедингов\n", "with torch.inference_mode():\n", " print(processor)\n", " print(img)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 1152])\n" ] } ], "source": [ "from transformers import AutoModel, AutoTokenizer, AutoImageProcessor\n", "from PIL import Image\n", "import requests\n", "import torch\n", "\n", "# Загрузка модели, токенизатора и процессора изображений\n", "model = AutoModel.from_pretrained(\n", " \"visheratin/mexma-siglip\",\n", " torch_dtype=torch.bfloat16,\n", " trust_remote_code=True,\n", " optimized=True\n", ").to(\"cuda\")\n", "processor = AutoImageProcessor.from_pretrained(\"visheratin/mexma-siglip\")\n", "\n", "# Загрузка и обработка изображения\n", "img_url = \"https://static.independent.co.uk/s3fs-public/thumbnails/image/2014/03/25/12/eiffel.jpg\"\n", "img = Image.open(requests.get(img_url, stream=True).raw)\n", "img = processor(images=img, return_tensors=\"pt\")[\"pixel_values\"]\n", "img = img.to(torch.bfloat16).to(\"cuda\")\n", "\n", "# Получение эмбеддингов изображения\n", "with torch.inference_mode():\n", " image_embeddings = model.encode_images(img, normalize=False)\n", "\n", "# Вывод эмбеддингов\n", "print(image_embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 7, "id": "c9703745-cc92-46cf-a216-025a80907104", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of image embeddings: torch.Size([1, 1152])\n", "Data type of image embeddings: torch.bfloat16\n", "Device of image embeddings: cuda:0\n", "Image embeddings: tensor([[-0.0287, -0.0082, 0.0444, ..., -0.0011, 0.0306, 0.0251]],\n", " device='cuda:0', dtype=torch.bfloat16)\n" ] } ], "source": [ "# Вывод формы тензора\n", "print(\"Shape of image embeddings:\", image_embeddings.shape)\n", "\n", "# Вывод типа данных тензора\n", "print(\"Data type of image embeddings:\", image_embeddings.dtype)\n", "\n", "# Вывод устройства, на котором находится тензор (CPU или GPU)\n", "print(\"Device of image embeddings:\", image_embeddings.device)\n", "\n", "# Вывод самого тензора (его значений)\n", "print(\"Image embeddings:\", image_embeddings)" ] }, { "cell_type": "code", "execution_count": 8, "id": "83729d70-b8a6-4c45-a222-2db608936bf4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear(in_features=1024, out_features=1152, bias=False)\n" ] } ], "source": [ "print(model.text_projector)" ] }, { "cell_type": "code", "execution_count": 15, "id": "426dfe31-59b0-4942-879a-2a78f5271710", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-1.9409e-02, 1.1110e-04, 1.4709e-02, ..., -1.9897e-02,\n", " 7.8125e-03, -5.6763e-03],\n", " [-8.2397e-03, 4.0588e-03, -4.8828e-04, ..., -2.8687e-02,\n", " -9.4604e-03, 1.2329e-02],\n", " [ 3.9062e-03, -1.5991e-02, 5.0306e-05, ..., 3.2349e-03,\n", " -6.5308e-03, 1.4954e-02],\n", " ...,\n", " [-1.3245e-02, 7.0190e-03, 1.5198e-02, ..., -8.9111e-03,\n", " 4.3335e-03, -2.5635e-02],\n", " [-2.4414e-02, 1.7822e-02, 2.4658e-02, ..., 5.0659e-03,\n", " 3.2654e-03, 7.8125e-03],\n", " [ 4.4250e-03, -1.2451e-02, -2.4902e-02, ..., -1.2817e-02,\n", " -1.2665e-03, -1.5076e-02]], device='cuda:0')\n", "tensor([[ 0.3091, 0.0765, -0.1184, ..., -0.0156, 0.1962, -0.1573],\n", " [-0.0393, 0.0771, 0.1193, ..., 0.1507, -0.0296, -0.0949],\n", " [ 0.0438, 0.1021, -0.0141, ..., 0.0935, -0.0122, -0.0239],\n", " ...,\n", " [-0.1386, -0.0726, -0.0899, ..., 0.2193, 0.2369, -0.0670],\n", " [ 0.4150, 0.0092, -0.1147, ..., -0.1390, -0.0257, -0.3204],\n", " [ 0.1041, 0.0360, 0.2886, ..., 0.0200, 0.0378, 0.0312]],\n", " device='cuda:0')\n", "Linear(in_features=1152, out_features=1024, bias=False)\n", "Shape of image embeddings: torch.Size([1, 1024])\n", "Shape of image embeddings: torch.Size([1, 512, 1024])\n" ] } ], "source": [ "import torch\n", "\n", "# Получаем матрицу весов проектора\n", "weight_matrix = model.text_projector.weight.data # [1152, 1024]\n", "weight_matrix = weight_matrix.float() # Преобразуем в float32\n", "print(weight_matrix)\n", "\n", "# Вычисляем псевдообратную матрицу\n", "pseudo_inverse = torch.pinverse(weight_matrix) # [1024, 1152]\n", "print(pseudo_inverse)\n", "\n", "# Создаем обратный проектор\n", "image_projector = torch.nn.Linear(1152, 1024, bias=False)\n", "image_projector.weight.data = pseudo_inverse\n", "print(image_projector)\n", "\n", "with torch.no_grad():\n", " image_embeddings_transformed = image_projector(image_embeddings.float())\n", " print(\"Shape of image embeddings:\", image_embeddings_transformed.shape)\n", "\n", "# Расширяем эмбеддинг изображения до [1, 512, 1024]\n", "with torch.no_grad():\n", " x = image_projector(image_embeddings.float()) # [1, 1024]\n", " x = x.unsqueeze(1).repeat(1, 512, 1) # [1, 512, 1024]\n", "print(\"Shape of image embeddings:\", x.shape)" ] }, { "cell_type": "code", "execution_count": 13, "id": "c704cd13-eea4-437e-96fa-82f1735b0ac7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of image embeddings: torch.Size([1, 1, 1024])\n" ] }, { "ename": "RuntimeError", "evalue": "mat1 and mat2 shapes cannot be multiplied (1x1152 and 1024x1152)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[13], line 31\u001b[0m\n\u001b[1;32m 28\u001b[0m proj\u001b[38;5;241m.\u001b[39mweight\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mpinverse(weight_matrix)\u001b[38;5;241m.\u001b[39mT \u001b[38;5;66;03m# [1024, 1152]\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# Применяем к эмбеддингам изображений\u001b[39;00m\n\u001b[0;32m---> 31\u001b[0m image_embeddings_transformed \u001b[38;5;241m=\u001b[39m \u001b[43mimage_projector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage_embeddings\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28mprint\u001b[39m(image_embeddings_transformed\u001b[38;5;241m.\u001b[39mshape) \u001b[38;5;66;03m# [1, 512, 1024]\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of image embeddings:\u001b[39m\u001b[38;5;124m\"\u001b[39m, image_embeddings_transformed\u001b[38;5;241m.\u001b[39mshape)\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", "Cell \u001b[0;32mIn[13], line 17\u001b[0m, in \u001b[0;36mImageProjector.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# x: [batch_size, 1152]\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Применяем каждый проектор к эмбеддингу изображения\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mproj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprojectors\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;66;03m# Список из 512 тензоров [1, 1024]\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mstack(outputs, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", "Cell \u001b[0;32mIn[13], line 17\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# x: [batch_size, 1152]\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Применяем каждый проектор к эмбеддингу изображения\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m outputs \u001b[38;5;241m=\u001b[39m [\u001b[43mproj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m proj \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprojectors] \u001b[38;5;66;03m# Список из 512 тензоров [1, 1024]\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mstack(outputs, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/linear.py:125\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mRuntimeError\u001b[0m: mat1 and mat2 shapes cannot be multiplied (1x1152 and 1024x1152)" ] } ], "source": [ "print(\"Shape of image embeddings:\", image_embeddings_transformed.shape)\n", "import torch\n", "import torch.nn as nn\n", "\n", "class ImageProjector(nn.Module):\n", " def __init__(self, input_dim=1152, output_dim=1024, seq_len=512):\n", " super().__init__()\n", " # Создаем 512 отдельных проекторов (по одному на каждый токен)\n", " self.projectors = nn.ModuleList([\n", " nn.Linear(input_dim, output_dim, bias=False) \n", " for _ in range(seq_len)\n", " ])\n", " \n", " def forward(self, x):\n", " # x: [batch_size, 1152]\n", " # Применяем каждый проектор к эмбеддингу изображения\n", " outputs = [proj(x) for proj in self.projectors] # Список из 512 тензоров [1, 1024]\n", " return torch.stack(outputs, dim=1) # [1, 512, 1024]\n", "\n", "# Инициализируем проектор\n", "image_projector = ImageProjector()\n", "\n", "# Загружаем веса из псевдообратной матрицы (если нужно)\n", "with torch.no_grad():\n", " for proj in image_projector.projectors:\n", " weight_matrix = model.text_projector.weight.data # [1152, 1024]\n", " weight_matrix = weight_matrix.float() # Преобразуем в float32\n", " proj.weight.data = torch.pinverse(weight_matrix).T # [1024, 1152]\n", "\n", "# Применяем к эмбеддингам изображений\n", "image_embeddings_transformed = image_projector(image_embeddings.float())\n", "print(image_embeddings_transformed.shape) # [1, 512, 1024]\n", "print(\"Shape of image embeddings:\", image_embeddings_transformed.shape)" ] }, { "cell_type": "code", "execution_count": 14, "id": "19ebad1b-aced-4f12-8bff-ce89cc89e32f", "metadata": {}, "outputs": [ { "ename": "RuntimeError", "evalue": "mat1 and mat2 shapes cannot be multiplied (1x1152 and 1024x1152)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[14], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Расширяем эмбеддинг изображения до [1, 512, 1024]\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m----> 7\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mimage_projector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage_embeddings\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# [1, 1024]\u001b[39;00m\n\u001b[1;32m 8\u001b[0m x \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mrepeat(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m512\u001b[39m, \u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# [1, 512, 1024]\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of image embeddings:\u001b[39m\u001b[38;5;124m\"\u001b[39m, image_embeddings\u001b[38;5;241m.\u001b[39mshape)\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/linear.py:125\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mRuntimeError\u001b[0m: mat1 and mat2 shapes cannot be multiplied (1x1152 and 1024x1152)" ] } ], "source": [ "print(\"Shape of image embeddings:\", image_embeddings_transformed.shape)\n", "# Простой проектор\n", "image_projector = nn.Linear(1152, 1024, bias=False)\n", "image_projector.weight.data = torch.pinverse(model.text_projector.weight.data.float()).T\n", "\n", "# Расширяем эмбеддинг изображения до [1, 512, 1024]\n", "with torch.no_grad():\n", " x = image_projector(image_embeddings.float()) # [1, 1024]\n", " x = x.unsqueeze(1).repeat(1, 512, 1) # [1, 512, 1024]\n", "print(\"Shape of image embeddings:\", image_embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 21, "id": "67fdfd36-17c4-4899-95e0-10853ee4e2a6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 512])\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dd140e24dfb3457eab3f818507ae3123", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/20 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "нарядная новогодняя елка, красивые игрушки, звезда сверху, огоньки, на тёмном фоне\n" ] } ], "source": [ "image_embeddings_transformed = x\n", "prompt_attention_mask = torch.ones(image_embeddings_transformed.shape[0], image_embeddings_transformed.shape[1])\n", "print(prompt_attention_mask.shape)\n", "image_embeddings_transformed = image_embeddings_transformed.to(\"cuda\")\n", "prompt_attention_mask =prompt_attention_mask.to(\"cuda\")\n", "image = pipe(\n", " prompt_embeds = image_embeddings_transformed,\n", " prompt_attention_mask = prompt_attention_mask,\n", " #negative_prompt = \"\",\n", " generator=generator,\n", ")[0]\n", "\n", "for img in image:\n", " img.show()\n", " print(prompt)" ] }, { "cell_type": "code", "execution_count": 6, "id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_19418/3674156061.py:18: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", " with torch.no_grad(), torch.cuda.amp.autocast():\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 1152])\n", "tensor([[-0.0550, 0.1304, 0.1885, ..., -0.1434, -0.4676, 0.1461]])\n", "Label probabilities: [('a dog', 0.0), ('a cat', 0.0), ('a donut', 0.0), ('a beignet', 0.517)]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_19418/3674156061.py:31: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", " with torch.no_grad(), torch.cuda.amp.autocast():\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "All patches shape: torch.Size([1, 1152])\n" ] } ], "source": [ "import torch\n", "import torch.nn.functional as F\n", "from urllib.request import urlopen\n", "from PIL import Image\n", "from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n", "\n", "model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n", "tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n", "\n", "image = Image.open(urlopen(\n", " 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n", "))\n", "image = preprocess(image).unsqueeze(0)\n", "\n", "labels_list = [\"a dog\", \"a cat\", \"a donut\", \"a beignet\"]\n", "text = tokenizer(labels_list, context_length=model.context_length)\n", "\n", "with torch.no_grad(), torch.cuda.amp.autocast():\n", " image_features = model.encode_image(image)\n", " print(image_features.shape)\n", " print(image_features)\n", " text_features = model.encode_text(text)\n", " image_features = F.normalize(image_features, dim=-1)\n", " text_features = F.normalize(text_features, dim=-1)\n", "\n", " text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)\n", "\n", "zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))\n", "print(\"Label probabilities: \", zipped_list)\n", "\n", "with torch.no_grad(), torch.cuda.amp.autocast():\n", " # Получаем скрытые состояния всех патчей\n", " outputs = model.visual(image) # [batch_size, num_patches + 1, hidden_dim]\n", " print(\"All patches shape:\", outputs.shape) # Пример: [1, 256, 1152]\n", " #all_patch_embeddings = outputs[:, 1:, :] # Игнорируем [CLS]-токен\n", " #print(\"All patches shape:\", all_patch_embeddings.shape) # Пример: [1, 256, 1152]\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_19418/2526917774.py:1: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", " with torch.no_grad(), torch.cuda.amp.autocast():\n" ] }, { "ename": "AttributeError", "evalue": "'TimmModel' object has no attribute 'patch_embed'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(), torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast():\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Извлекаем патчи и позиционные эмбеддинги\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisual\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embed\u001b[49m(image) \u001b[38;5;66;03m# [1, num_patches, 1152]\u001b[39;00m\n\u001b[1;32m 4\u001b[0m x \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_drop(x \u001b[38;5;241m+\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_embed)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Проход через трансформерные блоки\u001b[39;00m\n", "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1931\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1930\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1931\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1932\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1933\u001b[0m )\n", "\u001b[0;31mAttributeError\u001b[0m: 'TimmModel' object has no attribute 'patch_embed'" ] } ], "source": [ "with torch.no_grad(), torch.cuda.amp.autocast():\n", " # Извлекаем патчи и позиционные эмбеддинги\n", " x = model.visual.patch_embed(image) # [1, num_patches, 1152]\n", " x = model.visual.pos_drop(x + model.visual.pos_embed)\n", " \n", " # Проход через трансформерные блоки\n", " for blk in model.visual.blocks:\n", " x = blk(x)\n", " \n", " # Применяем LayerNorm (если есть)\n", " if hasattr(model.visual, \"norm\"):\n", " x = model.visual.norm(x)\n", " \n", " # Теперь x содержит все патчи\n", " print(\"All patches shape:\", x.shape)\n", " # Пример вывода: torch.Size([1, 756, 1152])" ] }, { "cell_type": "code", "execution_count": null, "id": "29ecd610-7121-4c39-80cf-5021b80f6431", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }