Add all files and directories

Files changed (15) hide show

Training_Station/Duck_Rectified/labels.txt +3 -0
image_module/__init__.py +0 -0
image_module/dataset_water.py +161 -0
image_module/transforms.py +151 -0
myutils/__init__.py +2 -0
myutils/data.py +149 -0
myutils/system.py +103 -0
records/link_efficientb4_model.pth +3 -0
video_module/__init__.py +0 -0
video_module/dataset/Water_DS.py +95 -0
video_module/dataset/__init__.py +1 -0
video_module/dataset/transforms.py +468 -0
video_module/model/AFB_URR.py +319 -0
video_module/model/FeatureBank.py +149 -0
video_module/model/__init__.py +2 -0

Training_Station/Duck_Rectified/labels.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+__ignore__
+_background_
+water

image_module/__init__.py ADDED Viewed

File without changes

image_module/dataset_water.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import numpy as np
+from glob import glob
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torch.utils import data
+from . import transforms as my_tf
+from myutils import load_image_in_PIL as load_img
+def load_image_in_PIL(path, mode='RGB'):
+    img = Image.open(path)
+    img.load()  # Very important for loading large image
+    return img.convert(mode)
+class WaterDataset(data.Dataset):
+    def __init__(self, mode, dataset_path, input_size=None, test_case=None, eval_size=None):
+        super(WaterDataset, self).__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.test_case = test_case
+        self.img_list = []
+        self.label_list = []
+        self.verbose_flag = False
+        self.online_augmentation_per_epoch = 640
+        self.eval_size = eval_size
+        if mode == 'train_offline':
+            with open(os.path.join(dataset_path, 'train_imgs.txt')) as f:
+                water_subdirs = f.readlines()
+            water_subdirs = [x.strip() for x in water_subdirs]
+            print('Initialize offline training dataset:')
+            for sub_folder in water_subdirs:
+                label_list = glob(os.path.join(dataset_path, 'Annotations/', sub_folder, '*.png'))
+                label_list.sort(key=lambda x: (len(x), x))
+                self.label_list += label_list
+                name_list = [os.path.basename(x)[:-4] for x in label_list]
+                img_list = glob(os.path.join(dataset_path, 'JPEGImages/', sub_folder, '*.jpg'))
+                img_list.sort(key=lambda x: (len(x), x))
+                img_list_valid = []
+                for img_path in img_list:
+                    if os.path.basename(img_path)[:-4] in name_list:
+                        img_list_valid.append(img_path)
+                self.img_list += img_list_valid
+                print('Add', sub_folder, len(img_list_valid), 'files.')
+        elif mode == 'eval':
+            if test_case is None:
+                raise ('test_case can not be None.')
+            img_path = os.path.join(dataset_path, 'JPEGImages/', test_case)
+            img_list = os.listdir(img_path)
+            img_list.sort(key=lambda x: (len(x), x))
+            self.img_list = [os.path.join(img_path, name) for name in img_list]
+            first_frame_label_path = os.path.join(dataset_path, 'Annotations/', test_case, img_list[0])
+            # Detect label image format: png or jpg
+            first_frame_label_path = first_frame_label_path[:-3]
+            if os.path.exists(first_frame_label_path + 'png'):
+                first_frame_label_path += 'png'
+            else:
+                first_frame_label_path += 'jpg'
+            if not os.path.exists(first_frame_label_path):
+                label_list = glob(os.path.join(dataset_path, 'Annotations/', test_case, '*.png'))
+                label_list.sort(key=lambda x: (x, len(x)))
+                first_frame_label_path = label_list[0]
+            self.first_frame = load_image_in_PIL(self.img_list[0], 'RGB')
+            self.img_list.pop(0)
+            self.first_frame_label = load_image_in_PIL(first_frame_label_path, 'P')
+            if self.eval_size:
+                self.origin_size = self.first_frame.size
+                self.first_frame = self.first_frame.resize(self.eval_size, Image.ANTIALIAS)
+                self.first_frame_label = self.first_frame_label.resize(self.eval_size, Image.ANTIALIAS)
+        else:
+            raise ('Mode %s does not support in [train_offline, train_online, eval].' % mode)
+    def __len__(self):
+        if self.mode == 'train_online':
+            return self.online_augmentation_per_epoch
+        else:
+            return len(self.img_list)
+    def get_first_frame(self):
+        img_tf = TF.to_tensor(self.first_frame)
+        img_tf = my_tf.imagenet_normalization(img_tf)
+        return img_tf
+    def get_first_frame_label(self):
+        return TF.to_tensor(self.first_frame_label)
+    def __getitem__(self, index):
+        raise NotImplementedError
+class WaterDataset_RGB(WaterDataset):
+    def __init__(self, mode, dataset_path, input_size=None, test_case=None, eval_size=None):
+        super(WaterDataset_RGB, self).__init__(mode, dataset_path, input_size, test_case, eval_size)
+    def __getitem__(self, index):
+        if self.mode == 'train_offline' or self.mode == 'val_offline' or self.mode == 'test_offline':
+            img = load_img(self.img_list[index], 'RGB')
+            label = load_img(self.label_list[index], 'P')
+            return self.apply_transforms(img, label)
+        elif self.mode == 'train_online':
+            return self.apply_transforms(self.first_frame, self.first_frame_label)
+        elif self.mode == 'eval':
+            img = load_img(self.img_list[index], 'RGB')
+            if self.eval_size:
+                img = img.resize(self.eval_size, Image.ANTIALIAS)
+            return self.apply_transforms(img)
+        else:
+            raise Exception("Error: Invalid dataset mode!")
+    def resize_to_origin(self, img):
+        return img.resize(self.origin_size)
+    def apply_transforms(self, img, label=None):
+        if self.mode == 'train_offline' or self.mode == 'train_online':
+            img = my_tf.random_adjust_color(img, self.verbose_flag)
+            img, label = my_tf.random_affine_transformation(img, None, label, self.verbose_flag)
+            img, label = my_tf.random_resized_crop(img, None, label, self.input_size, self.verbose_flag)
+        elif self.mode == 'test_offline' or self.mode == 'val_offline':
+            img = TF.resize(img, self.input_size)
+            label = TF.resize(label, self.input_size)
+        elif self.mode == 'eval':
+            pass
+        img_orig = TF.to_tensor(img)
+        img_norm = my_tf.imagenet_normalization(img_orig)
+        if self.mode == 'train_offline' or self.mode == 'train_online':
+            # label = TF.to_tensor(label)
+            label = np.expand_dims(np.array(label, np.float32), axis=0)
+            return img_norm, label
+        elif self.mode == 'val_offline':
+            label = np.expand_dims(np.array(label, np.float32), axis=0)
+            return img_norm, label
+        elif self.mode == 'test_offline':
+            label = np.expand_dims(np.array(label, np.float32), axis=0)
+            return img_norm, label, img_orig
+        else:
+            return None

image_module/transforms.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import random
+import math
+import numpy as np
+from PIL import Image, ImageFilter
+from scipy.ndimage import binary_erosion, binary_dilation
+import torchvision.transforms.functional as TF
+from torchvision.transforms import RandomResizedCrop
+random_thres = 0.8
+def random_adjust_color(img, verbose=False):
+    if random.random() < random_thres:
+        brightness_factor = random.uniform(0.1, 1.2)
+        img = TF.adjust_brightness(img, brightness_factor)
+        if verbose:
+            print('Brightness:', brightness_factor)
+    if random.random() < random_thres:
+        contrast_factor = random.uniform(0.2, 1.8)
+        img = TF.adjust_contrast(img, contrast_factor)
+        if verbose:
+            print('Contrast:', contrast_factor)
+    if random.random() < random_thres:
+        # hue_factor = random.uniform(-0.1, 0.1)
+        hue_factor = 0.1
+        img = TF.adjust_hue(img, hue_factor)
+        if verbose:
+            print('Hue:', hue_factor)
+    return img
+def random_affine_transformation(img, mask, label, verbose=False):
+    if random.random() < random_thres:
+        degrees = random.uniform(-20, 20)
+        translate_h = random.uniform(-0.2, 0.2)
+        translate_v = random.uniform(-0.2, 0.2)
+        scale = random.uniform(0.7, 1.3)
+        shear = random.uniform(-20, 20)
+        resample = TF.InterpolationMode.BICUBIC
+        img = TF.affine(img, degrees, (translate_h, translate_v), scale, shear, resample)
+        if mask:
+            mask = TF.affine(mask, degrees, (translate_h, translate_v), scale, shear, resample)
+        label = TF.affine(label, degrees, (translate_h, translate_v), scale, shear, resample)
+        if verbose:
+            print('Affine degrees: %.1f, T_h: %.1f, T_v: %.1f, Scale: %.1f, Shear: %.1f' % \
+                (degrees, translate_h, translate_v, scale, shear))
+    if random.random() < 0.5:
+        img = TF.hflip(img)
+        if mask:
+            mask = TF.hflip(mask)
+        label = TF.hflip(label)
+        if verbose:
+            print('Horizontal flip')
+    if mask:
+        return img, mask, label
+    else:
+        return img, label
+def random_mask_perturbation(mask, verbose=False):
+    degrees = random.uniform(-10, 10)
+    translate_h = random.uniform(-0.1, 0.1)
+    translate_v = random.uniform(-0.1, 0.1)
+    scale = random.uniform(0.8, 1.2)
+    shear = random.uniform(-10, 10)
+    resample = TF.InterpolationMode.BICUBIC
+    mask = TF.affine(mask, degrees, (translate_h, translate_v), scale, shear, resample)
+    if verbose:
+        print('Mask pertubation degrees: %.1f, T_h: %.1f, T_v: %.1f, Scale: %.1f, Shear: %.1f' % \
+            (degrees, translate_h, translate_v, scale, shear))
+    morphologic_times = int(random.random() * 10)
+    morphologic_thres = random.random()
+    filter_size = 7
+    for i in range(morphologic_times):
+        if random.random() < morphologic_thres:
+            mask = mask.filter(ImageFilter.MinFilter(filter_size))
+            if verbose:
+                print(i, 'erossion')
+        else:
+            mask = mask.filter(ImageFilter.MaxFilter(filter_size))
+            if verbose:
+                print(i, 'dilation')
+    mask = mask.convert('1')
+    return mask
+def random_resized_crop(img, mask, label, size, verbose=False):
+    scale = (0.08, 1.0)
+    ratio = (0.75, 1.33333333)
+    sample_flag = False
+    for attempt in range(10):
+        area = img.size[0] * img.size[1]
+        target_area = random.uniform(*scale) * area
+        aspect_ratio = random.uniform(*ratio)
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+        if random.random() < 0.5:
+            w, h = h, w
+        if w <= img.size[0] and h <= img.size[1]:
+            y = random.randint(0, img.size[1] - h)
+            x = random.randint(0, img.size[0] - w)
+            sample_flag = True
+            break
+    # Fallback
+    if not sample_flag:
+        w = min(img.size[0], img.size[1])
+        y = (img.size[1] - w) // 2
+        x = (img.size[0] - w) // 2
+        h = w
+    img = TF.resized_crop(img, y, x, h, w, size, TF.InterpolationMode.BICUBIC)
+    if mask:
+        mask = TF.resized_crop(mask, y, x, h, w, size, TF.InterpolationMode.BICUBIC)
+    label = TF.resized_crop(label, y, x, h, w, size, TF.InterpolationMode.BICUBIC)
+    if verbose:
+        print('x: %d, y: %d, w: %d, h: %d' % (x, y, w, h))
+    if mask:
+        return img, mask, label
+    else:
+        return img, label
+def imagenet_normalization(img_tensor):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    img_tensor = TF.normalize(img_tensor, mean, std)
+    return img_tensor

myutils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .data import *
2	+ from .system import *

myutils/data.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import numpy as np
+from PIL import Image
+from scipy.ndimage.morphology import binary_dilation
+import cv2
+from numpy.linalg import norm
+import torch
+from torch.nn import functional as NF
+from torchvision.transforms import functional as TF
+color_palette = [0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0] + [100, 100, 100] * 252
+def postprocessing_pred(pred: np.array) -> np.array:
+    label_cnt, labels = cv2.connectedComponentsWithAlgorithm(pred, 8, cv2.CV_32S, cv2.CCL_GRANA)
+    if label_cnt == 2:
+        if labels[0, 0] == pred[0, 0]:
+            pred = labels
+        else:
+            pred = 1 - labels
+    else:
+        max_cnt, max_label = 0, 0
+        for i in range(label_cnt):
+            mask = labels == i
+            if pred[mask][0] == 0:
+                continue
+            cnt = len(mask.nonzero()[0])
+            if cnt > max_cnt:
+                max_cnt = cnt
+                max_label = i
+        pred = labels == max_label
+    return pred.astype(np.uint8)
+def calc_uncertainty(score):
+    # seg shape: bs, obj_n, h, w
+    score_top, _ = score.topk(k=2, dim=1)
+    uncertainty = score_top[:, 0] / (score_top[:, 1] + 1e-8)  # bs, h, w
+    uncertainty = torch.exp(1 - uncertainty).unsqueeze(1)  # bs, 1, h, w
+    return uncertainty
+def save_seg_mask(pred, seg_path, palette=color_palette):
+    seg_img = Image.fromarray(pred)
+    seg_img.putpalette(palette)
+    seg_img.save(seg_path)
+def add_overlay(img, mask, colors=color_palette, alpha=0.4, cscale=1):
+    ids = np.unique(mask)
+    img_overlay = img.copy()
+    ones_np = np.ones(img.shape) * (1 - alpha)
+    colors = np.reshape(colors, (-1, 3))
+    colors = np.atleast_2d(colors) * cscale
+    for i in ids[1:]:
+        canvas = img * alpha + ones_np * np.array(colors[i])[::-1]
+        binary_mask = mask == i
+        img_overlay[binary_mask] = canvas[binary_mask]
+        contour = binary_dilation(binary_mask) ^ binary_mask
+        img_overlay[contour, :] = 0
+    return img_overlay
+def save_overlay(img, mask, overlay_path, colors=[255, 0, 0], alpha=0.4, cscale=1):
+    img = (img.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    img_overlay = add_overlay(img, mask, colors, alpha, cscale)
+    cv2.imwrite(overlay_path, img_overlay)
+def load_image_in_PIL(path, mode='RGB'):
+    img = Image.open(path)
+    img.load()  # Very important for loading large image
+    return img.convert(mode)
+def normalize(x):
+    return x / norm(x, ord=2, axis=1, keepdims=True)
+def dist(p0, p1, axis):
+    return norm(p0 - p1, ord=2, axis=axis)
+def resize_img(img, out_size):
+    h, w = img.shape[:2]
+    if h > w:
+        w_new = int(out_size * w / h)
+        h_new = out_size
+    else:
+        h_new = int(out_size * h / w)
+        w_new = out_size
+    img = cv2.resize(img, (w_new, h_new))
+    return img
+def unify_features(features):
+    output_size = features['f0'].shape[-2:]
+    feature_tuple = tuple()
+    for key, f in features.items():
+        if key != 'f0':
+            f = NF.interpolate(
+                f,
+                size=output_size, mode='bilinear', align_corners=False
+            )
+        feature_tuple += (f,)
+    unified_feature = torch.cat(feature_tuple, dim=1)
+    return unified_feature
+def pad_divide_by(in_list, d, in_size):
+    out_list = []
+    h, w = in_size
+    if h % d > 0:
+        new_h = h + d - h % d
+    else:
+        new_h = h
+    if w % d > 0:
+        new_w = w + d - w % d
+    else:
+        new_w = w
+    lh, uh = int((new_h - h) / 2), int(new_h - h) - int((new_h - h) / 2)
+    lw, uw = int((new_w - w) / 2), int(new_w - w) - int((new_w - w) / 2)
+    pad_array = (int(lw), int(uw), int(lh), int(uh))
+    for inp in in_list:
+        out_list.append(NF.pad(inp, pad_array))
+    return out_list, pad_array

myutils/system.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import time
+import os
+import shutil
+import numpy as np
+from PIL import Image
+import torch
+class AvgMeter(object):
+    def __init__(self, window=-1):
+        self.window = window
+        self.reset()
+    def reset(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.max = -np.inf
+        if self.window > 0:
+            self.val_arr = np.zeros(self.window)
+            self.arr_idx = 0
+    def update(self, val, n=1):
+        self.cnt += n
+        self.max = max(self.max, val)
+        if self.window > 0:
+            self.val_arr[self.arr_idx] = val
+            self.arr_idx = (self.arr_idx + 1) % self.window
+            self.avg = self.val_arr.mean()
+        else:
+            self.sum += val * n
+            self.avg = self.sum / self.cnt
+class FrameSecondMeter(object):
+    def __init__(self):
+        self.st = time.time()
+        self.fps = None
+        self.ed = None
+        self.frame_n = 0
+    def add_frame_n(self, frame_n):
+        self.frame_n += frame_n
+    def end(self):
+        self.ed = time.time()
+        self.fps = self.frame_n / (self.ed - self.st)
+def gct(f='l'):
+    '''
+    get current time
+    :param f: 'l' for log, 'f' for file name
+    :return: formatted time
+    '''
+    if f == 'l':
+        return time.strftime('%m/%d %H:%M:%S', time.localtime(time.time()))
+    elif f == 'f':
+        return time.strftime('%m_%d_%H_%M', time.localtime(time.time()))
+def save_scripts(path, scripts_to_save=None):
+    if not os.path.exists(os.path.join(path, 'scripts')):
+        os.makedirs(os.path.join(path, 'scripts'))
+    if scripts_to_save is not None:
+        for script in scripts_to_save:
+            dst_path = os.path.join(path, 'scripts', script)
+            try:
+                shutil.copy(script, dst_path)
+            except IOError:
+                os.makedirs(os.path.dirname(dst_path))
+                shutil.copy(script, dst_path)
+def count_model_size(model):
+    return np.sum(np.prod(v.size()) for name, v in model.named_parameters()) / 1e6
+def load_image_in_PIL(path, mode='RGB'):
+    img = Image.open(path)
+    img.load()  # Very important for loading large image
+    return img.convert(mode)
+def print_mem(info=None):
+    if info:
+        print(info, end=' ')
+    mem_allocated = round(torch.cuda.memory_allocated() / 1048576)
+    mem_cached = round(torch.cuda.memory_cached() / 1048576)
+    print(f'Mem allocated: {mem_allocated}MB, Mem cached: {mem_cached}MB')
+def set_bn_eval(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') != -1:
+        m.eval()

records/link_efficientb4_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab073b386a30375e0947246dbfcbab5b23197056d8918841f6b7e2764add7440
+size 72294975

video_module/__init__.py ADDED Viewed

File without changes

video_module/dataset/Water_DS.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import numpy as np
+from glob import glob
+import torch
+from torch.utils import data
+import torchvision.transforms as TF
+from video_module.dataset import transforms as mytrans
+import myutils
+class Water_Image_Train_DS(data.Dataset):
+    def __init__(self, root, output_size, clip_n, max_obj_n):
+        self.root = root
+        self.clip_n = clip_n
+        self.output_size = output_size
+        self.max_obj_n = max_obj_n
+        self.img_list = sorted(glob(os.path.join(root, 'JPEGImages', '*.jpg')) + glob(os.path.join(root, 'JPEGImages', '*.png')))
+        self.mask_list = sorted(glob(os.path.join(root, 'Annotations', '*.png')))
+        assert len(self.img_list) == len(self.mask_list), "The number of images and masks should be the same"
+        self.random_horizontal_flip = mytrans.RandomHorizontalFlip(0.3)
+        self.color_jitter = TF.ColorJitter(0.1, 0.1, 0.1, 0.03)
+        self.random_affine = mytrans.RandomAffine(degrees=20, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10)
+        self.random_resize_crop = mytrans.RandomResizedCrop(output_size, (0.8, 1))
+        self.to_tensor = TF.ToTensor()
+        self.to_onehot = mytrans.ToOnehot(max_obj_n, shuffle=True)
+    def __len__(self):
+        return len(self.img_list)
+    def __getitem__(self, idx):
+        img_pil = myutils.load_image_in_PIL(self.img_list[idx], 'RGB')
+        mask_pil = myutils.load_image_in_PIL(self.mask_list[idx], 'P')
+        frames = torch.zeros((self.clip_n, 3, self.output_size, self.output_size), dtype=torch.float)
+        masks = torch.zeros((self.clip_n, self.max_obj_n, self.output_size, self.output_size), dtype=torch.float)
+        for i in range(self.clip_n):
+            img, mask = img_pil, mask_pil
+            if i > 0:
+                img, mask = self.random_horizontal_flip(img, mask)
+                img = self.color_jitter(img)
+                img, mask = self.random_affine(img, mask)
+            img, mask = self.random_resize_crop(img, mask)
+            mask = np.array(mask, np.uint8)
+            if i == 0:
+                mask, obj_list = self.to_onehot(mask)
+                obj_n = len(obj_list) + 1
+            else:
+                mask, _ = self.to_onehot(mask, obj_list)
+            frames[i] = self.to_tensor(img)
+            masks[i] = mask
+        info = {
+            'name': self.img_list[idx]
+        }
+        return frames, masks[:, :obj_n], obj_n, info
+class Video_DS(data.Dataset):
+    def __init__(self, img_list, first_frame, first_mask):
+        self.img_list = img_list[1:]
+        self.video_len = len(self.img_list)
+        first_mask = np.array(first_mask, np.uint8) > 0
+        self.obj_n = first_mask.max() + 1
+        self.to_tensor = TF.ToTensor()
+        self.to_onehot = mytrans.ToOnehot(self.obj_n, shuffle=False)
+        mask, _ = self.to_onehot(first_mask)
+        self.first_mask = mask[:self.obj_n]
+        self.first_frame = self.to_tensor(first_frame)
+    def __len__(self):
+        return self.video_len
+    def __getitem__(self, idx):
+        img = myutils.load_image_in_PIL(self.img_list[idx], 'RGB')
+        frame = self.to_tensor(img)
+        img_name = os.path.basename(self.img_list[idx])[:-4]
+        return frame, img_name

video_module/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .Water_DS import *

video_module/dataset/transforms.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import math
+import warnings
+import random
+import numbers
+import numpy as np
+from PIL import Image
+from collections.abc import Sequence
+import torch
+import torchvision.transforms.functional as TF
+_pil_interpolation_to_str = {
+    Image.NEAREST: 'PIL.Image.NEAREST',
+    Image.BILINEAR: 'PIL.Image.BILINEAR',
+    Image.BICUBIC: 'PIL.Image.BICUBIC',
+    Image.LANCZOS: 'PIL.Image.LANCZOS',
+    Image.HAMMING: 'PIL.Image.HAMMING',
+    Image.BOX: 'PIL.Image.BOX',
+}
+def _get_image_size(img):
+    if TF._is_pil_image(img):
+        return img.size
+    elif isinstance(img, torch.Tensor) and img.dim() > 2:
+        return img.shape[-2:][::-1]
+    else:
+        raise TypeError("Unexpected type {}".format(type(img)))
+class RandomHorizontalFlip(object):
+    """Horizontal flip the given PIL Image randomly with a given probability.
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, mask):
+        """
+        Args:
+            img (PIL Image): Image to be flipped.
+        Returns:
+            PIL Image: Randomly flipped image.
+        """
+        if random.random() < self.p:
+            img = TF.hflip(img)
+            mask = TF.hflip(mask)
+        return img, mask
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={})'.format(self.p)
+class RandomAffine(object):
+    """Random affine transformation of the image keeping center invariant
+    Args:
+        degrees (sequence or float or int): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or float or int, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x axis in the range (-shear, +shear)
+            will be apllied. Else if shear is a tuple or list of 2 values a shear parallel to the x axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a tuple or list of 4 values,
+            a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default
+        resample ({PIL.Image.NEAREST, PIL.Image.BILINEAR, PIL.Image.BICUBIC}, optional):
+            An optional resampling filter. See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to PIL.Image.NEAREST.
+        fillcolor (tuple or int): Optional fill color (Tuple for RGB Image And int for grayscale) for the area
+            outside the transform in the output image.(Pillow>=5.0.0)
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+    """
+    def __init__(self, degrees, translate=None, scale=None, shear=None, resample=False, fillcolor=0):
+        if isinstance(degrees, numbers.Number):
+            if degrees < 0:
+                raise ValueError("If degrees is a single number, it must be positive.")
+            self.degrees = (-degrees, degrees)
+        else:
+            assert isinstance(degrees, (tuple, list)) and len(degrees) == 2, \
+                "degrees should be a list or tuple and it must be of length 2."
+            self.degrees = degrees
+        if translate is not None:
+            assert isinstance(translate, (tuple, list)) and len(translate) == 2, \
+                "translate should be a list or tuple and it must be of length 2."
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            assert isinstance(scale, (tuple, list)) and len(scale) == 2, \
+                "scale should be a list or tuple and it must be of length 2."
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+        if shear is not None:
+            if isinstance(shear, numbers.Number):
+                if shear < 0:
+                    raise ValueError("If shear is a single number, it must be positive.")
+                self.shear = (-shear, shear)
+            else:
+                assert isinstance(shear, (tuple, list)) and \
+                       (len(shear) == 2 or len(shear) == 4), \
+                    "shear should be a list or tuple and it must be of length 2 or 4."
+                # X-Axis shear with [min, max]
+                if len(shear) == 2:
+                    self.shear = [shear[0], shear[1], 0., 0.]
+                elif len(shear) == 4:
+                    self.shear = [s for s in shear]
+        else:
+            self.shear = shear
+        self.resample = resample
+        self.fillcolor = fillcolor
+    @staticmethod
+    def get_params(degrees, translate, scale_ranges, shears, img_size):
+        """Get parameters for affine transformation
+        Returns:
+            sequence: params to be passed to the affine transformation
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+        if translate is not None:
+            max_dx = translate[0] * img_size[0]
+            max_dy = translate[1] * img_size[1]
+            translations = (np.round(random.uniform(-max_dx, max_dx)),
+                            np.round(random.uniform(-max_dy, max_dy)))
+        else:
+            translations = (0, 0)
+        if scale_ranges is not None:
+            scale = random.uniform(scale_ranges[0], scale_ranges[1])
+        else:
+            scale = 1.0
+        if shears is not None:
+            if len(shears) == 2:
+                shear = [random.uniform(shears[0], shears[1]), 0.]
+            elif len(shears) == 4:
+                shear = [random.uniform(shears[0], shears[1]),
+                         random.uniform(shears[2], shears[3])]
+        else:
+            shear = 0.0
+        return angle, translations, scale, shear
+    def __call__(self, img, mask):
+        """
+            img (PIL Image): Image to be transformed.
+        Returns:
+            PIL Image: Affine transformed image.
+        """
+        ret = self.get_params(self.degrees, self.translate, self.scale, self.shear, img.size)
+        img = TF.affine(img, *ret, interpolation=TF.InterpolationMode.BICUBIC, fill=self.fillcolor)
+        mask = TF.affine(mask, *ret, interpolation=TF.InterpolationMode.NEAREST, fill=self.fillcolor)
+        return img, mask
+    def __repr__(self):
+        s = '{name}(degrees={degrees}'
+        if self.translate is not None:
+            s += ', translate={translate}'
+        if self.scale is not None:
+            s += ', scale={scale}'
+        if self.shear is not None:
+            s += ', shear={shear}'
+        if self.resample > 0:
+            s += ', resample={resample}'
+        if self.fillcolor != 0:
+            s += ', fillcolor={fillcolor}'
+        s += ')'
+        d = dict(self.__dict__)
+        d['resample'] = _pil_interpolation_to_str[d['resample']]
+        return s.format(name=self.__class__.__name__, **d)
+class RandomCrop(object):
+    """Crop the given PIL Image at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None, i.e no padding. If a sequence of length
+            4 is provided, it is used to pad left, top, right, bottom borders
+            respectively. If a sequence of length 2 is provided, it is used to
+            pad left/right, top/bottom borders, respectively.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+             - constant: pads with a constant value, this value is specified with fill
+             - edge: pads with the last value on the edge of the image
+             - reflect: pads with reflection of image (without repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+             - symmetric: pads with reflection of image (repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+    def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        w, h = _get_image_size(img)
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+    def __call__(self, img, mask):
+        """
+        Args:
+            img (PIL Image): Image to be cropped.
+        Returns:
+            PIL Image: Cropped image.
+        """
+        # if self.padding is not None:
+        #     img = TF.pad(img, self.padding, self.fill, self.padding_mode)
+        #
+        # # pad the width if needed
+        # if self.pad_if_needed and img.size[0] < self.size[1]:
+        #     img = TF.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode)
+        # # pad the height if needed
+        # if self.pad_if_needed and img.size[1] < self.size[0]:
+        #     img = TF.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode)
+        i, j, h, w = self.get_params(img, self.size)
+        img = TF.crop(img, i, j, h, w)
+        mask = TF.crop(mask, i, j, h, w)
+        return img, mask
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding)
+class RandomResizedCrop(object):
+    """Crop the given PIL Image to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), interpolation=Image.BILINEAR):
+        if isinstance(size, (tuple, list)):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+        self.interpolation = interpolation
+        self.scale = scale
+        self.ratio = ratio
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        width, height = _get_image_size(img)
+        area = height * width
+        for _ in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if 0 < w <= width and 0 < h <= height:
+                i = random.randint(0, height - h)
+                j = random.randint(0, width - w)
+                return i, j, h, w
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if (in_ratio < min(ratio)):
+            w = width
+            h = int(round(w / min(ratio)))
+        elif (in_ratio > max(ratio)):
+            h = height
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = width
+            h = height
+        i = (height - h) // 2
+        j = (width - w) // 2
+        return i, j, h, w
+    def __call__(self, img, mask):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        # print(i, j, h, w)
+        img = TF.resized_crop(img, i, j, h, w, self.size, TF.InterpolationMode.BICUBIC)
+        mask = TF.resized_crop(mask, i, j, h, w, self.size, TF.InterpolationMode.NEAREST)
+        return img, mask
+    def __repr__(self):
+        interpolate_str = _pil_interpolation_to_str[self.interpolation]
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0})'.format(interpolate_str)
+        return format_string
+class ToOnehot(object):
+    """To oneshot tensor
+    Args:
+        max_obj_n (float): Maximum number of the objects
+    """
+    def __init__(self, max_obj_n, shuffle):
+        self.max_obj_n = max_obj_n
+        self.shuffle = shuffle
+    def __call__(self, mask, obj_list=None):
+        """
+        Args:
+            mask (Mask in Numpy): Mask to be converted.
+        Returns:
+            Tensor: Converted mask in onehot format.
+        """
+        new_mask = np.zeros((self.max_obj_n, *mask.shape), np.uint8)
+        if not obj_list:
+            obj_list = list()
+            obj_max = mask.max() + 1
+            for i in range(1, obj_max):
+                tmp = (mask == i).astype(np.uint8)
+                if tmp.max() > 0:
+                    obj_list.append(i)
+            if self.shuffle:
+                random.shuffle(obj_list)
+            obj_list = obj_list[:self.max_obj_n - 1]
+        for i in range(len(obj_list)):
+            new_mask[i + 1] = (mask == obj_list[i]).astype(np.uint8)
+        new_mask[0] = 1 - np.sum(new_mask, axis=0)
+        return torch.from_numpy(new_mask), obj_list
+    def __repr__(self):
+        return self.__class__.__name__ + '(max_obj_n={})'.format(self.max_obj_n)
+class Resize(torch.nn.Module):
+    """Resize the input image to the given size.
+    The image can be a PIL Image or a torch Tensor, in which case it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+            In torchscript mode padding as single int is not supported, use a tuple or
+            list of length 1: ``[size, ]``.
+        interpolation (int, optional): Desired interpolation enum defined by `filters`_.
+            Default is ``PIL.Image.BILINEAR``. If input is Tensor, only ``PIL.Image.NEAREST``, ``PIL.Image.BILINEAR``
+            and ``PIL.Image.BICUBIC`` are supported.
+    """
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        super().__init__()
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError("Size should be int or sequence. Got {}".format(type(size)))
+        if isinstance(size, Sequence) and len(size) not in (1, 2):
+            raise ValueError("If size is a sequence, it should have 1 or 2 values")
+        self.size = size
+        self.interpolation = interpolation
+    def forward(self, img, mask):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        img = TF.resize(img, self.size, self.interpolation)
+        mask = TF.resize(mask, self.size, Image.NEAREST)
+        return img, mask
+    def __repr__(self):
+        interpolate_str = _pil_interpolation_to_str[self.interpolation]
+        return self.__class__.__name__ + '(size={0}, interpolation={1})'.format(self.size, interpolate_str)

video_module/model/AFB_URR.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torchvision.models import resnet50, ResNet50_Weights
+import myutils
+class ResBlock(nn.Module):
+    """A simple residual block component."""
+    def __init__(self, indim, outdim=None, stride=1):
+        super(ResBlock, self).__init__()
+        outdim = outdim or indim
+        self.conv1 = nn.Conv2d(indim, outdim, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(outdim, outdim, kernel_size=3, padding=1)
+        self.downsample = nn.Conv2d(indim, outdim, kernel_size=1, stride=stride) if indim != outdim or stride != 1 else None
+    def forward(self, x):
+        identity = x
+        out = F.relu(self.conv1(x))
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.downsample(identity)
+        out += identity
+        return F.relu(out)
+class EncoderM(nn.Module):
+    def __init__(self, load_imagenet_params):
+        super(EncoderM, self).__init__()
+        self.conv1_m = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.conv1_o = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        weights = ResNet50_Weights.IMAGENET1K_V1 if load_imagenet_params else None
+        resnet = resnet50(weights=weights)
+        self.conv1 = resnet.conv1
+        self.bn1 = resnet.bn1
+        self.relu = resnet.relu  # 1/2, 64
+        self.maxpool = resnet.maxpool
+        self.res2 = resnet.layer1  # 1/4, 256
+        self.res3 = resnet.layer2  # 1/8, 512
+        self.res4 = resnet.layer3  # 1/16, 1024
+        self.register_buffer('mean', torch.FloatTensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer('std', torch.FloatTensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    def forward(self, in_f, in_m, in_o):
+        f = (in_f - self.mean) / self.std
+        x = self.conv1(f) + self.conv1_m(in_m) + self.conv1_o(in_o)
+        x = self.bn1(x)
+        r1 = self.relu(x)  # 1/2, 64
+        x = self.maxpool(r1)  # 1/4, 64
+        r2 = self.res2(x)  # 1/4, 256
+        r3 = self.res3(r2)  # 1/8, 512
+        r4 = self.res4(r3)  # 1/16, 1024
+        return r4, r1
+class EncoderQ(nn.Module):
+    def __init__(self, load_imagenet_params):
+        super(EncoderQ, self).__init__()
+        weights = ResNet50_Weights.IMAGENET1K_V1 if load_imagenet_params else None
+        resnet = resnet50(weights=weights)
+        self.conv1 = resnet.conv1
+        self.bn1 = resnet.bn1
+        self.relu = resnet.relu  # 1/2, 64
+        self.maxpool = resnet.maxpool
+        self.res2 = resnet.layer1  # 1/4, 256
+        self.res3 = resnet.layer2  # 1/8, 512
+        self.res4 = resnet.layer3  # 1/8, 1024
+        self.register_buffer('mean', torch.FloatTensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer('std', torch.FloatTensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    def forward(self, in_f):
+        f = (in_f - self.mean) / self.std
+        x = self.conv1(f)
+        x = self.bn1(x)
+        r1 = self.relu(x)  # 1/2, 64
+        x = self.maxpool(r1)  # 1/4, 64
+        r2 = self.res2(x)  # 1/4, 256
+        r3 = self.res3(r2)  # 1/8, 512
+        r4 = self.res4(r3)  # 1/8, 1024
+        return r4, r3, r2, r1
+class KeyValue(nn.Module):
+    def __init__(self, indim, keydim, valdim):
+        super(KeyValue, self).__init__()
+        self.keydim = keydim
+        self.valdim = valdim
+        self.Key = nn.Conv2d(indim, keydim, kernel_size=(3, 3), padding=(1, 1), stride=1)
+        self.Value = nn.Conv2d(indim, valdim, kernel_size=(3, 3), padding=(1, 1), stride=1)
+    def forward(self, x):
+        key = self.Key(x)
+        key = key.view(*key.shape[:2], -1)  # obj_n, key_dim, pixel_n
+        val = self.Value(x)
+        val = val.view(*val.shape[:2], -1)  # obj_n, key_dim, pixel_n
+        return key, val
+class Refine(nn.Module):
+    def __init__(self, inplanes, planes):
+        super(Refine, self).__init__()
+        self.convFS = nn.Conv2d(inplanes, planes, kernel_size=(3, 3), padding=(1, 1), stride=1)
+        self.ResFS = ResBlock(planes, planes)
+        self.ResMM = ResBlock(planes, planes)
+        self.scale_factor = 2
+    def forward(self, f, pm):
+        s = self.ResFS(self.convFS(f))
+        m = s + F.interpolate(pm, scale_factor=self.scale_factor, mode='bilinear', align_corners=False)
+        m = self.ResMM(m)
+        return m
+class Matcher(nn.Module):
+    def __init__(self, thres_valid=1e-3, update_bank=False):
+        super(Matcher, self).__init__()
+        self.thres_valid = thres_valid
+        self.update_bank = update_bank
+    def forward(self, feature_bank, q_in, q_out):
+        mem_out_list = []
+        for i in range(0, feature_bank.obj_n):
+            d_key, bank_n = feature_bank.keys[i].size()
+            try:
+                p = torch.matmul(feature_bank.keys[i].transpose(0, 1), q_in) / math.sqrt(d_key)  # THW, HW
+                p = F.softmax(p, dim=1)  # bs, bank_n, HW
+                mem = torch.matmul(feature_bank.values[i], p)  # bs, D_o, HW
+            except RuntimeError as e:
+                device = feature_bank.keys[i].device
+                key_cpu = feature_bank.keys[i].cpu()
+                value_cpu = feature_bank.values[i].cpu()
+                q_in_cpu = q_in.cpu()
+                p = torch.matmul(key_cpu.transpose(0, 1), q_in_cpu) / math.sqrt(d_key)  # THW, HW
+                p = F.softmax(p, dim=1)  # bs, bank_n, HW
+                mem = torch.matmul(value_cpu, p).to(device)  # bs, D_o, HW
+                p = p.to(device)
+                print('\tLine 158. GPU out of memory, use CPU', f'p size: {p.shape}')
+            mem_out_list.append(torch.cat([mem, q_out], dim=1))
+            if self.update_bank:
+                try:
+                    ones = torch.ones_like(p)
+                    zeros = torch.zeros_like(p)
+                    bank_cnt = torch.where(p > self.thres_valid, ones, zeros).sum(dim=2)[0]
+                except RuntimeError as e:
+                    device = p.device
+                    p = p.cpu()
+                    ones = torch.ones_like(p)
+                    zeros = torch.zeros_like(p)
+                    bank_cnt = torch.where(p > self.thres_valid, ones, zeros).sum(dim=2)[0].to(device)
+                    print('\tLine 170. GPU out of memory, use CPU', f'p size: {p.shape}')
+                feature_bank.info[i][:, 1] += torch.log(bank_cnt + 1)
+        mem_out_tensor = torch.stack(mem_out_list, dim=0).transpose(0, 1)  # bs, obj_n, dim, pixel_n
+        return mem_out_tensor
+class Decoder(nn.Module):
+    def __init__(self, device):  # mdim_global = 256
+        super(Decoder, self).__init__()
+        self.device = device
+        mdim_global = 256
+        mdim_local = 32
+        local_size = 7
+        # Patch-wise
+        self.convFM = nn.Conv2d(1024, mdim_global, kernel_size=3, padding=1, stride=1)
+        self.ResMM = ResBlock(mdim_global, mdim_global)
+        self.RF3 = Refine(512, mdim_global)  # 1/8 -> 1/8
+        self.RF2 = Refine(256, mdim_global)  # 1/8 -> 1/4
+        self.pred2 = nn.Conv2d(mdim_global, 2, kernel_size=3, padding=1, stride=1)
+        # Local
+        self.local_avg = nn.AvgPool2d(local_size, stride=1, padding=local_size // 2)
+        self.local_max = nn.MaxPool2d(local_size, stride=1, padding=local_size // 2)
+        self.local_convFM = nn.Conv2d(128, mdim_local, kernel_size=3, padding=1, stride=1)
+        self.local_ResMM = ResBlock(mdim_local, mdim_local)
+        self.local_pred2 = nn.Conv2d(mdim_local, 2, kernel_size=3, padding=1, stride=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+    def forward(self, patch_match, r3, r2, r1=None, feature_shape=None):
+        p = self.ResMM(self.convFM(patch_match))
+        p = self.RF3(r3, p)  # out: 1/8, 256
+        p = self.RF2(r2, p)  # out: 1/4, 256
+        p = self.pred2(F.relu(p))
+        p = F.interpolate(p, scale_factor=2, mode='bilinear', align_corners=False)
+        bs, obj_n, h, w = feature_shape
+        rough_seg = F.softmax(p, dim=1)[:, 1]
+        rough_seg = rough_seg.view(bs, obj_n, h, w)
+        rough_seg = F.softmax(rough_seg, dim=1)  # object-level normalization
+        # Local refinement
+        uncertainty = myutils.calc_uncertainty(rough_seg)
+        uncertainty = uncertainty.expand(-1, obj_n, -1, -1).reshape(bs * obj_n, 1, h, w)
+        rough_seg = rough_seg.view(bs * obj_n, 1, h, w)  # bs*obj_n, 1, h, w
+        r1_weighted = r1 * rough_seg
+        r1_local = self.local_avg(r1_weighted)  # bs*obj_n, 64, h, w
+        r1_local = r1_local / (self.local_avg(rough_seg) + 1e-8)  # neighborhood reference
+        r1_conf = self.local_max(rough_seg)  # bs*obj_n, 1, h, w
+        local_match = torch.cat([r1, r1_local], dim=1)
+        q = self.local_ResMM(self.local_convFM(local_match))
+        q = r1_conf * self.local_pred2(F.relu(q))
+        p = p + uncertainty * q
+        p = F.interpolate(p, scale_factor=2, mode='bilinear', align_corners=False)
+        p = F.softmax(p, dim=1)[:, 1]  # no, h, w
+        return p
+class AFB_URR(nn.Module):
+    def __init__(self, device, update_bank, load_imagenet_params=False):
+        super(AFB_URR, self).__init__()
+        self.device = device
+        self.encoder_m = EncoderM(load_imagenet_params)
+        self.encoder_q = EncoderQ(load_imagenet_params)
+        self.keyval_r4 = KeyValue(1024, keydim=128, valdim=512)
+        self.global_matcher = Matcher(update_bank=update_bank)
+        self.decoder = Decoder(device)
+    def memorize(self, frame, mask):
+        _, K, H, W = mask.shape
+        (frame, mask), pad = myutils.pad_divide_by([frame, mask], 16, (frame.size()[2], frame.size()[3]))
+        frame = frame.expand(K, -1, -1, -1)  # obj_n, 3, h, w
+        mask = mask[0].unsqueeze(1).float()
+        mask_ones = torch.ones_like(mask)
+        mask_inv = (mask_ones - mask).clamp(0, 1)
+        r4, r1 = self.encoder_m(frame, mask, mask_inv)
+        k4, v4 = self.keyval_r4(r4)  # num_objects, 128 and 512, H/16, W/16
+        k4_list = [k4[i] for i in range(K)]
+        v4_list = [v4[i] for i in range(K)]
+        return k4_list, v4_list
+    def segment(self, frame, fb_global):
+        obj_n = fb_global.obj_n
+        if not self.training:
+            [frame], pad = myutils.pad_divide_by([frame], 16, (frame.size()[2], frame.size()[3]))
+        r4, r3, r2, r1 = self.encoder_q(frame)
+        bs, _, global_match_h, global_match_w = r4.shape
+        _, _, local_match_h, local_match_w = r1.shape
+        k4, v4 = self.keyval_r4(r4)  # 1, dim, H/16, W/16
+        res_global = self.global_matcher(fb_global, k4, v4)
+        res_global = res_global.reshape(bs * obj_n, v4.shape[1] * 2, global_match_h, global_match_w)
+        r3_size = r3.shape
+        r2_size = r2.shape
+        r3 = r3.unsqueeze(1).expand(-1, obj_n, -1, -1, -1).reshape(bs * obj_n, *r3_size[1:])
+        r2 = r2.unsqueeze(1).expand(-1, obj_n, -1, -1, -1).reshape(bs * obj_n, *r2_size[1:])
+        r1_size = r1.shape
+        r1 = r1.unsqueeze(1).expand(-1, obj_n, -1, -1, -1).reshape(bs * obj_n, *r1_size[1:])
+        feature_size = (bs, obj_n, r1_size[2], r1_size[3])
+        score = self.decoder(res_global, r3, r2, r1, feature_size)
+        # score = score.view(obj_n, bs, *frame.shape[-2:]).permute(1, 0, 2, 3)
+        score = score.view(bs, obj_n, *frame.shape[-2:])
+        if self.training:
+            uncertainty = myutils.calc_uncertainty(F.softmax(score, dim=1))
+            uncertainty = uncertainty.view(bs, -1).norm(p=2, dim=1) / math.sqrt(frame.shape[-2] * frame.shape[-1])  # [B,1,H,W]
+            uncertainty = uncertainty.mean()
+        else:
+            uncertainty = None
+        score = torch.clamp(score, 1e-7, 1 - 1e-7)
+        score = torch.log((score / (1 - score)))
+        if not self.training:
+            if pad[2] + pad[3] > 0:
+                score = score[:, :, pad[2]:-pad[3], :]
+            if pad[0] + pad[1] > 0:
+                score = score[:, :, :, pad[0]:-pad[1]]
+        return score, uncertainty
+    def forward(self, x):
+        pass

video_module/model/FeatureBank.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+import torch
+import torch.nn.functional as NF
+from torch_scatter import scatter_mean
+class FeatureBank:
+    def __init__(self, obj_n, memory_budget, device, update_rate=0.1, thres_close=0.95):
+        self.obj_n = obj_n
+        self.update_rate = update_rate
+        self.thres_close = thres_close
+        self.device = device
+        self.info = [None for _ in range(obj_n)]
+        self.peak_n = np.zeros(obj_n)
+        self.replace_n = np.zeros(obj_n)
+        self.class_budget = memory_budget // obj_n
+        if obj_n == 2:
+            self.class_budget = 0.8 * self.class_budget
+        self.keys = None
+        self.values = None
+    def init_bank(self, keys, values, frame_idx=0):
+        self.keys = keys
+        self.values = values
+        for class_idx in range(self.obj_n):
+            _, bank_n = keys[class_idx].shape
+            self.info[class_idx] = torch.zeros((bank_n, 2), device=self.device)
+            self.info[class_idx][:, 0] = frame_idx
+            self.peak_n[class_idx] = max(self.peak_n[class_idx], self.info[class_idx].shape[0])
+    def append(self, keys, values, frame_idx=0):
+        if self.keys:
+            for class_idx in range(self.obj_n):
+                self.keys[class_idx] = torch.cat([self.keys[class_idx], keys[class_idx]], dim=1)
+                self.values[class_idx] = torch.cat([self.values[class_idx], values[class_idx]], dim=1)
+                _, bank_n = keys[class_idx].shape
+                new_info = torch.ones((bank_n, 2), device=self.device) * 20 # zeros
+                new_info[:, 0] = frame_idx
+                self.info[class_idx] = torch.cat([self.info[class_idx], new_info], dim=0)
+                self.peak_n[class_idx] = max(self.peak_n[class_idx], self.info[class_idx].shape[0])
+        else:
+            self.init_bank(keys, values, frame_idx)
+    def update(self, prev_key, prev_value, frame_idx, update_rate=-1):
+        if update_rate == -1:
+            update_rate = self.update_rate
+        for class_idx in range(self.obj_n):
+            d_key, bank_n = self.keys[class_idx].shape
+            d_val, _ = self.values[class_idx].shape
+            normed_keys = NF.normalize(self.keys[class_idx], dim=0)
+            normed_prev_key = NF.normalize(prev_key[class_idx], dim=0)
+            mag_keys = self.keys[class_idx].norm(p=2, dim=0)
+            corr = torch.mm(normed_keys.transpose(0, 1), normed_prev_key)  # bank_n, prev_n
+            related_bank_idx = corr.argmax(dim=0, keepdim=True)  # 1, HW
+            related_bank_corr = torch.gather(corr, 0, related_bank_idx)  # 1, HW
+            # greater than threshold, merge them
+            selected_idx = (related_bank_corr[0] > self.thres_close).nonzero(as_tuple=False)
+            class_related_bank_idx = related_bank_idx[0, selected_idx[:, 0]]  # selected_HW
+            unique_related_bank_idx, cnt = class_related_bank_idx.unique(dim=0, return_counts=True)  # selected_HW
+            # Update key
+            key_bank_update = torch.zeros((d_key, bank_n), dtype=torch.float, device=self.device)  # d_key, THW
+            key_bank_idx = class_related_bank_idx.unsqueeze(0).expand(d_key, -1)  # d_key, HW
+            scatter_mean(normed_prev_key[:, selected_idx[:, 0]], key_bank_idx, dim=1, out=key_bank_update)
+            # d_key, selected_HW
+            self.keys[class_idx][:, unique_related_bank_idx] = \
+                mag_keys[unique_related_bank_idx] * \
+                ((1 - update_rate) * normed_keys[:, unique_related_bank_idx] + \
+                 update_rate * key_bank_update[:, unique_related_bank_idx])
+            # Update value
+            normed_values = NF.normalize(self.values[class_idx], dim=0)
+            normed_prev_value = NF.normalize(prev_value[class_idx], dim=0)
+            mag_values = self.values[class_idx].norm(p=2, dim=0)
+            val_bank_update = torch.zeros((d_val, bank_n), dtype=torch.float, device=self.device)
+            val_bank_idx = class_related_bank_idx.unsqueeze(0).expand(d_val, -1)
+            scatter_mean(normed_prev_value[:, selected_idx[:, 0]], val_bank_idx, dim=1, out=val_bank_update)
+            self.values[class_idx][:, unique_related_bank_idx] = \
+                mag_values[unique_related_bank_idx] * \
+                ((1 - update_rate) * normed_values[:, unique_related_bank_idx] + \
+                 update_rate * val_bank_update[:, unique_related_bank_idx])
+            # less than the threshold, concat them
+            selected_idx = (related_bank_corr[0] <= self.thres_close).nonzero(as_tuple=False)
+            if self.class_budget < bank_n + selected_idx.shape[0]:
+                self.remove(class_idx, selected_idx.shape[0], frame_idx)
+            self.keys[class_idx] = torch.cat([self.keys[class_idx], prev_key[class_idx][:, selected_idx[:, 0]]], dim=1)
+            self.values[class_idx] = \
+                torch.cat([self.values[class_idx], prev_value[class_idx][:, selected_idx[:, 0]]], dim=1)
+            new_info = torch.zeros((selected_idx.shape[0], 2), device=self.device)
+            new_info[:, 0] = frame_idx
+            self.info[class_idx] = torch.cat([self.info[class_idx], new_info], dim=0)
+            self.peak_n[class_idx] = max(self.peak_n[class_idx], self.info[class_idx].shape[0])
+            self.info[class_idx][:, 1] = torch.clamp(self.info[class_idx][:, 1], 0, 1e5)  # Prevent inf
+    def remove(self, class_idx, request_n, frame_idx):
+        old_size = self.keys[class_idx].shape[1]
+        LFU = frame_idx - self.info[class_idx][:, 0]  # time length
+        LFU = self.info[class_idx][:, 1] / LFU
+        thres_dynamic = int(LFU.min()) + 1
+        iter_cnt = 0
+        while True:
+            selected_idx = LFU > thres_dynamic
+            self.keys[class_idx] = self.keys[class_idx][:, selected_idx]
+            self.values[class_idx] = self.values[class_idx][:, selected_idx]
+            self.info[class_idx] = self.info[class_idx][selected_idx]
+            LFU = LFU[selected_idx]
+            iter_cnt += 1
+            balance = (self.class_budget - self.keys[class_idx].shape[1]) - request_n
+            if balance < 0:
+                thres_dynamic = int(LFU.min()) + 1
+            else:
+                break
+        new_size = self.keys[class_idx].shape[1]
+        self.replace_n[class_idx] += old_size - new_size
+        return balance
+    def print_peak_mem(self):
+        ur = self.peak_n / self.class_budget
+        rr = self.replace_n / self.class_budget
+        print(f'Obj num: {self.obj_n}.', f'Budget / obj: {self.class_budget}.', f'UR: {ur}.', f'Replace: {rr}.')

video_module/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .AFB_URR import *
2	+ from .FeatureBank import FeatureBank