test_smoker.py

import numpy
import json
import os
import cv2
import time
import glob

import argparse
import torch
import yaml
import pickle
import numpy as np

from backbone import EfficientDetBackbone
from efficientdet.utils import BBoxTransform, ClipBoxes
from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip
from utils.timer import Timer
from utils.visual_smoker import visual_smoker_demo

ap = argparse.ArgumentParser()
ap.add_argument('-p', '--project', type=str, default='smoker-det', help='project file that contains parameters')
ap.add_argument('-c', '--compound_coef', type=int, default=1, help='coefficients of efficientdet')
ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights')
ap.add_argument('--nms_threshold', type=float, default=0.5, help='nms threshold, don\'t change it if not for testing purposes')
ap.add_argument('--cuda', type=int, default=1)
ap.add_argument('--device', type=int, default=0)
ap.add_argument('--float16', type=int, default=0)
ap.add_argument('--override', type=int, default=1, help='override previous bbox results file if exists')
ap.add_argument('--data_dir', type=str, default='./datasets', help='the root folder of dataset')
ap.add_argument('--need_visual', type=int, default=1, help='whether need to visualize the results')
ap.add_argument('--flip_test', type=int, default=0, help='whether apply flip augmentation when testing')

args = ap.parse_args()

compound_coef = args.compound_coef
nms_threshold = args.nms_threshold
use_cuda = args.cuda
gpu = args.device
use_float16 = args.float16
override_prev_results = args.override
need_visual = args.need_visual
weights_path = f'./logs/smoker-detection/HOLT_best.pth' if args.weights is None else args.weights
data_dir = args.data_dir
project = args.project

save_path = "./logs/smoker-detection/test_vis"
if not os.path.exists(save_path):
    os.mkdir(save_path)
params = yaml.safe_load(open(f'projects/{project}.yml'))
SET_NAME = params['val_set']
project_name = params["project_name"]


print(f'evaluation on project {project_name}, weights {weights_path}...')

params = yaml.safe_load(open(f'projects/{project}.yml'))
num_objects = 2
num_union_actions = 3
num_union_hois = 3
num_inst_actions = 6

input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
input_size = input_sizes[compound_coef]
output_dir = f"./logs/{project_name}/test_results"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

if args.flip_test:
    detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_flip_final.pkl')
else:
    detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_final.pkl')

obj_list = ['person', 'cigarette']

obj_dict = {}
cid = 0
for obj in obj_list:
    if obj != "":
        cid += 1
        obj_dict[obj] = cid


with open(args.data_dir + "/smoker_det/annotations/verb_list.json", "r") as file:
    verbs_hico = json.load(file)
verbs_dict = {}
for id, item in enumerate(verbs_hico):
    verb_name = item["name"]
    verbs_dict[verb_name] = id


with open(args.data_dir + "/smoker_det/annotations/hoi_list.json", "r") as file:
    hois_hico = json.load(file)
verb_to_hoi = {}
for hoi_id, item in enumerate(hois_hico):
    verb_id = verbs_dict[item["verb"]]
    if verb_id in verb_to_hoi:
        verb_to_hoi[verb_id].append(hoi_id)
    else:
        verb_to_hoi[verb_id] = [hoi_id]

n = 0
for verb_id in verb_to_hoi:
    n += len(verb_to_hoi[verb_id])
    verb_to_hoi[verb_id] = np.array(verb_to_hoi[verb_id])
assert n == num_union_hois


def calc_ioa(a, b):
    # a(anchor) [boxes, (x1, y1, x2, y2)]
    # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]

    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

    exp_x1 = np.expand_dims(a[:, 0], axis=1)
    exp_x2 = np.expand_dims(a[:, 2], axis=1)
    exp_y1 = np.expand_dims(a[:, 1], 1)
    exp_y2 = np.expand_dims(a[:, 3], 1)

    iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
    ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
    # iw = torch.clamp(iw, min=0)
    # ih = torch.clamp(ih, min=0)
    iw = np.where(iw > 0, iw, 0)
    ih = np.where(ih > 0, ih, 0)

    intersection = iw * ih
    area = np.where(area > 1e-6, area, 1e-6)
    IoA = intersection / area
    # IoA[torch.isnan(IoA)] = 1
    return IoA


def calc_iou(a, b):
    # a(anchor) [boxes, (x1, y1, x2, y2)]
    # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]

    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

    exp_x1 = np.expand_dims(a[:, 0], axis=1)
    exp_x2 = np.expand_dims(a[:, 2], axis=1)
    exp_y1 = np.expand_dims(a[:, 1], 1)
    exp_y2 = np.expand_dims(a[:, 3], 1)

    iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
    ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
    # iw = torch.clamp(iw, min=0)
    # ih = torch.clamp(ih, min=0)
    iw = np.where(iw > 0, iw, 0)
    ih = np.where(ih > 0, ih, 0)

    ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
    ua = np.where(ua > 0, ua, 1e-8)

    intersection = iw * ih
    IoU = intersection / ua
    return IoU


def transform_class_id(id):
    class_name = obj_list[id]
    hico_obj_id = obj_dict[class_name]
    return hico_obj_id


def transform_action_hico(act_scores, mode):
    union_scores = np.zeros(num_union_actions)
    for i in range(num_inst_actions//2):
        if mode == "subject":
            union_scores[verb_to_hoi[i]] = act_scores[i]
        else:
            union_scores[verb_to_hoi[i]] = act_scores[i + num_inst_actions//2]
    return union_scores


def xy_to_wh(bbox):
    ctr_x = (bbox[0] + bbox[2]) / 2
    ctr_y = (bbox[1] + bbox[3]) / 2
    width = bbox[2] - bbox[0]
    height = bbox[3] - bbox[1]
    return ctr_x, ctr_y, width, height


def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma):
    xo, yo, wo, ho = xy_to_wh(obj_bbox)
    xt, yt, wt, ht = xy_to_wh(target_bbox)
    # xh, yh, wh, hh = xy_to_wh(human_bbox)
    xa, ya, wa, ha = xy_to_wh(anchor_bbox)
    dist = np.zeros(4, dtype=np.float)
    dist[0] = (xo - xt) / wa
    dist[1] = (yo - yt) / ha
    # dist[0] = (xo - xt) / wh
    # dist[1] = (yo - yt) / hh
    # dist[2] = np.log(wo/wt)
    # dist[3] = np.log(ho/ht)

    return np.exp(-1*np.sum(dist**2)/(2*sigma**2))


def target_object_dist(target_objects_pos, objects_pos, anchors):
    width = anchors[:, 2] - anchors[:, 0]
    height = anchors[:, 3] - anchors[:, 1]
    anchors_size = np.stack([width, height], axis=1)
    anchors_size = np.expand_dims(anchors_size, axis=1)
    target_objects_pos = np.expand_dims(target_objects_pos, 1)
    diff = target_objects_pos - objects_pos
    diff = diff / anchors_size
    dist = np.sum(diff**2, axis=2)
    return dist


def hoi_match(image_id, preds_inst, preds_union, human_thre=0.3, anchor_thre=0.1, loc_thre=0.05):
    """ deal with the rest of the image """
    num_inst = len(preds_inst["rois"])

    humans = []
    objects = []
    human_bboxes = []
    human_inst_ids = []
    human_role_scores = []
    human_obj_scores = []

    while len(humans) == 0:
        if human_thre < 0.2:
            break
        for inst_id in range(num_inst):
            if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre:
                continue
            item = {}
            item["bbox"] = preds_inst["rois"][inst_id]
            item["role_scores"] = preds_inst["act_scores"][inst_id][:len(verb_to_hoi)]
            # item["role_scores"] = transform_action_hico(preds_inst["act_scores"][inst_id], "subject")
            item["obj_scores"] = preds_inst["obj_scores"][inst_id]
            item["inst_id"] = inst_id
            humans.append(item)
            human_bboxes.append(item["bbox"])
            human_inst_ids.append(item["inst_id"])
            human_role_scores.append(item["role_scores"])
            human_obj_scores.append(item["obj_scores"] )
        human_thre -= 0.1
    human_bboxes = np.array(human_bboxes)
    human_inst_ids = np.array(human_inst_ids)
    human_role_scores = np.array(human_role_scores)
    human_obj_scores = np.array(human_obj_scores)

    obj_role_scores = []
    obj_obj_scores = []
    for obj_id in range(len(preds_inst["rois"])):
        item = {}
        # obj_role_score = transform_action_hico(preds_inst["act_scores"][obj_id], "object")
        obj_role_score = preds_inst["act_scores"][obj_id][len(verb_to_hoi):]
        item["obj_role_scores"] = obj_role_score
        item["obj_scores"] = preds_inst["obj_scores"][obj_id]

        item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id]

        obj_bbox = preds_inst["rois"][obj_id]
        item["bbox"] = obj_bbox
        objects.append(item)
        obj_role_scores.append(obj_role_score)
        obj_obj_scores.append(item["obj_scores"])
    object_bboxes = np.array(preds_inst["rois"])
    obj_role_scores = np.array(obj_role_scores)
    obj_obj_scores = np.array(obj_obj_scores)
    
    hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), num_union_actions), dtype=np.float)

    if len(human_bboxes) > 0:
        IoA = calc_ioa(preds_union["rois"], human_bboxes)

        IoA_max = np.max(IoA, axis=1)
        human_foreground = IoA_max > 0.1  # 0.25
        human_IoA = IoA[human_foreground]
        for key in preds_union:
            preds_union[key] = preds_union[key][human_foreground]

        new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
        new_IoA_argmax = np.argmax(new_IoA, axis=1)
        new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0
        new_IoA_sec_max = np.max(new_IoA, axis=1)
        obj_foreground = new_IoA_sec_max > 0.1  # 0.25
        for key in preds_union:
            preds_union[key] = preds_union[key][obj_foreground]

        human_IoU = calc_iou(preds_union["rois"], human_bboxes)
        human_IoA = human_IoA[obj_foreground]
        human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.1), axis=1)  # 0.25
        obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])

        num_union = len(preds_union["rois"])
        num_human = len(human_bboxes)

        sp_vectors = preds_union["sp_vector"]
        inter_human_regions = human_bboxes[human_IoU_argmax]
        humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2
        humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2
        humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1)
        inter_objects_pos = humans_pos + sp_vectors

        objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2
        objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2
        objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1)

        obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"])
        inter_human_instids = human_inst_ids[human_IoU_argmax]
        obj_dists[np.arange(num_union), inter_human_instids] = 100
        obj_dists[obj_IoA < 0.1] = 100  # 0.25
        inter_obj_ids = np.argmin(obj_dists, 1)
        inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids]

        sigma = 0.6
        location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2))
        location_scores = np.where(location_scores<loc_thre, 0, location_scores)
        anchor_scores = preds_union["act_scores"]
        anchor_scores = np.where(anchor_scores<anchor_thre, 0, anchor_scores)

        inter_human_ids = human_IoU_argmax
        inter_human_role_score = human_role_scores[inter_human_ids]
        inst_object_role_score = obj_role_scores[inter_obj_ids]

        # inter_human_obj_score = np.expand_dims(human_obj_scores[inter_human_ids],1)
        # inter_obj_obj_score = np.expand_dims(obj_obj_scores[inter_obj_ids],1)

        tau = 1.5
        # inter_scores = 0.5 * ((inter_human_role_score + inst_object_role_score) * anchor_scores).T * location_scores
        inter_scores = 0.5 * ((inter_human_role_score * inst_object_role_score) ** 0.5 * anchor_scores).T * location_scores ** tau
        
        inter_scores = inter_scores.T
        inter_scores[inst_object_role_score == 0] = 0

        for human_id in range(num_human):
            human_inter = inter_human_ids == human_id
            human_inter_obj_id = inter_obj_ids[human_inter]
            human_inter_score = inter_scores[human_inter]

            for obj_id in range(num_inst):
                hoi_pair_score[human_id, obj_id] = np.sum(human_inter_score[human_inter_obj_id==obj_id], axis=0)

    
    
    if args.flip_test:
        hoi_pair_score /= 2

    hoi_cat_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), num_union_hois), dtype=np.float)
    for verb in verb_to_hoi:
        hoi_cat_pair_score[:, :, verb_to_hoi[verb]] = hoi_pair_score[:, :, [verb]]

    dets = []
    for human_id, human in enumerate(humans):
        for obj_id, object in enumerate(objects):
            if human["inst_id"] == obj_id:
                continue
            tmp = []
            tmp.append(human["bbox"]) # human box
            tmp.append(object["bbox"])  # object box
            tmp.append(transform_class_id(object["obj_class_id"])) # object class id
            tmp.append(hoi_cat_pair_score[human_id, obj_id, :]) # hoi 600 score
            tmp.append(human["obj_scores"]) # human scores
            tmp.append(object["obj_scores"]) # object scores
            dets.append(tmp)
    return dets


def img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, threshold):
    fname, ext = os.path.splitext(file)
    image_id = int(fname.split("_")[-1])

    img_path = os.path.join(img_dir, file)

    ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size)

    if use_cuda:
        x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
    else:
        x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)

    if args.flip_test:
        ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda()
        x_flip = x[..., ids]
        x_cat = torch.cat([x, x_flip], 0)

    with torch.no_grad():
        if args.flip_test:
            features, union_act_cls, union_sub_reg, union_obj_reg, \
            inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat)

            anchors = torch.cat([anchors, anchors], 0)
            preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg,
                                                  regressBoxes, clipBoxes, 0.1, 1)
            preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
                                         regressBoxes, clipBoxes, threshold, nms_threshold,
                                         mode="object", classwise=True)
        else:
            features, union_act_cls, union_sub_reg, union_obj_reg, \
            inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x)

            preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg,
                                            regressBoxes, clipBoxes, 0.1, 1)

            preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
                                     regressBoxes, clipBoxes, threshold, nms_threshold,
                                     mode="object", classwise=True)

        preds_inst = invert_affine(framed_metas, preds_inst)[0]
        preds_union = invert_affine(framed_metas, preds_union)[0]

        dets = hoi_match(image_id, preds_inst, preds_union)

    if need_visual:
        visual_smoker_demo(preds_inst, dets, img_path, save_path)
    return dets

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (numpy.int_, numpy.intc, numpy.intp, numpy.int8,
                            numpy.int16, numpy.int32, numpy.int64, numpy.uint8,
                            numpy.uint16, numpy.uint32, numpy.uint64)):
            return int(obj)
        elif isinstance(obj, (numpy.float_, numpy.float16, numpy.float32,
                              numpy.float64)):
            return float(obj)
        elif isinstance(obj, (numpy.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


def test(threshold=0.2):
    model = EfficientDetBackbone(num_classes=num_objects, num_union_classes=num_union_actions,
                                 num_inst_classes=num_inst_actions, compound_coef=args.compound_coef,
                                 ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"]))
    model.load_state_dict(torch.load(weights_path, map_location=torch.device('cuda')))
    model.requires_grad_(False)
    model.eval()

    if args.cuda:
        model = model.cuda()
    if args.float16:
        model = model.half()

    regressBoxes = BBoxTransform()
    clipBoxes = ClipBoxes()

    img_dir = os.path.join(data_dir, "smoker_det/images/%s" % "test")

    _t = {'im_detect': Timer(), 'misc': Timer()}
    detection = {}
    count = 0
    for line in glob.iglob(img_dir + '/' + '*.jpg'):
        count += 1
        
        _t['im_detect'].tic()
        image_id = int(line[-8:-4])

        file = "test_" + (str(image_id)).zfill(5) + ".jpg" # image name
        dets = img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, threshold=threshold)

        detection[image_id] = dets # each images reflect a keys
        _t['im_detect'].toc()

        print('im_detect: {:d}/{:d}, image: {}, average time: {:.3f}s'.format(count, 360, file, _t['im_detect'].average_time))
    
    with open(detection_path, "wb") as file:
        pickle.dump(detection, file)
    with open("./logs/smoker-detection/dets.json", "w") as file:
        json.dump(detection, file, cls=NumpyEncoder)


if __name__ == '__main__':
    if override_prev_results or not os.path.exists(detection_path):
        test()