diff --git a/deva/vps_metrics/stuff_merging.py b/deva/vps_metrics/stuff_merging.py index 58153bb..cba4578 100644 --- a/deva/vps_metrics/stuff_merging.py +++ b/deva/vps_metrics/stuff_merging.py @@ -6,7 +6,7 @@ import numpy as np from PIL import Image from functools import partial -from progressbar import progressbar +from tqdm import tqdm from deva.utils.vipseg_categories import VIPSEG_CATEGORIES from deva.utils.pano_utils import IDPostprocessor, id_to_rgb @@ -94,7 +94,7 @@ def merge_stuff(input_path, output_path): output_annotations = [] pool = Pool(16) - for out_vid_ann in progressbar(pool.imap( + for out_vid_ann in tqdm(pool.imap( partial(process_single_video, input_path=input_path, output_path=output_path), annotations), max_value=len(annotations)): diff --git a/evaluation/eval_ref_davis.py b/evaluation/eval_ref_davis.py index 0596e00..7a3ceb3 100644 --- a/evaluation/eval_ref_davis.py +++ b/evaluation/eval_ref_davis.py @@ -13,151 +13,160 @@ from deva.utils.palette import davis_palette from deva.inference.result_utils import ResultSaver from deva.inference.eval_args import add_common_eval_args, get_model_and_config -""" -Arguments loading -""" -parser = ArgumentParser() -parser.add_argument('--img_path', default='../DAVIS/2017/trainval/JPEGImages/480p') -parser.add_argument('--mask_path') -parser.add_argument('--num_voting_frames', - default=5, - type=int, - help='Number of frames selected for the initial consensus voting') -add_common_eval_args(parser) -network, config, args = get_model_and_config(parser) -""" -Data preparation -""" -out_path = args.output -meta_dataset = ReferringDAVISTestDataset(args.img_path, args.mask_path) -torch.autograd.set_grad_enabled(False) - -videos = meta_dataset.get_videos() - -total_process_time = 0 -total_frames = 0 - -# Start eval -pbar = tqdm(videos, total=len(videos)) -for vid_name in pbar: - pbar.set_description(vid_name) - video_scores = meta_dataset.get_scores(vid_name) - try: - """ - initial pass, perform consensus voting and get a keyframe - """ - image_feature_store = ImageFeatureStore(network) - vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, config['num_voting_frames']) - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - - time_indices = [] - images = [] - masks = [] - scores = [] - for ti, data in enumerate(loader): - time_indices.append(data['info']['time_index'][0].item()) - image = data['rgb'].cuda()[0] - mask = data['mask'].cuda()[0] - images.append(image) - masks.append(mask) - - frame_name = data['info']['frame'][0][:-4] - scores.append(video_scores[frame_name]) - - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - keyframe_ti, projected_mask = find_consensus_with_established_association( - time_indices, - images, - masks, - scores=scores, - network=network, - store=image_feature_store, - config=config) - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - """ - Backward pass video reader - """ - backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - start=-1, - end=keyframe_ti + 1, - reverse=True) - """ - Forward pass video reader - """ - forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - start=keyframe_ti, - end=-1, - reverse=False) - """ - Running them in combination - """ - vid_readers = [backward_vid_reader, forward_vid_reader] - for vid_reader in vid_readers: - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - vid_length = len(loader) - # no need to count usage for LT if the video is not that long anyway - config['enable_long_term_count_usage'] = ( - config['enable_long_term'] - and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * - config['num_prototypes']) >= config['max_long_term_elements']) - processor = DEVAInferenceCore(network, - config=config, - image_feature_store=image_feature_store) - result_saver = ResultSaver(out_path, - vid_name, - dataset='ref_davis', - palette=davis_palette, - object_manager=processor.object_manager) +def main(): + """ + Arguments loading + """ + parser = ArgumentParser() + parser.add_argument('--img_path', default='../DAVIS/2017/trainval/JPEGImages/480p') + parser.add_argument('--mask_path') + parser.add_argument('--num_voting_frames', + default=5, + type=int, + help='Number of frames selected for the initial consensus voting') + add_common_eval_args(parser) + network, config, args = get_model_and_config(parser) + """ + Data preparation + """ + out_path = args.output + meta_dataset = ReferringDAVISTestDataset(args.img_path, args.mask_path) + torch.autograd.set_grad_enabled(False) + + videos = meta_dataset.get_videos() + + total_process_time = 0 + total_frames = 0 + + # Start eval + pbar = tqdm(videos, total=len(videos)) + for vid_name in pbar: + pbar.set_description(vid_name) + video_scores = meta_dataset.get_scores(vid_name) + try: + """ + initial pass, perform consensus voting and get a keyframe + """ + image_feature_store = ImageFeatureStore(network) + vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, + config['num_voting_frames']) + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + time_indices = [] + images = [] + masks = [] + scores = [] for ti, data in enumerate(loader): - with torch.cuda.amp.autocast(enabled=args.amp): - image = data['rgb'].cuda()[0] - info = data['info'] - frame = info['frame'][0] - shape = info['shape'] - need_resize = info['need_resize'][0] - image_ti = info['time_index'][0].item() - - if image_ti == keyframe_ti: - mask = projected_mask - else: - mask = None - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - # Run the model on this frame - prob = processor.step(image, - mask, - end=(ti == vid_length - 1), - hard_mask=False, - image_ti_override=image_ti) - - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 - - result_saver.save_mask(prob, frame, need_resize=need_resize, shape=shape) - - result_saver.end() - with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f: - f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') - - except Exception as e: - print(f'Runtime error at {vid_name}') - print(e) - raise e - -print(f'Total processing time: {total_process_time}') -print(f'Total processed frames: {total_frames}') -print(f'FPS: {total_frames / total_process_time}') -print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + time_indices.append(data['info']['time_index'][0].item()) + image = data['rgb'].cuda()[0] + mask = data['mask'].cuda()[0] + images.append(image) + masks.append(mask) + + frame_name = data['info']['frame'][0][:-4] + scores.append(video_scores[frame_name]) + + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + keyframe_ti, projected_mask = find_consensus_with_established_association( + time_indices, + images, + masks, + scores=scores, + network=network, + store=image_feature_store, + config=config) + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + """ + Backward pass video reader + """ + backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + start=-1, + end=keyframe_ti + 1, + reverse=True) + """ + Forward pass video reader + """ + forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + start=keyframe_ti, + end=-1, + reverse=False) + """ + Running them in combination + """ + vid_readers = [backward_vid_reader, forward_vid_reader] + for vid_reader in vid_readers: + + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + vid_length = len(loader) + # no need to count usage for LT if the video is not that long anyway + config['enable_long_term_count_usage'] = ( + config['enable_long_term'] and + (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * + config['num_prototypes']) >= config['max_long_term_elements']) + + processor = DEVAInferenceCore(network, + config=config, + image_feature_store=image_feature_store) + result_saver = ResultSaver(out_path, + vid_name, + dataset='ref_davis', + palette=davis_palette, + object_manager=processor.object_manager) + + for ti, data in enumerate(loader): + with torch.cuda.amp.autocast(enabled=args.amp): + image = data['rgb'].cuda()[0] + info = data['info'] + frame = info['frame'][0] + shape = info['shape'] + need_resize = info['need_resize'][0] + image_ti = info['time_index'][0].item() + + if image_ti == keyframe_ti: + mask = projected_mask + else: + mask = None + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + # Run the model on this frame + prob = processor.step(image, + mask, + end=(ti == vid_length - 1), + hard_mask=False, + image_ti_override=image_ti) + + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + + result_saver.save_mask(prob, frame, need_resize=need_resize, shape=shape) + + result_saver.end() + with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f: + f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') + + except Exception as e: + print(f'Runtime error at {vid_name}') + print(e) + raise e + + print(f'Total processing time: {total_process_time}') + print(f'Total processed frames: {total_frames}') + print(f'FPS: {total_frames / total_process_time}') + print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + + + +if __name__ == '__main__': + main() diff --git a/evaluation/eval_ref_youtubevos.py b/evaluation/eval_ref_youtubevos.py index 2a0661c..491fa66 100644 --- a/evaluation/eval_ref_youtubevos.py +++ b/evaluation/eval_ref_youtubevos.py @@ -17,177 +17,185 @@ from deva.inference.consensus_associated import find_consensus_with_established_association from deva.utils.load_subset import load_referring_yv_val from deva.inference.eval_args import add_common_eval_args, get_model_and_config -""" -Arguments loading -""" -parser = ArgumentParser() -parser.add_argument('--img_path', default='../YouTube/all_frames/valid_all_frames/JPEGImages') -parser.add_argument('--mask_path') -parser.add_argument('--json_path', - default='../YouTube/meta_expressions/valid/meta_expressions.json') -parser.add_argument('--num_voting_frames', - default=10, - type=int, - help='Number of frames selected for the initial consensus voting') -add_common_eval_args(parser) -network, config, args = get_model_and_config(parser) -""" -Data preparation -""" -out_path = args.output -meta_dataset = ReferringYouTubeVOSTestDataset(args.img_path, args.mask_path, args.json_path) -torch.autograd.set_grad_enabled(False) - -videos = meta_dataset.get_videos() -video_subset = load_referring_yv_val() -print(f'Subset size: {len(video_subset)}') - -total_process_time = 0 -total_frames = 0 - -# Start eval -pbar = tqdm(video_subset) -for vid_name in pbar: - pbar.set_description(vid_name) - objects = meta_dataset.get_objects(vid_name) - video_scores = meta_dataset.get_scores(vid_name) - image_feature_store = ImageFeatureStore(network, no_warning=True) - for object_name in objects: - try: - """ - initial pass, perform consensus voting and get a keyframe - """ - object_scores = video_scores[object_name] - vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, object_name, - config['num_voting_frames']) - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - - time_indices = [] - images = [] - masks = [] - scores = [] - for ti, data in enumerate(loader): - image_ti = data['info']['time_index'][0].item() - time_indices.append(image_ti) - image = data['rgb'].cuda()[0] - mask = data['mask'].cuda()[0] - images.append(image) - masks.append(mask) - - frame_name = data['info']['frame'][0][:-4] - scores.append(object_scores[frame_name]) - - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - keyframe_ti, projected_mask = find_consensus_with_established_association( - time_indices, - images, - masks, - scores=scores, - network=network, - store=image_feature_store, - config=config) - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - """ - Backward pass video reader - """ - backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - object_name, - start=-1, - end=keyframe_ti + 1, - reverse=True) - """ - Forward pass video reader - """ - forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - object_name, - start=keyframe_ti, - end=-1, - reverse=False) - """ - Running them in combination - """ - vid_readers = [backward_vid_reader, forward_vid_reader] - for vid_reader in vid_readers: - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - vid_length = len(loader) - # no need to count usage for LT if the video is not that long anyway - config['enable_long_term_count_usage'] = ( - config['enable_long_term'] and - (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * - config['num_prototypes']) >= config['max_long_term_elements']) - processor = DEVAInferenceCore(network, - config=config, - image_feature_store=image_feature_store) +def main(): + """ + Arguments loading + """ + parser = ArgumentParser() + parser.add_argument('--img_path', default='../YouTube/all_frames/valid_all_frames/JPEGImages') + parser.add_argument('--mask_path') + parser.add_argument('--json_path', + default='../YouTube/meta_expressions/valid/meta_expressions.json') + parser.add_argument('--num_voting_frames', + default=10, + type=int, + help='Number of frames selected for the initial consensus voting') + add_common_eval_args(parser) + network, config, args = get_model_and_config(parser) + """ + Data preparation + """ + out_path = args.output + meta_dataset = ReferringYouTubeVOSTestDataset(args.img_path, args.mask_path, args.json_path) + torch.autograd.set_grad_enabled(False) + + videos = meta_dataset.get_videos() + video_subset = load_referring_yv_val() + print(f'Subset size: {len(video_subset)}') + + total_process_time = 0 + total_frames = 0 + + # Start eval + pbar = tqdm(video_subset) + for vid_name in pbar: + pbar.set_description(vid_name) + objects = meta_dataset.get_objects(vid_name) + video_scores = meta_dataset.get_scores(vid_name) + image_feature_store = ImageFeatureStore(network, no_warning=True) + for object_name in objects: + try: + """ + initial pass, perform consensus voting and get a keyframe + """ + object_scores = video_scores[object_name] + vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, object_name, + config['num_voting_frames']) + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + time_indices = [] + images = [] + masks = [] + scores = [] for ti, data in enumerate(loader): - with torch.cuda.amp.autocast(enabled=args.amp): - image = data['rgb'].cuda()[0] - info = data['info'] - frame = info['frame'][0] - shape = info['shape'] - need_resize = info['need_resize'][0] - image_ti = info['time_index'][0].item() - - if image_ti == keyframe_ti: - mask = projected_mask - else: - mask = None - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - # Run the model on this frame - prob = processor.step(image, - mask, - end=(ti == vid_length - 1), - hard_mask=False, - image_ti_override=image_ti, - delete_buffer=False) - - # Upsample to original size if needed - if need_resize: - prob = F.interpolate(prob.unsqueeze(1), - shape, - mode='bilinear', - align_corners=False)[:, 0] - - out_mask = (prob[1] > prob[0]).float() * 255 - - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 - - # Save the mask - if args.save_all or info['save'][0]: - this_out_path = path.join(out_path, 'Annotations', vid_name, - object_name) - os.makedirs(this_out_path, exist_ok=True) - out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) - out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) - - with open(path.join(out_path, 'Annotations', vid_name, object_name, 'key.txt'), - 'w') as f: - f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') - - except Exception as e: - print(f'Runtime error at {vid_name}') - print(e) - raise e - -print(f'Total processing time: {total_process_time}') -print(f'Total processed frames: {total_frames}') -print(f'FPS: {total_frames / total_process_time}') -print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') - -print('Making zip for YouTubeVOS...') -shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output, - 'Annotations') + image_ti = data['info']['time_index'][0].item() + time_indices.append(image_ti) + image = data['rgb'].cuda()[0] + mask = data['mask'].cuda()[0] + images.append(image) + masks.append(mask) + + frame_name = data['info']['frame'][0][:-4] + scores.append(object_scores[frame_name]) + + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + keyframe_ti, projected_mask = find_consensus_with_established_association( + time_indices, + images, + masks, + scores=scores, + network=network, + store=image_feature_store, + config=config) + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + """ + Backward pass video reader + """ + backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + object_name, + start=-1, + end=keyframe_ti + 1, + reverse=True) + """ + Forward pass video reader + """ + forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + object_name, + start=keyframe_ti, + end=-1, + reverse=False) + """ + Running them in combination + """ + vid_readers = [backward_vid_reader, forward_vid_reader] + for vid_reader in vid_readers: + + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + vid_length = len(loader) + # no need to count usage for LT if the video is not that long anyway + config['enable_long_term_count_usage'] = ( + config['enable_long_term'] + and (vid_length / + (config['max_mid_term_frames'] - config['min_mid_term_frames']) * + config['num_prototypes']) >= config['max_long_term_elements']) + + processor = DEVAInferenceCore(network, + config=config, + image_feature_store=image_feature_store) + + for ti, data in enumerate(loader): + with torch.cuda.amp.autocast(enabled=args.amp): + image = data['rgb'].cuda()[0] + info = data['info'] + frame = info['frame'][0] + shape = info['shape'] + need_resize = info['need_resize'][0] + image_ti = info['time_index'][0].item() + + if image_ti == keyframe_ti: + mask = projected_mask + else: + mask = None + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + # Run the model on this frame + prob = processor.step(image, + mask, + end=(ti == vid_length - 1), + hard_mask=False, + image_ti_override=image_ti, + delete_buffer=False) + + # Upsample to original size if needed + if need_resize: + prob = F.interpolate(prob.unsqueeze(1), + shape, + mode='bilinear', + align_corners=False)[:, 0] + + out_mask = (prob[1] > prob[0]).float() * 255 + + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + + # Save the mask + if args.save_all or info['save'][0]: + this_out_path = path.join(out_path, 'Annotations', vid_name, + object_name) + os.makedirs(this_out_path, exist_ok=True) + out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) + out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) + + with open(path.join(out_path, 'Annotations', vid_name, object_name, 'key.txt'), + 'w') as f: + f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') + + except Exception as e: + print(f'Runtime error at {vid_name}') + print(e) + raise e + + print(f'Total processing time: {total_process_time}') + print(f'Total processed frames: {total_frames}') + print(f'FPS: {total_frames / total_process_time}') + print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + + print('Making zip for YouTubeVOS...') + shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output, + 'Annotations') + + +if __name__ == '__main__': + main() diff --git a/evaluation/eval_saliency.py b/evaluation/eval_saliency.py index 9368506..47da6de 100644 --- a/evaluation/eval_saliency.py +++ b/evaluation/eval_saliency.py @@ -16,151 +16,165 @@ from deva.inference.consensus_associated import find_consensus_with_established_association from deva.utils.tensor_utils import pad_divide_by from deva.inference.eval_args import add_common_eval_args, get_model_and_config -""" -Arguments loading -""" -parser = ArgumentParser() -parser.add_argument('--img_path', default='../DAVIS/2016/JPEGImages/480p') -parser.add_argument('--mask_path') -parser.add_argument('--imset_path') -parser.add_argument('--num_voting_frames', - default=10, - type=int, - help='Number of frames selected for the initial consensus voting') -add_common_eval_args(parser) -network, config, args = get_model_and_config(parser) -""" -Data preparation -""" -out_path = args.output -meta_dataset = DAVISSaliencyTestDataset(args.img_path, args.mask_path, imset=args.imset_path) -torch.autograd.set_grad_enabled(False) - -videos = meta_dataset.get_videos() - -total_process_time = 0 -total_frames = 0 - -# Start eval -pbar = tqdm(videos, total=len(meta_dataset)) -for vid_name in pbar: - pbar.set_description(vid_name) - - try: - """ - initial pass, perform consensus voting and get a keyframe - """ - image_feature_store = ImageFeatureStore(network) - vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, config['num_voting_frames']) - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - - time_indices = [] - images = [] - masks = [] - for ti, data in enumerate(loader): - time_indices.append(data['info']['time_index'][0].item()) - image = data['rgb'].cuda()[0] - mask = data['mask'].cuda()[0] - image, _ = pad_divide_by(image, 16) - mask, _ = pad_divide_by(mask, 16) - images.append(image) - masks.append(mask) - - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - keyframe_ti, projected_mask = find_consensus_with_established_association( - time_indices, images, masks, network=network, store=image_feature_store, config=config) - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - """ - Backward pass video reader - """ - backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - start=-1, - end=keyframe_ti + 1, - reverse=True) - """ - Forward pass video reader - """ - forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, - start=keyframe_ti, - end=-1, - reverse=False) - """ - Running them in combination - """ - vid_readers = [backward_vid_reader, forward_vid_reader] - for vid_reader in vid_readers: - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - vid_length = len(loader) - # no need to count usage for LT if the video is not that long anyway - config['enable_long_term_count_usage'] = ( - config['enable_long_term'] - and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * - config['num_prototypes']) >= config['max_long_term_elements']) - processor = DEVAInferenceCore(network, - config=config, - image_feature_store=image_feature_store) +def main(): + """ + Arguments loading + """ + parser = ArgumentParser() + parser.add_argument('--img_path', default='../DAVIS/2016/JPEGImages/480p') + parser.add_argument('--mask_path') + parser.add_argument('--imset_path') + parser.add_argument('--num_voting_frames', + default=10, + type=int, + help='Number of frames selected for the initial consensus voting') + add_common_eval_args(parser) + network, config, args = get_model_and_config(parser) + """ + Data preparation + """ + out_path = args.output + meta_dataset = DAVISSaliencyTestDataset(args.img_path, args.mask_path, imset=args.imset_path) + torch.autograd.set_grad_enabled(False) + + videos = meta_dataset.get_videos() + + total_process_time = 0 + total_frames = 0 + + # Start eval + pbar = tqdm(videos, total=len(meta_dataset)) + for vid_name in pbar: + pbar.set_description(vid_name) + + try: + """ + initial pass, perform consensus voting and get a keyframe + """ + image_feature_store = ImageFeatureStore(network) + vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, + config['num_voting_frames']) + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + time_indices = [] + images = [] + masks = [] for ti, data in enumerate(loader): - with torch.cuda.amp.autocast(enabled=args.amp): - image = data['rgb'].cuda()[0] - info = data['info'] - frame = info['frame'][0] - shape = info['shape'] - need_resize = info['need_resize'][0] - image_ti = info['time_index'][0].item() - - if image_ti == keyframe_ti: - mask = projected_mask - else: - mask = None - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - # Run the model on this frame - prob = processor.step(image, - mask, - end=(ti == vid_length - 1), - hard_mask=False, - image_ti_override=image_ti) - - # Upsample to original size if needed - if need_resize: - prob = F.interpolate(prob.unsqueeze(1), - shape, - mode='bilinear', - align_corners=False)[:, 0] - - out_mask = (prob[1] > prob[0]).float() * 255 - - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 - - # Save the mask - this_out_path = path.join(out_path, vid_name) - os.makedirs(this_out_path, exist_ok=True) - out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) - out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) - - with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f: - f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') - - except Exception as e: - print(f'Runtime error at {vid_name}') - print(e) - raise e - -print(f'Total processing time: {total_process_time}') -print(f'Total processed frames: {total_frames}') -print(f'FPS: {total_frames / total_process_time}') -print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + time_indices.append(data['info']['time_index'][0].item()) + image = data['rgb'].cuda()[0] + mask = data['mask'].cuda()[0] + image, _ = pad_divide_by(image, 16) + mask, _ = pad_divide_by(mask, 16) + images.append(image) + masks.append(mask) + + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + keyframe_ti, projected_mask = find_consensus_with_established_association( + time_indices, + images, + masks, + network=network, + store=image_feature_store, + config=config) + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + """ + Backward pass video reader + """ + backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + start=-1, + end=keyframe_ti + 1, + reverse=True) + """ + Forward pass video reader + """ + forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name, + start=keyframe_ti, + end=-1, + reverse=False) + """ + Running them in combination + """ + vid_readers = [backward_vid_reader, forward_vid_reader] + for vid_reader in vid_readers: + + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + vid_length = len(loader) + # no need to count usage for LT if the video is not that long anyway + config['enable_long_term_count_usage'] = ( + config['enable_long_term'] and + (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * + config['num_prototypes']) >= config['max_long_term_elements']) + + processor = DEVAInferenceCore(network, + config=config, + image_feature_store=image_feature_store) + + for ti, data in enumerate(loader): + with torch.cuda.amp.autocast(enabled=args.amp): + image = data['rgb'].cuda()[0] + info = data['info'] + frame = info['frame'][0] + shape = info['shape'] + need_resize = info['need_resize'][0] + image_ti = info['time_index'][0].item() + + if image_ti == keyframe_ti: + mask = projected_mask + else: + mask = None + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + # Run the model on this frame + prob = processor.step(image, + mask, + end=(ti == vid_length - 1), + hard_mask=False, + image_ti_override=image_ti) + + # Upsample to original size if needed + if need_resize: + prob = F.interpolate(prob.unsqueeze(1), + shape, + mode='bilinear', + align_corners=False)[:, 0] + + out_mask = (prob[1] > prob[0]).float() * 255 + + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + + # Save the mask + this_out_path = path.join(out_path, vid_name) + os.makedirs(this_out_path, exist_ok=True) + out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) + out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) + + with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f: + f.write(f'options: {time_indices}; keyframe: {keyframe_ti}') + + except Exception as e: + print(f'Runtime error at {vid_name}') + print(e) + raise e + + print(f'Total processing time: {total_process_time}') + print(f'Total processed frames: {total_frames}') + print(f'FPS: {total_frames / total_process_time}') + print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + + + +if __name__ == '__main__': + main() diff --git a/evaluation/eval_vos.py b/evaluation/eval_vos.py index 3db6714..f6f03d8 100644 --- a/evaluation/eval_vos.py +++ b/evaluation/eval_vos.py @@ -18,210 +18,217 @@ from deva.inference.data.vos_test_datasets import GeneralVOSTestDataset, DAVISTestDataset, YouTubeVOSTestDataset from deva.inference.inference_core import DEVAInferenceCore from deva.inference.eval_args import add_common_eval_args, get_model_and_config -""" -Arguments loading -""" -parser = ArgumentParser() -parser.add_argument('--d16_path', default='../DAVIS/2016') -parser.add_argument('--d17_path', default='../DAVIS/2017') -parser.add_argument('--y18_path', default='../YouTube2018') -parser.add_argument('--y19_path', default='../YouTube') -# For generic (G) evaluation, point to a folder that contains "JPEGImages" and "Annotations" -parser.add_argument('--generic_path', default='./example/vos') - -parser.add_argument('--dataset', help='D16/D17/Y18/Y19/G', default='D17') -parser.add_argument('--split', help='val/test', default='val') -parser.add_argument('--use_all_masks', - help='Use all masks in the mask folder for generic evaluation. ' - 'Forced to be True for YouTubeVOS; forced to be False for DAVIS/MOSE.', - action='store_true') - -# Multi-scale options -parser.add_argument('--save_scores', action='store_true') -parser.add_argument('--flip', action='store_true') - -add_common_eval_args(parser) -network, config, args = get_model_and_config(parser) -args.dataset = args.dataset.upper() - -if args.output is None: - args.output = f'../output/{args.dataset}_{args.split}' - print(f'Output path not provided. Defaulting to {args.output}') -""" -Data preparation -""" -is_youtube = args.dataset.startswith('Y') -is_davis = args.dataset.startswith('D') - -if is_youtube or args.save_scores: - out_path = path.join(args.output, 'Annotations') -else: - out_path = args.output - -if is_youtube: - if args.dataset == 'Y18': - yv_path = args.y18_path - elif args.dataset == 'Y19': - yv_path = args.y19_path - - if args.split == 'val': - args.split = 'valid' - meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='valid', size=args.size) - elif args.split == 'test': - meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='test', size=args.size) + + +def main(): + """ + Arguments loading + """ + parser = ArgumentParser() + parser.add_argument('--d16_path', default='../DAVIS/2016') + parser.add_argument('--d17_path', default='../DAVIS/2017') + parser.add_argument('--y18_path', default='../YouTube2018') + parser.add_argument('--y19_path', default='../YouTube') + # For generic (G) evaluation, point to a folder that contains "JPEGImages" and "Annotations" + parser.add_argument('--generic_path', default='./example/vos') + + parser.add_argument('--dataset', help='D16/D17/Y18/Y19/G', default='D17') + parser.add_argument('--split', help='val/test', default='val') + parser.add_argument('--use_all_masks', + help='Use all masks in the mask folder for generic evaluation. ' + 'Forced to be True for YouTubeVOS; forced to be False for DAVIS/MOSE.', + action='store_true') + + # Multi-scale options + parser.add_argument('--save_scores', action='store_true') + parser.add_argument('--flip', action='store_true') + + add_common_eval_args(parser) + network, config, args = get_model_and_config(parser) + args.dataset = args.dataset.upper() + + if args.output is None: + args.output = f'../output/{args.dataset}_{args.split}' + print(f'Output path not provided. Defaulting to {args.output}') + """ + Data preparation + """ + is_youtube = args.dataset.startswith('Y') + is_davis = args.dataset.startswith('D') + + if is_youtube or args.save_scores: + out_path = path.join(args.output, 'Annotations') else: - raise NotImplementedError + out_path = args.output + + if is_youtube: + if args.dataset == 'Y18': + yv_path = args.y18_path + elif args.dataset == 'Y19': + yv_path = args.y19_path -elif is_davis: - if args.dataset == 'D16': - if args.split == 'val': - # Set up Dataset, a small hack to use the image set in the 2017 folder because the 2016 one is of a different format - meta_dataset = DAVISTestDataset(args.d16_path, - imset='../../2017/trainval/ImageSets/2016/val.txt', - size=args.size) - else: - raise NotImplementedError - palette = None - elif args.dataset == 'D17': if args.split == 'val': - meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'trainval'), - imset='2017/val.txt', - size=args.size) + args.split = 'valid' + meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='valid', size=args.size) elif args.split == 'test': - meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'test-dev'), - imset='2017/test-dev.txt', - size=args.size) + meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='test', size=args.size) else: raise NotImplementedError -elif args.dataset == 'G': - meta_dataset = GeneralVOSTestDataset(path.join(args.generic_path), - size=args.size, - use_all_masks=args.use_all_masks) - - if not args.save_all: - args.save_all = True - print('save_all is forced to be true in generic evaluation mode.') -else: - raise NotImplementedError - -torch.autograd.set_grad_enabled(False) - -# Set up loader -meta_loader = meta_dataset.get_datasets() - -total_process_time = 0 -total_frames = 0 - -# Start eval -pbar = tqdm(meta_loader, total=len(meta_dataset)) -for vid_reader in pbar: - - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - vid_name = vid_reader.vid_name - pbar.set_description(vid_name) - vid_length = len(loader) - # no need to count usage for LT if the video is not that long anyway - config['enable_long_term_count_usage'] = ( - config['enable_long_term'] - and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * - config['num_prototypes']) >= config['max_long_term_elements']) - - try: - processor = DEVAInferenceCore(network, config=config) - first_mask_loaded = False - - for ti, data in enumerate(loader): - with torch.cuda.amp.autocast(enabled=args.amp): - image = data['rgb'].cuda()[0] - mask = data.get('mask') - if mask is not None: - mask = mask.cuda()[0] - valid_labels = data.get('valid_labels') - if valid_labels is not None: - valid_labels = valid_labels.tolist()[0] - info = data['info'] - frame = info['frame'][0] - shape = info['shape'] - need_resize = info['need_resize'][0] - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - # if, for some reason, the first frame is not aligned with the first mask - if not first_mask_loaded: + + elif is_davis: + if args.dataset == 'D16': + if args.split == 'val': + # Set up Dataset, a small hack to use the image set in the 2017 folder because the 2016 one is of a different format + meta_dataset = DAVISTestDataset(args.d16_path, + imset='../../2017/trainval/ImageSets/2016/val.txt', + size=args.size) + else: + raise NotImplementedError + palette = None + elif args.dataset == 'D17': + if args.split == 'val': + meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'trainval'), + imset='2017/val.txt', + size=args.size) + elif args.split == 'test': + meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'test-dev'), + imset='2017/test-dev.txt', + size=args.size) + else: + raise NotImplementedError + elif args.dataset == 'G': + meta_dataset = GeneralVOSTestDataset(path.join(args.generic_path), + size=args.size, + use_all_masks=args.use_all_masks) + + if not args.save_all: + args.save_all = True + print('save_all is forced to be true in generic evaluation mode.') + else: + raise NotImplementedError + + torch.autograd.set_grad_enabled(False) + + # Set up loader + meta_loader = meta_dataset.get_datasets() + + total_process_time = 0 + total_frames = 0 + + # Start eval + pbar = tqdm(meta_loader, total=len(meta_dataset)) + for vid_reader in pbar: + + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + vid_name = vid_reader.vid_name + pbar.set_description(vid_name) + vid_length = len(loader) + # no need to count usage for LT if the video is not that long anyway + config['enable_long_term_count_usage'] = ( + config['enable_long_term'] + and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * + config['num_prototypes']) >= config['max_long_term_elements']) + + try: + processor = DEVAInferenceCore(network, config=config) + first_mask_loaded = False + + for ti, data in enumerate(loader): + with torch.cuda.amp.autocast(enabled=args.amp): + image = data['rgb'].cuda()[0] + mask = data.get('mask') if mask is not None: - first_mask_loaded = True - else: - # no point to do anything without a mask - continue - - if args.flip: - image = torch.flip(image, dims=[-1]) - mask = torch.flip(mask, dims=[-1]) if mask is not None else None - - # Run the model on this frame - prob = processor.step(image, mask, valid_labels, end=(ti == vid_length - 1)) - - # Upsample to original size if needed - if need_resize: - prob = F.interpolate(prob.unsqueeze(1), - shape, - mode='bilinear', - align_corners=False)[:, 0] - - if args.flip: - prob = torch.flip(prob, dims=[-1]) - - # Probability mask -> index mask - out_mask = torch.argmax(prob, dim=0) - out_mask = processor.object_manager.tmp_to_obj_cls(out_mask) - - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 - - if args.save_scores: - prob = (prob.detach().cpu().numpy() * 255).astype(np.uint8) - - # Save the mask - if args.save_all or info['save'][0]: - this_out_path = path.join(out_path, vid_name) - os.makedirs(this_out_path, exist_ok=True) - out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) - if vid_reader.get_palette() is not None: - out_img.putpalette(vid_reader.get_palette()) - out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) - - if args.save_scores: - np_path = path.join(args.output, 'Scores', vid_name) - os.makedirs(np_path, exist_ok=True) - if ti == len(loader) - 1: - hkl.dump(processor.object_manager.get_tmp_to_obj_mapping(), - path.join(np_path, f'backward.hkl'), - mode='w') + mask = mask.cuda()[0] + valid_labels = data.get('valid_labels') + if valid_labels is not None: + valid_labels = valid_labels.tolist()[0] + info = data['info'] + frame = info['frame'][0] + shape = info['shape'] + need_resize = info['need_resize'][0] + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + # if, for some reason, the first frame is not aligned with the first mask + if not first_mask_loaded: + if mask is not None: + first_mask_loaded = True + else: + # no point to do anything without a mask + continue + + if args.flip: + image = torch.flip(image, dims=[-1]) + mask = torch.flip(mask, dims=[-1]) if mask is not None else None + + # Run the model on this frame + prob = processor.step(image, mask, valid_labels, end=(ti == vid_length - 1)) + + # Upsample to original size if needed + if need_resize: + prob = F.interpolate(prob.unsqueeze(1), + shape, + mode='bilinear', + align_corners=False)[:, 0] + + if args.flip: + prob = torch.flip(prob, dims=[-1]) + + # Probability mask -> index mask + out_mask = torch.argmax(prob, dim=0) + out_mask = processor.object_manager.tmp_to_obj_cls(out_mask) + + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + + if args.save_scores: + prob = (prob.detach().cpu().numpy() * 255).astype(np.uint8) + + # Save the mask if args.save_all or info['save'][0]: - hkl.dump(prob, - path.join(np_path, f'{frame[:-4]}.hkl'), - mode='w', - compression='lzf') - - except Exception as e: - print(f'Runtime error at {vid_name}') - print(e) - raise e - -print(f'Total processing time: {total_process_time}') -print(f'Total processed frames: {total_frames}') -print(f'FPS: {total_frames / total_process_time}') -print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') - -if not args.save_scores: - if is_youtube: - print('Making zip for YouTubeVOS...') - shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output, - 'Annotations') - elif is_davis and args.split == 'test': - print('Making zip for DAVIS test-dev...') - shutil.make_archive(args.output, 'zip', args.output) + this_out_path = path.join(out_path, vid_name) + os.makedirs(this_out_path, exist_ok=True) + out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8)) + if vid_reader.get_palette() is not None: + out_img.putpalette(vid_reader.get_palette()) + out_img.save(os.path.join(this_out_path, frame[:-4] + '.png')) + + if args.save_scores: + np_path = path.join(args.output, 'Scores', vid_name) + os.makedirs(np_path, exist_ok=True) + if ti == len(loader) - 1: + hkl.dump(processor.object_manager.get_tmp_to_obj_mapping(), + path.join(np_path, f'backward.hkl'), + mode='w') + if args.save_all or info['save'][0]: + hkl.dump(prob, + path.join(np_path, f'{frame[:-4]}.hkl'), + mode='w', + compression='lzf') + + except Exception as e: + print(f'Runtime error at {vid_name}') + print(e) + raise e + + print(f'Total processing time: {total_process_time}') + print(f'Total processed frames: {total_frames}') + print(f'FPS: {total_frames / total_process_time}') + print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + + if not args.save_scores: + if is_youtube: + print('Making zip for YouTubeVOS...') + shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', + args.output, 'Annotations') + elif is_davis and args.split == 'test': + print('Making zip for DAVIS test-dev...') + shutil.make_archive(args.output, 'zip', args.output) + + +if __name__ == '__main__': + main() diff --git a/evaluation/eval_with_detections.py b/evaluation/eval_with_detections.py index e68e7d4..3c8f3e4 100644 --- a/evaluation/eval_with_detections.py +++ b/evaluation/eval_with_detections.py @@ -23,247 +23,268 @@ from deva.vps_metrics.eval_vpq_vipseg import eval_vpq from deva.inference.postprocess_unsup_davis17 import limit_max_id -# for id2rgb -np.random.seed(42) -""" -Arguments loading -""" -parser = ArgumentParser() -parser.add_argument('--img_path', default='./example/vipseg') -parser.add_argument('--mask_path') -parser.add_argument('--json_path', default=None) -parser.add_argument('--detection_every', type=int, default=5) -parser.add_argument('--num_voting_frames', - default=3, - type=int, - help='Number of frames selected for voting. only valid in semionline') -parser.add_argument('--dataset', default='vipseg', help='vipseg/burst/unsup_davis17/demo') -parser.add_argument('--max_missed_detection_count', type=int, default=5) -# skip VPQ/STQ computation -parser.add_argument('--no_metrics', action='store_true') - -parser.add_argument('--temporal_setting', default='semionline', help='semionline/online') -parser.add_argument('--max_num_objects', - default=-1, - type=int, - help='Max. num of objects to keep in memory. -1 for no limit') - -# the options below are only valid for burst -parser.add_argument('--start', type=int, default=None, help='for distributed testing') -parser.add_argument('--count', type=int, default=None, help='for distributed testing') -parser.add_argument('--burst_gt_json', default='../BURST/val/all_classes.json') - -# only valid for VIPSeg -parser.add_argument('--vipseg_root', default='../VIPSeg/VIPSeg_720P') - -# this option is only valid for unsup_davis17; limit the maximum number of predicted objects -parser.add_argument('--postprocess_limit_max_id', type=int, default=20) - -add_common_eval_args(parser) -network, config, args = get_model_and_config(parser) -""" -Temporal setting -""" -temporal_setting = args.temporal_setting.lower() -assert temporal_setting in ['semionline', 'online'] -""" -Data preparation -""" -dataset_name = args.dataset.lower() -assert dataset_name in ['vipseg', 'burst', 'unsup_davis17', - 'demo'], f'Unknown dataset {dataset_name}' -print(f'Dataset: {dataset_name}') -is_vipseg = (dataset_name == 'vipseg') -is_burst = (dataset_name == 'burst') -is_davis = (dataset_name == 'unsup_davis17') -is_demo = (dataset_name == 'demo') - -# try to find json path is not given -if args.json_path is None: - if path.exists(path.join(args.mask_path, 'pred.json')): - args.json_path = path.join(args.mask_path, 'pred.json') -out_path = args.output - -# try to find the real mask path if it is hidden behind pan_pred -if path.exists(path.join(args.mask_path, 'pan_pred')): - args.mask_path = path.join(args.mask_path, 'pan_pred') -if is_vipseg or is_davis or is_demo: - meta_dataset = VIPSegDetectionTestDataset(args.img_path, args.mask_path, args.size) -elif is_burst: - meta_dataset = BURSTDetectionTestDataset(args.img_path, - args.mask_path, - args.burst_gt_json, - args.size, - start=args.start, - count=args.count) -else: - raise NotImplementedError - -torch.autograd.set_grad_enabled(False) - -# Set up loader -meta_loader = meta_dataset.get_datasets() -""" -Read the global pred.json if any -""" -global_json_enabled = args.json_path is not None -per_vid_json_enabled = None -if global_json_enabled: - print(f'Using a global json file {args.json_path}') - with open(args.json_path, 'r') as f: - all_json_info = json.load(f) - all_json_info = all_json_info['annotations'] - - video_id_to_annotation = {} - for ann in all_json_info: - video_id_to_annotation[ann['video_id']] = ann['annotations'] - -if is_vipseg: - # we will export this as a single json for VPQ/STQ evaluation - output_json_annotations = [] - -total_process_time = 0 -total_frames = 0 - -# Start eval -pbar = tqdm(meta_loader, total=len(meta_dataset)) -for vid_reader in pbar: - loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) - vid_name = vid_reader.vid_name - pbar.set_description(vid_name) - vid_length = len(loader) - next_voting_frame = args.num_voting_frames - 1 - # no need to count usage for LT if the video is not that long anyway - config['enable_long_term_count_usage'] = ( - config['enable_long_term'] - and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * - config['num_prototypes']) >= config['max_long_term_elements']) - - try: - processor = DEVAInferenceCore(network, config=config) - result_saver = ResultSaver(out_path, - vid_name, - dataset=dataset_name, - palette=vid_reader.palette, - object_manager=processor.object_manager) - - for ti, data in enumerate(loader): - with torch.cuda.amp.autocast(enabled=args.amp): - image = data['rgb'].cuda()[0] - mask = data.get('mask') - if mask is not None: - mask = mask.cuda()[0] - info = data['info'] - frame = info['frame'][0] - shape = info['shape'] - need_resize = info['need_resize'][0] - is_rgb = info['is_rgb'][0] - path_to_image = info['path_to_image'][0] - if args.save_all: - info['save'][0] = True - if is_rgb: - # if the mask format is RGB (instead of grayscale/palette), we need - # more usable IDs (>255) - processor.enabled_long_id() - - segments_info = None - if not global_json_enabled: - # safety check - json_path = info.get('json') - if per_vid_json_enabled is None: - if json_path is None: - print('Neither global nor per-video json exist.') - per_vid_json_enabled = False - else: - print('Using per-video json.') - per_vid_json_enabled = True - elif json_path is None and per_vid_json_enabled: - raise RuntimeError( - f'Per-video json is enabled but not found for {vid_name}.') - - # read the per-video pred.json - if per_vid_json_enabled: - with open(json_path[0], 'r') as f: - segments_info = json.load(f) + +def main(): + # for id2rgb + np.random.seed(42) + """ + Arguments loading + """ + parser = ArgumentParser() + parser.add_argument('--img_path', default='./example/vipseg') + parser.add_argument('--mask_path') + parser.add_argument('--json_path', default=None) + parser.add_argument('--detection_every', type=int, default=5) + parser.add_argument('--num_voting_frames', + default=3, + type=int, + help='Number of frames selected for voting. only valid in semionline') + parser.add_argument('--dataset', default='vipseg', help='vipseg/burst/unsup_davis17/demo') + parser.add_argument('--max_missed_detection_count', type=int, default=5) + # skip VPQ/STQ computation + parser.add_argument('--no_metrics', action='store_true') + + parser.add_argument('--temporal_setting', default='semionline', help='semionline/online') + parser.add_argument('--max_num_objects', + default=-1, + type=int, + help='Max. num of objects to keep in memory. -1 for no limit') + + # the options below are only valid for burst + parser.add_argument('--start', type=int, default=None, help='for distributed testing') + parser.add_argument('--count', type=int, default=None, help='for distributed testing') + parser.add_argument('--burst_gt_json', default='../BURST/val/all_classes.json') + + # only valid for VIPSeg + parser.add_argument('--vipseg_root', default='../VIPSeg/VIPSeg_720P') + + # this option is only valid for unsup_davis17; limit the maximum number of predicted objects + parser.add_argument('--postprocess_limit_max_id', type=int, default=20) + + add_common_eval_args(parser) + network, config, args = get_model_and_config(parser) + """ + Temporal setting + """ + temporal_setting = args.temporal_setting.lower() + assert temporal_setting in ['semionline', 'online'] + """ + Data preparation + """ + dataset_name = args.dataset.lower() + assert dataset_name in ['vipseg', 'burst', 'unsup_davis17', + 'demo'], f'Unknown dataset {dataset_name}' + print(f'Dataset: {dataset_name}') + is_vipseg = (dataset_name == 'vipseg') + is_burst = (dataset_name == 'burst') + is_davis = (dataset_name == 'unsup_davis17') + is_demo = (dataset_name == 'demo') + + # try to find json path is not given + if args.json_path is None: + if path.exists(path.join(args.mask_path, 'pred.json')): + args.json_path = path.join(args.mask_path, 'pred.json') + out_path = args.output + + # try to find the real mask path if it is hidden behind pan_pred + if path.exists(path.join(args.mask_path, 'pan_pred')): + args.mask_path = path.join(args.mask_path, 'pan_pred') + if is_vipseg or is_davis or is_demo: + meta_dataset = VIPSegDetectionTestDataset(args.img_path, args.mask_path, args.size) + elif is_burst: + meta_dataset = BURSTDetectionTestDataset(args.img_path, + args.mask_path, + args.burst_gt_json, + args.size, + start=args.start, + count=args.count) + else: + raise NotImplementedError + + torch.autograd.set_grad_enabled(False) + + # Set up loader + meta_loader = meta_dataset.get_datasets() + """ + Read the global pred.json if any + """ + global_json_enabled = args.json_path is not None + per_vid_json_enabled = None + if global_json_enabled: + print(f'Using a global json file {args.json_path}') + with open(args.json_path, 'r') as f: + all_json_info = json.load(f) + all_json_info = all_json_info['annotations'] + + video_id_to_annotation = {} + for ann in all_json_info: + video_id_to_annotation[ann['video_id']] = ann['annotations'] + + if is_vipseg: + # we will export this as a single json for VPQ/STQ evaluation + output_json_annotations = [] + + total_process_time = 0 + total_frames = 0 + + # Start eval + pbar = tqdm(meta_loader, total=len(meta_dataset)) + for vid_reader in pbar: + loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2) + vid_name = vid_reader.vid_name + pbar.set_description(vid_name) + vid_length = len(loader) + next_voting_frame = args.num_voting_frames - 1 + # no need to count usage for LT if the video is not that long anyway + config['enable_long_term_count_usage'] = ( + config['enable_long_term'] + and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) * + config['num_prototypes']) >= config['max_long_term_elements']) + + try: + processor = DEVAInferenceCore(network, config=config) + result_saver = ResultSaver(out_path, + vid_name, + dataset=dataset_name, + palette=vid_reader.palette, + object_manager=processor.object_manager) + + for ti, data in enumerate(loader): + with torch.cuda.amp.autocast(enabled=args.amp): + image = data['rgb'].cuda()[0] + mask = data.get('mask') + if mask is not None: + mask = mask.cuda()[0] + info = data['info'] + frame = info['frame'][0] + shape = info['shape'] + need_resize = info['need_resize'][0] + is_rgb = info['is_rgb'][0] + path_to_image = info['path_to_image'][0] + if args.save_all: + info['save'][0] = True + if is_rgb: + # if the mask format is RGB (instead of grayscale/palette), we need + # more usable IDs (>255) processor.enabled_long_id() - else: - # read from the global json - segments_info = video_id_to_annotation[vid_name][ti]['segments_info'] - processor.enabled_long_id() - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - segments_info = convert_json_dict_to_objects_info(mask, - segments_info, - dataset=dataset_name) - frame_info = FrameInfo(image, mask, segments_info, ti, info) - - if temporal_setting == 'semionline': - if ti + args.num_voting_frames > next_voting_frame: - # wait for more frames before proceeding - processor.add_to_temporary_buffer(frame_info) - - if ti == next_voting_frame: - # process this clip - this_image = processor.frame_buffer[0].image - this_ti = processor.frame_buffer[0].ti - this_frame_name = processor.frame_buffer[0].name - save_this_frame = processor.frame_buffer[0].save_needed - path_to_image = processor.frame_buffer[0].path_to_image - - _, mask, new_segments_info = processor.vote_in_temporary_buffer( - keyframe_selection='first') - prob = processor.incorporate_detection(this_image, mask, - new_segments_info) - next_voting_frame += args.detection_every - if next_voting_frame >= vid_length: - next_voting_frame = vid_length + args.num_voting_frames - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 + segments_info = None + if not global_json_enabled: + # safety check + json_path = info.get('json') + if per_vid_json_enabled is None: + if json_path is None: + print('Neither global nor per-video json exist.') + per_vid_json_enabled = False + else: + print('Using per-video json.') + per_vid_json_enabled = True + elif json_path is None and per_vid_json_enabled: + raise RuntimeError( + f'Per-video json is enabled but not found for {vid_name}.') + + # read the per-video pred.json + if per_vid_json_enabled: + with open(json_path[0], 'r') as f: + segments_info = json.load(f) + processor.enabled_long_id() + else: + # read from the global json + segments_info = video_id_to_annotation[vid_name][ti]['segments_info'] + processor.enabled_long_id() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + segments_info = convert_json_dict_to_objects_info(mask, + segments_info, + dataset=dataset_name) + frame_info = FrameInfo(image, mask, segments_info, ti, info) + + if temporal_setting == 'semionline': + if ti + args.num_voting_frames > next_voting_frame: + # wait for more frames before proceeding + processor.add_to_temporary_buffer(frame_info) + + if ti == next_voting_frame: + # process this clip + this_image = processor.frame_buffer[0].image + this_ti = processor.frame_buffer[0].ti + this_frame_name = processor.frame_buffer[0].name + save_this_frame = processor.frame_buffer[0].save_needed + path_to_image = processor.frame_buffer[0].path_to_image + + _, mask, new_segments_info = processor.vote_in_temporary_buffer( + keyframe_selection='first') + prob = processor.incorporate_detection( + this_image, mask, new_segments_info) + next_voting_frame += args.detection_every + if next_voting_frame >= vid_length: + next_voting_frame = vid_length + args.num_voting_frames - if save_this_frame: - result_saver.save_mask( - prob, - this_frame_name, - need_resize=need_resize, - shape=shape, - path_to_image=path_to_image, - ) - - for frame_info in processor.frame_buffer[1:]: - this_image = frame_info.image - this_ti = frame_info.ti - this_frame_name = frame_info.name - save_this_frame = frame_info.save_needed - path_to_image = frame_info.path_to_image - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - prob = processor.step(this_image, - None, - None, - end=(this_ti == vid_length - 1)) end.record() torch.cuda.synchronize() total_process_time += (start.elapsed_time(end) / 1000) total_frames += 1 if save_this_frame: - result_saver.save_mask(prob, - this_frame_name, - need_resize=need_resize, - shape=shape, - path_to_image=path_to_image) - - processor.clear_buffer() - else: - # standard propagation - prob = processor.step(image, None, None, end=(ti == vid_length - 1)) + result_saver.save_mask( + prob, + this_frame_name, + need_resize=need_resize, + shape=shape, + path_to_image=path_to_image, + ) + + for frame_info in processor.frame_buffer[1:]: + this_image = frame_info.image + this_ti = frame_info.ti + this_frame_name = frame_info.name + save_this_frame = frame_info.save_needed + path_to_image = frame_info.path_to_image + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + prob = processor.step(this_image, + None, + None, + end=(this_ti == vid_length - 1)) + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + + if save_this_frame: + result_saver.save_mask(prob, + this_frame_name, + need_resize=need_resize, + shape=shape, + path_to_image=path_to_image) + + processor.clear_buffer() + else: + # standard propagation + prob = processor.step(image, None, None, end=(ti == vid_length - 1)) + end.record() + torch.cuda.synchronize() + total_process_time += (start.elapsed_time(end) / 1000) + total_frames += 1 + if info['save'][0]: + result_saver.save_mask(prob, + frame, + need_resize=need_resize, + shape=shape, + path_to_image=path_to_image) + + elif temporal_setting == 'online': + if ti % args.detection_every == 0: + # incorporate new detections + assert mask is not None + prob = processor.incorporate_detection(image, mask, segments_info) + else: + # Run the model on this frame + prob = processor.step(image, None, None, end=(ti == vid_length - 1)) end.record() torch.cuda.synchronize() total_process_time += (start.elapsed_time(end) / 1000) @@ -275,72 +296,57 @@ shape=shape, path_to_image=path_to_image) - elif temporal_setting == 'online': - if ti % args.detection_every == 0: - # incorporate new detections - assert mask is not None - prob = processor.incorporate_detection(image, mask, segments_info) else: - # Run the model on this frame - prob = processor.step(image, None, None, end=(ti == vid_length - 1)) - end.record() - torch.cuda.synchronize() - total_process_time += (start.elapsed_time(end) / 1000) - total_frames += 1 - if info['save'][0]: - result_saver.save_mask(prob, - frame, - need_resize=need_resize, - shape=shape, - path_to_image=path_to_image) - - else: - raise NotImplementedError - - result_saver.end() - if is_vipseg: - # save this for a dataset-level json - output_json_annotations.append(result_saver.video_json) - elif is_burst: - # save this as a video-level json, which we merge later - with open(path.join(out_path, vid_name, 'pred.json'), 'w') as f: - json.dump(result_saver.video_json, f) - elif is_demo: - # save this as a video-level json in a separate folder - os.makedirs(path.join(out_path, 'JSONFiles'), exist_ok=True) - with open(path.join(out_path, 'JSONFiles', f'{vid_name}.json'), 'w') as f: - json.dump(result_saver.video_json, f, indent=4) - - except Exception as e: - print(f'Runtime error at {vid_name}') - print(e) - raise e # comment this out if you want - -if is_vipseg: - output_json = {'annotations': output_json_annotations} - with open(path.join(out_path, 'pred.json'), 'w') as f: - json.dump(output_json, f) - -print(f'Total processing time: {total_process_time}') -print(f'Total processed frames: {total_frames}') -print(f'FPS: {total_frames / total_process_time}') -print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') - -if is_vipseg: - vipseg_root = args.vipseg_root - print('Starting evaluation...') - merge_stuff(out_path, out_path) - - if not args.no_metrics: - p1 = Process(target=partial(eval_stq, out_path, f'{vipseg_root}/panomasksRGB', - f'{vipseg_root}/panoptic_gt_VIPSeg_val.json')) - p1.start() - eval_vpq(out_path, - f'{vipseg_root}/panomasksRGB', - f'{vipseg_root}/panoptic_gt_VIPSeg_val.json', - num_processes=16) - p1.join() -elif is_davis: - if args.postprocess_limit_max_id > 0: - print('Post-processing DAVIS 2017...') - limit_max_id(out_path, out_path, max_num_objects=args.postprocess_limit_max_id) + raise NotImplementedError + + result_saver.end() + if is_vipseg: + # save this for a dataset-level json + output_json_annotations.append(result_saver.video_json) + elif is_burst: + # save this as a video-level json, which we merge later + with open(path.join(out_path, vid_name, 'pred.json'), 'w') as f: + json.dump(result_saver.video_json, f) + elif is_demo: + # save this as a video-level json in a separate folder + os.makedirs(path.join(out_path, 'JSONFiles'), exist_ok=True) + with open(path.join(out_path, 'JSONFiles', f'{vid_name}.json'), 'w') as f: + json.dump(result_saver.video_json, f, indent=4) + + except Exception as e: + print(f'Runtime error at {vid_name}') + print(e) + raise e # comment this out if you want + + if is_vipseg: + output_json = {'annotations': output_json_annotations} + with open(path.join(out_path, 'pred.json'), 'w') as f: + json.dump(output_json, f) + + print(f'Total processing time: {total_process_time}') + print(f'Total processed frames: {total_frames}') + print(f'FPS: {total_frames / total_process_time}') + print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}') + + if is_vipseg: + vipseg_root = args.vipseg_root + print('Starting evaluation...') + merge_stuff(out_path, out_path) + + if not args.no_metrics: + p1 = Process(target=partial(eval_stq, out_path, f'{vipseg_root}/panomasksRGB', + f'{vipseg_root}/panoptic_gt_VIPSeg_val.json')) + p1.start() + eval_vpq(out_path, + f'{vipseg_root}/panomasksRGB', + f'{vipseg_root}/panoptic_gt_VIPSeg_val.json', + num_processes=16) + p1.join() + elif is_davis: + if args.postprocess_limit_max_id > 0: + print('Post-processing DAVIS 2017...') + limit_max_id(out_path, out_path, max_num_objects=args.postprocess_limit_max_id) + + +if __name__ == '__main__': + main() diff --git a/scripts/merge_multi_scale.py b/scripts/merge_multi_scale.py index de5def7..a3fc9c0 100644 --- a/scripts/merge_multi_scale.py +++ b/scripts/merge_multi_scale.py @@ -8,7 +8,7 @@ import hickle as hkl from PIL import Image, ImagePalette -from progressbar import progressbar +from tqdm import tqdm from multiprocessing import Pool from deva.utils import palette @@ -123,7 +123,7 @@ def process_vid(vid): print('Total number of videos: ', len(all_vid)) pool = Pool(processes=args.num_proc) - for _ in progressbar(pool.imap_unordered(process_vid, all_vid), max_value=len(all_vid)): + for _ in tqdm(pool.imap_unordered(process_vid, all_vid), max_value=len(all_vid)): pass pool.close()