diff --git a/deva/vps_metrics/stuff_merging.py b/deva/vps_metrics/stuff_merging.py
index 58153bb..cba4578 100644
--- a/deva/vps_metrics/stuff_merging.py
+++ b/deva/vps_metrics/stuff_merging.py
@@ -6,7 +6,7 @@
 import numpy as np
 from PIL import Image
 from functools import partial
-from progressbar import progressbar
+from tqdm import tqdm
 
 from deva.utils.vipseg_categories import VIPSEG_CATEGORIES
 from deva.utils.pano_utils import IDPostprocessor, id_to_rgb
@@ -94,7 +94,7 @@ def merge_stuff(input_path, output_path):
 
     output_annotations = []
     pool = Pool(16)
-    for out_vid_ann in progressbar(pool.imap(
+    for out_vid_ann in tqdm(pool.imap(
             partial(process_single_video, input_path=input_path, output_path=output_path),
             annotations),
                                    max_value=len(annotations)):
diff --git a/evaluation/eval_ref_davis.py b/evaluation/eval_ref_davis.py
index 0596e00..7a3ceb3 100644
--- a/evaluation/eval_ref_davis.py
+++ b/evaluation/eval_ref_davis.py
@@ -13,151 +13,160 @@
 from deva.utils.palette import davis_palette
 from deva.inference.result_utils import ResultSaver
 from deva.inference.eval_args import add_common_eval_args, get_model_and_config
-"""
-Arguments loading
-"""
-parser = ArgumentParser()
-parser.add_argument('--img_path', default='../DAVIS/2017/trainval/JPEGImages/480p')
-parser.add_argument('--mask_path')
-parser.add_argument('--num_voting_frames',
-                    default=5,
-                    type=int,
-                    help='Number of frames selected for the initial consensus voting')
-add_common_eval_args(parser)
-network, config, args = get_model_and_config(parser)
-"""
-Data preparation
-"""
-out_path = args.output
-meta_dataset = ReferringDAVISTestDataset(args.img_path, args.mask_path)
-torch.autograd.set_grad_enabled(False)
-
-videos = meta_dataset.get_videos()
-
-total_process_time = 0
-total_frames = 0
-
-# Start eval
-pbar = tqdm(videos, total=len(videos))
-for vid_name in pbar:
-    pbar.set_description(vid_name)
-    video_scores = meta_dataset.get_scores(vid_name)
-    try:
-        """
-        initial pass, perform consensus voting and get a keyframe
-        """
-        image_feature_store = ImageFeatureStore(network)
-        vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, config['num_voting_frames'])
-        loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-
-        time_indices = []
-        images = []
-        masks = []
-        scores = []
-        for ti, data in enumerate(loader):
-            time_indices.append(data['info']['time_index'][0].item())
-            image = data['rgb'].cuda()[0]
-            mask = data['mask'].cuda()[0]
-            images.append(image)
-            masks.append(mask)
-
-            frame_name = data['info']['frame'][0][:-4]
-            scores.append(video_scores[frame_name])
-
-        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        keyframe_ti, projected_mask = find_consensus_with_established_association(
-            time_indices,
-            images,
-            masks,
-            scores=scores,
-            network=network,
-            store=image_feature_store,
-            config=config)
-        end.record()
-        torch.cuda.synchronize()
-        total_process_time += (start.elapsed_time(end) / 1000)
-        """
-        Backward pass video reader
-        """
-        backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                    start=-1,
-                                                                    end=keyframe_ti + 1,
-                                                                    reverse=True)
-        """
-        Forward pass video reader
-        """
-        forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                   start=keyframe_ti,
-                                                                   end=-1,
-                                                                   reverse=False)
-        """
-        Running them in combination
-        """
-        vid_readers = [backward_vid_reader, forward_vid_reader]
-        for vid_reader in vid_readers:
 
-            loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-            vid_length = len(loader)
-            # no need to count usage for LT if the video is not that long anyway
-            config['enable_long_term_count_usage'] = (
-                config['enable_long_term']
-                and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
-                     config['num_prototypes']) >= config['max_long_term_elements'])
 
-            processor = DEVAInferenceCore(network,
-                                          config=config,
-                                          image_feature_store=image_feature_store)
-            result_saver = ResultSaver(out_path,
-                                       vid_name,
-                                       dataset='ref_davis',
-                                       palette=davis_palette,
-                                       object_manager=processor.object_manager)
+def main():
+    """
+    Arguments loading
+    """
+    parser = ArgumentParser()
+    parser.add_argument('--img_path', default='../DAVIS/2017/trainval/JPEGImages/480p')
+    parser.add_argument('--mask_path')
+    parser.add_argument('--num_voting_frames',
+                        default=5,
+                        type=int,
+                        help='Number of frames selected for the initial consensus voting')
+    add_common_eval_args(parser)
+    network, config, args = get_model_and_config(parser)
+    """
+    Data preparation
+    """
+    out_path = args.output
+    meta_dataset = ReferringDAVISTestDataset(args.img_path, args.mask_path)
+    torch.autograd.set_grad_enabled(False)
+
+    videos = meta_dataset.get_videos()
+
+    total_process_time = 0
+    total_frames = 0
+
+    # Start eval
+    pbar = tqdm(videos, total=len(videos))
+    for vid_name in pbar:
+        pbar.set_description(vid_name)
+        video_scores = meta_dataset.get_scores(vid_name)
+        try:
+            """
+            initial pass, perform consensus voting and get a keyframe
+            """
+            image_feature_store = ImageFeatureStore(network)
+            vid_reader = meta_dataset.get_offline_sampled_frames(vid_name,
+                                                                 config['num_voting_frames'])
+            loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
 
+            time_indices = []
+            images = []
+            masks = []
+            scores = []
             for ti, data in enumerate(loader):
-                with torch.cuda.amp.autocast(enabled=args.amp):
-                    image = data['rgb'].cuda()[0]
-                    info = data['info']
-                    frame = info['frame'][0]
-                    shape = info['shape']
-                    need_resize = info['need_resize'][0]
-                    image_ti = info['time_index'][0].item()
-
-                    if image_ti == keyframe_ti:
-                        mask = projected_mask
-                    else:
-                        mask = None
-
-                    start = torch.cuda.Event(enable_timing=True)
-                    end = torch.cuda.Event(enable_timing=True)
-                    start.record()
-
-                    # Run the model on this frame
-                    prob = processor.step(image,
-                                          mask,
-                                          end=(ti == vid_length - 1),
-                                          hard_mask=False,
-                                          image_ti_override=image_ti)
-
-                    end.record()
-                    torch.cuda.synchronize()
-                    total_process_time += (start.elapsed_time(end) / 1000)
-                    total_frames += 1
-
-                    result_saver.save_mask(prob, frame, need_resize=need_resize, shape=shape)
-
-            result_saver.end()
-        with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f:
-            f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
-
-    except Exception as e:
-        print(f'Runtime error at {vid_name}')
-        print(e)
-        raise e
-
-print(f'Total processing time: {total_process_time}')
-print(f'Total processed frames: {total_frames}')
-print(f'FPS: {total_frames / total_process_time}')
-print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+                time_indices.append(data['info']['time_index'][0].item())
+                image = data['rgb'].cuda()[0]
+                mask = data['mask'].cuda()[0]
+                images.append(image)
+                masks.append(mask)
+
+                frame_name = data['info']['frame'][0][:-4]
+                scores.append(video_scores[frame_name])
+
+            torch.cuda.synchronize()
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+            start.record()
+            keyframe_ti, projected_mask = find_consensus_with_established_association(
+                time_indices,
+                images,
+                masks,
+                scores=scores,
+                network=network,
+                store=image_feature_store,
+                config=config)
+            end.record()
+            torch.cuda.synchronize()
+            total_process_time += (start.elapsed_time(end) / 1000)
+            """
+            Backward pass video reader
+            """
+            backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                        start=-1,
+                                                                        end=keyframe_ti + 1,
+                                                                        reverse=True)
+            """
+            Forward pass video reader
+            """
+            forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                       start=keyframe_ti,
+                                                                       end=-1,
+                                                                       reverse=False)
+            """
+            Running them in combination
+            """
+            vid_readers = [backward_vid_reader, forward_vid_reader]
+            for vid_reader in vid_readers:
+
+                loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
+                vid_length = len(loader)
+                # no need to count usage for LT if the video is not that long anyway
+                config['enable_long_term_count_usage'] = (
+                    config['enable_long_term'] and
+                    (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
+                     config['num_prototypes']) >= config['max_long_term_elements'])
+
+                processor = DEVAInferenceCore(network,
+                                              config=config,
+                                              image_feature_store=image_feature_store)
+                result_saver = ResultSaver(out_path,
+                                           vid_name,
+                                           dataset='ref_davis',
+                                           palette=davis_palette,
+                                           object_manager=processor.object_manager)
+
+                for ti, data in enumerate(loader):
+                    with torch.cuda.amp.autocast(enabled=args.amp):
+                        image = data['rgb'].cuda()[0]
+                        info = data['info']
+                        frame = info['frame'][0]
+                        shape = info['shape']
+                        need_resize = info['need_resize'][0]
+                        image_ti = info['time_index'][0].item()
+
+                        if image_ti == keyframe_ti:
+                            mask = projected_mask
+                        else:
+                            mask = None
+
+                        start = torch.cuda.Event(enable_timing=True)
+                        end = torch.cuda.Event(enable_timing=True)
+                        start.record()
+
+                        # Run the model on this frame
+                        prob = processor.step(image,
+                                              mask,
+                                              end=(ti == vid_length - 1),
+                                              hard_mask=False,
+                                              image_ti_override=image_ti)
+
+                        end.record()
+                        torch.cuda.synchronize()
+                        total_process_time += (start.elapsed_time(end) / 1000)
+                        total_frames += 1
+
+                        result_saver.save_mask(prob, frame, need_resize=need_resize, shape=shape)
+
+                result_saver.end()
+            with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f:
+                f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
+
+        except Exception as e:
+            print(f'Runtime error at {vid_name}')
+            print(e)
+            raise e
+
+    print(f'Total processing time: {total_process_time}')
+    print(f'Total processed frames: {total_frames}')
+    print(f'FPS: {total_frames / total_process_time}')
+    print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/eval_ref_youtubevos.py b/evaluation/eval_ref_youtubevos.py
index 2a0661c..491fa66 100644
--- a/evaluation/eval_ref_youtubevos.py
+++ b/evaluation/eval_ref_youtubevos.py
@@ -17,177 +17,185 @@
 from deva.inference.consensus_associated import find_consensus_with_established_association
 from deva.utils.load_subset import load_referring_yv_val
 from deva.inference.eval_args import add_common_eval_args, get_model_and_config
-"""
-Arguments loading
-"""
-parser = ArgumentParser()
-parser.add_argument('--img_path', default='../YouTube/all_frames/valid_all_frames/JPEGImages')
-parser.add_argument('--mask_path')
-parser.add_argument('--json_path',
-                    default='../YouTube/meta_expressions/valid/meta_expressions.json')
-parser.add_argument('--num_voting_frames',
-                    default=10,
-                    type=int,
-                    help='Number of frames selected for the initial consensus voting')
-add_common_eval_args(parser)
-network, config, args = get_model_and_config(parser)
-"""
-Data preparation
-"""
-out_path = args.output
-meta_dataset = ReferringYouTubeVOSTestDataset(args.img_path, args.mask_path, args.json_path)
-torch.autograd.set_grad_enabled(False)
-
-videos = meta_dataset.get_videos()
-video_subset = load_referring_yv_val()
-print(f'Subset size: {len(video_subset)}')
-
-total_process_time = 0
-total_frames = 0
-
-# Start eval
-pbar = tqdm(video_subset)
-for vid_name in pbar:
-    pbar.set_description(vid_name)
-    objects = meta_dataset.get_objects(vid_name)
-    video_scores = meta_dataset.get_scores(vid_name)
-    image_feature_store = ImageFeatureStore(network, no_warning=True)
-    for object_name in objects:
-        try:
-            """
-            initial pass, perform consensus voting and get a keyframe
-            """
-            object_scores = video_scores[object_name]
-            vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, object_name,
-                                                                 config['num_voting_frames'])
-            loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-
-            time_indices = []
-            images = []
-            masks = []
-            scores = []
-            for ti, data in enumerate(loader):
-                image_ti = data['info']['time_index'][0].item()
-                time_indices.append(image_ti)
-                image = data['rgb'].cuda()[0]
-                mask = data['mask'].cuda()[0]
-                images.append(image)
-                masks.append(mask)
-
-                frame_name = data['info']['frame'][0][:-4]
-                scores.append(object_scores[frame_name])
-
-            torch.cuda.synchronize()
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            keyframe_ti, projected_mask = find_consensus_with_established_association(
-                time_indices,
-                images,
-                masks,
-                scores=scores,
-                network=network,
-                store=image_feature_store,
-                config=config)
-            end.record()
-            torch.cuda.synchronize()
-            total_process_time += (start.elapsed_time(end) / 1000)
-            """
-            Backward pass video reader
-            """
-            backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                        object_name,
-                                                                        start=-1,
-                                                                        end=keyframe_ti + 1,
-                                                                        reverse=True)
-            """
-            Forward pass video reader
-            """
-            forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                       object_name,
-                                                                       start=keyframe_ti,
-                                                                       end=-1,
-                                                                       reverse=False)
-            """
-            Running them in combination
-            """
-            vid_readers = [backward_vid_reader, forward_vid_reader]
-            for vid_reader in vid_readers:
 
-                loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-                vid_length = len(loader)
-                # no need to count usage for LT if the video is not that long anyway
-                config['enable_long_term_count_usage'] = (
-                    config['enable_long_term'] and
-                    (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
-                     config['num_prototypes']) >= config['max_long_term_elements'])
 
-                processor = DEVAInferenceCore(network,
-                                              config=config,
-                                              image_feature_store=image_feature_store)
+def main():
+    """
+    Arguments loading
+    """
+    parser = ArgumentParser()
+    parser.add_argument('--img_path', default='../YouTube/all_frames/valid_all_frames/JPEGImages')
+    parser.add_argument('--mask_path')
+    parser.add_argument('--json_path',
+                        default='../YouTube/meta_expressions/valid/meta_expressions.json')
+    parser.add_argument('--num_voting_frames',
+                        default=10,
+                        type=int,
+                        help='Number of frames selected for the initial consensus voting')
+    add_common_eval_args(parser)
+    network, config, args = get_model_and_config(parser)
+    """
+    Data preparation
+    """
+    out_path = args.output
+    meta_dataset = ReferringYouTubeVOSTestDataset(args.img_path, args.mask_path, args.json_path)
+    torch.autograd.set_grad_enabled(False)
+
+    videos = meta_dataset.get_videos()
+    video_subset = load_referring_yv_val()
+    print(f'Subset size: {len(video_subset)}')
+
+    total_process_time = 0
+    total_frames = 0
+
+    # Start eval
+    pbar = tqdm(video_subset)
+    for vid_name in pbar:
+        pbar.set_description(vid_name)
+        objects = meta_dataset.get_objects(vid_name)
+        video_scores = meta_dataset.get_scores(vid_name)
+        image_feature_store = ImageFeatureStore(network, no_warning=True)
+        for object_name in objects:
+            try:
+                """
+                initial pass, perform consensus voting and get a keyframe
+                """
+                object_scores = video_scores[object_name]
+                vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, object_name,
+                                                                     config['num_voting_frames'])
+                loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
 
+                time_indices = []
+                images = []
+                masks = []
+                scores = []
                 for ti, data in enumerate(loader):
-                    with torch.cuda.amp.autocast(enabled=args.amp):
-                        image = data['rgb'].cuda()[0]
-                        info = data['info']
-                        frame = info['frame'][0]
-                        shape = info['shape']
-                        need_resize = info['need_resize'][0]
-                        image_ti = info['time_index'][0].item()
-
-                        if image_ti == keyframe_ti:
-                            mask = projected_mask
-                        else:
-                            mask = None
-
-                        start = torch.cuda.Event(enable_timing=True)
-                        end = torch.cuda.Event(enable_timing=True)
-                        start.record()
-
-                        # Run the model on this frame
-                        prob = processor.step(image,
-                                              mask,
-                                              end=(ti == vid_length - 1),
-                                              hard_mask=False,
-                                              image_ti_override=image_ti,
-                                              delete_buffer=False)
-
-                        # Upsample to original size if needed
-                        if need_resize:
-                            prob = F.interpolate(prob.unsqueeze(1),
-                                                 shape,
-                                                 mode='bilinear',
-                                                 align_corners=False)[:, 0]
-
-                        out_mask = (prob[1] > prob[0]).float() * 255
-
-                        end.record()
-                        torch.cuda.synchronize()
-                        total_process_time += (start.elapsed_time(end) / 1000)
-                        total_frames += 1
-
-                        # Save the mask
-                        if args.save_all or info['save'][0]:
-                            this_out_path = path.join(out_path, 'Annotations', vid_name,
-                                                      object_name)
-                            os.makedirs(this_out_path, exist_ok=True)
-                            out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
-                            out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
-
-            with open(path.join(out_path, 'Annotations', vid_name, object_name, 'key.txt'),
-                      'w') as f:
-                f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
-
-        except Exception as e:
-            print(f'Runtime error at {vid_name}')
-            print(e)
-            raise e
-
-print(f'Total processing time: {total_process_time}')
-print(f'Total processed frames: {total_frames}')
-print(f'FPS: {total_frames / total_process_time}')
-print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
-
-print('Making zip for YouTubeVOS...')
-shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output,
-                    'Annotations')
+                    image_ti = data['info']['time_index'][0].item()
+                    time_indices.append(image_ti)
+                    image = data['rgb'].cuda()[0]
+                    mask = data['mask'].cuda()[0]
+                    images.append(image)
+                    masks.append(mask)
+
+                    frame_name = data['info']['frame'][0][:-4]
+                    scores.append(object_scores[frame_name])
+
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                keyframe_ti, projected_mask = find_consensus_with_established_association(
+                    time_indices,
+                    images,
+                    masks,
+                    scores=scores,
+                    network=network,
+                    store=image_feature_store,
+                    config=config)
+                end.record()
+                torch.cuda.synchronize()
+                total_process_time += (start.elapsed_time(end) / 1000)
+                """
+                Backward pass video reader
+                """
+                backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                            object_name,
+                                                                            start=-1,
+                                                                            end=keyframe_ti + 1,
+                                                                            reverse=True)
+                """
+                Forward pass video reader
+                """
+                forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                           object_name,
+                                                                           start=keyframe_ti,
+                                                                           end=-1,
+                                                                           reverse=False)
+                """
+                Running them in combination
+                """
+                vid_readers = [backward_vid_reader, forward_vid_reader]
+                for vid_reader in vid_readers:
+
+                    loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
+                    vid_length = len(loader)
+                    # no need to count usage for LT if the video is not that long anyway
+                    config['enable_long_term_count_usage'] = (
+                        config['enable_long_term']
+                        and (vid_length /
+                             (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
+                             config['num_prototypes']) >= config['max_long_term_elements'])
+
+                    processor = DEVAInferenceCore(network,
+                                                  config=config,
+                                                  image_feature_store=image_feature_store)
+
+                    for ti, data in enumerate(loader):
+                        with torch.cuda.amp.autocast(enabled=args.amp):
+                            image = data['rgb'].cuda()[0]
+                            info = data['info']
+                            frame = info['frame'][0]
+                            shape = info['shape']
+                            need_resize = info['need_resize'][0]
+                            image_ti = info['time_index'][0].item()
+
+                            if image_ti == keyframe_ti:
+                                mask = projected_mask
+                            else:
+                                mask = None
+
+                            start = torch.cuda.Event(enable_timing=True)
+                            end = torch.cuda.Event(enable_timing=True)
+                            start.record()
+
+                            # Run the model on this frame
+                            prob = processor.step(image,
+                                                  mask,
+                                                  end=(ti == vid_length - 1),
+                                                  hard_mask=False,
+                                                  image_ti_override=image_ti,
+                                                  delete_buffer=False)
+
+                            # Upsample to original size if needed
+                            if need_resize:
+                                prob = F.interpolate(prob.unsqueeze(1),
+                                                     shape,
+                                                     mode='bilinear',
+                                                     align_corners=False)[:, 0]
+
+                            out_mask = (prob[1] > prob[0]).float() * 255
+
+                            end.record()
+                            torch.cuda.synchronize()
+                            total_process_time += (start.elapsed_time(end) / 1000)
+                            total_frames += 1
+
+                            # Save the mask
+                            if args.save_all or info['save'][0]:
+                                this_out_path = path.join(out_path, 'Annotations', vid_name,
+                                                          object_name)
+                                os.makedirs(this_out_path, exist_ok=True)
+                                out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
+                                out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
+
+                with open(path.join(out_path, 'Annotations', vid_name, object_name, 'key.txt'),
+                          'w') as f:
+                    f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
+
+            except Exception as e:
+                print(f'Runtime error at {vid_name}')
+                print(e)
+                raise e
+
+    print(f'Total processing time: {total_process_time}')
+    print(f'Total processed frames: {total_frames}')
+    print(f'FPS: {total_frames / total_process_time}')
+    print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+
+    print('Making zip for YouTubeVOS...')
+    shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output,
+                        'Annotations')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/eval_saliency.py b/evaluation/eval_saliency.py
index 9368506..47da6de 100644
--- a/evaluation/eval_saliency.py
+++ b/evaluation/eval_saliency.py
@@ -16,151 +16,165 @@
 from deva.inference.consensus_associated import find_consensus_with_established_association
 from deva.utils.tensor_utils import pad_divide_by
 from deva.inference.eval_args import add_common_eval_args, get_model_and_config
-"""
-Arguments loading
-"""
-parser = ArgumentParser()
-parser.add_argument('--img_path', default='../DAVIS/2016/JPEGImages/480p')
-parser.add_argument('--mask_path')
-parser.add_argument('--imset_path')
-parser.add_argument('--num_voting_frames',
-                    default=10,
-                    type=int,
-                    help='Number of frames selected for the initial consensus voting')
-add_common_eval_args(parser)
-network, config, args = get_model_and_config(parser)
-"""
-Data preparation
-"""
-out_path = args.output
-meta_dataset = DAVISSaliencyTestDataset(args.img_path, args.mask_path, imset=args.imset_path)
-torch.autograd.set_grad_enabled(False)
-
-videos = meta_dataset.get_videos()
-
-total_process_time = 0
-total_frames = 0
-
-# Start eval
-pbar = tqdm(videos, total=len(meta_dataset))
-for vid_name in pbar:
-    pbar.set_description(vid_name)
-
-    try:
-        """
-        initial pass, perform consensus voting and get a keyframe
-        """
-        image_feature_store = ImageFeatureStore(network)
-        vid_reader = meta_dataset.get_offline_sampled_frames(vid_name, config['num_voting_frames'])
-        loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-
-        time_indices = []
-        images = []
-        masks = []
-        for ti, data in enumerate(loader):
-            time_indices.append(data['info']['time_index'][0].item())
-            image = data['rgb'].cuda()[0]
-            mask = data['mask'].cuda()[0]
-            image, _ = pad_divide_by(image, 16)
-            mask, _ = pad_divide_by(mask, 16)
-            images.append(image)
-            masks.append(mask)
-
-        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        keyframe_ti, projected_mask = find_consensus_with_established_association(
-            time_indices, images, masks, network=network, store=image_feature_store, config=config)
-        end.record()
-        torch.cuda.synchronize()
-        total_process_time += (start.elapsed_time(end) / 1000)
-        """
-        Backward pass video reader
-        """
-        backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                    start=-1,
-                                                                    end=keyframe_ti + 1,
-                                                                    reverse=True)
-        """
-        Forward pass video reader
-        """
-        forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
-                                                                   start=keyframe_ti,
-                                                                   end=-1,
-                                                                   reverse=False)
-        """
-        Running them in combination
-        """
-        vid_readers = [backward_vid_reader, forward_vid_reader]
-        for vid_reader in vid_readers:
 
-            loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-            vid_length = len(loader)
-            # no need to count usage for LT if the video is not that long anyway
-            config['enable_long_term_count_usage'] = (
-                config['enable_long_term']
-                and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
-                     config['num_prototypes']) >= config['max_long_term_elements'])
 
-            processor = DEVAInferenceCore(network,
-                                          config=config,
-                                          image_feature_store=image_feature_store)
+def main():
+    """
+    Arguments loading
+    """
+    parser = ArgumentParser()
+    parser.add_argument('--img_path', default='../DAVIS/2016/JPEGImages/480p')
+    parser.add_argument('--mask_path')
+    parser.add_argument('--imset_path')
+    parser.add_argument('--num_voting_frames',
+                        default=10,
+                        type=int,
+                        help='Number of frames selected for the initial consensus voting')
+    add_common_eval_args(parser)
+    network, config, args = get_model_and_config(parser)
+    """
+    Data preparation
+    """
+    out_path = args.output
+    meta_dataset = DAVISSaliencyTestDataset(args.img_path, args.mask_path, imset=args.imset_path)
+    torch.autograd.set_grad_enabled(False)
+
+    videos = meta_dataset.get_videos()
+
+    total_process_time = 0
+    total_frames = 0
+
+    # Start eval
+    pbar = tqdm(videos, total=len(meta_dataset))
+    for vid_name in pbar:
+        pbar.set_description(vid_name)
+
+        try:
+            """
+            initial pass, perform consensus voting and get a keyframe
+            """
+            image_feature_store = ImageFeatureStore(network)
+            vid_reader = meta_dataset.get_offline_sampled_frames(vid_name,
+                                                                 config['num_voting_frames'])
+            loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
 
+            time_indices = []
+            images = []
+            masks = []
             for ti, data in enumerate(loader):
-                with torch.cuda.amp.autocast(enabled=args.amp):
-                    image = data['rgb'].cuda()[0]
-                    info = data['info']
-                    frame = info['frame'][0]
-                    shape = info['shape']
-                    need_resize = info['need_resize'][0]
-                    image_ti = info['time_index'][0].item()
-
-                    if image_ti == keyframe_ti:
-                        mask = projected_mask
-                    else:
-                        mask = None
-
-                    start = torch.cuda.Event(enable_timing=True)
-                    end = torch.cuda.Event(enable_timing=True)
-                    start.record()
-
-                    # Run the model on this frame
-                    prob = processor.step(image,
-                                          mask,
-                                          end=(ti == vid_length - 1),
-                                          hard_mask=False,
-                                          image_ti_override=image_ti)
-
-                    # Upsample to original size if needed
-                    if need_resize:
-                        prob = F.interpolate(prob.unsqueeze(1),
-                                             shape,
-                                             mode='bilinear',
-                                             align_corners=False)[:, 0]
-
-                    out_mask = (prob[1] > prob[0]).float() * 255
-
-                    end.record()
-                    torch.cuda.synchronize()
-                    total_process_time += (start.elapsed_time(end) / 1000)
-                    total_frames += 1
-
-                    # Save the mask
-                    this_out_path = path.join(out_path, vid_name)
-                    os.makedirs(this_out_path, exist_ok=True)
-                    out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
-                    out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
-
-        with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f:
-            f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
-
-    except Exception as e:
-        print(f'Runtime error at {vid_name}')
-        print(e)
-        raise e
-
-print(f'Total processing time: {total_process_time}')
-print(f'Total processed frames: {total_frames}')
-print(f'FPS: {total_frames / total_process_time}')
-print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+                time_indices.append(data['info']['time_index'][0].item())
+                image = data['rgb'].cuda()[0]
+                mask = data['mask'].cuda()[0]
+                image, _ = pad_divide_by(image, 16)
+                mask, _ = pad_divide_by(mask, 16)
+                images.append(image)
+                masks.append(mask)
+
+            torch.cuda.synchronize()
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+            start.record()
+            keyframe_ti, projected_mask = find_consensus_with_established_association(
+                time_indices,
+                images,
+                masks,
+                network=network,
+                store=image_feature_store,
+                config=config)
+            end.record()
+            torch.cuda.synchronize()
+            total_process_time += (start.elapsed_time(end) / 1000)
+            """
+            Backward pass video reader
+            """
+            backward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                        start=-1,
+                                                                        end=keyframe_ti + 1,
+                                                                        reverse=True)
+            """
+            Forward pass video reader
+            """
+            forward_vid_reader = meta_dataset.get_partial_video_loader(vid_name,
+                                                                       start=keyframe_ti,
+                                                                       end=-1,
+                                                                       reverse=False)
+            """
+            Running them in combination
+            """
+            vid_readers = [backward_vid_reader, forward_vid_reader]
+            for vid_reader in vid_readers:
+
+                loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
+                vid_length = len(loader)
+                # no need to count usage for LT if the video is not that long anyway
+                config['enable_long_term_count_usage'] = (
+                    config['enable_long_term'] and
+                    (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
+                     config['num_prototypes']) >= config['max_long_term_elements'])
+
+                processor = DEVAInferenceCore(network,
+                                              config=config,
+                                              image_feature_store=image_feature_store)
+
+                for ti, data in enumerate(loader):
+                    with torch.cuda.amp.autocast(enabled=args.amp):
+                        image = data['rgb'].cuda()[0]
+                        info = data['info']
+                        frame = info['frame'][0]
+                        shape = info['shape']
+                        need_resize = info['need_resize'][0]
+                        image_ti = info['time_index'][0].item()
+
+                        if image_ti == keyframe_ti:
+                            mask = projected_mask
+                        else:
+                            mask = None
+
+                        start = torch.cuda.Event(enable_timing=True)
+                        end = torch.cuda.Event(enable_timing=True)
+                        start.record()
+
+                        # Run the model on this frame
+                        prob = processor.step(image,
+                                              mask,
+                                              end=(ti == vid_length - 1),
+                                              hard_mask=False,
+                                              image_ti_override=image_ti)
+
+                        # Upsample to original size if needed
+                        if need_resize:
+                            prob = F.interpolate(prob.unsqueeze(1),
+                                                 shape,
+                                                 mode='bilinear',
+                                                 align_corners=False)[:, 0]
+
+                        out_mask = (prob[1] > prob[0]).float() * 255
+
+                        end.record()
+                        torch.cuda.synchronize()
+                        total_process_time += (start.elapsed_time(end) / 1000)
+                        total_frames += 1
+
+                        # Save the mask
+                        this_out_path = path.join(out_path, vid_name)
+                        os.makedirs(this_out_path, exist_ok=True)
+                        out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
+                        out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
+
+            with open(path.join(out_path, vid_name, 'key.txt'), 'w') as f:
+                f.write(f'options: {time_indices}; keyframe: {keyframe_ti}')
+
+        except Exception as e:
+            print(f'Runtime error at {vid_name}')
+            print(e)
+            raise e
+
+    print(f'Total processing time: {total_process_time}')
+    print(f'Total processed frames: {total_frames}')
+    print(f'FPS: {total_frames / total_process_time}')
+    print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/eval_vos.py b/evaluation/eval_vos.py
index 3db6714..f6f03d8 100644
--- a/evaluation/eval_vos.py
+++ b/evaluation/eval_vos.py
@@ -18,210 +18,217 @@
 from deva.inference.data.vos_test_datasets import GeneralVOSTestDataset, DAVISTestDataset, YouTubeVOSTestDataset
 from deva.inference.inference_core import DEVAInferenceCore
 from deva.inference.eval_args import add_common_eval_args, get_model_and_config
-"""
-Arguments loading
-"""
-parser = ArgumentParser()
-parser.add_argument('--d16_path', default='../DAVIS/2016')
-parser.add_argument('--d17_path', default='../DAVIS/2017')
-parser.add_argument('--y18_path', default='../YouTube2018')
-parser.add_argument('--y19_path', default='../YouTube')
-# For generic (G) evaluation, point to a folder that contains "JPEGImages" and "Annotations"
-parser.add_argument('--generic_path', default='./example/vos')
-
-parser.add_argument('--dataset', help='D16/D17/Y18/Y19/G', default='D17')
-parser.add_argument('--split', help='val/test', default='val')
-parser.add_argument('--use_all_masks',
-                    help='Use all masks in the mask folder for generic evaluation. '
-                    'Forced to be True for YouTubeVOS; forced to be False for DAVIS/MOSE.',
-                    action='store_true')
-
-# Multi-scale options
-parser.add_argument('--save_scores', action='store_true')
-parser.add_argument('--flip', action='store_true')
-
-add_common_eval_args(parser)
-network, config, args = get_model_and_config(parser)
-args.dataset = args.dataset.upper()
-
-if args.output is None:
-    args.output = f'../output/{args.dataset}_{args.split}'
-    print(f'Output path not provided. Defaulting to {args.output}')
-"""
-Data preparation
-"""
-is_youtube = args.dataset.startswith('Y')
-is_davis = args.dataset.startswith('D')
-
-if is_youtube or args.save_scores:
-    out_path = path.join(args.output, 'Annotations')
-else:
-    out_path = args.output
-
-if is_youtube:
-    if args.dataset == 'Y18':
-        yv_path = args.y18_path
-    elif args.dataset == 'Y19':
-        yv_path = args.y19_path
-
-    if args.split == 'val':
-        args.split = 'valid'
-        meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='valid', size=args.size)
-    elif args.split == 'test':
-        meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='test', size=args.size)
+
+
+def main():
+    """
+    Arguments loading
+    """
+    parser = ArgumentParser()
+    parser.add_argument('--d16_path', default='../DAVIS/2016')
+    parser.add_argument('--d17_path', default='../DAVIS/2017')
+    parser.add_argument('--y18_path', default='../YouTube2018')
+    parser.add_argument('--y19_path', default='../YouTube')
+    # For generic (G) evaluation, point to a folder that contains "JPEGImages" and "Annotations"
+    parser.add_argument('--generic_path', default='./example/vos')
+
+    parser.add_argument('--dataset', help='D16/D17/Y18/Y19/G', default='D17')
+    parser.add_argument('--split', help='val/test', default='val')
+    parser.add_argument('--use_all_masks',
+                        help='Use all masks in the mask folder for generic evaluation. '
+                        'Forced to be True for YouTubeVOS; forced to be False for DAVIS/MOSE.',
+                        action='store_true')
+
+    # Multi-scale options
+    parser.add_argument('--save_scores', action='store_true')
+    parser.add_argument('--flip', action='store_true')
+
+    add_common_eval_args(parser)
+    network, config, args = get_model_and_config(parser)
+    args.dataset = args.dataset.upper()
+
+    if args.output is None:
+        args.output = f'../output/{args.dataset}_{args.split}'
+        print(f'Output path not provided. Defaulting to {args.output}')
+    """
+    Data preparation
+    """
+    is_youtube = args.dataset.startswith('Y')
+    is_davis = args.dataset.startswith('D')
+
+    if is_youtube or args.save_scores:
+        out_path = path.join(args.output, 'Annotations')
     else:
-        raise NotImplementedError
+        out_path = args.output
+
+    if is_youtube:
+        if args.dataset == 'Y18':
+            yv_path = args.y18_path
+        elif args.dataset == 'Y19':
+            yv_path = args.y19_path
 
-elif is_davis:
-    if args.dataset == 'D16':
-        if args.split == 'val':
-            # Set up Dataset, a small hack to use the image set in the 2017 folder because the 2016 one is of a different format
-            meta_dataset = DAVISTestDataset(args.d16_path,
-                                            imset='../../2017/trainval/ImageSets/2016/val.txt',
-                                            size=args.size)
-        else:
-            raise NotImplementedError
-        palette = None
-    elif args.dataset == 'D17':
         if args.split == 'val':
-            meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'trainval'),
-                                            imset='2017/val.txt',
-                                            size=args.size)
+            args.split = 'valid'
+            meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='valid', size=args.size)
         elif args.split == 'test':
-            meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'test-dev'),
-                                            imset='2017/test-dev.txt',
-                                            size=args.size)
+            meta_dataset = YouTubeVOSTestDataset(data_root=yv_path, split='test', size=args.size)
         else:
             raise NotImplementedError
-elif args.dataset == 'G':
-    meta_dataset = GeneralVOSTestDataset(path.join(args.generic_path),
-                                         size=args.size,
-                                         use_all_masks=args.use_all_masks)
-
-    if not args.save_all:
-        args.save_all = True
-        print('save_all is forced to be true in generic evaluation mode.')
-else:
-    raise NotImplementedError
-
-torch.autograd.set_grad_enabled(False)
-
-# Set up loader
-meta_loader = meta_dataset.get_datasets()
-
-total_process_time = 0
-total_frames = 0
-
-# Start eval
-pbar = tqdm(meta_loader, total=len(meta_dataset))
-for vid_reader in pbar:
-
-    loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-    vid_name = vid_reader.vid_name
-    pbar.set_description(vid_name)
-    vid_length = len(loader)
-    # no need to count usage for LT if the video is not that long anyway
-    config['enable_long_term_count_usage'] = (
-        config['enable_long_term']
-        and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
-             config['num_prototypes']) >= config['max_long_term_elements'])
-
-    try:
-        processor = DEVAInferenceCore(network, config=config)
-        first_mask_loaded = False
-
-        for ti, data in enumerate(loader):
-            with torch.cuda.amp.autocast(enabled=args.amp):
-                image = data['rgb'].cuda()[0]
-                mask = data.get('mask')
-                if mask is not None:
-                    mask = mask.cuda()[0]
-                valid_labels = data.get('valid_labels')
-                if valid_labels is not None:
-                    valid_labels = valid_labels.tolist()[0]
-                info = data['info']
-                frame = info['frame'][0]
-                shape = info['shape']
-                need_resize = info['need_resize'][0]
-
-                start = torch.cuda.Event(enable_timing=True)
-                end = torch.cuda.Event(enable_timing=True)
-                start.record()
-
-                # if, for some reason, the first frame is not aligned with the first mask
-                if not first_mask_loaded:
+
+    elif is_davis:
+        if args.dataset == 'D16':
+            if args.split == 'val':
+                # Set up Dataset, a small hack to use the image set in the 2017 folder because the 2016 one is of a different format
+                meta_dataset = DAVISTestDataset(args.d16_path,
+                                                imset='../../2017/trainval/ImageSets/2016/val.txt',
+                                                size=args.size)
+            else:
+                raise NotImplementedError
+            palette = None
+        elif args.dataset == 'D17':
+            if args.split == 'val':
+                meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'trainval'),
+                                                imset='2017/val.txt',
+                                                size=args.size)
+            elif args.split == 'test':
+                meta_dataset = DAVISTestDataset(path.join(args.d17_path, 'test-dev'),
+                                                imset='2017/test-dev.txt',
+                                                size=args.size)
+            else:
+                raise NotImplementedError
+    elif args.dataset == 'G':
+        meta_dataset = GeneralVOSTestDataset(path.join(args.generic_path),
+                                             size=args.size,
+                                             use_all_masks=args.use_all_masks)
+
+        if not args.save_all:
+            args.save_all = True
+            print('save_all is forced to be true in generic evaluation mode.')
+    else:
+        raise NotImplementedError
+
+    torch.autograd.set_grad_enabled(False)
+
+    # Set up loader
+    meta_loader = meta_dataset.get_datasets()
+
+    total_process_time = 0
+    total_frames = 0
+
+    # Start eval
+    pbar = tqdm(meta_loader, total=len(meta_dataset))
+    for vid_reader in pbar:
+
+        loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
+        vid_name = vid_reader.vid_name
+        pbar.set_description(vid_name)
+        vid_length = len(loader)
+        # no need to count usage for LT if the video is not that long anyway
+        config['enable_long_term_count_usage'] = (
+            config['enable_long_term']
+            and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
+                 config['num_prototypes']) >= config['max_long_term_elements'])
+
+        try:
+            processor = DEVAInferenceCore(network, config=config)
+            first_mask_loaded = False
+
+            for ti, data in enumerate(loader):
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    image = data['rgb'].cuda()[0]
+                    mask = data.get('mask')
                     if mask is not None:
-                        first_mask_loaded = True
-                    else:
-                        # no point to do anything without a mask
-                        continue
-
-                if args.flip:
-                    image = torch.flip(image, dims=[-1])
-                    mask = torch.flip(mask, dims=[-1]) if mask is not None else None
-
-                # Run the model on this frame
-                prob = processor.step(image, mask, valid_labels, end=(ti == vid_length - 1))
-
-                # Upsample to original size if needed
-                if need_resize:
-                    prob = F.interpolate(prob.unsqueeze(1),
-                                         shape,
-                                         mode='bilinear',
-                                         align_corners=False)[:, 0]
-
-                if args.flip:
-                    prob = torch.flip(prob, dims=[-1])
-
-                # Probability mask -> index mask
-                out_mask = torch.argmax(prob, dim=0)
-                out_mask = processor.object_manager.tmp_to_obj_cls(out_mask)
-
-                end.record()
-                torch.cuda.synchronize()
-                total_process_time += (start.elapsed_time(end) / 1000)
-                total_frames += 1
-
-                if args.save_scores:
-                    prob = (prob.detach().cpu().numpy() * 255).astype(np.uint8)
-
-                # Save the mask
-                if args.save_all or info['save'][0]:
-                    this_out_path = path.join(out_path, vid_name)
-                    os.makedirs(this_out_path, exist_ok=True)
-                    out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
-                    if vid_reader.get_palette() is not None:
-                        out_img.putpalette(vid_reader.get_palette())
-                    out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
-
-                if args.save_scores:
-                    np_path = path.join(args.output, 'Scores', vid_name)
-                    os.makedirs(np_path, exist_ok=True)
-                    if ti == len(loader) - 1:
-                        hkl.dump(processor.object_manager.get_tmp_to_obj_mapping(),
-                                 path.join(np_path, f'backward.hkl'),
-                                 mode='w')
+                        mask = mask.cuda()[0]
+                    valid_labels = data.get('valid_labels')
+                    if valid_labels is not None:
+                        valid_labels = valid_labels.tolist()[0]
+                    info = data['info']
+                    frame = info['frame'][0]
+                    shape = info['shape']
+                    need_resize = info['need_resize'][0]
+
+                    start = torch.cuda.Event(enable_timing=True)
+                    end = torch.cuda.Event(enable_timing=True)
+                    start.record()
+
+                    # if, for some reason, the first frame is not aligned with the first mask
+                    if not first_mask_loaded:
+                        if mask is not None:
+                            first_mask_loaded = True
+                        else:
+                            # no point to do anything without a mask
+                            continue
+
+                    if args.flip:
+                        image = torch.flip(image, dims=[-1])
+                        mask = torch.flip(mask, dims=[-1]) if mask is not None else None
+
+                    # Run the model on this frame
+                    prob = processor.step(image, mask, valid_labels, end=(ti == vid_length - 1))
+
+                    # Upsample to original size if needed
+                    if need_resize:
+                        prob = F.interpolate(prob.unsqueeze(1),
+                                             shape,
+                                             mode='bilinear',
+                                             align_corners=False)[:, 0]
+
+                    if args.flip:
+                        prob = torch.flip(prob, dims=[-1])
+
+                    # Probability mask -> index mask
+                    out_mask = torch.argmax(prob, dim=0)
+                    out_mask = processor.object_manager.tmp_to_obj_cls(out_mask)
+
+                    end.record()
+                    torch.cuda.synchronize()
+                    total_process_time += (start.elapsed_time(end) / 1000)
+                    total_frames += 1
+
+                    if args.save_scores:
+                        prob = (prob.detach().cpu().numpy() * 255).astype(np.uint8)
+
+                    # Save the mask
                     if args.save_all or info['save'][0]:
-                        hkl.dump(prob,
-                                 path.join(np_path, f'{frame[:-4]}.hkl'),
-                                 mode='w',
-                                 compression='lzf')
-
-    except Exception as e:
-        print(f'Runtime error at {vid_name}')
-        print(e)
-        raise e
-
-print(f'Total processing time: {total_process_time}')
-print(f'Total processed frames: {total_frames}')
-print(f'FPS: {total_frames / total_process_time}')
-print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
-
-if not args.save_scores:
-    if is_youtube:
-        print('Making zip for YouTubeVOS...')
-        shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip', args.output,
-                            'Annotations')
-    elif is_davis and args.split == 'test':
-        print('Making zip for DAVIS test-dev...')
-        shutil.make_archive(args.output, 'zip', args.output)
+                        this_out_path = path.join(out_path, vid_name)
+                        os.makedirs(this_out_path, exist_ok=True)
+                        out_img = Image.fromarray(out_mask.cpu().numpy().astype(np.uint8))
+                        if vid_reader.get_palette() is not None:
+                            out_img.putpalette(vid_reader.get_palette())
+                        out_img.save(os.path.join(this_out_path, frame[:-4] + '.png'))
+
+                    if args.save_scores:
+                        np_path = path.join(args.output, 'Scores', vid_name)
+                        os.makedirs(np_path, exist_ok=True)
+                        if ti == len(loader) - 1:
+                            hkl.dump(processor.object_manager.get_tmp_to_obj_mapping(),
+                                     path.join(np_path, f'backward.hkl'),
+                                     mode='w')
+                        if args.save_all or info['save'][0]:
+                            hkl.dump(prob,
+                                     path.join(np_path, f'{frame[:-4]}.hkl'),
+                                     mode='w',
+                                     compression='lzf')
+
+        except Exception as e:
+            print(f'Runtime error at {vid_name}')
+            print(e)
+            raise e
+
+    print(f'Total processing time: {total_process_time}')
+    print(f'Total processed frames: {total_frames}')
+    print(f'FPS: {total_frames / total_process_time}')
+    print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+
+    if not args.save_scores:
+        if is_youtube:
+            print('Making zip for YouTubeVOS...')
+            shutil.make_archive(path.join(args.output, path.basename(args.output)), 'zip',
+                                args.output, 'Annotations')
+        elif is_davis and args.split == 'test':
+            print('Making zip for DAVIS test-dev...')
+            shutil.make_archive(args.output, 'zip', args.output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/eval_with_detections.py b/evaluation/eval_with_detections.py
index e68e7d4..3c8f3e4 100644
--- a/evaluation/eval_with_detections.py
+++ b/evaluation/eval_with_detections.py
@@ -23,247 +23,268 @@
 from deva.vps_metrics.eval_vpq_vipseg import eval_vpq
 from deva.inference.postprocess_unsup_davis17 import limit_max_id
 
-# for id2rgb
-np.random.seed(42)
-"""
-Arguments loading
-"""
-parser = ArgumentParser()
-parser.add_argument('--img_path', default='./example/vipseg')
-parser.add_argument('--mask_path')
-parser.add_argument('--json_path', default=None)
-parser.add_argument('--detection_every', type=int, default=5)
-parser.add_argument('--num_voting_frames',
-                    default=3,
-                    type=int,
-                    help='Number of frames selected for voting. only valid in semionline')
-parser.add_argument('--dataset', default='vipseg', help='vipseg/burst/unsup_davis17/demo')
-parser.add_argument('--max_missed_detection_count', type=int, default=5)
-# skip VPQ/STQ computation
-parser.add_argument('--no_metrics', action='store_true')
-
-parser.add_argument('--temporal_setting', default='semionline', help='semionline/online')
-parser.add_argument('--max_num_objects',
-                    default=-1,
-                    type=int,
-                    help='Max. num of objects to keep in memory. -1 for no limit')
-
-# the options below are only valid for burst
-parser.add_argument('--start', type=int, default=None, help='for distributed testing')
-parser.add_argument('--count', type=int, default=None, help='for distributed testing')
-parser.add_argument('--burst_gt_json', default='../BURST/val/all_classes.json')
-
-# only valid for VIPSeg
-parser.add_argument('--vipseg_root', default='../VIPSeg/VIPSeg_720P')
-
-# this option is only valid for unsup_davis17; limit the maximum number of predicted objects
-parser.add_argument('--postprocess_limit_max_id', type=int, default=20)
-
-add_common_eval_args(parser)
-network, config, args = get_model_and_config(parser)
-"""
-Temporal setting
-"""
-temporal_setting = args.temporal_setting.lower()
-assert temporal_setting in ['semionline', 'online']
-"""
-Data preparation
-"""
-dataset_name = args.dataset.lower()
-assert dataset_name in ['vipseg', 'burst', 'unsup_davis17',
-                        'demo'], f'Unknown dataset {dataset_name}'
-print(f'Dataset: {dataset_name}')
-is_vipseg = (dataset_name == 'vipseg')
-is_burst = (dataset_name == 'burst')
-is_davis = (dataset_name == 'unsup_davis17')
-is_demo = (dataset_name == 'demo')
-
-# try to find json path is not given
-if args.json_path is None:
-    if path.exists(path.join(args.mask_path, 'pred.json')):
-        args.json_path = path.join(args.mask_path, 'pred.json')
-out_path = args.output
-
-# try to find the real mask path if it is hidden behind pan_pred
-if path.exists(path.join(args.mask_path, 'pan_pred')):
-    args.mask_path = path.join(args.mask_path, 'pan_pred')
-if is_vipseg or is_davis or is_demo:
-    meta_dataset = VIPSegDetectionTestDataset(args.img_path, args.mask_path, args.size)
-elif is_burst:
-    meta_dataset = BURSTDetectionTestDataset(args.img_path,
-                                             args.mask_path,
-                                             args.burst_gt_json,
-                                             args.size,
-                                             start=args.start,
-                                             count=args.count)
-else:
-    raise NotImplementedError
-
-torch.autograd.set_grad_enabled(False)
-
-# Set up loader
-meta_loader = meta_dataset.get_datasets()
-"""
-Read the global pred.json if any
-"""
-global_json_enabled = args.json_path is not None
-per_vid_json_enabled = None
-if global_json_enabled:
-    print(f'Using a global json file {args.json_path}')
-    with open(args.json_path, 'r') as f:
-        all_json_info = json.load(f)
-    all_json_info = all_json_info['annotations']
-
-    video_id_to_annotation = {}
-    for ann in all_json_info:
-        video_id_to_annotation[ann['video_id']] = ann['annotations']
-
-if is_vipseg:
-    # we will export this as a single json for VPQ/STQ evaluation
-    output_json_annotations = []
-
-total_process_time = 0
-total_frames = 0
-
-# Start eval
-pbar = tqdm(meta_loader, total=len(meta_dataset))
-for vid_reader in pbar:
-    loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
-    vid_name = vid_reader.vid_name
-    pbar.set_description(vid_name)
-    vid_length = len(loader)
-    next_voting_frame = args.num_voting_frames - 1
-    # no need to count usage for LT if the video is not that long anyway
-    config['enable_long_term_count_usage'] = (
-        config['enable_long_term']
-        and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
-             config['num_prototypes']) >= config['max_long_term_elements'])
-
-    try:
-        processor = DEVAInferenceCore(network, config=config)
-        result_saver = ResultSaver(out_path,
-                                   vid_name,
-                                   dataset=dataset_name,
-                                   palette=vid_reader.palette,
-                                   object_manager=processor.object_manager)
-
-        for ti, data in enumerate(loader):
-            with torch.cuda.amp.autocast(enabled=args.amp):
-                image = data['rgb'].cuda()[0]
-                mask = data.get('mask')
-                if mask is not None:
-                    mask = mask.cuda()[0]
-                info = data['info']
-                frame = info['frame'][0]
-                shape = info['shape']
-                need_resize = info['need_resize'][0]
-                is_rgb = info['is_rgb'][0]
-                path_to_image = info['path_to_image'][0]
-                if args.save_all:
-                    info['save'][0] = True
-                if is_rgb:
-                    # if the mask format is RGB (instead of grayscale/palette), we need
-                    # more usable IDs (>255)
-                    processor.enabled_long_id()
-
-                segments_info = None
-                if not global_json_enabled:
-                    # safety check
-                    json_path = info.get('json')
-                    if per_vid_json_enabled is None:
-                        if json_path is None:
-                            print('Neither global nor per-video json exist.')
-                            per_vid_json_enabled = False
-                        else:
-                            print('Using per-video json.')
-                            per_vid_json_enabled = True
-                    elif json_path is None and per_vid_json_enabled:
-                        raise RuntimeError(
-                            f'Per-video json is enabled but not found for {vid_name}.')
-
-                    # read the per-video pred.json
-                    if per_vid_json_enabled:
-                        with open(json_path[0], 'r') as f:
-                            segments_info = json.load(f)
+
+def main():
+    # for id2rgb
+    np.random.seed(42)
+    """
+    Arguments loading
+    """
+    parser = ArgumentParser()
+    parser.add_argument('--img_path', default='./example/vipseg')
+    parser.add_argument('--mask_path')
+    parser.add_argument('--json_path', default=None)
+    parser.add_argument('--detection_every', type=int, default=5)
+    parser.add_argument('--num_voting_frames',
+                        default=3,
+                        type=int,
+                        help='Number of frames selected for voting. only valid in semionline')
+    parser.add_argument('--dataset', default='vipseg', help='vipseg/burst/unsup_davis17/demo')
+    parser.add_argument('--max_missed_detection_count', type=int, default=5)
+    # skip VPQ/STQ computation
+    parser.add_argument('--no_metrics', action='store_true')
+
+    parser.add_argument('--temporal_setting', default='semionline', help='semionline/online')
+    parser.add_argument('--max_num_objects',
+                        default=-1,
+                        type=int,
+                        help='Max. num of objects to keep in memory. -1 for no limit')
+
+    # the options below are only valid for burst
+    parser.add_argument('--start', type=int, default=None, help='for distributed testing')
+    parser.add_argument('--count', type=int, default=None, help='for distributed testing')
+    parser.add_argument('--burst_gt_json', default='../BURST/val/all_classes.json')
+
+    # only valid for VIPSeg
+    parser.add_argument('--vipseg_root', default='../VIPSeg/VIPSeg_720P')
+
+    # this option is only valid for unsup_davis17; limit the maximum number of predicted objects
+    parser.add_argument('--postprocess_limit_max_id', type=int, default=20)
+
+    add_common_eval_args(parser)
+    network, config, args = get_model_and_config(parser)
+    """
+    Temporal setting
+    """
+    temporal_setting = args.temporal_setting.lower()
+    assert temporal_setting in ['semionline', 'online']
+    """
+    Data preparation
+    """
+    dataset_name = args.dataset.lower()
+    assert dataset_name in ['vipseg', 'burst', 'unsup_davis17',
+                            'demo'], f'Unknown dataset {dataset_name}'
+    print(f'Dataset: {dataset_name}')
+    is_vipseg = (dataset_name == 'vipseg')
+    is_burst = (dataset_name == 'burst')
+    is_davis = (dataset_name == 'unsup_davis17')
+    is_demo = (dataset_name == 'demo')
+
+    # try to find json path is not given
+    if args.json_path is None:
+        if path.exists(path.join(args.mask_path, 'pred.json')):
+            args.json_path = path.join(args.mask_path, 'pred.json')
+    out_path = args.output
+
+    # try to find the real mask path if it is hidden behind pan_pred
+    if path.exists(path.join(args.mask_path, 'pan_pred')):
+        args.mask_path = path.join(args.mask_path, 'pan_pred')
+    if is_vipseg or is_davis or is_demo:
+        meta_dataset = VIPSegDetectionTestDataset(args.img_path, args.mask_path, args.size)
+    elif is_burst:
+        meta_dataset = BURSTDetectionTestDataset(args.img_path,
+                                                 args.mask_path,
+                                                 args.burst_gt_json,
+                                                 args.size,
+                                                 start=args.start,
+                                                 count=args.count)
+    else:
+        raise NotImplementedError
+
+    torch.autograd.set_grad_enabled(False)
+
+    # Set up loader
+    meta_loader = meta_dataset.get_datasets()
+    """
+    Read the global pred.json if any
+    """
+    global_json_enabled = args.json_path is not None
+    per_vid_json_enabled = None
+    if global_json_enabled:
+        print(f'Using a global json file {args.json_path}')
+        with open(args.json_path, 'r') as f:
+            all_json_info = json.load(f)
+        all_json_info = all_json_info['annotations']
+
+        video_id_to_annotation = {}
+        for ann in all_json_info:
+            video_id_to_annotation[ann['video_id']] = ann['annotations']
+
+    if is_vipseg:
+        # we will export this as a single json for VPQ/STQ evaluation
+        output_json_annotations = []
+
+    total_process_time = 0
+    total_frames = 0
+
+    # Start eval
+    pbar = tqdm(meta_loader, total=len(meta_dataset))
+    for vid_reader in pbar:
+        loader = DataLoader(vid_reader, batch_size=1, shuffle=False, num_workers=2)
+        vid_name = vid_reader.vid_name
+        pbar.set_description(vid_name)
+        vid_length = len(loader)
+        next_voting_frame = args.num_voting_frames - 1
+        # no need to count usage for LT if the video is not that long anyway
+        config['enable_long_term_count_usage'] = (
+            config['enable_long_term']
+            and (vid_length / (config['max_mid_term_frames'] - config['min_mid_term_frames']) *
+                 config['num_prototypes']) >= config['max_long_term_elements'])
+
+        try:
+            processor = DEVAInferenceCore(network, config=config)
+            result_saver = ResultSaver(out_path,
+                                       vid_name,
+                                       dataset=dataset_name,
+                                       palette=vid_reader.palette,
+                                       object_manager=processor.object_manager)
+
+            for ti, data in enumerate(loader):
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    image = data['rgb'].cuda()[0]
+                    mask = data.get('mask')
+                    if mask is not None:
+                        mask = mask.cuda()[0]
+                    info = data['info']
+                    frame = info['frame'][0]
+                    shape = info['shape']
+                    need_resize = info['need_resize'][0]
+                    is_rgb = info['is_rgb'][0]
+                    path_to_image = info['path_to_image'][0]
+                    if args.save_all:
+                        info['save'][0] = True
+                    if is_rgb:
+                        # if the mask format is RGB (instead of grayscale/palette), we need
+                        # more usable IDs (>255)
                         processor.enabled_long_id()
-                else:
-                    # read from the global json
-                    segments_info = video_id_to_annotation[vid_name][ti]['segments_info']
-                    processor.enabled_long_id()
-
-                start = torch.cuda.Event(enable_timing=True)
-                end = torch.cuda.Event(enable_timing=True)
-                start.record()
-
-                segments_info = convert_json_dict_to_objects_info(mask,
-                                                                  segments_info,
-                                                                  dataset=dataset_name)
-                frame_info = FrameInfo(image, mask, segments_info, ti, info)
-
-                if temporal_setting == 'semionline':
-                    if ti + args.num_voting_frames > next_voting_frame:
-                        # wait for more frames before proceeding
-                        processor.add_to_temporary_buffer(frame_info)
-
-                        if ti == next_voting_frame:
-                            # process this clip
-                            this_image = processor.frame_buffer[0].image
-                            this_ti = processor.frame_buffer[0].ti
-                            this_frame_name = processor.frame_buffer[0].name
-                            save_this_frame = processor.frame_buffer[0].save_needed
-                            path_to_image = processor.frame_buffer[0].path_to_image
-
-                            _, mask, new_segments_info = processor.vote_in_temporary_buffer(
-                                keyframe_selection='first')
-                            prob = processor.incorporate_detection(this_image, mask,
-                                                                   new_segments_info)
-                            next_voting_frame += args.detection_every
-                            if next_voting_frame >= vid_length:
-                                next_voting_frame = vid_length + args.num_voting_frames
 
-                            end.record()
-                            torch.cuda.synchronize()
-                            total_process_time += (start.elapsed_time(end) / 1000)
-                            total_frames += 1
+                    segments_info = None
+                    if not global_json_enabled:
+                        # safety check
+                        json_path = info.get('json')
+                        if per_vid_json_enabled is None:
+                            if json_path is None:
+                                print('Neither global nor per-video json exist.')
+                                per_vid_json_enabled = False
+                            else:
+                                print('Using per-video json.')
+                                per_vid_json_enabled = True
+                        elif json_path is None and per_vid_json_enabled:
+                            raise RuntimeError(
+                                f'Per-video json is enabled but not found for {vid_name}.')
+
+                        # read the per-video pred.json
+                        if per_vid_json_enabled:
+                            with open(json_path[0], 'r') as f:
+                                segments_info = json.load(f)
+                            processor.enabled_long_id()
+                    else:
+                        # read from the global json
+                        segments_info = video_id_to_annotation[vid_name][ti]['segments_info']
+                        processor.enabled_long_id()
+
+                    start = torch.cuda.Event(enable_timing=True)
+                    end = torch.cuda.Event(enable_timing=True)
+                    start.record()
+
+                    segments_info = convert_json_dict_to_objects_info(mask,
+                                                                      segments_info,
+                                                                      dataset=dataset_name)
+                    frame_info = FrameInfo(image, mask, segments_info, ti, info)
+
+                    if temporal_setting == 'semionline':
+                        if ti + args.num_voting_frames > next_voting_frame:
+                            # wait for more frames before proceeding
+                            processor.add_to_temporary_buffer(frame_info)
+
+                            if ti == next_voting_frame:
+                                # process this clip
+                                this_image = processor.frame_buffer[0].image
+                                this_ti = processor.frame_buffer[0].ti
+                                this_frame_name = processor.frame_buffer[0].name
+                                save_this_frame = processor.frame_buffer[0].save_needed
+                                path_to_image = processor.frame_buffer[0].path_to_image
+
+                                _, mask, new_segments_info = processor.vote_in_temporary_buffer(
+                                    keyframe_selection='first')
+                                prob = processor.incorporate_detection(
+                                    this_image, mask, new_segments_info)
+                                next_voting_frame += args.detection_every
+                                if next_voting_frame >= vid_length:
+                                    next_voting_frame = vid_length + args.num_voting_frames
 
-                            if save_this_frame:
-                                result_saver.save_mask(
-                                    prob,
-                                    this_frame_name,
-                                    need_resize=need_resize,
-                                    shape=shape,
-                                    path_to_image=path_to_image,
-                                )
-
-                            for frame_info in processor.frame_buffer[1:]:
-                                this_image = frame_info.image
-                                this_ti = frame_info.ti
-                                this_frame_name = frame_info.name
-                                save_this_frame = frame_info.save_needed
-                                path_to_image = frame_info.path_to_image
-                                start = torch.cuda.Event(enable_timing=True)
-                                end = torch.cuda.Event(enable_timing=True)
-                                start.record()
-                                prob = processor.step(this_image,
-                                                      None,
-                                                      None,
-                                                      end=(this_ti == vid_length - 1))
                                 end.record()
                                 torch.cuda.synchronize()
                                 total_process_time += (start.elapsed_time(end) / 1000)
                                 total_frames += 1
 
                                 if save_this_frame:
-                                    result_saver.save_mask(prob,
-                                                           this_frame_name,
-                                                           need_resize=need_resize,
-                                                           shape=shape,
-                                                           path_to_image=path_to_image)
-
-                            processor.clear_buffer()
-                    else:
-                        # standard propagation
-                        prob = processor.step(image, None, None, end=(ti == vid_length - 1))
+                                    result_saver.save_mask(
+                                        prob,
+                                        this_frame_name,
+                                        need_resize=need_resize,
+                                        shape=shape,
+                                        path_to_image=path_to_image,
+                                    )
+
+                                for frame_info in processor.frame_buffer[1:]:
+                                    this_image = frame_info.image
+                                    this_ti = frame_info.ti
+                                    this_frame_name = frame_info.name
+                                    save_this_frame = frame_info.save_needed
+                                    path_to_image = frame_info.path_to_image
+                                    start = torch.cuda.Event(enable_timing=True)
+                                    end = torch.cuda.Event(enable_timing=True)
+                                    start.record()
+                                    prob = processor.step(this_image,
+                                                          None,
+                                                          None,
+                                                          end=(this_ti == vid_length - 1))
+                                    end.record()
+                                    torch.cuda.synchronize()
+                                    total_process_time += (start.elapsed_time(end) / 1000)
+                                    total_frames += 1
+
+                                    if save_this_frame:
+                                        result_saver.save_mask(prob,
+                                                               this_frame_name,
+                                                               need_resize=need_resize,
+                                                               shape=shape,
+                                                               path_to_image=path_to_image)
+
+                                processor.clear_buffer()
+                        else:
+                            # standard propagation
+                            prob = processor.step(image, None, None, end=(ti == vid_length - 1))
+                            end.record()
+                            torch.cuda.synchronize()
+                            total_process_time += (start.elapsed_time(end) / 1000)
+                            total_frames += 1
+                            if info['save'][0]:
+                                result_saver.save_mask(prob,
+                                                       frame,
+                                                       need_resize=need_resize,
+                                                       shape=shape,
+                                                       path_to_image=path_to_image)
+
+                    elif temporal_setting == 'online':
+                        if ti % args.detection_every == 0:
+                            # incorporate new detections
+                            assert mask is not None
+                            prob = processor.incorporate_detection(image, mask, segments_info)
+                        else:
+                            # Run the model on this frame
+                            prob = processor.step(image, None, None, end=(ti == vid_length - 1))
                         end.record()
                         torch.cuda.synchronize()
                         total_process_time += (start.elapsed_time(end) / 1000)
@@ -275,72 +296,57 @@
                                                    shape=shape,
                                                    path_to_image=path_to_image)
 
-                elif temporal_setting == 'online':
-                    if ti % args.detection_every == 0:
-                        # incorporate new detections
-                        assert mask is not None
-                        prob = processor.incorporate_detection(image, mask, segments_info)
                     else:
-                        # Run the model on this frame
-                        prob = processor.step(image, None, None, end=(ti == vid_length - 1))
-                    end.record()
-                    torch.cuda.synchronize()
-                    total_process_time += (start.elapsed_time(end) / 1000)
-                    total_frames += 1
-                    if info['save'][0]:
-                        result_saver.save_mask(prob,
-                                               frame,
-                                               need_resize=need_resize,
-                                               shape=shape,
-                                               path_to_image=path_to_image)
-
-                else:
-                    raise NotImplementedError
-
-        result_saver.end()
-        if is_vipseg:
-            # save this for a dataset-level json
-            output_json_annotations.append(result_saver.video_json)
-        elif is_burst:
-            # save this as a video-level json, which we merge later
-            with open(path.join(out_path, vid_name, 'pred.json'), 'w') as f:
-                json.dump(result_saver.video_json, f)
-        elif is_demo:
-            # save this as a video-level json in a separate folder
-            os.makedirs(path.join(out_path, 'JSONFiles'), exist_ok=True)
-            with open(path.join(out_path, 'JSONFiles', f'{vid_name}.json'), 'w') as f:
-                json.dump(result_saver.video_json, f, indent=4)
-
-    except Exception as e:
-        print(f'Runtime error at {vid_name}')
-        print(e)
-        raise e  # comment this out if you want
-
-if is_vipseg:
-    output_json = {'annotations': output_json_annotations}
-    with open(path.join(out_path, 'pred.json'), 'w') as f:
-        json.dump(output_json, f)
-
-print(f'Total processing time: {total_process_time}')
-print(f'Total processed frames: {total_frames}')
-print(f'FPS: {total_frames / total_process_time}')
-print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
-
-if is_vipseg:
-    vipseg_root = args.vipseg_root
-    print('Starting evaluation...')
-    merge_stuff(out_path, out_path)
-
-    if not args.no_metrics:
-        p1 = Process(target=partial(eval_stq, out_path, f'{vipseg_root}/panomasksRGB',
-                                    f'{vipseg_root}/panoptic_gt_VIPSeg_val.json'))
-        p1.start()
-        eval_vpq(out_path,
-                 f'{vipseg_root}/panomasksRGB',
-                 f'{vipseg_root}/panoptic_gt_VIPSeg_val.json',
-                 num_processes=16)
-        p1.join()
-elif is_davis:
-    if args.postprocess_limit_max_id > 0:
-        print('Post-processing DAVIS 2017...')
-        limit_max_id(out_path, out_path, max_num_objects=args.postprocess_limit_max_id)
+                        raise NotImplementedError
+
+            result_saver.end()
+            if is_vipseg:
+                # save this for a dataset-level json
+                output_json_annotations.append(result_saver.video_json)
+            elif is_burst:
+                # save this as a video-level json, which we merge later
+                with open(path.join(out_path, vid_name, 'pred.json'), 'w') as f:
+                    json.dump(result_saver.video_json, f)
+            elif is_demo:
+                # save this as a video-level json in a separate folder
+                os.makedirs(path.join(out_path, 'JSONFiles'), exist_ok=True)
+                with open(path.join(out_path, 'JSONFiles', f'{vid_name}.json'), 'w') as f:
+                    json.dump(result_saver.video_json, f, indent=4)
+
+        except Exception as e:
+            print(f'Runtime error at {vid_name}')
+            print(e)
+            raise e  # comment this out if you want
+
+    if is_vipseg:
+        output_json = {'annotations': output_json_annotations}
+        with open(path.join(out_path, 'pred.json'), 'w') as f:
+            json.dump(output_json, f)
+
+    print(f'Total processing time: {total_process_time}')
+    print(f'Total processed frames: {total_frames}')
+    print(f'FPS: {total_frames / total_process_time}')
+    print(f'Max allocated memory (MB): {torch.cuda.max_memory_allocated() / (2**20)}')
+
+    if is_vipseg:
+        vipseg_root = args.vipseg_root
+        print('Starting evaluation...')
+        merge_stuff(out_path, out_path)
+
+        if not args.no_metrics:
+            p1 = Process(target=partial(eval_stq, out_path, f'{vipseg_root}/panomasksRGB',
+                                        f'{vipseg_root}/panoptic_gt_VIPSeg_val.json'))
+            p1.start()
+            eval_vpq(out_path,
+                     f'{vipseg_root}/panomasksRGB',
+                     f'{vipseg_root}/panoptic_gt_VIPSeg_val.json',
+                     num_processes=16)
+            p1.join()
+    elif is_davis:
+        if args.postprocess_limit_max_id > 0:
+            print('Post-processing DAVIS 2017...')
+            limit_max_id(out_path, out_path, max_num_objects=args.postprocess_limit_max_id)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/merge_multi_scale.py b/scripts/merge_multi_scale.py
index de5def7..a3fc9c0 100644
--- a/scripts/merge_multi_scale.py
+++ b/scripts/merge_multi_scale.py
@@ -8,7 +8,7 @@
 import hickle as hkl
 from PIL import Image, ImagePalette
 
-from progressbar import progressbar
+from tqdm import tqdm
 from multiprocessing import Pool
 from deva.utils import palette
 
@@ -123,7 +123,7 @@ def process_vid(vid):
     print('Total number of videos: ', len(all_vid))
 
     pool = Pool(processes=args.num_proc)
-    for _ in progressbar(pool.imap_unordered(process_vid, all_vid), max_value=len(all_vid)):
+    for _ in tqdm(pool.imap_unordered(process_vid, all_vid), max_value=len(all_vid)):
         pass
 
     pool.close()