diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 1709d9dbcbb3..dcd6df29767a 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -291,6 +291,7 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
 
     return sources
 
+
 def process_image(processor, image, image_aspect_ratio="square"):
     if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
         # image processor from HF
@@ -322,12 +323,11 @@ def expand2square(pil_img, background_color):
         else:
             image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
     else:
-        assert (
-                image_aspect_ratio == 'square'
-        ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
+        assert image_aspect_ratio == 'square', 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
         image = processor(image)
     return image
 
+
 def preprocess_llama_3(
     sources: dict,
     tokenizer,
@@ -803,9 +803,11 @@ def preprocess_nv_dpo(
             if len(parts) != 2:
                 break
 
-            #handle label if exists
+            # handle label if exists
             labels_match = re.search(rf"{re.escape(DEFAULT_LABELS_TOKEN)}.*?\n", parts[1])
-            instruction_len = len(tokenizer.text_to_ids(parts[0] + sep + (parts[1][:labels_match.end()] if labels_match else "")))
+            instruction_len = len(
+                tokenizer.text_to_ids(parts[0] + sep + (parts[1][: labels_match.end()] if labels_match else ""))
+            )
             round_len = len(tokenizer.text_to_ids(rou + conv.sep))
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 8792c80280ca..f8d2365b6dc7 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -300,7 +300,8 @@ def create_vision_encoder_and_processor(self, mm_cfg):
         if mm_cfg.vision_encoder.get("from_hf", False):
             if "clip" in mm_cfg.vision_encoder.from_pretrained:
                 vision_encoder = CLIPVisionModel.from_pretrained(
-                    mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16,
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
                 ).cuda()
                 vision_encoder = vision_encoder.to(torch.bfloat16)
                 if mm_cfg.vision_encoder.freeze:
@@ -312,7 +313,8 @@ def create_vision_encoder_and_processor(self, mm_cfg):
                 )
             elif "siglip" in mm_cfg.vision_encoder.from_pretrained:
                 vision_encoder = SiglipVisionModel.from_pretrained(
-                    mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16,
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
                 ).cuda()
                 vision_encoder = vision_encoder.to(torch.bfloat16)
                 if mm_cfg.vision_encoder.freeze:
@@ -323,7 +325,7 @@ def create_vision_encoder_and_processor(self, mm_cfg):
                     mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
                 )
             else:
-                raise(ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface"))
+                raise (ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface"))
         else:
             vision_cfg = MegatronCLIPModel.restore_from(
                 mm_cfg.vision_encoder.from_pretrained, return_config=True
@@ -333,7 +335,12 @@ def create_vision_encoder_and_processor(self, mm_cfg):
             if mm_cfg.vision_encoder.freeze:
                 vision_encoder.freeze()
             crop_size = mm_cfg.get("crop_size", (224, 224))
-            image_processor = image_transform(crop_size, is_train=False, mean=None, std=None, )
+            image_processor = image_transform(
+                crop_size,
+                is_train=False,
+                mean=None,
+                std=None,
+            )
 
         return vision_encoder, image_processor
 
@@ -1058,8 +1065,9 @@ def build_train_valid_test_datasets(self):
         else:
             ds_dict = make_supervised_data_module(
                 tokenizer=self.tokenizer,
-                image_processor=self.model.module.image_processor if hasattr(self.model,
-                                                                             "module") else self.model.image_processor,
+                image_processor=(
+                    self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor
+                ),
                 model_cfg=self.cfg,
             )
             self._train_ds = ds_dict["train_dataset"]
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 3d65fd987b83..8bde092da947 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -430,8 +430,9 @@ def image_processor(maybe_image_path):
         else:
             image = maybe_image_path
 
-        processor = model.model.module.image_processor \
-            if hasattr(model.model, "module") else model.model.image_processor
+        processor = (
+            model.model.module.image_processor if hasattr(model.model, "module") else model.model.image_processor
+        )
         image = process_image(processor, image, neva_cfg.data.image_aspect_ratio)
         if neva_cfg.precision in [16, '16', '16-mixed']:
             media = image.type(torch.float16)