diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 1709d9dbcbb3..dcd6df29767a 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -291,6 +291,7 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in return sources + def process_image(processor, image, image_aspect_ratio="square"): if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor): # image processor from HF @@ -322,12 +323,11 @@ def expand2square(pil_img, background_color): else: image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] else: - assert ( - image_aspect_ratio == 'square' - ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.' + assert image_aspect_ratio == 'square', 'NeMo image transform with setting `image_aspect_ratio` to `square`.' image = processor(image) return image + def preprocess_llama_3( sources: dict, tokenizer, @@ -803,9 +803,11 @@ def preprocess_nv_dpo( if len(parts) != 2: break - #handle label if exists + # handle label if exists labels_match = re.search(rf"{re.escape(DEFAULT_LABELS_TOKEN)}.*?\n", parts[1]) - instruction_len = len(tokenizer.text_to_ids(parts[0] + sep + (parts[1][:labels_match.end()] if labels_match else ""))) + instruction_len = len( + tokenizer.text_to_ids(parts[0] + sep + (parts[1][: labels_match.end()] if labels_match else "")) + ) round_len = len(tokenizer.text_to_ids(rou + conv.sep)) target[cur_len : cur_len + instruction_len] = IGNORE_INDEX diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 8792c80280ca..f8d2365b6dc7 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -300,7 +300,8 @@ def create_vision_encoder_and_processor(self, mm_cfg): if mm_cfg.vision_encoder.get("from_hf", False): if "clip" in mm_cfg.vision_encoder.from_pretrained: vision_encoder = CLIPVisionModel.from_pretrained( - mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16, + mm_cfg.vision_encoder.from_pretrained, + torch_dtype=torch.bfloat16, ).cuda() vision_encoder = vision_encoder.to(torch.bfloat16) if mm_cfg.vision_encoder.freeze: @@ -312,7 +313,8 @@ def create_vision_encoder_and_processor(self, mm_cfg): ) elif "siglip" in mm_cfg.vision_encoder.from_pretrained: vision_encoder = SiglipVisionModel.from_pretrained( - mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16, + mm_cfg.vision_encoder.from_pretrained, + torch_dtype=torch.bfloat16, ).cuda() vision_encoder = vision_encoder.to(torch.bfloat16) if mm_cfg.vision_encoder.freeze: @@ -323,7 +325,7 @@ def create_vision_encoder_and_processor(self, mm_cfg): mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) else: - raise(ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface")) + raise (ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface")) else: vision_cfg = MegatronCLIPModel.restore_from( mm_cfg.vision_encoder.from_pretrained, return_config=True @@ -333,7 +335,12 @@ def create_vision_encoder_and_processor(self, mm_cfg): if mm_cfg.vision_encoder.freeze: vision_encoder.freeze() crop_size = mm_cfg.get("crop_size", (224, 224)) - image_processor = image_transform(crop_size, is_train=False, mean=None, std=None, ) + image_processor = image_transform( + crop_size, + is_train=False, + mean=None, + std=None, + ) return vision_encoder, image_processor @@ -1058,8 +1065,9 @@ def build_train_valid_test_datasets(self): else: ds_dict = make_supervised_data_module( tokenizer=self.tokenizer, - image_processor=self.model.module.image_processor if hasattr(self.model, - "module") else self.model.image_processor, + image_processor=( + self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor + ), model_cfg=self.cfg, ) self._train_ds = ds_dict["train_dataset"] diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index 3d65fd987b83..8bde092da947 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -430,8 +430,9 @@ def image_processor(maybe_image_path): else: image = maybe_image_path - processor = model.model.module.image_processor \ - if hasattr(model.model, "module") else model.model.image_processor + processor = ( + model.model.module.image_processor if hasattr(model.model, "module") else model.model.image_processor + ) image = process_image(processor, image, neva_cfg.data.image_aspect_ratio) if neva_cfg.precision in [16, '16', '16-mixed']: media = image.type(torch.float16)