From 90222e0032ebe3e207d3c4dca4a15bcc785c50c5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 29 Mar 2024 15:04:51 +0800 Subject: [PATCH 001/126] refactor llava --- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 242 ++++++++++++++++ xtuner/dataset/evaluation/__init__.py | 4 + .../dataset/evaluation/base_eval_dataset.py | 59 ++++ .../dataset/evaluation/mme_llava_dataset.py | 164 +++++++++++ .../multiple_choice_llava_dataset.py | 273 ++++++++++++++++++ xtuner/dataset/evaluation/utils.py | 75 +++++ xtuner/engine/__init__.py | 5 +- xtuner/engine/hooks/evaluate_chat_hook.py | 67 +---- xtuner/engine/runner/__init__.py | 4 +- xtuner/engine/runner/loops.py | 189 ++++++++++++ xtuner/model/llava.py | 148 ++++++++-- xtuner/model/utils.py | 3 +- 12 files changed, 1146 insertions(+), 87 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py create mode 100644 xtuner/dataset/evaluation/__init__.py create mode 100644 xtuner/dataset/evaluation/base_eval_dataset.py create mode 100644 xtuner/dataset/evaluation/mme_llava_dataset.py create mode 100644 xtuner/dataset/evaluation/multiple_choice_llava_dataset.py create mode 100644 xtuner/dataset/evaluation/utils.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py new file mode 100644 index 000000000..66d67bdda --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -0,0 +1,242 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) + +test_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_dataloader and val_evaluator, +# only utilizing val_dataset. +val_dataloader = dict(dataset=val_dataset) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_dataloader and test_evaluator, +# only utilizing test_dataset. +test_dataloader = dict(dataset=test_dataset) +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py new file mode 100644 index 000000000..1a09f5c64 --- /dev/null +++ b/xtuner/dataset/evaluation/__init__.py @@ -0,0 +1,4 @@ +from .mme_llava_dataset import MMELLaVADataset +from multiple_choice_llava_dataset import MultipleChoiceLLaVADataset + +__all__ = ['MMELLaVADataset', 'MultipleChoiceLLaVADataset'] diff --git a/xtuner/dataset/evaluation/base_eval_dataset.py b/xtuner/dataset/evaluation/base_eval_dataset.py new file mode 100644 index 000000000..1fba323b2 --- /dev/null +++ b/xtuner/dataset/evaluation/base_eval_dataset.py @@ -0,0 +1,59 @@ +from torch.utils.data import Dataset +import copy +from collections.abc import Mapping +from typing import Union +from mmengine.config import Config +import logging +from mmengine.fileio import list_from_file +from mmengine.logging import print_log +from abc import abstractmethod + + +class BaseEvalDataset(Dataset): + + METAINFO: dict = dict() + + def __init__(self, metainfo: Union[Mapping, Config, None] = None): + self._metainfo = self._load_metainfo(copy.deepcopy(metainfo)) + + @classmethod + def _load_metainfo(cls, + metainfo: Union[Mapping, Config, None] = None) -> dict: + """Collect meta information from the dictionary of meta. + + Args: + metainfo (Mapping or Config, optional): Meta information dict. + If ``metainfo`` contains existed filename, it will be + parsed by ``list_from_file``. + + Returns: + dict: Parsed meta information. + """ + # avoid `cls.METAINFO` being overwritten by `metainfo` + cls_metainfo = copy.deepcopy(cls.METAINFO) + if metainfo is None: + return cls_metainfo + if not isinstance(metainfo, (Mapping, Config)): + raise TypeError('metainfo should be a Mapping or Config, ' + f'but got {type(metainfo)}') + + for k, v in metainfo.items(): + if isinstance(v, str): + # If type of value is string, and can be loaded from + # corresponding backend. it means the file name of meta file. + try: + cls_metainfo[k] = list_from_file(v) + except (TypeError, FileNotFoundError): + print_log( + f'{v} is not a meta file, simply parsed as meta ' + 'information', + logger='current', + level=logging.WARNING) + cls_metainfo[k] = v + else: + cls_metainfo[k] = v + return cls_metainfo + + @abstractmethod + def evaluate(self, results, work_dir): + pass diff --git a/xtuner/dataset/evaluation/mme_llava_dataset.py b/xtuner/dataset/evaluation/mme_llava_dataset.py new file mode 100644 index 000000000..4e9631548 --- /dev/null +++ b/xtuner/dataset/evaluation/mme_llava_dataset.py @@ -0,0 +1,164 @@ +import os +import os.path as osp + +import pandas as pd +import torch +from mmengine.dist import (master_only) +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image, expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from PIL import Image +from .utils import YOrN_Extraction, MME_rating + + +class MMELLaVADataset(BaseEvalDataset): + + def __init__(self, data_file, image_folder, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, for_llava_prompt=False, metainfo=None): + super().__init__(metainfo) + self.image_folder = image_folder + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + image_path = self.df.iloc[idx]['image_path'] + + question = self.df.iloc[idx]['question'] + if self.for_llava_prompt: + question = question.replace(' Please answer yes or no.', + '\nAnswer the question using a single word or phrase.') + + category = self.df.iloc[idx]['category'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'image_path': image_path, + 'question': question, + 'answer': answer, + 'category': category, + 'index': index, + 'img_id': idx + } + data_list.append(data) + return data_list + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = {'img_id': data['img_id']} + + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.use_system: + inputs = self.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.template['INSTRUCTION'].format(input=text, round=1) + + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 发现重新生成数据集后,感知部分还是对不上,推理部分对的上,暂时不清楚原因 + # image = self.get_image(data['img']).convert('RGB') + image = Image.open(os.path.join(self.image_folder, + data['image_path'])).convert('RGB') + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + + @master_only + def evaluate(self, results, work_dir): + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + cur_result['image_path'] = filtered_rows.get('image_path') + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + + ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} + # 不使用 gpt + data['extracted'] = [ans_map[x] for x in data['index']] + data['score'] = (data['answer'] == data['extracted']) + + results_df = pd.DataFrame(data) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = MME_rating(data) + print_log('============================================', 'current') + print_log(score, 'current') + print_log('============================================', 'current') + print_log(f'MME YOrN_eval successfully finished evaluating', 'current') + return score diff --git a/xtuner/dataset/evaluation/multiple_choice_llava_dataset.py b/xtuner/dataset/evaluation/multiple_choice_llava_dataset.py new file mode 100644 index 000000000..54cf0609f --- /dev/null +++ b/xtuner/dataset/evaluation/multiple_choice_llava_dataset.py @@ -0,0 +1,273 @@ +import os +import os.path as osp +import re +import string + +import numpy as np +import pandas as pd +import torch +from mmengine.dist import (master_only) +from rich.console import Console +from rich.table import Table +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image, expand2square +from xtuner.tools.utils import is_cn_string +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.registry import BUILDER +from mmengine.logging import print_log + + +class MultipleChoiceLLaVADataset(BaseEvalDataset): + # 'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d' + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + self.split = 'dev' if 'answer' in self.df.iloc[0].keys() else 'test' + self.has_l2_category = 'l2-category' in self.df.columns.to_list() + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + question = self.df.iloc[idx]['question'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + category = self.df.iloc[idx]['category'] + + options = { + cand: self.load_from_df(idx, cand) + for cand in string.ascii_uppercase + if self.load_from_df(idx, cand) is not None + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = self.load_from_df(idx, 'hint') + data = { + 'img': image, + 'question': question, + 'answer': answer, + 'options': options_prompt, + 'category': category, + 'options_dict': options, + 'index': index, + 'context': hint, + 'img_id': idx + } + if self.has_l2_category: + data.update({'l2-category': self.df.iloc[idx]['l2-category']}) + data_list.append(data) + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = {'img_id': data['img_id']} + + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + + if self.use_system: + inputs = self.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.template['INSTRUCTION'].format(input=text, round=1) + + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + image = self.get_image(data['img']).convert('RGB') + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + + def load_from_df(self, idx, key): + if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]): + return self.df.iloc[idx][key] + else: + return None + + @master_only + def evaluate(self, results, work_dir): + + def calc_acc(df, group='category'): + assert group in ['overall', 'category', 'l2-category'] + if group == 'overall': + res = {'Average': np.mean(df['hit'])} + else: + res = {} + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + sub_df = df[df[group] == ab] + res[ab] = np.mean(sub_df['hit']) + return res + + def eval_sub_data(sub_data, answer_map): + lt = len(sub_data) + for i in range(lt): + item = sub_data.iloc[i] + match = re.search(r'([A-D]+)', item['prediction']) + pred = match.group(1) if match else '' + gt = answer_map[item['index']] + if gt != pred: + return 0 + return 1 + + def show_result(ret_json): + show_dict = ret_json.copy() + table = Table(title=f' Multiple Choice ({self.data_file}) ') + console = Console() + table.add_column('Category', justify='left') + table.add_column('Accuracy (%)', justify='right') + average = show_dict.pop('Average') * 100 + table.add_row('Average', f'{average:.1f}') + table.add_section() + for cat_name, cat_acc in show_dict.items(): + table.add_row(cat_name, f'{cat_acc * 100:.1f}') + with console.capture() as capture: + console.print(table, end='') + print_log('\n' + capture.get(), 'current') + print_log('Note: Please be cautious if you use the results in papers, ' + "since we don't use ChatGPT as a helper for choice " + 'extraction', 'current') + + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result.update(filtered_rows.get('options_dict')) + cur_result['prediction'] = pred_dict['prediction'] + if filtered_rows.get('category') is not None: + cur_result['category'] = filtered_rows.get('category') + if filtered_rows.get('l2-category') is not None: + cur_result['l2-category'] = filtered_rows.get('l2-category') + cur_result['index'] = filtered_rows.get('index') + cur_result['split'] = filtered_rows.get('split') + cur_result['answer'] = filtered_rows.get('answer') + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + if self.split != 'dev': + print_log('Test set does not have answers, skip evaluation', 'current') + return {'Average': 0} + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + for k in data.keys(): + data[k.lower() if k not in 'ABCD' else k] = data.pop(k) + + data_main = data[data['index'] < int(1e6)] + cate_map = { + i: c + for i, c in zip(self.df['index'], self.df['category']) + } + if self.has_l2_category: + l2_cate_map = { + i: c + for i, c in zip(self.df['index'], self.df['l2-category']) + } + answer_map = { + i: c + for i, c in zip(self.df['index'], self.df['answer']) + } + + lt = len(data_main) + hit, tot = 0, 0 + result = {} + for i in range(lt): + item_main = data_main.iloc[i] + idx = item_main['index'] + assert idx not in result + sub_data = data[data['index'] % int(1e6) == idx] + ret = eval_sub_data(sub_data, answer_map) + result[idx] = ret + hit += ret + tot += 1 + + indices = data_main['index'] + data_main = data_main.copy() + data_main['hit'] = [result[i] for i in indices] + main_idx = data_main['index'] + data_main['category'] = [cate_map[i] for i in main_idx] + + ret_json = calc_acc(data_main, 'overall') + + if self.has_l2_category: + data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] + l2 = calc_acc(data_main, 'l2-category') + ret_json.update(l2) + else: + leaf = calc_acc(data_main, 'category') + ret_json.update(leaf) + print_log('============================================', 'current') + show_result(ret_json) + print_log('============================================', 'current') + print_log('Multiple Choice successfully finished evaluating' 'current') + return ret_json diff --git a/xtuner/dataset/evaluation/utils.py b/xtuner/dataset/evaluation/utils.py new file mode 100644 index 000000000..e01179aec --- /dev/null +++ b/xtuner/dataset/evaluation/utils.py @@ -0,0 +1,75 @@ +import numpy as np +from collections import defaultdict + + +def process_punctuation(inText): + import re + outText = inText + punct = [ + ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', + '>', '<', '@', '`', ',', '?', '!' + ] + commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605 + periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605 + for p in punct: + if (p + ' ' in inText or ' ' + p in inText) or (re.search( + commaStrip, inText) is not None): + outText = outText.replace(p, '') + else: + outText = outText.replace(p, ' ') + outText = periodStrip.sub('', outText, re.UNICODE) + return outText + + +def YOrN_Extraction(output): + s = output.lower() + words = process_punctuation(s).split() + if 'yes' in words and 'no' not in words: + return 'Yes' + if 'yes' not in words and 'no' in words: + return 'No' + return 'Unknown' + + +def MME_rating(data): + stats = defaultdict(dict) + lt = len(data) + for i in range(lt): + item = data.iloc[i] + category = item['category'] + image_path = item['image_path'] + score = item['score'] + if image_path not in stats[category]: + stats[category][image_path] = [] + stats[category][image_path].append(score) + + def acc(key, mode='normal'): + res = stats[key] + values = [] + for val in res.values(): + if mode == 'normal': + values.extend(val) + elif mode == 'plus': + values.append(val[0] * val[1]) + return np.mean(values) * 100 + + scores = {} + for k in stats: + scores[k] = acc(k) + acc(k, 'plus') + + super_cates = dict( + perception=[ + 'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence', + 'landmark', 'position', 'posters', 'scene' + ], + reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation'] + ) + + ret = {} + for sc, cate_list in super_cates.items(): + base = 0 + for c in cate_list: + base += scores[c] + ret[sc] = base + ret.update(scores) + return ret diff --git a/xtuner/engine/__init__.py b/xtuner/engine/__init__.py index 4f50972ea..ae4a46726 100644 --- a/xtuner/engine/__init__.py +++ b/xtuner/engine/__init__.py @@ -2,9 +2,10 @@ from ._strategy import DeepSpeedStrategy from .hooks import (DatasetInfoHook, EvaluateChatHook, ThroughputHook, VarlenAttnArgsToMessageHubHook) -from .runner import TrainLoop +from .runner import TrainLoop, ValLoop, TestLoop __all__ = [ 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook', - 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop' + 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop', + 'ValLoop', 'TestLoop' ] diff --git a/xtuner/engine/hooks/evaluate_chat_hook.py b/xtuner/engine/hooks/evaluate_chat_hook.py index 8e6a86822..fd6650705 100644 --- a/xtuner/engine/hooks/evaluate_chat_hook.py +++ b/xtuner/engine/hooks/evaluate_chat_hook.py @@ -16,7 +16,6 @@ class EvaluateChatHook(Hook): - priority = 'LOW' def __init__(self, @@ -108,52 +107,14 @@ def _eval_images(self, for sample_image, sample_input in zip(self.evaluation_images, self.evaluation_inputs): - image = expand2square( - sample_image, - tuple(int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - image = image.to(device) - sample_input = DEFAULT_IMAGE_TOKEN + '\n' + sample_input - inputs = (self.system + self.instruction).format( - input=sample_input, round=1, **runner.cfg) - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode( - chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - input_ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - input_ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - input_ids.append(IMAGE_TOKEN_INDEX) - input_ids = torch.tensor(input_ids).to(device) - visual_outputs = model.visual_encoder( - image.unsqueeze(0).to(model.visual_encoder.dtype), - output_hidden_states=True) - pixel_values = model.projector( - visual_outputs.hidden_states[model.visual_select_layer][:, 1:]) - mm_inputs = prepare_inputs_labels_for_multimodal( - llm=model.llm, - input_ids=input_ids.unsqueeze(0), - pixel_values=pixel_values) - - generation_output = model.generate( - **mm_inputs, - max_new_tokens=max_new_tokens, - generation_config=self.gen_config, - bos_token_id=self.tokenizer.bos_token_id, - stopping_criteria=self.stop_criteria) - generation_output = self.tokenizer.decode(generation_output[0]) + generation_output = model.chat({'image': sample_image, 'text': sample_input}) + inputs = generation_output['inputs'] + prediction = generation_output['prediction'] runner.logger.info(f'Sample output:\n' - f'{inputs + generation_output}\n') + f'{inputs + prediction}\n') if save_eval_output: - eval_outputs.append(f'{inputs + generation_output}\n') + eval_outputs.append(f'{inputs + prediction}\n') if save_eval_output: self._save_eval_output(runner, eval_outputs) @@ -196,13 +157,11 @@ def _generate_samples(self, model = model.module device = next(iter(model.parameters())).device - is_checkpointing = model.llm.is_gradient_checkpointing - use_cache = model.llm.config.use_cache - # Cast to inference mode - model.activation_checkpointing_disable() - model.llm.config.use_cache = True + model.gradient_checkpointing_disable() model.eval() + model.preparing_for_generation({'generation_kwargs': {'max_new_tokens': max_new_tokens}}) + if self.evaluation_images is not None: self._eval_images(runner, model, device, max_new_tokens, save_eval_output) @@ -211,9 +170,7 @@ def _generate_samples(self, save_eval_output) # Cast to training mode - if is_checkpointing: - model.activation_checkpointing_enable() - model.llm.config.use_cache = use_cache + model.gradient_checkpointing_enable() model.train() def before_train(self, runner): @@ -231,7 +188,7 @@ def _is_save_checkpoint(self, runner): return False if checkpoint_hook.every_n_train_iters( - runner, checkpoint_hook.interval, checkpoint_hook.save_begin) or \ + runner, checkpoint_hook.interval, checkpoint_hook.save_begin) or \ (checkpoint_hook.save_last and checkpoint_hook.is_last_train_iter(runner)): return True @@ -249,8 +206,8 @@ def after_train_iter(self, save_eval_output = self._is_save_checkpoint(runner) do_chat = ( - save_eval_output - or self.every_n_train_iters(runner, self.every_n_iters)) + save_eval_output + or self.every_n_train_iters(runner, self.every_n_iters)) if not do_chat: return diff --git a/xtuner/engine/runner/__init__.py b/xtuner/engine/runner/__init__.py index d8d1c582b..c621e5f1b 100644 --- a/xtuner/engine/runner/__init__.py +++ b/xtuner/engine/runner/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .loops import TrainLoop +from .loops import TrainLoop, ValLoop, TestLoop -__all__ = ['TrainLoop'] +__all__ = ['TrainLoop', 'ValLoop', 'TestLoop'] diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index aeb6be31a..66d4a5572 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -2,7 +2,16 @@ from typing import Dict, Optional, Union from mmengine.runner import IterBasedTrainLoop +from mmengine.runner import ValLoop as MMENGINE_ValLoop +from mmengine.runner import TestLoop as MMENGINE_TestLoop from torch.utils.data import DataLoader +from typing import Sequence +from mmengine.dist import broadcast_object_list, is_main_process, get_world_size, get_rank,barrier, collect_results +from xtuner.registry import BUILDER +import math +from tqdm import tqdm +import torch +from mmengine.runner.amp import autocast class TrainLoop(IterBasedTrainLoop): @@ -38,3 +47,183 @@ def __init__(self, raise NotImplementedError super().__init__( runner=runner, dataloader=dataloader, max_iters=iters, **kwargs) + + +class ValLoop(MMENGINE_ValLoop): + def __init__(self, + runner, + dataloader=None, + evaluator=None, + fp16: bool = False, + select_metric='first') -> None: + self._runner = runner + self.fp16 = fp16 + self.select_metric = select_metric + self.datasets = dataloader['dataset'] + if not isinstance(self.datasets, Sequence): + self.datasets = [self.datasets] + + @property + def runner(self): + return self._runner + + def _build_dataset(self, dataset_cfg): + if is_main_process(): + dataset = BUILDER.build(dataset_cfg) + objects = [dataset] + else: + objects = [None] + dataset = broadcast_object_list(objects)[0] + return dataset + + def run(self) -> dict: + """Launch validation.""" + self.runner.call_hook('before_val') + self.runner.call_hook('before_val_epoch') + self.runner.model.gradient_checkpointing_disable() + self.runner.model.eval() + + rank = get_rank() + metrics = [] + for _, dataset_cfg in enumerate(self.datasets): + dataset = self._build_dataset(dataset_cfg) + assert len(dataset) > 0, 'The dataset is empty' + + self.runner.model.preparing_for_generation(dataset.get('metainfo', None)) + + results = [] + n_samples = len(dataset) + per_rank_samples = math.ceil(n_samples / get_world_size()) + per_rank_ids = range(per_rank_samples * rank, + min(n_samples, per_rank_samples * (rank + 1))) + for idx in tqdm(per_rank_ids, desc=f'Rank {rank}'): + data_batch = dataset[idx] + self.run_iter(idx, data_batch, results) + + barrier() + results = collect_results(results, len(dataset)) + + if is_main_process(): + metric = dataset.evaluate(results, self.runner.work_dir) + objects = [metric] + else: + objects = [None] + metric = broadcast_object_list(objects)[0] + metrics.append(metric) + del dataset + + # select metrics + if self.select_metric == 'first': + metrics = metrics[0] + else: + raise NotImplementedError + + self.runner.call_hook('after_val_epoch', metrics=metrics) + self.runner.call_hook('after_val') + self.runner.model.gradient_checkpointing_enable() + self.runner.model.train() + return metrics + + @torch.no_grad() + def run_iter(self, idx, data_batch: Sequence[dict], results: list): + """Iterate one mini-batch. + + Args: + data_batch (Sequence[dict]): Batch of data + from dataloader. + """ + assert 'img_id' in data_batch, 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['img_id']} + + self.runner.call_hook( + 'before_val_iter', batch_idx=idx, data_batch=data_batch) + + # outputs should be sequence of BaseDataElement + with autocast(enabled=self.fp16): + outputs = self.runner.model.val_step(data_batch) + prediction['prediction'] = outputs['prediction'] + results.append(prediction) + + self.runner.call_hook( + 'after_val_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=outputs) + + +class TestLoop(ValLoop): + def run(self) -> dict: + """Launch validation.""" + self.runner.call_hook('before_test') + self.runner.call_hook('before_test_epoch') + self.runner.model.gradient_checkpointing_disable() + self.runner.model.eval() + + rank = get_rank() + metrics = [] + for _, dataset_cfg in enumerate(self.datasets): + dataset = self._build_dataset(dataset_cfg) + assert len(dataset) > 0, 'The dataset is empty' + + results = [] + n_samples = len(dataset) + per_rank_samples = math.ceil(n_samples / get_world_size()) + per_rank_ids = range(per_rank_samples * rank, + min(n_samples, per_rank_samples * (rank + 1))) + for idx in tqdm(per_rank_ids, desc=f'Rank {rank}'): + data_batch = dataset[idx] + self.run_iter(idx, data_batch, results) + + barrier() + results = collect_results(results, len(dataset)) + + if is_main_process(): + metric = dataset.evaluate(results, self.runner.work_dir) + objects = [metric] + else: + objects = [None] + metric = broadcast_object_list(objects)[0] + metrics.append(metric) + del dataset + + # select metrics + if self.select_metric == 'first': + metrics = metrics[0] + else: + raise NotImplementedError + self.runner.call_hook('after_test_epoch', metrics=metrics) + self.runner.call_hook('after_test') + + self.runner.model.gradient_checkpointing_enable() + self.runner.model.train() + return metrics + + @torch.no_grad() + def run_iter(self, idx, data_batch: Sequence[dict], results: list): + """Iterate one mini-batch. + + Args: + data_batch (Sequence[dict]): Batch of data + from dataloader. + """ + assert 'img_id' in data_batch, 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['img_id']} + + self.runner.call_hook( + 'before_test_iter', batch_idx=idx, data_batch=data_batch) + + # outputs should be sequence of BaseDataElement + with autocast(enabled=self.fp16): + outputs = self.runner.model.val_step(data_batch) + prediction.update(outputs) + results.append(prediction) + + self.runner.call_hook( + 'after_test_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=outputs) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 19b427a75..039fac551 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -7,7 +7,7 @@ from mmengine.config import Config, ConfigDict from mmengine.model import BaseModel from peft import get_peft_model, prepare_model_for_kbit_training -from transformers import AutoConfig +from transformers import AutoConfig, GenerationConfig from xtuner.registry import BUILDER from .modules import ProjectorConfig, ProjectorModel, dispatch_modules @@ -16,6 +16,10 @@ get_peft_model_state_dict, guess_load_checkpoint, make_inputs_require_grad, prepare_inputs_labels_for_multimodal, traverse_dict) +from xtuner.tools.utils import get_stop_criteria +from xtuner.dataset.utils import expand2square, load_image +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, + StopWordStoppingCriteria) class LLaVAModel(BaseModel): @@ -31,7 +35,10 @@ def __init__(self, llm_lora=None, visual_encoder_lora=None, use_activation_checkpointing=True, - max_position_embeddings=None): + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None): super().__init__() self.freeze_llm = freeze_llm self.freeze_visual_encoder = freeze_visual_encoder @@ -57,6 +64,7 @@ def __init__(self, if self.freeze_visual_encoder: self.visual_encoder.requires_grad_(False) + self.use_activation_checkpointing = use_activation_checkpointing if use_activation_checkpointing: # For backward compatibility if hasattr(self.llm, 'enable_input_require_grads'): @@ -93,6 +101,15 @@ def __init__(self, self._is_init = True + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + self.template = template + + def _parse_lora_config(self, lora_config): if isinstance(lora_config, dict) or isinstance( lora_config, Config) or isinstance(lora_config, ConfigDict): @@ -120,16 +137,18 @@ def _prepare_visual_encoder_for_lora(self, self.visual_encoder = get_peft_model(self.visual_encoder, lora_config) def gradient_checkpointing_enable(self): - self.activation_checkpointing_enable() + if self.use_activation_checkpointing: + self.activation_checkpointing_enable() + + def gradient_checkpointing_disable(self): + if self.use_activation_checkpointing: + self.activation_checkpointing_disable() def activation_checkpointing_enable(self): self.llm.gradient_checkpointing_enable() self.visual_encoder.gradient_checkpointing_enable() self.projector.gradient_checkpointing_enable() - def gradient_checkpointing_disable(self): - self.activation_checkpointing_disable() - def activation_checkpointing_disable(self): self.llm.gradient_checkpointing_disable() self.visual_encoder.gradient_checkpointing_disable() @@ -230,7 +249,7 @@ def _build_from_cfg_or_module(self, cfg_or_mod): else: raise NotImplementedError - def forward(self, data, data_samples=None, mode='loss'): + def _prepare_data_for_llm(self, data): if 'pixel_values' in data: visual_outputs = self.visual_encoder( data['pixel_values'].to(self.visual_encoder.dtype), @@ -239,34 +258,109 @@ def forward(self, data, data_samples=None, mode='loss'): visual_outputs.hidden_states[self.visual_select_layer][:, 1:]) data['pixel_values'] = pixel_values data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def forward(self, data, data_samples=None, mode='loss'): + data = self._prepare_data_for_llm(data) if mode == 'loss': return self.compute_loss(data, data_samples) - elif mode == 'predict': - return self.predict(data, data_samples) - elif mode == 'tensor': - return self._forward(data, data_samples) + elif mode == 'predict' or mode == 'generate': + return self.generate(data, data_samples) + elif mode == 'chat': + return self.chat(data) else: raise NotImplementedError - def _forward(self, data, data_samples=None): - - outputs = self.llm(**data) - - return outputs - - def predict(self, data, data_samples=None): - outputs = self.llm(**data) - logits_dict = [{'logits': logits} for logits in outputs.logits] - return logits_dict - def compute_loss(self, data, data_samples=None): outputs = self.llm(**data) loss_dict = {'loss': outputs.loss} return loss_dict - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.llm, name) + def preparing_for_generation(self, metainfo: dict = None): + default_generation_kwargs = dict( + max_new_tokens=100, + do_sample=False, + eos_token_id=self.tokenizer.eos_token_id, + pad_token_id=self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None else + self.tokenizer.eos_token_id) + default_generation_kwargs.update(metainfo.get('generation_kwargs', {})) + self.gen_config = GenerationConfig(**default_generation_kwargs) + + stop_words = [] + stop_words += self.template.get('STOP_WORDS', []) + stop_criteria = get_stop_criteria( + tokenizer=self.tokenizer, stop_words=stop_words) + self.stop_criteria = stop_criteria + + def generate(self, data, data_samples=None): + # TODO: It is the direct output of the dataset without going through the dataloader. + input_ids = data['input_ids'].unsqueeze(0).to(self.visual_encoder.device) + data['input_ids'] = input_ids + pixel_values = data['pixel_values'].unsqueeze(0).to(self.visual_encoder.device) + data['pixel_values'] = pixel_values + + mm_inputs = self._prepare_data_for_llm(data) + generate_output = self.llm.generate( + **mm_inputs, + generation_config=self.gen_config, + streamer=None, + bos_token_id=self.tokenizer.bos_token_id, + stopping_criteria=self.stop_criteria) + + prediction = self.tokenizer.decode( + generate_output[0], skip_special_tokens=True).strip() + + return dict(prediction=prediction) + + def chat(self, data, system=''): + # single image and single text mode + instruction = self.template.get('INSTRUCTION', '{input}') + + sample_image = data['img'] + sample_input = data['text'] + + image = expand2square( + sample_image, + tuple(int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + image = image.to(self.visual_encoder.device) + sample_input = DEFAULT_IMAGE_TOKEN + '\n' + sample_input + if system != '': + system = self.template.get( + 'SYSTEM', '{system}\n').format(system=system) + + inputs = (system + instruction).format(input=sample_input, round=1) + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode( + chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + input_ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + input_ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + input_ids.append(IMAGE_TOKEN_INDEX) + input_ids = torch.tensor(input_ids).to(self.visual_encoder.device) + + data['input_ids'] = input_ids.unsqueeze(0) + data['pixel_values'] = image.unsqueeze(0) + + mm_inputs = self._prepare_data_for_llm(data) + generate_output = self.llm.generate( + **mm_inputs, + generation_config=self.gen_config, + streamer=None, + bos_token_id=self.tokenizer.bos_token_id, + stopping_criteria=self.stop_criteria) + + prediction = self.tokenizer.decode( + generate_output[0], skip_special_tokens=True).strip() + + return dict(prediction=prediction, inputs=inputs) diff --git a/xtuner/model/utils.py b/xtuner/model/utils.py index dce86315d..0a35b5970 100644 --- a/xtuner/model/utils.py +++ b/xtuner/model/utils.py @@ -134,7 +134,8 @@ def prepare_inputs_labels_for_multimodal( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, labels: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None): + pixel_values: Optional[torch.FloatTensor] = None, + **kwargs): if pixel_values is None: return { 'input_ids': input_ids, From 4dd223eb3a6c4ea6c8b37c0d68c9b497cf4556a0 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 29 Mar 2024 17:15:24 +0800 Subject: [PATCH 002/126] fix --- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 41 +++++++++++------ xtuner/dataset/evaluation/__init__.py | 2 +- .../dataset/evaluation/base_eval_dataset.py | 10 +++++ xtuner/engine/hooks/dataset_info_hook.py | 14 +++--- xtuner/engine/runner/loops.py | 44 ++++++------------- xtuner/model/llava.py | 5 +-- 6 files changed, 61 insertions(+), 55 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index 66d67bdda..72bf0c629 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -15,6 +15,7 @@ from xtuner.utils import PROMPT_TEMPLATE from xtuner.model import LLaVAModel from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset import ConcatDataset ####################################################################### # PART 1 Settings # @@ -70,6 +71,7 @@ type=LLaVAModel, tokenizer=tokenizer, template=prompt_template, + image_processor=image_processor, freeze_llm=True, freeze_visual_encoder=True, llm=dict( @@ -202,14 +204,15 @@ log_processor = dict(by_epoch=False) # ==================== val and test cfg ======================= -val_dataset = dict( - type=MMELLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', - image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True) +val_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True)] test_dataset = [ dict( @@ -229,14 +232,24 @@ pad_image_to_square=True) ] -# TODO: We are not currently using val_dataloader and val_evaluator, -# only utilizing val_dataset. -val_dataloader = dict(dataset=val_dataset) +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset)) val_evaluator = dict() val_cfg = dict(type=ValLoop) -# TODO: We are not currently using test_dataloader and test_evaluator, -# only utilizing test_dataset. -test_dataloader = dict(dataset=test_dataset) +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset)) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') + diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 1a09f5c64..0c995e54d 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -1,4 +1,4 @@ from .mme_llava_dataset import MMELLaVADataset -from multiple_choice_llava_dataset import MultipleChoiceLLaVADataset +from .multiple_choice_llava_dataset import MultipleChoiceLLaVADataset __all__ = ['MMELLaVADataset', 'MultipleChoiceLLaVADataset'] diff --git a/xtuner/dataset/evaluation/base_eval_dataset.py b/xtuner/dataset/evaluation/base_eval_dataset.py index 1fba323b2..dab081462 100644 --- a/xtuner/dataset/evaluation/base_eval_dataset.py +++ b/xtuner/dataset/evaluation/base_eval_dataset.py @@ -54,6 +54,16 @@ def _load_metainfo(cls, cls_metainfo[k] = v return cls_metainfo + @property + def metainfo(self) -> dict: + """Get meta information of dataset. + + Returns: + dict: meta information collected from ``BaseDataset.METAINFO``, + annotation file and metainfo argument during instantiation. + """ + return copy.deepcopy(self._metainfo) + @abstractmethod def evaluate(self, results, work_dir): pass diff --git a/xtuner/engine/hooks/dataset_info_hook.py b/xtuner/engine/hooks/dataset_info_hook.py index d835311dc..5b4e513be 100644 --- a/xtuner/engine/hooks/dataset_info_hook.py +++ b/xtuner/engine/hooks/dataset_info_hook.py @@ -46,17 +46,19 @@ def before_train(self, runner) -> None: if do_train: train_dataset = runner.train_dataloader.dataset self.log(runner, train_dataset, mode='train') - if do_eval: + if do_eval and hasattr(runner, 'val_dataloader'): eval_dataset = runner.val_dataloader.dataset self.log(runner, eval_dataset, mode='eval') - if do_test: + if do_test and hasattr(runner, 'test_dataloader'): test_dataset = runner.test_dataloader.dataset self.log(runner, test_dataset, mode='test') def before_val(self, runner) -> None: - eval_dataset = runner.val_dataloader.dataset - self.log(runner, eval_dataset, mode='eval') + if hasattr(runner, 'val_dataloader'): + eval_dataset = runner.val_dataloader.dataset + self.log(runner, eval_dataset, mode='eval') def before_test(self, runner) -> None: - test_dataset = runner.test_dataloader.dataset - self.log(runner, test_dataset, mode='test') + if hasattr(runner, 'test_dataloader'): + test_dataset = runner.test_dataloader.dataset + self.log(runner, test_dataset, mode='test') diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 66d4a5572..078e9127f 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -50,31 +50,12 @@ def __init__(self, class ValLoop(MMENGINE_ValLoop): - def __init__(self, - runner, - dataloader=None, - evaluator=None, - fp16: bool = False, - select_metric='first') -> None: + def __init__(self, runner, dataloader=None, evaluator=None, fp16: bool = False, select_metric='first') -> None: + # must be concatset + super(MMENGINE_ValLoop, self).__init__(runner, dataloader) self._runner = runner self.fp16 = fp16 self.select_metric = select_metric - self.datasets = dataloader['dataset'] - if not isinstance(self.datasets, Sequence): - self.datasets = [self.datasets] - - @property - def runner(self): - return self._runner - - def _build_dataset(self, dataset_cfg): - if is_main_process(): - dataset = BUILDER.build(dataset_cfg) - objects = [dataset] - else: - objects = [None] - dataset = broadcast_object_list(objects)[0] - return dataset def run(self) -> dict: """Launch validation.""" @@ -85,11 +66,8 @@ def run(self) -> dict: rank = get_rank() metrics = [] - for _, dataset_cfg in enumerate(self.datasets): - dataset = self._build_dataset(dataset_cfg) - assert len(dataset) > 0, 'The dataset is empty' - - self.runner.model.preparing_for_generation(dataset.get('metainfo', None)) + for _, dataset in enumerate(self.dataloader.datasets): + self.runner.model.preparing_for_generation(dataset.metainfo) results = [] n_samples = len(dataset) @@ -108,7 +86,8 @@ def run(self) -> dict: objects = [metric] else: objects = [None] - metric = broadcast_object_list(objects)[0] + broadcast_object_list(objects) + metric = objects[0] metrics.append(metric) del dataset @@ -142,7 +121,7 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): # outputs should be sequence of BaseDataElement with autocast(enabled=self.fp16): - outputs = self.runner.model.val_step(data_batch) + outputs = self.runner.model.val_step({'data': data_batch}) prediction['prediction'] = outputs['prediction'] results.append(prediction) @@ -167,6 +146,8 @@ def run(self) -> dict: dataset = self._build_dataset(dataset_cfg) assert len(dataset) > 0, 'The dataset is empty' + self.runner.model.preparing_for_generation(dataset.metainfo) + results = [] n_samples = len(dataset) per_rank_samples = math.ceil(n_samples / get_world_size()) @@ -184,7 +165,8 @@ def run(self) -> dict: objects = [metric] else: objects = [None] - metric = broadcast_object_list(objects)[0] + broadcast_object_list(objects) + metric = objects[0] metrics.append(metric) del dataset @@ -218,7 +200,7 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): # outputs should be sequence of BaseDataElement with autocast(enabled=self.fp16): - outputs = self.runner.model.val_step(data_batch) + outputs = self.runner.model.val_step({'data': data_batch}) prediction.update(outputs) results.append(prediction) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 039fac551..c7c1c1b15 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -261,9 +261,8 @@ def _prepare_data_for_llm(self, data): return data def forward(self, data, data_samples=None, mode='loss'): - data = self._prepare_data_for_llm(data) - if mode == 'loss': + data = self._prepare_data_for_llm(data) return self.compute_loss(data, data_samples) elif mode == 'predict' or mode == 'generate': return self.generate(data, data_samples) @@ -318,7 +317,7 @@ def chat(self, data, system=''): # single image and single text mode instruction = self.template.get('INSTRUCTION', '{input}') - sample_image = data['img'] + sample_image = data['image'] sample_input = data['text'] image = expand2square( From d2428af55e0c01c171e8de6c38c947a3cbdb7957 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 29 Mar 2024 17:32:24 +0800 Subject: [PATCH 003/126] fix --- ..._2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py | 10 +++++++++- xtuner/engine/hooks/dataset_info_hook.py | 14 ++++++-------- xtuner/engine/runner/loops.py | 13 ++++--------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index 72bf0c629..e2179ba28 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -212,7 +212,15 @@ prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, - pad_image_to_square=True)] + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] test_dataset = [ dict( diff --git a/xtuner/engine/hooks/dataset_info_hook.py b/xtuner/engine/hooks/dataset_info_hook.py index 5b4e513be..d835311dc 100644 --- a/xtuner/engine/hooks/dataset_info_hook.py +++ b/xtuner/engine/hooks/dataset_info_hook.py @@ -46,19 +46,17 @@ def before_train(self, runner) -> None: if do_train: train_dataset = runner.train_dataloader.dataset self.log(runner, train_dataset, mode='train') - if do_eval and hasattr(runner, 'val_dataloader'): + if do_eval: eval_dataset = runner.val_dataloader.dataset self.log(runner, eval_dataset, mode='eval') - if do_test and hasattr(runner, 'test_dataloader'): + if do_test: test_dataset = runner.test_dataloader.dataset self.log(runner, test_dataset, mode='test') def before_val(self, runner) -> None: - if hasattr(runner, 'val_dataloader'): - eval_dataset = runner.val_dataloader.dataset - self.log(runner, eval_dataset, mode='eval') + eval_dataset = runner.val_dataloader.dataset + self.log(runner, eval_dataset, mode='eval') def before_test(self, runner) -> None: - if hasattr(runner, 'test_dataloader'): - test_dataset = runner.test_dataloader.dataset - self.log(runner, test_dataset, mode='test') + test_dataset = runner.test_dataloader.dataset + self.log(runner, test_dataset, mode='test') diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 078e9127f..b923a57f2 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -66,7 +66,7 @@ def run(self) -> dict: rank = get_rank() metrics = [] - for _, dataset in enumerate(self.dataloader.datasets): + for _, dataset in enumerate(self.dataloader.dataset.datasets): self.runner.model.preparing_for_generation(dataset.metainfo) results = [] @@ -74,7 +74,7 @@ def run(self) -> dict: per_rank_samples = math.ceil(n_samples / get_world_size()) per_rank_ids = range(per_rank_samples * rank, min(n_samples, per_rank_samples * (rank + 1))) - for idx in tqdm(per_rank_ids, desc=f'Rank {rank}'): + for idx in per_rank_ids: data_batch = dataset[idx] self.run_iter(idx, data_batch, results) @@ -89,7 +89,6 @@ def run(self) -> dict: broadcast_object_list(objects) metric = objects[0] metrics.append(metric) - del dataset # select metrics if self.select_metric == 'first': @@ -142,10 +141,7 @@ def run(self) -> dict: rank = get_rank() metrics = [] - for _, dataset_cfg in enumerate(self.datasets): - dataset = self._build_dataset(dataset_cfg) - assert len(dataset) > 0, 'The dataset is empty' - + for _, dataset in enumerate(self.dataloader.dataset.datasets): self.runner.model.preparing_for_generation(dataset.metainfo) results = [] @@ -153,7 +149,7 @@ def run(self) -> dict: per_rank_samples = math.ceil(n_samples / get_world_size()) per_rank_ids = range(per_rank_samples * rank, min(n_samples, per_rank_samples * (rank + 1))) - for idx in tqdm(per_rank_ids, desc=f'Rank {rank}'): + for idx in per_rank_ids: data_batch = dataset[idx] self.run_iter(idx, data_batch, results) @@ -168,7 +164,6 @@ def run(self) -> dict: broadcast_object_list(objects) metric = objects[0] metrics.append(metric) - del dataset # select metrics if self.select_metric == 'first': From 39add6b0d0c84dee3ec50cd865610015e1b90ba2 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 29 Mar 2024 17:55:30 +0800 Subject: [PATCH 004/126] update --- xtuner/engine/runner/loops.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index b923a57f2..8780b0ff1 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -3,13 +3,10 @@ from mmengine.runner import IterBasedTrainLoop from mmengine.runner import ValLoop as MMENGINE_ValLoop -from mmengine.runner import TestLoop as MMENGINE_TestLoop from torch.utils.data import DataLoader from typing import Sequence -from mmengine.dist import broadcast_object_list, is_main_process, get_world_size, get_rank,barrier, collect_results -from xtuner.registry import BUILDER +from mmengine.dist import broadcast_object_list, is_main_process, get_world_size, get_rank, barrier, collect_results import math -from tqdm import tqdm import torch from mmengine.runner.amp import autocast @@ -59,6 +56,7 @@ def __init__(self, runner, dataloader=None, evaluator=None, fp16: bool = False, def run(self) -> dict: """Launch validation.""" + self.runner.logger.info('==================== Start val loop ===================') self.runner.call_hook('before_val') self.runner.call_hook('before_val_epoch') self.runner.model.gradient_checkpointing_disable() @@ -66,6 +64,7 @@ def run(self) -> dict: rank = get_rank() metrics = [] + for _, dataset in enumerate(self.dataloader.dataset.datasets): self.runner.model.preparing_for_generation(dataset.metainfo) @@ -79,8 +78,9 @@ def run(self) -> dict: self.run_iter(idx, data_batch, results) barrier() + self.runner.logger.info('==================== Start collect results ===================') results = collect_results(results, len(dataset)) - + self.runner.logger.info('========= Starting the evaluation of a data ===========') if is_main_process(): metric = dataset.evaluate(results, self.runner.work_dir) objects = [metric] @@ -96,6 +96,7 @@ def run(self) -> dict: else: raise NotImplementedError + self.runner.logger.info('================ Ending val loop ================') self.runner.call_hook('after_val_epoch', metrics=metrics) self.runner.call_hook('after_val') self.runner.model.gradient_checkpointing_enable() @@ -134,6 +135,7 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): class TestLoop(ValLoop): def run(self) -> dict: """Launch validation.""" + self.runner.logger.info('==================== Start test loop ===================') self.runner.call_hook('before_test') self.runner.call_hook('before_test_epoch') self.runner.model.gradient_checkpointing_disable() @@ -154,7 +156,9 @@ def run(self) -> dict: self.run_iter(idx, data_batch, results) barrier() + self.runner.logger.info('==================== Start collect results ===================') results = collect_results(results, len(dataset)) + self.runner.logger.info('========= Starting the evaluation of a data ===========') if is_main_process(): metric = dataset.evaluate(results, self.runner.work_dir) @@ -172,7 +176,7 @@ def run(self) -> dict: raise NotImplementedError self.runner.call_hook('after_test_epoch', metrics=metrics) self.runner.call_hook('after_test') - + self.runner.logger.info('================ Ending test loop ================') self.runner.model.gradient_checkpointing_enable() self.runner.model.train() return metrics From f154bb91adea9eaae88d4cf555677c175e1c8490 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 29 Mar 2024 18:03:05 +0800 Subject: [PATCH 005/126] update --- ..._2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index e2179ba28..8547e451c 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -213,13 +213,13 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True) + # dict( + # type=MultipleChoiceLLaVADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) ] test_dataset = [ From 2feb0e33a2465d02469febe79039f444fca2563e Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 10:53:37 +0800 Subject: [PATCH 006/126] fix ddp --- xtuner/engine/runner/loops.py | 53 ++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 8780b0ff1..6679b4f91 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -8,7 +8,10 @@ from mmengine.dist import broadcast_object_list, is_main_process, get_world_size, get_rank, barrier, collect_results import math import torch -from mmengine.runner.amp import autocast +from mmengine.model import is_model_wrapper + +TORCH_DTYPE_MAP = dict( + fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto') class TrainLoop(IterBasedTrainLoop): @@ -47,11 +50,13 @@ def __init__(self, class ValLoop(MMENGINE_ValLoop): - def __init__(self, runner, dataloader=None, evaluator=None, fp16: bool = False, select_metric='first') -> None: + def __init__(self, runner, dataloader=None, evaluator=None, torch_dtype=None, select_metric='first') -> None: # must be concatset super(MMENGINE_ValLoop, self).__init__(runner, dataloader) self._runner = runner - self.fp16 = fp16 + self.torch_dtype = torch_dtype + if torch_dtype is not None: + self.torch_dtype = TORCH_DTYPE_MAP[torch_dtype] self.select_metric = select_metric def run(self) -> dict: @@ -59,14 +64,20 @@ def run(self) -> dict: self.runner.logger.info('==================== Start val loop ===================') self.runner.call_hook('before_val') self.runner.call_hook('before_val_epoch') - self.runner.model.gradient_checkpointing_disable() - self.runner.model.eval() + + if is_model_wrapper(self.runner.model): + model = self.runner.model.module + else: + model = self.runner.model + + model.gradient_checkpointing_disable() + model.eval() rank = get_rank() metrics = [] for _, dataset in enumerate(self.dataloader.dataset.datasets): - self.runner.model.preparing_for_generation(dataset.metainfo) + model.preparing_for_generation(dataset.metainfo) results = [] n_samples = len(dataset) @@ -99,8 +110,8 @@ def run(self) -> dict: self.runner.logger.info('================ Ending val loop ================') self.runner.call_hook('after_val_epoch', metrics=metrics) self.runner.call_hook('after_val') - self.runner.model.gradient_checkpointing_enable() - self.runner.model.train() + model.gradient_checkpointing_enable() + model.train() return metrics @torch.no_grad() @@ -120,8 +131,7 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): 'before_val_iter', batch_idx=idx, data_batch=data_batch) # outputs should be sequence of BaseDataElement - with autocast(enabled=self.fp16): - outputs = self.runner.model.val_step({'data': data_batch}) + outputs = self.runner.model.val_step({'data': data_batch}) prediction['prediction'] = outputs['prediction'] results.append(prediction) @@ -138,13 +148,23 @@ def run(self) -> dict: self.runner.logger.info('==================== Start test loop ===================') self.runner.call_hook('before_test') self.runner.call_hook('before_test_epoch') - self.runner.model.gradient_checkpointing_disable() - self.runner.model.eval() + + if is_model_wrapper(self.runner.model): + model = self.runner.model.module + else: + model = self.runner.model + + model.gradient_checkpointing_disable() + model.eval() + + if self.torch_dtype is not None: + self.runner.logger.info(f'Convert model dtype to {self.torch_dtype}') + model.to(self.torch_dtype) rank = get_rank() metrics = [] for _, dataset in enumerate(self.dataloader.dataset.datasets): - self.runner.model.preparing_for_generation(dataset.metainfo) + model.preparing_for_generation(dataset.metainfo) results = [] n_samples = len(dataset) @@ -177,8 +197,8 @@ def run(self) -> dict: self.runner.call_hook('after_test_epoch', metrics=metrics) self.runner.call_hook('after_test') self.runner.logger.info('================ Ending test loop ================') - self.runner.model.gradient_checkpointing_enable() - self.runner.model.train() + model.gradient_checkpointing_enable() + model.train() return metrics @torch.no_grad() @@ -198,8 +218,7 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): 'before_test_iter', batch_idx=idx, data_batch=data_batch) # outputs should be sequence of BaseDataElement - with autocast(enabled=self.fp16): - outputs = self.runner.model.val_step({'data': data_batch}) + outputs = self.runner.model.val_step({'data': data_batch}) prediction.update(outputs) results.append(prediction) From cd0a01b4b4f36fd7e7c109e5622aa68fa7397584 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 11:01:06 +0800 Subject: [PATCH 007/126] add config --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py new file mode 100644 index 000000000..60e0f8bb7 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -0,0 +1,271 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceLLaVADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset)) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset)) +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 36053f6a91e339223594c091958aef6ce6f950ce Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 11:23:21 +0800 Subject: [PATCH 008/126] add config --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 288 ++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py new file mode 100644 index 000000000..d2c4d9c50 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -0,0 +1,288 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) +from peft import LoraConfig +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +import torch +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + llm_lora=dict( + type=LoraConfig, + r=512, + lora_alpha=256, + lora_dropout=0.05, + bias='none', + task_type='CAUSAL_LM'), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceLLaVADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset)) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset)) +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 211c33ab66e00b1ab6371a3c7a56e1ab4749283a Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 11:45:53 +0800 Subject: [PATCH 009/126] add config --- ...ip_so400m_p14_384_loar_e1_gpu8_finetune.py | 291 ++++++++++++++++++ ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 271 ++++++++++++++++ 2 files changed, 562 insertions(+) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py new file mode 100644 index 000000000..9a5428318 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py @@ -0,0 +1,291 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) +from peft import LoraConfig +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +import torch +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + llm_lora=dict( + type=LoraConfig, + r=512, + lora_alpha=256, + lora_dropout=0.05, + bias='none', + task_type='CAUSAL_LM'), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceLLaVADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset)) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset)) +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py new file mode 100644 index 000000000..a404c12a0 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -0,0 +1,271 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceLLaVADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MMELLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset)) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset)) +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From b38f453153f11a1d29c4fb37b1e4239e32d0c5ea Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 12:12:29 +0800 Subject: [PATCH 010/126] fix disp --- xtuner/engine/runner/loops.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 6679b4f91..7f429f137 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -75,7 +75,8 @@ def run(self) -> dict: rank = get_rank() metrics = [] - + # Ensure that eta and log are displayed correctly. + current_run_total_ids = 0 for _, dataset in enumerate(self.dataloader.dataset.datasets): model.preparing_for_generation(dataset.metainfo) @@ -86,7 +87,8 @@ def run(self) -> dict: min(n_samples, per_rank_samples * (rank + 1))) for idx in per_rank_ids: data_batch = dataset[idx] - self.run_iter(idx, data_batch, results) + self.run_iter(current_run_total_ids, data_batch, results) + current_run_total_ids += 1 barrier() self.runner.logger.info('==================== Start collect results ===================') @@ -163,6 +165,8 @@ def run(self) -> dict: rank = get_rank() metrics = [] + # Ensure that eta and log are displayed correctly. + current_run_total_ids = 0 for _, dataset in enumerate(self.dataloader.dataset.datasets): model.preparing_for_generation(dataset.metainfo) @@ -173,7 +177,8 @@ def run(self) -> dict: min(n_samples, per_rank_samples * (rank + 1))) for idx in per_rank_ids: data_batch = dataset[idx] - self.run_iter(idx, data_batch, results) + self.run_iter(current_run_total_ids, data_batch, results) + current_run_total_ids += 1 barrier() self.runner.logger.info('==================== Start collect results ===================') From a86815119bce49ee9de7edb1b212febbda075e14 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 13:06:28 +0800 Subject: [PATCH 011/126] fix test --- xtuner/engine/runner/loops.py | 2 +- xtuner/tools/test.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 7f429f137..af607ad9d 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -50,7 +50,7 @@ def __init__(self, class ValLoop(MMENGINE_ValLoop): - def __init__(self, runner, dataloader=None, evaluator=None, torch_dtype=None, select_metric='first') -> None: + def __init__(self, runner, dataloader=None, evaluator=None, torch_dtype='fp16', select_metric='first') -> None: # must be concatset super(MMENGINE_ValLoop, self).__init__(runner, dataloader) self._runner = runner diff --git a/xtuner/tools/test.py b/xtuner/tools/test.py index 5eb3f6d9d..6d5d3820c 100644 --- a/xtuner/tools/test.py +++ b/xtuner/tools/test.py @@ -11,6 +11,7 @@ from xtuner.configs import cfgs_name_path from xtuner.model.utils import guess_load_checkpoint from xtuner.registry import MAP_FUNC +from mmengine.model import is_model_wrapper def parse_args(): @@ -96,7 +97,11 @@ def main(): runner = RUNNERS.build(cfg) state_dict = guess_load_checkpoint(args.checkpoint) - runner.model.load_state_dict(state_dict, strict=False) + + if is_model_wrapper(runner.model): + runner.model.module.load_state_dict(state_dict, strict=False) + else: + runner.model.load_state_dict(state_dict, strict=False) runner.logger.info(f'Load checkpoint from {args.checkpoint}') # start testing From 7f70c56d83c6193f54fa8e066021152e1a08070e Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 19:17:34 +0800 Subject: [PATCH 012/126] add dataset --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 78 +++++- xtuner/dataset/evaluation/__init__.py | 5 +- .../evaluation/hallusion_llava_dataset.py | 157 +++++++++++ .../dataset/evaluation/pope_llava_dataset.py | 190 +++++++++++++ .../evaluation/textvqa_llava_dataset.py | 137 ++++++++++ xtuner/dataset/evaluation/textvqa_utils.py | 255 ++++++++++++++++++ xtuner/dataset/evaluation/utils.py | 60 +++++ 7 files changed, 878 insertions(+), 4 deletions(-) create mode 100644 xtuner/dataset/evaluation/hallusion_llava_dataset.py create mode 100644 xtuner/dataset/evaluation/pope_llava_dataset.py create mode 100644 xtuner/dataset/evaluation/textvqa_llava_dataset.py create mode 100644 xtuner/dataset/evaluation/textvqa_utils.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index 60e0f8bb7..27fbef3f8 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -13,7 +13,8 @@ from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset, POPELLaVADataset, \ + HallusionLLaVADataset, TextVQALLaVADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -232,6 +233,64 @@ ] test_dataset = [ + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQALLaVADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), dict( type=MMELLaVADataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', @@ -239,10 +298,23 @@ prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + type=HallusionLLaVADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPELLaVADataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 0c995e54d..1353a06be 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -1,4 +1,7 @@ from .mme_llava_dataset import MMELLaVADataset from .multiple_choice_llava_dataset import MultipleChoiceLLaVADataset +from .pope_llava_dataset import POPELLaVADataset +from .hallusion_llava_dataset import HallusionLLaVADataset +from .textvqa_llava_dataset import TextVQALLaVADataset -__all__ = ['MMELLaVADataset', 'MultipleChoiceLLaVADataset'] +__all__ = ['MMELLaVADataset', 'MultipleChoiceLLaVADataset', 'POPELLaVADataset', 'HallusionLLaVADataset', 'TextVQALLaVADataset'] diff --git a/xtuner/dataset/evaluation/hallusion_llava_dataset.py b/xtuner/dataset/evaluation/hallusion_llava_dataset.py new file mode 100644 index 000000000..385619ee5 --- /dev/null +++ b/xtuner/dataset/evaluation/hallusion_llava_dataset.py @@ -0,0 +1,157 @@ +import os +import os.path as osp + +import pandas as pd +import torch +from mmengine.dist import (master_only) +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image, expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from .utils import YOrN_Extraction, Hallusion_rating + + +class HallusionLLaVADataset(BaseEvalDataset): + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + image_path = self.df.iloc[idx]['image_path'] + question = self.df.iloc[idx]['question'] + category = self.df.iloc[idx]['category'] + l2_category = self.df.iloc[idx]['l2-category'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'image_path': image_path, + 'question': question, + 'answer': answer, + 'category': category, + 'index': index, + 'l2-category': l2_category, + 'id': idx + } + data_list.append(data) + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = {'id': data['id']} + + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.use_system: + inputs = self.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.template['INSTRUCTION'].format(input=text, round=1) + + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + image = self.get_image(data['img']).convert('RGB') + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + + @master_only + def evaluate(self, result, work_dir): + orig_index = [x['id'] for x in self.data] + results = [] + for pred_dict in result: + index = pred_dict['id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + cur_result['image_path'] = filtered_rows.get('image_path') + cur_result['l2-category'] = filtered_rows.get('l2-category') + results.append(cur_result) + + results_df = pd.DataFrame(results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + + ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} + # 不使用 gpt + data['extracted'] = [ans_map[x] for x in data['index']] + data['score'] = (data['answer'] == data['extracted']) + + results_df = pd.DataFrame(data) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = Hallusion_rating(data) + print_log('============================================', 'current') + print_log(score, 'current') + print_log('============================================', 'current') + print_log(f'YOrN_eval successfully finished evaluating', 'current') + return score + diff --git a/xtuner/dataset/evaluation/pope_llava_dataset.py b/xtuner/dataset/evaluation/pope_llava_dataset.py new file mode 100644 index 000000000..17dd7810e --- /dev/null +++ b/xtuner/dataset/evaluation/pope_llava_dataset.py @@ -0,0 +1,190 @@ +import json +import os + +import pandas as pd +import torch +from mmengine.dist import master_only +from PIL import Image + +from xtuner.dataset.utils import expand2square +from xtuner.registry import BUILDER +from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX +from mmengine.logging import print_log +from .base_eval_dataset import BaseEvalDataset + +from .utils import YOrN_Extraction, load_jsonl + + +def eval_func(pred_list, label_list): + pos = 1 + neg = 0 + yes_ratio = pred_list.count(1) / len(pred_list) + + TP, TN, FP, FN = 0, 0, 0, 0 + for pred, label in zip(pred_list, label_list): + if pred == pos and label == pos: + TP += 1 + elif pred == pos and label == neg: + FP += 1 + elif pred == neg and label == neg: + TN += 1 + elif pred == neg and label == pos: + FN += 1 + + print_log('TP\tFP\tTN\tFN\t', 'current') + print_log(f'{TP}\t{FP}\t{TN}\t{FN}', 'current') + + precision = float(TP) / float(TP + FP) + recall = float(TP) / float(TP + FN) + f1 = 2 * precision * recall / (precision + recall) + acc = (TP + TN) / (TP + TN + FP + FN) + print_log(f'Accuracy: {acc}', 'current') + print_log(f'Precision: {precision}', 'current') + print_log(f'Recall: {recall}', 'current') + print_log(f'F1 score: {f1}', 'current') + print_log(f'Yes ratio: {yes_ratio}', 'current') + return f1 + + +class POPELLaVADataset(BaseEvalDataset): + + def __init__(self, data_file, coco_val_path, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None): + super().__init__(metainfo) + self.use_system = use_system + if isinstance(data_file, str): + data_file = [data_file] + self.raw_data = [load_jsonl(f) for f in data_file] + + self.name = [ + os.path.splitext(os.path.basename(f))[0] for f in data_file + ] + + self.coco_val_path = coco_val_path + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + + self.results_xlsx_path = 'pope-results.xlsx' + self.data = self.load_data_list() + + def get_image(self, image): + image = Image.open(os.path.join(self.coco_val_path, image)) + return image + + def __len__(self): + return len(self.data) + + def load_data_list(self): + data_list = [] + idx = 0 + for data_idx in range(len(self.raw_data)): + for sample_idx in range(len(self.raw_data[data_idx])): + sample = self.raw_data[data_idx][sample_idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + answer = sample['label'] + category = self.name[data_idx] + assert answer in ['yes', 'no'] + data = { + 'id': idx, + 'index': index, + 'img': image_path, + 'question': question, + 'answer': answer, + 'category': category + } + data_list.append(data) + idx += 1 + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = {'id': data['id']} + + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + # We did not add “\nAnswer the question using a single word or phrase.” to the prompt + if self.use_system: + inputs = self.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.template['INSTRUCTION'].format(input=text, round=1) + + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode( + chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + image = self.get_image(data['img']).convert('RGB') + if self.pad_image_to_square: + image = expand2square( + image, + tuple(int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + + @master_only + def evaluate(self, result, work_dir, show=True): + orig_index = [x['id'] for x in self.data] + results = [] + for pred_dict in result: + index = pred_dict['id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + results.append(cur_result) + + results_df = pd.DataFrame(results) + with pd.ExcelWriter( + os.path.join(work_dir, self.results_xlsx_path), + engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = 0 + for sub_name in self.name: + sub_results = [x for x in results if x['category'] == sub_name] + pred_list = [ + int(YOrN_Extraction(x['prediction']) == 'Yes') + for x in sub_results + ] + label_list = [ + int(YOrN_Extraction(x['answer']) == 'Yes') for x in sub_results + ] + print_log('============================================', 'current') + print_log('Category: {}, # samples: {}'.format(sub_name, + len(sub_results)), 'current') + cur_f1 = eval_func(pred_list, label_list) + score += cur_f1 + + score /= len(self.name) + print_log('============================================', 'current') + print_log(f'Average F1-score: {score}', 'current') + print_log('============================================', 'current') + print_log('POPE successfully finished evaluating', 'current') + return score diff --git a/xtuner/dataset/evaluation/textvqa_llava_dataset.py b/xtuner/dataset/evaluation/textvqa_llava_dataset.py new file mode 100644 index 000000000..dedec318e --- /dev/null +++ b/xtuner/dataset/evaluation/textvqa_llava_dataset.py @@ -0,0 +1,137 @@ +import os +import os.path as osp +import re + +import torch +from .base_eval_dataset import BaseEvalDataset + +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.registry import BUILDER +import json +from PIL import Image +from xtuner.dataset.utils import expand2square +from mmengine.dist import (master_only) +from .textvqa_utils import TextVQAAccuracyEvaluator +from mmengine.logging import print_log + + +def prompt_processor(prompt): + if prompt.startswith('OCR tokens: '): + pattern = r"Question: (.*?) Short answer:" + match = re.search(pattern, prompt, re.DOTALL) + question = match.group(1) + elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: + if prompt.startswith('Reference OCR token:'): + question = prompt.split('\n')[1] + else: + question = prompt.split('\n')[0] + elif len(prompt.split('\n')) == 2: + question = prompt.split('\n')[0] + else: + assert False + + return question.lower() + + +class TextVQALLaVADataset(BaseEvalDataset): + def __init__(self, data_file, ann_file, image_folder, prompt_template, image_processor, tokenizer, + pad_image_to_square=True, use_system=False, metainfo=None): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.ann_file = ann_file + self.image_folder = image_folder + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.jsonl' + self.data = self.load_data_list() + + def load_data_list(self): + return [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data_dict = {'id': idx} + line = self.data[idx] + image_file = line["image"] + qs = line["text"] + + text = DEFAULT_IMAGE_TOKEN + '\n' + qs + + if self.use_system: + inputs = self.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.template['INSTRUCTION'].format(input=text, round=1) + + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + + @master_only + def evaluate(self, result, work_dir, show=True): + answers_file = osp.join(work_dir, self.results_path) + ans_file = open(answers_file, "w") + + for pred_dict in result: + idx = pred_dict["id"] + gt_data = self.data[idx] + + ans_file.write(json.dumps({"question_id": gt_data['question_id'], + "prompt": gt_data['text'], + "text": pred_dict['prediction'], + "metadata": {}}) + "\n") + ans_file.close() + + annotations = json.load(open(self.ann_file))['data'] + annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in + annotations} + results = [json.loads(line) for line in open(answers_file)] + + pred_list = [] + for result in results: + annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] + pred_list.append({ + "pred_answer": result['text'], + "gt_answers": annotation['answers'], + }) + + evaluator = TextVQAAccuracyEvaluator() + acc = 100. * evaluator.eval_pred_list(pred_list) + print_log('============================================', 'current') + print_log('Samples: {}, Accuracy: {:.2f}%'.format(len(pred_list), acc), 'current') + print_log('============================================', 'current') + print_log(f'TextVQA successfully finished evaluating', 'current') + return {'acc': acc} diff --git a/xtuner/dataset/evaluation/textvqa_utils.py b/xtuner/dataset/evaluation/textvqa_utils.py new file mode 100644 index 000000000..c3e5887e0 --- /dev/null +++ b/xtuner/dataset/evaluation/textvqa_utils.py @@ -0,0 +1,255 @@ +from tqdm import tqdm +import re + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or ( + re.search(self.COMMA_STRIP, in_text) is not None + ): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item + + +class TextVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def _compute_answer_scores(self, raw_answers): + """ + compute the accuracy (soft score) of human answers + """ + answers = [self.answer_processor(a) for a in raw_answers] + assert len(answers) == 10 + gt_answers = list(enumerate(answers)) + unique_answers = set(answers) + unique_answer_scores = {} + + for unique_answer in unique_answers: + accs = [] + for gt_answer in gt_answers: + other_answers = [item for item in gt_answers if item != gt_answer] + matching_answers = [ + item for item in other_answers if item[1] == unique_answer + ] + acc = min(1, float(len(matching_answers)) / 3) + accs.append(acc) + unique_answer_scores[unique_answer] = sum(accs) / len(accs) + + return unique_answer_scores + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in tqdm(pred_list): + pred_answer = self.answer_processor(entry["pred_answer"]) + unique_answer_scores = self._compute_answer_scores(entry["gt_answers"]) + score = unique_answer_scores.get(pred_answer, 0.0) + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy diff --git a/xtuner/dataset/evaluation/utils.py b/xtuner/dataset/evaluation/utils.py index e01179aec..4b02eda82 100644 --- a/xtuner/dataset/evaluation/utils.py +++ b/xtuner/dataset/evaluation/utils.py @@ -1,5 +1,6 @@ import numpy as np from collections import defaultdict +import json def process_punctuation(inText): @@ -73,3 +74,62 @@ def acc(key, mode='normal'): ret[sc] = base ret.update(scores) return ret + + +def Hallusion_rating(data): + def calc_fAcc(data): + res = defaultdict(list) + lt = len(data) + for i in range(lt): + line = data.iloc[i] + res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score']) + return np.mean([np.all(x) for x in res.values()]) * 100 + + def calc_qAcc(data): + res = defaultdict(list) + lt = len(data) + for i in range(lt): + line = data.iloc[i] + res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score']) + return np.mean([np.all(x) for x in res.values()]) * 100 + + def calc_aAcc(data): + return np.mean(data['score']) * 100 + + data['set_id'] = [x.split('_')[3] for x in data['index']] + data['figure_id'] = [x.split('_')[4] for x in data['index']] + data['question_id'] = [x.split('_')[5] for x in data['index']] + + res = dict(split=[], aAcc=[], fAcc=[], qAcc=[]) + res['split'].append('Overall') + res['aAcc'].append(calc_aAcc(data)) + res['fAcc'].append(calc_fAcc(data)) + res['qAcc'].append(calc_qAcc(data)) + + if 'category' in data: + cates = list(set(data['category'])) + for c in cates: + sub = data[data['category'] == c] + res['split'].append(c) + res['aAcc'].append(calc_aAcc(sub)) + res['fAcc'].append(calc_fAcc(sub)) + res['qAcc'].append(calc_qAcc(sub)) + + if 'l2-category' in data: + cates = list(set(data['l2-category'])) + for c in cates: + sub = data[data['l2-category'] == c] + res['split'].append(c) + res['aAcc'].append(calc_aAcc(sub)) + res['fAcc'].append(calc_fAcc(sub)) + res['qAcc'].append(calc_qAcc(sub)) + return res + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data From 04f0ac151a6442c1977964531bed469e1426f2ec Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 1 Apr 2024 19:59:35 +0800 Subject: [PATCH 013/126] fix eval dataset --- xtuner/dataset/evaluation/hallusion_llava_dataset.py | 8 ++++---- xtuner/dataset/evaluation/pope_llava_dataset.py | 8 ++++---- xtuner/dataset/evaluation/textvqa_llava_dataset.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/xtuner/dataset/evaluation/hallusion_llava_dataset.py b/xtuner/dataset/evaluation/hallusion_llava_dataset.py index 385619ee5..cc2a2b181 100644 --- a/xtuner/dataset/evaluation/hallusion_llava_dataset.py +++ b/xtuner/dataset/evaluation/hallusion_llava_dataset.py @@ -67,14 +67,14 @@ def load_data_list(self): 'category': category, 'index': index, 'l2-category': l2_category, - 'id': idx + 'img_id': idx } data_list.append(data) return data_list def __getitem__(self, idx): data = self.data[idx] - data_dict = {'id': data['id']} + data_dict = {'img_id': data['img_id']} text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text @@ -115,10 +115,10 @@ def __getitem__(self, idx): @master_only def evaluate(self, result, work_dir): - orig_index = [x['id'] for x in self.data] + orig_index = [x['img_id'] for x in self.data] results = [] for pred_dict in result: - index = pred_dict['id'] + index = pred_dict['img_id'] new_index = orig_index.index(index) filtered_rows = self.data[new_index] diff --git a/xtuner/dataset/evaluation/pope_llava_dataset.py b/xtuner/dataset/evaluation/pope_llava_dataset.py index 17dd7810e..9fdca6f2d 100644 --- a/xtuner/dataset/evaluation/pope_llava_dataset.py +++ b/xtuner/dataset/evaluation/pope_llava_dataset.py @@ -92,7 +92,7 @@ def load_data_list(self): category = self.name[data_idx] assert answer in ['yes', 'no'] data = { - 'id': idx, + 'img_id': idx, 'index': index, 'img': image_path, 'question': question, @@ -105,7 +105,7 @@ def load_data_list(self): def __getitem__(self, idx): data = self.data[idx] - data_dict = {'id': data['id']} + data_dict = {'img_id': data['img_id']} text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text @@ -146,10 +146,10 @@ def __getitem__(self, idx): @master_only def evaluate(self, result, work_dir, show=True): - orig_index = [x['id'] for x in self.data] + orig_index = [x['img_id'] for x in self.data] results = [] for pred_dict in result: - index = pred_dict['id'] + index = pred_dict['img_id'] new_index = orig_index.index(index) filtered_rows = self.data[new_index] cur_result = {} diff --git a/xtuner/dataset/evaluation/textvqa_llava_dataset.py b/xtuner/dataset/evaluation/textvqa_llava_dataset.py index dedec318e..133d00de6 100644 --- a/xtuner/dataset/evaluation/textvqa_llava_dataset.py +++ b/xtuner/dataset/evaluation/textvqa_llava_dataset.py @@ -59,7 +59,7 @@ def __len__(self): return len(self.data) def __getitem__(self, idx): - data_dict = {'id': idx} + data_dict = {'img_id': idx} line = self.data[idx] image_file = line["image"] qs = line["text"] @@ -106,7 +106,7 @@ def evaluate(self, result, work_dir, show=True): ans_file = open(answers_file, "w") for pred_dict in result: - idx = pred_dict["id"] + idx = pred_dict["img_id"] gt_data = self.data[idx] ans_file.write(json.dumps({"question_id": gt_data['question_id'], From 8b44a9e0aa6a4c3a1c1b5672e4665a137d1fdc00 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Wed, 3 Apr 2024 15:06:33 +0800 Subject: [PATCH 014/126] update config --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 7 +++-- ...ip_so400m_p14_384_loar_e1_gpu8_finetune.py | 7 +++-- ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 7 +++-- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 7 +++-- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 7 +++-- .../dataset/collate_fns/defalut_collate_fn.py | 3 ++ xtuner/engine/runner/loops.py | 29 +++++++++++-------- xtuner/model/llava.py | 10 ++----- 8 files changed, 47 insertions(+), 30 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py index d2c4d9c50..9d4ffcd7f 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -273,7 +273,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset)) + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=default_collate_fn)) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -283,6 +284,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset)) + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=default_collate_fn) +) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py index 9a5428318..6cb39ac6f 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py @@ -276,7 +276,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset)) + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=default_collate_fn)) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -286,6 +287,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset)) + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=default_collate_fn) +) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index a404c12a0..cec2bb2a2 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -256,7 +256,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset)) + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=default_collate_fn)) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -266,6 +267,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset)) + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=default_collate_fn) +) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index 27fbef3f8..2e054e04b 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -328,7 +328,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset)) + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=default_collate_fn)) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -338,6 +339,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset)) + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=default_collate_fn) +) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index 8547e451c..a08e1a027 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -247,7 +247,8 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset)) + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=default_collate_fn)) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -257,7 +258,9 @@ num_workers=0, drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset)) + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=default_collate_fn) +) test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/collate_fns/defalut_collate_fn.py b/xtuner/dataset/collate_fns/defalut_collate_fn.py index f644df9cf..9780c7d9f 100644 --- a/xtuner/dataset/collate_fns/defalut_collate_fn.py +++ b/xtuner/dataset/collate_fns/defalut_collate_fn.py @@ -87,6 +87,9 @@ def default_collate_fn(instances: Sequence[Dict], pixel_values = torch.stack(pixel_values) data_dict['pixel_values'] = pixel_values + # add img_id for eval if exists + img_ids = [example.get('img_id', 0) for example in instances] + data_dict['img_id'] = img_ids if return_hf_format: return data_dict else: diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index af607ad9d..3256d232e 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -50,9 +50,10 @@ def __init__(self, class ValLoop(MMENGINE_ValLoop): - def __init__(self, runner, dataloader=None, evaluator=None, torch_dtype='fp16', select_metric='first') -> None: + def __init__(self, runner, dataloader, evaluator=None, torch_dtype='fp16', select_metric='first') -> None: # must be concatset super(MMENGINE_ValLoop, self).__init__(runner, dataloader) + self.collate_fn = self.dataloader.collate_fn self._runner = runner self.torch_dtype = torch_dtype if torch_dtype is not None: @@ -87,6 +88,8 @@ def run(self) -> dict: min(n_samples, per_rank_samples * (rank + 1))) for idx in per_rank_ids: data_batch = dataset[idx] + # TODO: Only bs=1 is currently supported temporarily + data_batch = self.collate_fn([data_batch]) self.run_iter(current_run_total_ids, data_batch, results) current_run_total_ids += 1 @@ -124,17 +127,17 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): data_batch (Sequence[dict]): Batch of data from dataloader. """ - assert 'img_id' in data_batch, 'img_id is required in data_batch. ' \ - 'The __getitem__ function in the dataset must ' \ - 'return a dictionary with the img_id.' - prediction = {'img_id': data_batch['img_id']} + assert 'img_id' in data_batch['data'], 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['data']['img_id'][0]} self.runner.call_hook( 'before_val_iter', batch_idx=idx, data_batch=data_batch) # outputs should be sequence of BaseDataElement - outputs = self.runner.model.val_step({'data': data_batch}) - prediction['prediction'] = outputs['prediction'] + outputs = self.runner.model.val_step(data_batch) + prediction.update(outputs) results.append(prediction) self.runner.call_hook( @@ -177,6 +180,8 @@ def run(self) -> dict: min(n_samples, per_rank_samples * (rank + 1))) for idx in per_rank_ids: data_batch = dataset[idx] + # TODO: Only bs=1 is currently supported temporarily + data_batch = self.collate_fn([data_batch]) self.run_iter(current_run_total_ids, data_batch, results) current_run_total_ids += 1 @@ -214,16 +219,16 @@ def run_iter(self, idx, data_batch: Sequence[dict], results: list): data_batch (Sequence[dict]): Batch of data from dataloader. """ - assert 'img_id' in data_batch, 'img_id is required in data_batch. ' \ - 'The __getitem__ function in the dataset must ' \ - 'return a dictionary with the img_id.' - prediction = {'img_id': data_batch['img_id']} + assert 'img_id' in data_batch['data'], 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['data']['img_id'][0]} self.runner.call_hook( 'before_test_iter', batch_idx=idx, data_batch=data_batch) # outputs should be sequence of BaseDataElement - outputs = self.runner.model.val_step({'data': data_batch}) + outputs = self.runner.model.val_step(data_batch) prediction.update(outputs) results.append(prediction) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index c7c1c1b15..2fff52d03 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -265,6 +265,7 @@ def forward(self, data, data_samples=None, mode='loss'): data = self._prepare_data_for_llm(data) return self.compute_loss(data, data_samples) elif mode == 'predict' or mode == 'generate': + data = self._prepare_data_for_llm(data) return self.generate(data, data_samples) elif mode == 'chat': return self.chat(data) @@ -294,15 +295,8 @@ def preparing_for_generation(self, metainfo: dict = None): self.stop_criteria = stop_criteria def generate(self, data, data_samples=None): - # TODO: It is the direct output of the dataset without going through the dataloader. - input_ids = data['input_ids'].unsqueeze(0).to(self.visual_encoder.device) - data['input_ids'] = input_ids - pixel_values = data['pixel_values'].unsqueeze(0).to(self.visual_encoder.device) - data['pixel_values'] = pixel_values - - mm_inputs = self._prepare_data_for_llm(data) generate_output = self.llm.generate( - **mm_inputs, + **data, generation_config=self.gen_config, streamer=None, bos_token_id=self.tokenizer.bos_token_id, From 05534c9c2d3076f28e5a60091902782d2086d3e9 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Wed, 3 Apr 2024 16:06:41 +0800 Subject: [PATCH 015/126] fix --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 7 +- ...ip_so400m_p14_384_loar_e1_gpu8_finetune.py | 7 +- ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 9 ++- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 9 ++- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 62 --------------- xtuner/dataset/collate_fns/__init__.py | 3 +- .../dataset/collate_fns/defalut_collate_fn.py | 3 - xtuner/dataset/collate_fns/mm_collate_fn.py | 77 +++++++++++++++++++ 8 files changed, 98 insertions(+), 79 deletions(-) create mode 100644 xtuner/dataset/collate_fns/mm_collate_fn.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py index 9d4ffcd7f..d1f33caf0 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -7,7 +7,7 @@ SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) from peft import LoraConfig from xtuner.dataset import LLaVADataset -from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook @@ -274,7 +274,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -285,7 +285,8 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=default_collate_fn) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) ) + test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py index 6cb39ac6f..bfc091ab0 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py @@ -7,7 +7,7 @@ SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) from peft import LoraConfig from xtuner.dataset import LLaVADataset -from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook @@ -277,7 +277,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -288,7 +288,8 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=default_collate_fn) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) ) + test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index cec2bb2a2..2eedceca0 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -7,7 +7,7 @@ SiglipImageProcessor, SiglipVisionModel) from xtuner.dataset import LLaVADataset -from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook @@ -249,6 +249,8 @@ pad_image_to_square=True) ] +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 # TODO: We are not currently using val_evaluator # Don't support num_workers > 0 val_dataloader = dict( @@ -257,7 +259,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -268,7 +270,8 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=default_collate_fn) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) ) + test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index 2e054e04b..57f58b675 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -7,7 +7,7 @@ SiglipImageProcessor, SiglipVisionModel) from xtuner.dataset import LLaVADataset -from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook @@ -112,7 +112,7 @@ type=LengthGroupedSampler, length_property='modality_length', per_device_batch_size=batch_size * accumulative_counts), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn)) ####################################################################### # PART 4 Scheduler & Optimizer # @@ -329,7 +329,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -340,7 +340,8 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=default_collate_fn) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) ) + test_evaluator = val_evaluator test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index a08e1a027..2f56d0637 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -202,65 +202,3 @@ # set log processor log_processor = dict(by_epoch=False) - -# ==================== val and test cfg ======================= -val_dataset = [ - dict( - type=MMELLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', - image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), - # dict( - # type=MultipleChoiceLLaVADataset, - # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - # prompt_template=PROMPT_TEMPLATE.vicuna, - # tokenizer=tokenizer, - # image_processor=image_processor, - # pad_image_to_square=True) -] - -test_dataset = [ - dict( - type=MMELLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', - image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), - dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True) -] - -# TODO: We are not currently using val_evaluator -# Don't support num_workers > 0 -val_dataloader = dict( - batch_size=1, - num_workers=0, - drop_last=False, - sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=default_collate_fn)) -val_evaluator = dict() -val_cfg = dict(type=ValLoop) - -# TODO: We are not currently using test_evaluator -test_dataloader = dict( - batch_size=1, - num_workers=0, - drop_last=False, - sampler=dict(type=DefaultSampler, shuffle=False), - dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=default_collate_fn) -) -test_evaluator = val_evaluator -test_cfg = dict(type=TestLoop, select_metric='first') - diff --git a/xtuner/dataset/collate_fns/__init__.py b/xtuner/dataset/collate_fns/__init__.py index 0d2d1febe..6f6fb4631 100644 --- a/xtuner/dataset/collate_fns/__init__.py +++ b/xtuner/dataset/collate_fns/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .defalut_collate_fn import default_collate_fn from .mmlu_collate_fn import mmlu_collate_fn +from .mm_collate_fn import mm_collate_fn -__all__ = ['default_collate_fn', 'mmlu_collate_fn'] +__all__ = ['default_collate_fn', 'mmlu_collate_fn', 'mm_collate_fn'] diff --git a/xtuner/dataset/collate_fns/defalut_collate_fn.py b/xtuner/dataset/collate_fns/defalut_collate_fn.py index 9780c7d9f..f644df9cf 100644 --- a/xtuner/dataset/collate_fns/defalut_collate_fn.py +++ b/xtuner/dataset/collate_fns/defalut_collate_fn.py @@ -87,9 +87,6 @@ def default_collate_fn(instances: Sequence[Dict], pixel_values = torch.stack(pixel_values) data_dict['pixel_values'] = pixel_values - # add img_id for eval if exists - img_ids = [example.get('img_id', 0) for example in instances] - data_dict['img_id'] = img_ids if return_hf_format: return data_dict else: diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py new file mode 100644 index 000000000..bd9469092 --- /dev/null +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Sequence + +import torch +from torch.nn.utils.rnn import pad_sequence +from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX + + +def mm_collate_fn(instances: Sequence[Dict], + pad_index: int = DEFAULT_PAD_TOKEN_INDEX, + return_hf_format: bool = False, + extra_collate_keys=None): + input_ids = [] + labels = [] + cumulative_len = [] + position_ids = [] + + has_image = any(inst.get('pixel_values') is not None for inst in instances) + has_labels = any(inst.get('labels') is not None for inst in instances) + mode = 'train' if has_labels else 'eval' + + if has_image: + pixel_values = [] + + for i, data in enumerate(instances): + input_ids.append(torch.LongTensor(data['input_ids'])) + if mode == 'train': + labels.append(torch.LongTensor(data['labels'])) + position_ids.append(torch.IntTensor(data['position_ids'])) + + if 'cumulative_len' in data: + cumulative_len.append(torch.IntTensor(data['cumulative_len'])) + + if has_image: + pixel_values.append(data['pixel_values']) + + if len(instances) > 1: + input_ids = pad_sequence( + input_ids, batch_first=True, padding_value=pad_index) + labels = pad_sequence( + labels, batch_first=True, padding_value=IGNORE_INDEX) + position_ids = pad_sequence( + position_ids, batch_first=True, padding_value=0) + else: + input_ids = torch.stack(input_ids) + if mode == 'train': + labels = torch.stack(labels) + position_ids = torch.stack(position_ids) + + if len(cumulative_len) == 0: + cumulative_len = None + + if mode == 'train': + data_dict = { + 'input_ids': input_ids, + 'position_ids': position_ids, + 'attention_mask': input_ids.ne(pad_index), + 'labels': labels, + 'cumulative_len': cumulative_len, + } + else: + data_dict = { + 'input_ids': input_ids, + } + + if has_image: + pixel_values = torch.stack(pixel_values) + data_dict['pixel_values'] = pixel_values + + if extra_collate_keys is not None: + for key in extra_collate_keys: + data_dict[key] = [inst[key] for inst in instances] + + if return_hf_format: + return data_dict + else: + return {'data': data_dict, 'data_samples': None} From 262b6361aa95c24c9f259ad84e7c90224dfb2d89 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Wed, 3 Apr 2024 17:59:21 +0800 Subject: [PATCH 016/126] fix --- xtuner/dataset/collate_fns/mm_collate_fn.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py index bd9469092..464954436 100644 --- a/xtuner/dataset/collate_fns/mm_collate_fn.py +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -26,7 +26,6 @@ def mm_collate_fn(instances: Sequence[Dict], input_ids.append(torch.LongTensor(data['input_ids'])) if mode == 'train': labels.append(torch.LongTensor(data['labels'])) - position_ids.append(torch.IntTensor(data['position_ids'])) if 'cumulative_len' in data: cumulative_len.append(torch.IntTensor(data['cumulative_len'])) @@ -39,13 +38,14 @@ def mm_collate_fn(instances: Sequence[Dict], input_ids, batch_first=True, padding_value=pad_index) labels = pad_sequence( labels, batch_first=True, padding_value=IGNORE_INDEX) - position_ids = pad_sequence( - position_ids, batch_first=True, padding_value=0) else: input_ids = torch.stack(input_ids) if mode == 'train': labels = torch.stack(labels) - position_ids = torch.stack(position_ids) + + if mode == 'train': + attention_mask = input_ids.ne(pad_index) + position_ids = attention_mask.long().cumsum(-1) - 1 if len(cumulative_len) == 0: cumulative_len = None @@ -54,7 +54,7 @@ def mm_collate_fn(instances: Sequence[Dict], data_dict = { 'input_ids': input_ids, 'position_ids': position_ids, - 'attention_mask': input_ids.ne(pad_index), + 'attention_mask': attention_mask, 'labels': labels, 'cumulative_len': cumulative_len, } From a22f033d45a672d2e855315346e2e3e16c3da93c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 7 Apr 2024 11:28:32 +0800 Subject: [PATCH 017/126] update design --- xtuner/dataset/evaluation/__init__.py | 12 +-- .../dataset/evaluation/base_eval_dataset.py | 2 +- ..._llava_dataset.py => hallusion_dataset.py} | 52 +++---------- .../{mme_llava_dataset.py => mme_dataset.py} | 57 +++----------- ..._dataset.py => multiple_choice_dataset.py} | 63 +++------------- ...{pope_llava_dataset.py => pope_dataset.py} | 51 ++----------- ...qa_llava_dataset.py => textvqa_dataset.py} | 63 +++++----------- xtuner/dataset/llava_proxy_eval_dataset.py | 75 +++++++++++++++++++ 8 files changed, 137 insertions(+), 238 deletions(-) rename xtuner/dataset/evaluation/{hallusion_llava_dataset.py => hallusion_dataset.py} (72%) rename xtuner/dataset/evaluation/{mme_llava_dataset.py => mme_dataset.py} (70%) rename xtuner/dataset/evaluation/{multiple_choice_llava_dataset.py => multiple_choice_dataset.py} (79%) rename xtuner/dataset/evaluation/{pope_llava_dataset.py => pope_dataset.py} (75%) rename xtuner/dataset/evaluation/{textvqa_llava_dataset.py => textvqa_dataset.py} (66%) create mode 100644 xtuner/dataset/llava_proxy_eval_dataset.py diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 1353a06be..369775dba 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -1,7 +1,7 @@ -from .mme_llava_dataset import MMELLaVADataset -from .multiple_choice_llava_dataset import MultipleChoiceLLaVADataset -from .pope_llava_dataset import POPELLaVADataset -from .hallusion_llava_dataset import HallusionLLaVADataset -from .textvqa_llava_dataset import TextVQALLaVADataset +from .mme_dataset import MMEDataset +from .multiple_choice_dataset import MultipleChoiceDataset +from .pope_dataset import POPEDataset +from .hallusion_dataset import HallusionDataset +from .textvqa_dataset import TextVQADataset -__all__ = ['MMELLaVADataset', 'MultipleChoiceLLaVADataset', 'POPELLaVADataset', 'HallusionLLaVADataset', 'TextVQALLaVADataset'] +__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset'] diff --git a/xtuner/dataset/evaluation/base_eval_dataset.py b/xtuner/dataset/evaluation/base_eval_dataset.py index dab081462..99c778245 100644 --- a/xtuner/dataset/evaluation/base_eval_dataset.py +++ b/xtuner/dataset/evaluation/base_eval_dataset.py @@ -11,7 +11,7 @@ class BaseEvalDataset(Dataset): - METAINFO: dict = dict() + METAINFO: dict = dict(name='default') def __init__(self, metainfo: Union[Mapping, Config, None] = None): self._metainfo = self._load_metainfo(copy.deepcopy(metainfo)) diff --git a/xtuner/dataset/evaluation/hallusion_llava_dataset.py b/xtuner/dataset/evaluation/hallusion_dataset.py similarity index 72% rename from xtuner/dataset/evaluation/hallusion_llava_dataset.py rename to xtuner/dataset/evaluation/hallusion_dataset.py index cc2a2b181..cd5fd2dc6 100644 --- a/xtuner/dataset/evaluation/hallusion_llava_dataset.py +++ b/xtuner/dataset/evaluation/hallusion_dataset.py @@ -2,21 +2,22 @@ import os.path as osp import pandas as pd -import torch from mmengine.dist import (master_only) from .base_eval_dataset import BaseEvalDataset -from xtuner.dataset.utils import decode_base64_to_image, expand2square -from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.dataset.utils import decode_base64_to_image from xtuner.registry import BUILDER from mmengine.logging import print_log from .utils import YOrN_Extraction, Hallusion_rating +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset -class HallusionLLaVADataset(BaseEvalDataset): +class HallusionDataset(BaseEvalDataset): + + METAINFO: dict = dict(name='hullusion') def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, - use_system=False, metainfo=None): + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): super().__init__(metainfo) self.use_system = use_system self.data_file = data_file @@ -36,6 +37,9 @@ def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_i self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' self.data = self.load_data_list() + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + def get_image(self, image): while len(image) < 16: image = self.df[self.df['index'] == int(image)]['image'].values @@ -74,43 +78,7 @@ def load_data_list(self): def __getitem__(self, idx): data = self.data[idx] - data_dict = {'img_id': data['img_id']} - - text = data['question'] - text = DEFAULT_IMAGE_TOKEN + '\n' + text - - if self.use_system: - inputs = self.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.template['INSTRUCTION'].format(input=text, round=1) - - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - ids.append(IMAGE_TOKEN_INDEX) - ids = torch.tensor(ids) - data_dict['input_ids'] = ids - - image = self.get_image(data['img']).convert('RGB') - if self.pad_image_to_square: - image = expand2square( - image, - tuple( - int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - + data_dict = self.proxy_eval_dataset.getitem(idx, data) return data_dict @master_only diff --git a/xtuner/dataset/evaluation/mme_llava_dataset.py b/xtuner/dataset/evaluation/mme_dataset.py similarity index 70% rename from xtuner/dataset/evaluation/mme_llava_dataset.py rename to xtuner/dataset/evaluation/mme_dataset.py index 4e9631548..94f68f9b6 100644 --- a/xtuner/dataset/evaluation/mme_llava_dataset.py +++ b/xtuner/dataset/evaluation/mme_dataset.py @@ -2,22 +2,23 @@ import os.path as osp import pandas as pd -import torch from mmengine.dist import (master_only) from .base_eval_dataset import BaseEvalDataset -from xtuner.dataset.utils import decode_base64_to_image, expand2square -from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.dataset.utils import decode_base64_to_image from xtuner.registry import BUILDER from mmengine.logging import print_log -from PIL import Image from .utils import YOrN_Extraction, MME_rating +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset -class MMELLaVADataset(BaseEvalDataset): +class MMEDataset(BaseEvalDataset): + + METAINFO: dict = dict(name='mme') def __init__(self, data_file, image_folder, prompt_template, image_processor, tokenizer, pad_image_to_square=True, - use_system=False, for_llava_prompt=False, metainfo=None): + use_system=False, for_llava_prompt=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): super().__init__(metainfo) self.image_folder = image_folder self.use_system = use_system @@ -39,6 +40,9 @@ def __init__(self, data_file, image_folder, prompt_template, image_processor, to self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' self.data = self.load_data_list() + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + def load_data_list(self): data_list = [] for idx in range(len(self.df)): @@ -80,46 +84,7 @@ def __len__(self): def __getitem__(self, idx): data = self.data[idx] - data_dict = {'img_id': data['img_id']} - - text = data['question'] - text = DEFAULT_IMAGE_TOKEN + '\n' + text - - if self.use_system: - inputs = self.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.template['INSTRUCTION'].format(input=text, round=1) - - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - ids.append(IMAGE_TOKEN_INDEX) - ids = torch.tensor(ids) - data_dict['input_ids'] = ids - - # 发现重新生成数据集后,感知部分还是对不上,推理部分对的上,暂时不清楚原因 - # image = self.get_image(data['img']).convert('RGB') - image = Image.open(os.path.join(self.image_folder, - data['image_path'])).convert('RGB') - if self.pad_image_to_square: - image = expand2square( - image, - tuple( - int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - + data_dict = self.proxy_eval_dataset.getitem(idx, data) return data_dict @master_only diff --git a/xtuner/dataset/evaluation/multiple_choice_llava_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py similarity index 79% rename from xtuner/dataset/evaluation/multiple_choice_llava_dataset.py rename to xtuner/dataset/evaluation/multiple_choice_dataset.py index 54cf0609f..e62e73401 100644 --- a/xtuner/dataset/evaluation/multiple_choice_llava_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -5,24 +5,23 @@ import numpy as np import pandas as pd -import torch from mmengine.dist import (master_only) from rich.console import Console from rich.table import Table from .base_eval_dataset import BaseEvalDataset -from xtuner.dataset.utils import decode_base64_to_image, expand2square -from xtuner.tools.utils import is_cn_string -from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from xtuner.dataset.utils import decode_base64_to_image from xtuner.registry import BUILDER from mmengine.logging import print_log +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset -class MultipleChoiceLLaVADataset(BaseEvalDataset): +class MultipleChoiceDataset(BaseEvalDataset): # 'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d' + METAINFO: dict = dict(name='multiple_choice') def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, - use_system=False, metainfo=None): + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): super().__init__(metainfo) self.use_system = use_system self.data_file = data_file @@ -40,6 +39,9 @@ def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_i self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' self.data = self.load_data_list() + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + def get_image(self, image): while len(image) < 16: image = self.df[self.df['index'] == int(image)]['image'].values @@ -89,54 +91,7 @@ def load_data_list(self): def __getitem__(self, idx): data = self.data[idx] - data_dict = {'img_id': data['img_id']} - - if data['context'] is not None: - text = data['context'] + '\n' + data[ - 'question'] + '\n' + data['options'] - else: - text = data['question'] + '\n' + data['options'] - - text = DEFAULT_IMAGE_TOKEN + '\n' + text - - if is_cn_string(text): - text = text + '请直接回答选项字母。' - else: - text = text + ("Answer with the option's letter from the " - 'given choices directly.') - - if self.use_system: - inputs = self.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.template['INSTRUCTION'].format(input=text, round=1) - - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - ids.append(IMAGE_TOKEN_INDEX) - ids = torch.tensor(ids) - data_dict['input_ids'] = ids - - image = self.get_image(data['img']).convert('RGB') - if self.pad_image_to_square: - image = expand2square( - image, - tuple( - int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - + data_dict = self.proxy_eval_dataset.getitem(idx, data) return data_dict def load_from_df(self, idx, key): diff --git a/xtuner/dataset/evaluation/pope_llava_dataset.py b/xtuner/dataset/evaluation/pope_dataset.py similarity index 75% rename from xtuner/dataset/evaluation/pope_llava_dataset.py rename to xtuner/dataset/evaluation/pope_dataset.py index 9fdca6f2d..780784eb8 100644 --- a/xtuner/dataset/evaluation/pope_llava_dataset.py +++ b/xtuner/dataset/evaluation/pope_dataset.py @@ -1,18 +1,15 @@ -import json import os import pandas as pd -import torch from mmengine.dist import master_only from PIL import Image -from xtuner.dataset.utils import expand2square from xtuner.registry import BUILDER -from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX from mmengine.logging import print_log from .base_eval_dataset import BaseEvalDataset from .utils import YOrN_Extraction, load_jsonl +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset def eval_func(pred_list, label_list): @@ -46,10 +43,11 @@ def eval_func(pred_list, label_list): return f1 -class POPELLaVADataset(BaseEvalDataset): +class POPEDataset(BaseEvalDataset): + METAINFO: dict = dict(name='pope') def __init__(self, data_file, coco_val_path, prompt_template, image_processor, tokenizer, pad_image_to_square=True, - use_system=False, metainfo=None): + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): super().__init__(metainfo) self.use_system = use_system if isinstance(data_file, str): @@ -72,6 +70,9 @@ def __init__(self, data_file, coco_val_path, prompt_template, image_processor, t self.results_xlsx_path = 'pope-results.xlsx' self.data = self.load_data_list() + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + def get_image(self, image): image = Image.open(os.path.join(self.coco_val_path, image)) return image @@ -105,43 +106,7 @@ def load_data_list(self): def __getitem__(self, idx): data = self.data[idx] - data_dict = {'img_id': data['img_id']} - - text = data['question'] - text = DEFAULT_IMAGE_TOKEN + '\n' + text - # We did not add “\nAnswer the question using a single word or phrase.” to the prompt - if self.use_system: - inputs = self.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.template['INSTRUCTION'].format(input=text, round=1) - - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode( - chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - ids.append(IMAGE_TOKEN_INDEX) - ids = torch.tensor(ids) - data_dict['input_ids'] = ids - - image = self.get_image(data['img']).convert('RGB') - if self.pad_image_to_square: - image = expand2square( - image, - tuple(int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - + data_dict = self.proxy_eval_dataset.getitem(idx, data) return data_dict @master_only diff --git a/xtuner/dataset/evaluation/textvqa_llava_dataset.py b/xtuner/dataset/evaluation/textvqa_dataset.py similarity index 66% rename from xtuner/dataset/evaluation/textvqa_llava_dataset.py rename to xtuner/dataset/evaluation/textvqa_dataset.py index 133d00de6..e786c2f8f 100644 --- a/xtuner/dataset/evaluation/textvqa_llava_dataset.py +++ b/xtuner/dataset/evaluation/textvqa_dataset.py @@ -2,17 +2,14 @@ import os.path as osp import re -import torch from .base_eval_dataset import BaseEvalDataset -from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) from xtuner.registry import BUILDER import json -from PIL import Image -from xtuner.dataset.utils import expand2square from mmengine.dist import (master_only) from .textvqa_utils import TextVQAAccuracyEvaluator from mmengine.logging import print_log +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset def prompt_processor(prompt): @@ -33,9 +30,12 @@ def prompt_processor(prompt): return question.lower() -class TextVQALLaVADataset(BaseEvalDataset): +class TextVQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='textvqa') + def __init__(self, data_file, ann_file, image_folder, prompt_template, image_processor, tokenizer, - pad_image_to_square=True, use_system=False, metainfo=None): + pad_image_to_square=True, use_system=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): super().__init__(metainfo) self.use_system = use_system self.data_file = data_file @@ -52,52 +52,23 @@ def __init__(self, data_file, ann_file, image_folder, prompt_template, image_pro self.results_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.jsonl' self.data = self.load_data_list() + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + def load_data_list(self): - return [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + for i, d in enumerate(data): + d['img_id'] = i + d['image_path'] = d['image'] + d['question'] = d['text'] + return data def __len__(self): return len(self.data) def __getitem__(self, idx): - data_dict = {'img_id': idx} - line = self.data[idx] - image_file = line["image"] - qs = line["text"] - - text = DEFAULT_IMAGE_TOKEN + '\n' + qs - - if self.use_system: - inputs = self.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.template['INSTRUCTION'].format(input=text, round=1) - - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode(chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - ids.append(IMAGE_TOKEN_INDEX) - ids = torch.tensor(ids) - data_dict['input_ids'] = ids - - image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') - if self.pad_image_to_square: - image = expand2square( - image, - tuple( - int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) return data_dict @master_only diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py new file mode 100644 index 000000000..5c26d00ed --- /dev/null +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -0,0 +1,75 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string + + +class LLaVAProxyEvalDataset: + def __init__(self, eval_dataset): + self.eval_ds = eval_dataset + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + if self.eval_ds.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + image = self.eval_ds.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict From 94d8bc9605fddc85b44de5dd1ecffc61920fe078 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 7 Apr 2024 11:43:10 +0800 Subject: [PATCH 018/126] update cfg --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 86 ++++++++++++++++-- ...ip_so400m_p14_384_loar_e1_gpu8_finetune.py | 86 ++++++++++++++++-- ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 88 +++++++++++++++++-- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 30 +++---- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 8 +- 5 files changed, 255 insertions(+), 43 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py index d1f33caf0..8bcad7190 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -13,7 +13,8 @@ from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -128,7 +129,7 @@ type=LengthGroupedSampler, length_property='modality_length', per_device_batch_size=batch_size * accumulative_counts), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn)) ####################################################################### # PART 4 Scheduler & Optimizer # @@ -232,7 +233,7 @@ # ==================== val and test cfg ======================= val_dataset = [ dict( - type=MMELLaVADataset, + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -240,7 +241,7 @@ image_processor=image_processor, pad_image_to_square=True), # dict( - # type=MultipleChoiceLLaVADataset, + # type=MultipleChoiceDataset, # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', # prompt_template=PROMPT_TEMPLATE.vicuna, # tokenizer=tokenizer, @@ -250,16 +251,87 @@ test_dataset = [ dict( - type=MMELLaVADataset, + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py index bfc091ab0..39ddcedf0 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py @@ -13,7 +13,8 @@ from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -131,7 +132,7 @@ type=LengthGroupedSampler, length_property='modality_length', per_device_batch_size=batch_size * accumulative_counts), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn)) ####################################################################### # PART 4 Scheduler & Optimizer # @@ -235,7 +236,7 @@ # ==================== val and test cfg ======================= val_dataset = [ dict( - type=MMELLaVADataset, + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -243,7 +244,7 @@ image_processor=image_processor, pad_image_to_square=True), # dict( - # type=MultipleChoiceLLaVADataset, + # type=MultipleChoiceDataset, # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', # prompt_template=PROMPT_TEMPLATE.vicuna, # tokenizer=tokenizer, @@ -253,16 +254,87 @@ test_dataset = [ dict( - type=MMELLaVADataset, + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index 2eedceca0..54d80c67a 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -13,7 +13,8 @@ from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -111,7 +112,7 @@ type=LengthGroupedSampler, length_property='modality_length', per_device_batch_size=batch_size * accumulative_counts), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn)) ####################################################################### # PART 4 Scheduler & Optimizer # @@ -215,7 +216,7 @@ # ==================== val and test cfg ======================= val_dataset = [ dict( - type=MMELLaVADataset, + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -223,7 +224,7 @@ image_processor=image_processor, pad_image_to_square=True), # dict( - # type=MultipleChoiceLLaVADataset, + # type=MultipleChoiceDataset, # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', # prompt_template=PROMPT_TEMPLATE.vicuna, # tokenizer=tokenizer, @@ -233,24 +234,93 @@ test_dataset = [ dict( - type=MMELLaVADataset, + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True) ] -# TODO: We are not currently using val_evaluator -# Don't support num_workers > 0 # TODO: We are not currently using val_evaluator # Don't support num_workers > 0 val_dataloader = dict( diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index 57f58b675..e4f40dcb8 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -13,8 +13,8 @@ from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset, POPELLaVADataset, \ - HallusionLLaVADataset, TextVQALLaVADataset +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -216,7 +216,7 @@ # ==================== val and test cfg ======================= val_dataset = [ dict( - type=MMELLaVADataset, + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -224,7 +224,7 @@ image_processor=image_processor, pad_image_to_square=True), # dict( - # type=MultipleChoiceLLaVADataset, + # type=MultipleChoiceDataset, # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', # prompt_template=PROMPT_TEMPLATE.vicuna, # tokenizer=tokenizer, @@ -234,56 +234,56 @@ test_dataset = [ dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=MultipleChoiceLLaVADataset, + type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=TextVQALLaVADataset, + type=TextVQADataset, data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', @@ -292,7 +292,7 @@ image_processor=image_processor, pad_image_to_square=True), dict( - type=MMELLaVADataset, + type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -301,14 +301,14 @@ # for_llava_prompt=True, # 开了后,perception 会掉 pad_image_to_square=True), dict( - type=HallusionLLaVADataset, + type=HallusionDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( - type=POPELLaVADataset, + type=POPEDataset, data_file=[ '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index 2f56d0637..0547c9a14 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -8,14 +8,12 @@ SiglipImageProcessor, SiglipVisionModel) from xtuner.dataset import LLaVADataset -from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook -from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from xtuner.engine.runner import TrainLoop from xtuner.utils import PROMPT_TEMPLATE from xtuner.model import LLaVAModel -from xtuner.dataset.evaluation import MMELLaVADataset, MultipleChoiceLLaVADataset -from xtuner.dataset import ConcatDataset ####################################################################### # PART 1 Settings # @@ -103,7 +101,7 @@ pin_memory=True, dataset=llava_dataset, sampler=dict(type=DefaultSampler, shuffle=True), - collate_fn=dict(type=default_collate_fn)) + collate_fn=dict(type=mm_collate_fn)) ####################################################################### # PART 4 Scheduler & Optimizer # From 2efb4568dbb4a345eb8453dc5ca156bd810143f5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 8 Apr 2024 09:45:30 +0800 Subject: [PATCH 019/126] add anyres --- ...glip_so400m_p14_anyres_e1_gpu8_finetune.py | 348 ++++++++++++++++++ xtuner/dataset/__init__.py | 3 +- xtuner/dataset/collate_fns/mm_collate_fn.py | 5 +- xtuner/dataset/llava.py | 35 +- xtuner/dataset/utils.py | 175 ++++++++- 5 files changed, 562 insertions(+), 4 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py new file mode 100644 index 000000000..1b8b52881 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py @@ -0,0 +1,348 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index 19ef58b9d..8b2e6f02f 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -6,7 +6,7 @@ from .intern_repo import (build_packed_dataset, load_intern_repo_tokenized_dataset, load_intern_repo_untokenized_dataset) -from .llava import LLaVADataset +from .llava import LLaVADataset, AnyResLLaVADataset from .modelscope import process_ms_dataset from .moss_sft import MOSSSFTDataset from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset, @@ -32,4 +32,5 @@ 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset', + 'AnyResLLaVADataset' ] diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py index 464954436..67d2a8f7b 100644 --- a/xtuner/dataset/collate_fns/mm_collate_fn.py +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -64,7 +64,10 @@ def mm_collate_fn(instances: Sequence[Dict], } if has_image: - pixel_values = torch.stack(pixel_values) + # if all images have the same size, stack them into a single tensor + # else, keep them as a list of tensors + if all(x.shape == pixel_values[0].shape for x in pixel_values): + pixel_values = torch.stack(pixel_values, dim=0) data_dict['pixel_values'] = pixel_values if extra_collate_keys is not None: diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 1c337c877..ec087293f 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -13,7 +13,7 @@ from xtuner.registry import BUILDER from .huggingface import process_hf_dataset -from .utils import expand2square +from .utils import expand2square, process_anyres_image class LLaVADataset(Dataset): @@ -105,3 +105,36 @@ def __getitem__(self, index): data_dict['pixel_values'] = torch.zeros(3, crop_size['height'], crop_size['width']) return data_dict + + +class AnyResLLaVADataset(LLaVADataset): + + def __init__(self, image_grid_pinpoints, *args, **kwargs): + self.image_grid_pinpoints = image_grid_pinpoints + super().__init__(*args, **kwargs) + # TODO: Assuming they are all squares. + if hasattr(self.image_processor, 'crop_size'): + self._crop_size = self.image_processor.crop_size + else: + self._crop_size = self.image_processor.size + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + def __getitem__(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(self.image_folder, + image_file)).convert('RGB') + orig_size = image.size + # use to remove padding + data_dict['orig_size'] = orig_size + image = process_anyres_image(image, self.image_processor, + self.image_grid_pinpoints, + self._patch_size, self._shortest_edge) + data_dict['pixel_values'] = image + else: + data_dict['orig_size'] = self._crop_size + data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], + self._crop_size['width']) + return data_dict diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index 84336ddb2..aded85f3e 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -4,7 +4,8 @@ import io from io import BytesIO from itertools import chain - +import torch +import math import numpy as np import requests from PIL import Image @@ -269,3 +270,175 @@ def decode_base64_to_image(base64_string): image_data = base64.b64decode(base64_string) image = Image.open(io.BytesIO(image_data)) return image + + +# ---------------------------------------------------------------------- +# ref: https://github.com/haotian-liu/LLaVA +def select_best_resolution(original_size, possible_resolutions): + """Selects the best resolution from a list of possible resolutions based on + the original size. + + Args: + original_size (tuple): The original size of the image in the format + (width, height). + possible_resolutions (list): A list of possible resolutions in + the format [(width1, height1), (width2, height2), ...]. + + Returns: + tuple: The best fit resolution in the format (width, height). + """ + original_width, original_height = original_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float('inf') + + for width, height in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int( + original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + +def resize_and_pad_image(image, target_resolution): + """Resize and pad an image to a target resolution while maintaining aspect + ratio. + + Args: + image (PIL.Image.Image): The input image. + target_resolution (tuple): The target resolution (width, height) of + the image. + + Returns: + PIL.Image.Image: The resized and padded image. + """ + original_width, original_height = image.size + target_width, target_height = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + # Resize the image + resized_image = image.resize((new_width, new_height)) + + new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + new_image.paste(resized_image, (paste_x, paste_y)) + + return new_image + + +def divide_to_patches(image, patch_size): + """Divides an image into patches of a specified size. + + Args: + image (PIL.Image.Image): The input image. + patch_size (int): The size of each patch. + + Returns: + list: A list of PIL.Image.Image objects representing the patches. + """ + patches = [] + width, height = image.size + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + box = (j, i, j + patch_size, i + patch_size) + patch = image.crop(box) + patches.append(patch) + + return patches + + +def process_anyres_image(image, processor, possible_resolutions, patch_size, shortest_edge): + """Process an image with variable resolutions. + + Args: + image (PIL.Image.Image): The input image to be processed. + processor: The image processor object. + possible_resolutions (str): A string representation of a list of + possible resolutions. + + Returns: + torch.Tensor: A tensor containing the processed image patches. + """ + best_resolution = select_best_resolution(image.size, possible_resolutions) + image_padded = resize_and_pad_image(image, best_resolution) + + patches = divide_to_patches(image_padded, patch_size) + + image_original_resize = image.resize((shortest_edge, shortest_edge)) + + image_patches = [image_original_resize] + patches + image_patches = [ + processor.preprocess(image_patch, + return_tensors='pt')['pixel_values'][0] + for image_patch in image_patches + ] + return torch.stack(image_patches, dim=0) + + +def get_anyres_image_grid_shape(image_size, possible_resolutions, patch_size): + """Calculate the shape of the image patch grid after the preprocessing for + images of any resolution. + + Args: + image_size (tuple): The size of the input image in the format + (width, height). + possible_resolutions (list): A string representation of a list of + possible resolutions. + patch_size (int): The size of each image patch. + + Returns: + tuple: The shape of the image patch grid in the format (width, height). + """ + width, height = select_best_resolution(image_size, possible_resolutions) + return width // patch_size, height // patch_size + + +def unpad_image(tensor, original_size): + """Unpads a PyTorch tensor of a padded and resized image. + + Args: + tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format. + original_size (tuple): The original size of the image (height, width). + + Returns: + torch.Tensor: The unpadded image tensor. + """ + original_width, original_height = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding:current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding:current_width - padding] + + return unpadded_tensor +# ---------------------------------------------------------------------- From 9f8d2b3a340689fdd1b75ad5c168d7f4009a7f39 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 8 Apr 2024 14:32:47 +0800 Subject: [PATCH 020/126] add gqa --- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 13 +- xtuner/dataset/evaluation/__init__.py | 3 +- xtuner/dataset/evaluation/gqa_dataset.py | 118 +++++ xtuner/dataset/evaluation/gqa_eval_utils.py | 499 ++++++++++++++++++ xtuner/dataset/llava_proxy_eval_dataset.py | 2 +- 5 files changed, 631 insertions(+), 4 deletions(-) create mode 100644 xtuner/dataset/evaluation/gqa_dataset.py create mode 100644 xtuner/dataset/evaluation/gqa_eval_utils.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index e4f40dcb8..dead563c9 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -14,7 +14,7 @@ from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset + HallusionDataset, TextVQADataset, GQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -318,7 +318,16 @@ prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, - pad_image_to_square=True) + pad_image_to_square=True), + dict( + type=GQADataset, + question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 369775dba..76f5a333e 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -3,5 +3,6 @@ from .pope_dataset import POPEDataset from .hallusion_dataset import HallusionDataset from .textvqa_dataset import TextVQADataset +from .gqa_dataset import GQADataset -__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset'] +__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset'] diff --git a/xtuner/dataset/evaluation/gqa_dataset.py b/xtuner/dataset/evaluation/gqa_dataset.py new file mode 100644 index 000000000..3c93de460 --- /dev/null +++ b/xtuner/dataset/evaluation/gqa_dataset.py @@ -0,0 +1,118 @@ +import os +import os.path as osp +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .gqa_eval_utils import eval_gqa + + +class GQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='gqa') + + def __init__( + self, + question_file, + gt_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + self.data_file = question_file + self.gt_file = gt_file + # Save detailed information for easy viewing + self.answer_file = 'answer_gqa_results.jsonl' + # solely for evaluation purposes + self.prediction_file = 'pred_gqa_results.jsonl' + + self.image_folder = image_folder + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + question_data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + data_list = [] + for idx in range(len(question_data)): + sample = question_data[idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + category = sample['category'] + + data = { + 'img_id': idx, + 'index': index, + 'image_path': image_path, + 'question': question, + 'category': category, + } + data_list.append(data) + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, results, work_dir): + answers_file = osp.join(work_dir, self.answer_file) + ans_file = open(answers_file, "w") + + for pred_dict in results: + idx = pred_dict["img_id"] + gt_data = self.data[idx] + + ans_file.write( + json.dumps( + { + "question_id": gt_data['index'], + "prompt": gt_data['question'], + "text": pred_dict['prediction'], + "metadata": {}, + } + ) + + "\n" + ) + ans_file.close() + + all_preds = [] + for line_idx, line in enumerate(open(answers_file)): + res = json.loads(line) + question_id = res['question_id'] + text = res['text'].rstrip('.').lower() + all_preds.append({"questionId": question_id, "prediction": text}) + + prediction_file = osp.join(work_dir, self.prediction_file) + with open(prediction_file, 'w') as f: + json.dump(all_preds, f) + + evaluator = eval_gqa(questions=self.gt_file, predictions=prediction_file) + print_log('============================================', 'current') + metrics = evaluator.forward() + print_log('============================================', 'current') + print_log(f'GQA successfully finished evaluating', 'current') + return metrics diff --git a/xtuner/dataset/evaluation/gqa_eval_utils.py b/xtuner/dataset/evaluation/gqa_eval_utils.py new file mode 100644 index 000000000..f8fb9ae7d --- /dev/null +++ b/xtuner/dataset/evaluation/gqa_eval_utils.py @@ -0,0 +1,499 @@ +# Evaluation code for GQA. +# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length. +# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides. +# +# +# Metrics: +# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against +# cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the +# predicted answer p matches a and 0 otherwise, and average over all questions in the dataset. +# +# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer +# pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can +# be unambiguously inferred given (q,a). +# Denote Q the set of all questions the model answered correctly. For each question q in Q, we +# measure the model's accuracy over the entailed questions Eq to get the score sq and finally +# average these results across all questions in Q. +# +# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer +# to the question (e.g. a color to a color question, yes/no to a binary question etc.). +# We provide a set of valid answers to each questions over the final answer vocabulary, in +# the choices file, and use it to compute average validity across the dataset. +# +# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world, +# e.g. not answering "purple" to a question about apple color (unless it's really purple). +# We provide a set of all plausible answers to each questions, computed by looking at all +# attributes and relations hold for various objects throughout the whole dataset scene graphs, +# and use it to compute average model plausibility across the data. +# +# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the +# image when answering a question. Each question in the dataset is annotated with the visual regions +# they refer to, which are then used to compute the level to which the model has a correct visual attention, +# which will allow to identify whether it really answers based on the image of by language-based guesses. +# Supports both spatial features and object-based features. +# +# - Distribution: Measures the overall match between the true answer distribution for different questions, +# vs the overall distribution predicted by the model through its answers for all the data. +# We use chi-square statistic to measure the degree of similarity between the distributions, +# giving indication to the level of overall world-knowledge of the model +# +# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type +# (questions about attributes, relations, categories, objects or the whole scene). +# +# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic +# complexity - number of reasoning steps. +# +# We may support additional metrics (e.g. coverage) in the future. +# +# +# Files format: +# - predictions file format: JSON array: [{"questionId": str, "prediction": str}] +# - attentions file format: JSON array: +# Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }]. +# Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1. +# - questions and choices files are provided as part of the dataset. +# see https://gqadataset.org/download.html for information about their format. +# +# +# If you have any questions or comments, please feel free to send an email, +# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :) +# +# +# import torch.nn as nn +from collections import defaultdict +from tqdm import tqdm +import os.path +import glob +import json +from mmengine.logging import print_log + + +########################################################################################## +class eval_gqa(): + + def __init__( + self, + tier="val", + scenes="{tier}_sceneGraphs.json", + questions="{tier}_all_questions.json", + choices="{tier}_choices.json", + predictions="{tier}_predictions.json", + attentions="{tier}_attentions.json", + consistency=False, + grounding=False, + objectFeatures=False, + mapSize=7, + ): + + self.consistency = consistency + self.grounding = grounding + self.objectFeatures = objectFeatures + self.mapSize = mapSize + if not consistency: + print_log("Please consider using --consistency to compute consistency scores for entailed questions.", + 'current') + print_log("If you do so, please provide answers to all questions in val_all_questions.json.\n", 'current') + + if not grounding: + print_log("Please consider using --grounding to compute attention scores.", 'current') + print_log("If you do so, please provide attention maps through --attentions.\n", 'current') + + ##### Files Loading + ########################################################################################## + # Load scene graphs + print_log("Loading scene graphs...", 'current') + try: + self.scenes = self.loadFile(scenes.format(tier=self.tier)) + except: + print_log('Failed to load scene graphs -- cannot evaluate grounding') + self.scenes = None # for testdev + + # Load questions + print_log("Loading questions...", 'current') + self.questions = self.loadFile(questions) + + # Load choices + print_log("Loading choices...", 'current') + try: + self.choices = self.loadFile(choices.format(tier=self.tier)) + except: + print_log('Failed to load choices -- cannot evaluate validity or plausibility', 'current') + self.choices = None # for testdev + + # Load predictions and turn them into a dictionary + print_log("Loading predictions...", 'current') + predictions = self.loadFile(predictions.format(tier=tier)) + self.predictions = {p["questionId"]: p["prediction"] for p in predictions} + + # Make sure all question have predictions + for qid in self.questions: + if (qid not in self.predictions) and (consistency or self.questions[qid]["isBalanced"]): + print_log("no prediction for question {}. Please add prediction for all questions.".format(qid), + 'current') + raise Exception("missing predictions") + + # Load attentions and turn them into a dictionary + self.attentions = None + if grounding: + with open(attentions.format(tier=tier)) as attentionsFile: + attentions = json.load(attentionsFile) + self.attentions = {a["questionId"]: a["attention"] for a in attentions} + + def forward(self): + # Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as + # accuracy per question type, length and number of reasoning steps. + scores = { + "accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately. + "binary": [], + # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately. + "open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately. + "validity": [], # list of validity per question (1 if valid else 0). + "plausibility": [], # list of plausibility per question (1 if plausible else 0). + "consistency": [], # list of consistency scores for entailed questions. + "accuracyPerStructuralType": defaultdict(list), + # list of question accuracies for each structural type (e.g. compare, logic questions). + "accuracyPerSemanticType": defaultdict(list), + # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation). + "accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number. + "accuracyPerSteps": defaultdict(list), + # list of question accuracies per question's reasoning length (steps number). + "grounding": [], # list of grounding scores for each question. + } + + # Initialize golden and predicted histograms per each question group. Used to compute the distribution metric. + dist = {"gold": defaultdict(lambda: defaultdict(int)), "predicted": defaultdict(lambda: defaultdict(int))} + ##### Main score computation + ########################################################################################## + + # Loop over the questions and compute mterics + for qid, question in tqdm(self.questions.items()): + + # Compute scores over the balanced dataset (more robust against cheating by making educated guesses) + if question["isBalanced"]: + gold = question["answer"] + predicted = self.predictions[qid] + + correct = predicted == gold + score = self.toScore(correct) + + wordsNum = self.getWordsNum(question) + stepsNum = self.getStepsNum(question) + + # Update accuracy + scores["accuracy"].append(score) + scores["accuracyPerLength"][wordsNum].append(score) + scores["accuracyPerSteps"][stepsNum].append(score) + scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score) + scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score) + answerType = "open" if question["types"]["structural"] == "query" else "binary" + scores[answerType].append(score) + + # Update validity score + valid = ( + self.belongs(predicted, self.choices[qid]["valid"], question) if self.choices else False + ) + scores["validity"].append(self.toScore(valid)) + + # Update plausibility score + plausible = ( + self.belongs(predicted, self.choices[qid]["plausible"], question) + if self.choices + else False + ) + scores["plausibility"].append(self.toScore(plausible)) + + # Optionally compute grounding (attention) score + if self.attentions is not None: + groundingScore = self.computeGroundingScore( + question, self.scenes[question["imageId"]], self.attentions[qid] + ) + if groundingScore is not None: + scores["grounding"].append(groundingScore) + + # Update histograms for gold and predicted answers + globalGroup = question["groups"]["global"] + if globalGroup is not None: + dist["gold"][globalGroup][gold] += 1 + dist["predicted"][globalGroup][predicted] += 1 + + if self.consistency: + # Compute consistency (for entailed questions) + scores = self.updateConsistency(qid, question, self.questions, correct, scores) + + # Compute distribution score + scores["distribution"] = self.chiSquare(dist["gold"], dist["predicted"]) / 100 + + # Average scores over all questions (in the balanced dataset) and print_log scores + + metrics = [ + "binary", + "open", + "accuracy", + "consistency", + "validity", + "plausibility", + "grounding", + "distribution", + ] + + detailedMetrics = [ + ("accuracyPerStructuralType", "Accuracy / structural type"), + ("accuracyPerSemanticType", "Accuracy / semantic type"), + ("accuracyPerSteps", "Accuracy / steps number"), + ("accuracyPerLength", "Accuracy / words number"), + ] + + subMetrics = {"attr": "attribute", "cat": "category", "global": "scene", "obj": "object", "rel": "relation"} + # average + for k in metrics: + if isinstance(scores[k], list): + scores[k] = self.avg(scores[k]) * 100 + + for k, _ in detailedMetrics: + for t in scores[k]: + scores[k][t] = self.avg(scores[k][t]) * 100, len(scores[k][t]) + + # print_log + for m in metrics: + # skip grounding and consistency scores if not requested + if m == "grounding" and not self.grounding: + continue + if m == "consistency" and not self.consistency: + continue + + # print_log score + print_log( + "{title}: {score:.2f}{suffix}".format( + title=m.capitalize(), + score=scores[m], + suffix=" (lower is better)" if m == "distribution" else "%", + ) + , 'current') + + for m, mPrintName in detailedMetrics: + print_log("") + # print_log metric title + print_log("{}:".format(mPrintName)) + + for t in sorted(list(scores[m].keys())): + # set sub-metric title + tName = t + if isinstance(scores[k], list): + tName = subMetrics.get(t, t).capitalize() + + # print_log score + print_log( + " {title}: {score:.2f}{suffix} ({amount} questions)".format( + title=tName, score=scores[m][t][0], suffix="%", amount=scores[m][t][1] + ) + , 'current') + return metrics + + def loadFile(self, name): + # load standard json file + if os.path.isfile(name): + with open(name) as file: + data = json.load(file) + # load file chunks if too big + elif os.path.isdir(name.split(".")[0]): + data = {} + chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir=name.split(".")[0], ext=name.split(".")[1])) + for chunk in chunks: + with open(chunk) as file: + data.update(json.load(file)) + else: + raise Exception("Can't find {}".format(name)) + return data + + ##### Scores data structures initialization + ########################################################################################## + + # book to float + def toScore(self, b): + return float(1 if b else 0) + + # Compute average of a list + def avg(self, l): + if len(l) == 0: + return 0 + return float(sum(l)) / len(l) + + def wavg(self, l, w): + if sum(w) == 0: + return None + return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w) + + ##### Question lengths - words numbers and reasoning steps number + ########################################################################################## + + # Compute question length (words number) + def getWordsNum(self, question): + return len(question["question"].split()) + + # Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length) + def getStepsNum(self, question): + return len( + [ + c + for c in question["semantic"] + if not ( + any( + [ + o in "{}: {}".format(c["operation"], c["argument"]) + for o in ["exist", "query: name", "choose name"] + ] + ) + ) + ] + ) + + ##### Functions for question annotations + ########################################################################################## + + # # Utility function for converting question annotations string keys to slices + # def toSlice(strSlice): + # sliceLims = (int(n) for n in strSlice.split(':')) + # return apply(slice, sliceLims) + + # # Utility function for converting question annotations string keys to indexes list: + # # "1" => [0] + # # "1:3" => [1, 2] + # # "4:9:2" => [4, 6, 8] + # def intsFromSlice(strSlice): + # slice_obj = get_slice_obj(slicearg) + # return range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1) + + ##### Functions for validity and plausibility + ########################################################################################## + + def belongs(self, element, group, question): + # normalization () + if "Common" in question["types"]["detailed"]: + group = ["color", "material", "shape"] + + return element in group + + ##### Functions for consistency scores (for entailed questions ("inferred")) + ########################################################################################## + + def updateConsistency(self, questionId, question, questions, correct, scores): + inferredQuestions = [eid for eid in question["entailed"] if eid != questionId] + + if correct and len(inferredQuestions) > 0: + + cosnsitencyScores = [] + for eid in inferredQuestions: + gold = questions[eid]["answer"] + predicted = self.predictions[eid] + score = self.toScore(predicted == gold) + cosnsitencyScores.append(score) + + scores["consistency"].append(self.avg(cosnsitencyScores)) + return scores + + ##### Functions for grounding score (optional, only for attention models) + ########################################################################################## + + # Utility functions for working with bounding boxes. + # c = (x0, y0, x1, y1), r = (r0, r1) + + def yrange(self, c): + return (c[1], c[3]) + + def xrange(self, c): + return (c[0], c[2]) + + def length(self, r): + if r is None: + return 0 + return float(r[1] - r[0]) + + def size(self, c): + return self.length(self.xrange(c)) * self.length(self.yrange(c)) + + def intersection(self, r1, r2): + ir = (max(r1[0], r2[0]), min(r1[1], r2[1])) + if ir[1] > ir[0]: + return ir + return None + + def intersectionSize(self, c1, c2): + return self.length(self.intersection(self.xrange(c1), self.xrange(c2))) * self.length( + self.intersection(self.yrange(c1), self.yrange(c2)) + ) + + def intersectionRate(self, c1, c2): + return float(self.intersectionSize(c1, c2)) / self.size(c1) + + # Get spatial cell + def getCell(self, i, j): + edge = float(1) / self.mapSize + return (edge * i, edge * j, edge * (i + 1), edge * (j + 1)) + + # Get bounding box of objectId in sceneGraph + def getRegion(self, sceneGraph, objectId): + obj = sceneGraph["objects"][objectId] + x0 = float(obj["x"]) / sceneGraph["width"] + y0 = float(obj["y"]) / sceneGraph["height"] + x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"] + y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"] + return (x0, y0, x1, y1) + + # Compute grounding score. Computer amount of attention (probability) given to each of the regions + # the question and answers refer to. + def computeGroundingScore(self, question, sceneGraph, attentionMap): + ## prepare gold regions + regions = [] + # add question regions + regions += [ + self.getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values() + ] + # add answer regions + regions += [ + self.getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values() + ] + # add all the image if the question refers to the whole scene + if any(("scene" in c) for c in question["semantic"]): + regions.append((0, 0, 1, 1)) + + # prepare attention map + if self.objectFeatures: + # cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells] + pass + else: + cells = [ + (self.getCell(i, j), attentionMap[i][j]) + for i in range(self.mapSize) + for j in range(self.mapSize) + ] + + # compare attention map to gold regions + scores = [] + for region in regions: + for cell, attention in cells: + scores.append(attention * self.intersectionRate(cell, region)) + return sum(scores) + + ##### Functions for distribution score + ########################################################################################## + + # Compute chi square statistic of gold distribution vs predicted distribution, + # averaged over all question groups + def chiSquare(self, goldDist, predictedDist): + sumScore, sumOverall = 0, 0 + + for group in goldDist: + score, overall = 0, 0 + + for ans in goldDist[group]: + e = goldDist[group][ans] + o = predictedDist[group].get(ans, 0) + score += (float(o - e) ** 2) / e + overall += goldDist[group][ans] + + sumScore += score * overall + sumOverall += overall + + avgScore = float(sumScore) / sumOverall + + return avgScore diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index 5c26d00ed..e918c1045 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -56,7 +56,7 @@ def getitem(self, idx, data): data_dict['input_ids'] = ids # 3 process image - if self.eval_ds.metainfo['name'] in ['mme', 'textvqa']: + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: # MMEDataset or TextVQADataset image = Image.open(os.path.join(self.eval_ds.image_folder, data['image_path'])).convert('RGB') From 350f030022129c407d2873f7131a451e9e987b4d Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 8 Apr 2024 14:46:51 +0800 Subject: [PATCH 021/126] fix --- xtuner/dataset/evaluation/gqa_dataset.py | 4 ++-- xtuner/dataset/evaluation/gqa_eval_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xtuner/dataset/evaluation/gqa_dataset.py b/xtuner/dataset/evaluation/gqa_dataset.py index 3c93de460..ba38c64b6 100644 --- a/xtuner/dataset/evaluation/gqa_dataset.py +++ b/xtuner/dataset/evaluation/gqa_dataset.py @@ -112,7 +112,7 @@ def evaluate(self, results, work_dir): evaluator = eval_gqa(questions=self.gt_file, predictions=prediction_file) print_log('============================================', 'current') - metrics = evaluator.forward() + scores = evaluator.forward() print_log('============================================', 'current') print_log(f'GQA successfully finished evaluating', 'current') - return metrics + return scores diff --git a/xtuner/dataset/evaluation/gqa_eval_utils.py b/xtuner/dataset/evaluation/gqa_eval_utils.py index f8fb9ae7d..9e97e26e8 100644 --- a/xtuner/dataset/evaluation/gqa_eval_utils.py +++ b/xtuner/dataset/evaluation/gqa_eval_utils.py @@ -287,7 +287,7 @@ def forward(self): title=tName, score=scores[m][t][0], suffix="%", amount=scores[m][t][1] ) , 'current') - return metrics + return scores def loadFile(self, name): # load standard json file From 286b6536a14a450b72c48f814ed576ac0844d7c6 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 8 Apr 2024 15:29:19 +0800 Subject: [PATCH 022/126] remove --- xtuner/engine/runner/loops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index 3256d232e..558d9d16d 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -207,8 +207,8 @@ def run(self) -> dict: self.runner.call_hook('after_test_epoch', metrics=metrics) self.runner.call_hook('after_test') self.runner.logger.info('================ Ending test loop ================') - model.gradient_checkpointing_enable() - model.train() + # model.gradient_checkpointing_enable() + # model.train() return metrics @torch.no_grad() From 6dbf0dbc58dab8ff1f651a0488fa90ef75099eaf Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 8 Apr 2024 16:38:27 +0800 Subject: [PATCH 023/126] add cfg --- ...7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py | 13 +++++++++++-- xtuner/model/llava.py | 9 +++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index 54d80c67a..a7d3dd074 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -14,7 +14,7 @@ from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset + HallusionDataset, TextVQADataset, GQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -318,7 +318,16 @@ prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, - pad_image_to_square=True) + pad_image_to_square=True), + dict( + type=GQADataset, + question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 2fff52d03..c0719a744 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -254,8 +254,13 @@ def _prepare_data_for_llm(self, data): visual_outputs = self.visual_encoder( data['pixel_values'].to(self.visual_encoder.dtype), output_hidden_states=True) - pixel_values = self.projector( - visual_outputs.hidden_states[self.visual_select_layer][:, 1:]) + if type(self.visual_encoder).__name__ == 'CLIPVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + pixel_values = self.projector(visual_outputs) data['pixel_values'] = pixel_values data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) return data From 27ce878b5c496b3733c8dbe8352812f823f9011b Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 11:36:51 +0800 Subject: [PATCH 024/126] add any res --- ...so400m_p14_anyres_e1_gpu8_all_finetune.py} | 91 +++++-- xtuner/dataset/__init__.py | 6 +- .../anyres_llava_proxy_eval_dataset.py | 83 ++++++ xtuner/model/__init__.py | 3 +- xtuner/model/anyres_llava.py | 255 ++++++++++++++++++ 5 files changed, 408 insertions(+), 30 deletions(-) rename xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/{llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py => llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py} (78%) create mode 100644 xtuner/dataset/anyres_llava_proxy_eval_dataset.py create mode 100644 xtuner/model/anyres_llava.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py similarity index 78% rename from xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py rename to xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py index 1b8b52881..ffd1d3889 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -11,20 +11,21 @@ from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook -from xtuner.model import LLaVAModel +from xtuner.model import AnyResLLaVAModel from xtuner.utils import PROMPT_TEMPLATE from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset + HallusionDataset, TextVQADataset, GQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset ####################################################################### # PART 1 Settings # ####################################################################### # Model -llm_name_or_path = 'microsoft/phi-2' -visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' # Specify the pretrained pth pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' @@ -55,7 +56,10 @@ evaluation_freq = 500 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' -evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] ####################################################################### # PART 2 Model & Tokenizer & Image Processor # @@ -72,9 +76,11 @@ trust_remote_code=True) model = dict( - type=LLaVAModel, + type=AnyResLLaVAModel, freeze_llm=False, - freeze_visual_encoder=True, + freeze_visual_encoder=False, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, pretrained_pth=pretrained_pth, tokenizer=tokenizer, template=prompt_template, @@ -92,7 +98,8 @@ ####################################################################### llava_dataset = dict( type=AnyResLLaVADataset, - offline_processed_text_folder=None, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, @@ -156,15 +163,15 @@ # Log the dialogue periodically during the training process, optional custom_hooks = [ dict(type=DatasetInfoHook, tokenizer=tokenizer), - dict( - type=EvaluateChatHook, - tokenizer=tokenizer, - image_processor=image_processor, - every_n_iters=evaluation_freq, - evaluation_inputs=evaluation_inputs, - evaluation_images=evaluation_images, - system=SYSTEM, - prompt_template=prompt_template) + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) ] # configure default hooks @@ -218,24 +225,21 @@ val_dataset = [ dict( type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, - pad_image_to_square=True), - # dict( - # type=MultipleChoiceDataset, - # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - # prompt_template=PROMPT_TEMPLATE.vicuna, - # tokenizer=tokenizer, - # image_processor=image_processor, - # pad_image_to_square=True) + pad_image_to_square=True) ] test_dataset = [ dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -243,6 +247,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -250,6 +256,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -257,6 +265,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -264,6 +274,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -271,6 +283,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -278,6 +292,8 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -285,6 +301,8 @@ pad_image_to_square=True), dict( type=TextVQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', @@ -294,6 +312,8 @@ pad_image_to_square=True), dict( type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', prompt_template=PROMPT_TEMPLATE.vicuna, @@ -303,6 +323,8 @@ pad_image_to_square=True), dict( type=HallusionDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -310,6 +332,8 @@ pad_image_to_square=True), dict( type=POPEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), data_file=[ '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', @@ -319,7 +343,18 @@ prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, - pad_image_to_square=True) + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), ] # TODO: We are not currently using val_evaluator @@ -330,7 +365,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=val_dataset), - collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size'])) val_evaluator = dict() val_cfg = dict(type=ValLoop) @@ -341,7 +376,7 @@ drop_last=False, sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict(type=ConcatDataset, datasets=test_dataset), - collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size']) ) test_evaluator = val_evaluator diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index 8b2e6f02f..936abd606 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -12,6 +12,8 @@ from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset, RefCOCOJsonEvalDataset) from .utils import decode_base64_to_image, expand2square, load_image +from .llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -32,5 +34,7 @@ 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset', - 'AnyResLLaVADataset' + 'AnyResLLaVADataset', + 'LLaVAProxyEvalDataset', + 'AnyResLLaVAProxyEvalDataset' ] diff --git a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py new file mode 100644 index 000000000..1718bcde9 --- /dev/null +++ b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py @@ -0,0 +1,83 @@ +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +from .utils import process_anyres_image + + +class AnyResLLaVAProxyEvalDataset: + def __init__(self, eval_dataset, image_grid_pinpoints): + self.eval_ds = eval_dataset + self.image_grid_pinpoints = image_grid_pinpoints + + # TODO: Assuming they are all squares. + if hasattr(eval_dataset.image_processor, 'crop_size'): + self._crop_size = eval_dataset.image_processor.crop_size + else: + self._crop_size = eval_dataset.image_processor.size + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + orig_size = image.size + # use to remove padding + data_dict['orig_size'] = orig_size + image = process_anyres_image(image, self.eval_ds.image_processor, + self.image_grid_pinpoints, + self._patch_size, self._shortest_edge) + data_dict['pixel_values'] = image + + return data_dict diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index 39547b2d7..dd0f57eb3 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .llava import LLaVAModel from .sft import SupervisedFinetune +from .anyres_llava import AnyResLLaVAModel -__all__ = ['SupervisedFinetune', 'LLaVAModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel'] diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py new file mode 100644 index 000000000..ac11ea689 --- /dev/null +++ b/xtuner/model/anyres_llava.py @@ -0,0 +1,255 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from ..dataset.utils import get_anyres_image_grid_shape, unpad_image +from .llava import LLaVAModel +from collections import OrderedDict +import torch + +from xtuner.registry import BUILDER +from .modules import ProjectorConfig, ProjectorModel, dispatch_modules +from .utils import (LoadWoInit, + get_peft_model_state_dict, guess_load_checkpoint, + make_inputs_require_grad, + prepare_inputs_labels_for_multimodal) + + +class AnyResLLaVAModel(LLaVAModel): + + def __init__(self, llm, + visual_encoder, + freeze_llm=False, + freeze_visual_encoder=False, + visual_select_layer=-2, + pretrained_pth=None, + projector_depth=2, + llm_lora=None, + visual_encoder_lora=None, + use_activation_checkpointing=True, + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None, + image_grid_pinpoints=None, + token_merge_ratio=4): + super(LLaVAModel, self).__init__() + self.freeze_llm = freeze_llm + self.freeze_visual_encoder = freeze_visual_encoder + with LoadWoInit(): + if isinstance(llm, dict): + llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) + + self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( + visual_encoder) + self.llm.config.use_cache = False + dispatch_modules(self.llm) + + if token_merge_ratio != 1: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size, + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + else: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size * token_merge_ratio, + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth, + llm_hidden_size_ratio=1) + self.projector = ProjectorModel(projector_config).to( + self.visual_encoder.dtype) + + if self.freeze_llm: + self.llm.requires_grad_(False) + if self.freeze_visual_encoder: + self.visual_encoder.requires_grad_(False) + + self.use_activation_checkpointing = use_activation_checkpointing + if use_activation_checkpointing: + # For backward compatibility + if hasattr(self.llm, 'enable_input_require_grads'): + self.llm.enable_input_require_grads() + else: + self.llm.get_input_embeddings().register_forward_hook( + make_inputs_require_grad) + if hasattr(self.visual_encoder, 'enable_input_require_grads'): + self.visual_encoder.enable_input_require_grads() + else: + self.visual_encoder.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + self.projector.enable_input_require_grads() + + # enable gradient (activation) checkpointing for memory efficiency + self.gradient_checkpointing_enable() + + self.use_llm_lora = llm_lora is not None + self.use_visual_encoder_lora = visual_encoder_lora is not None + + if self.use_llm_lora: + self._prepare_llm_for_lora(llm_lora, use_activation_checkpointing) + if self.use_visual_encoder_lora: + self._prepare_visual_encoder_for_lora( + visual_encoder_lora, use_activation_checkpointing) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + self.visual_select_layer = visual_select_layer + + self._is_init = True + + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + self.template = template + + self.token_merge_ratio = token_merge_ratio + # self.image_newline = torch.randn( + # self.llm.config.hidden_size, dtype=self.visual_encoder.dtype) + self.image_grid_pinpoints = image_grid_pinpoints + # self.mm_patch_merge_type = 'spatial_unpad' + self.image_aspect_ratio = 'anyres' + + def state_dict(self, *args, **kwargs): + state_dict = super(LLaVAModel, self).state_dict(*args, **kwargs) + to_return = OrderedDict() + # Step 1. visual_encoder + if self.use_visual_encoder_lora: + to_return.update( + get_peft_model_state_dict( + self.visual_encoder, state_dict=state_dict)) + elif not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder.' in k + }) + # Step 2. LLM + if self.use_llm_lora: + to_return.update( + get_peft_model_state_dict(self.llm, state_dict=state_dict)) + elif not self.freeze_llm: + to_return.update( + {k: v + for k, v in state_dict.items() if 'llm.' in k}) + # Step 3. Projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'projector.' in k}) + # Step 4. Image Newline + to_return.update( + {k: v + for k, v in state_dict.items() if 'image_newline.' in k}) + return to_return + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + new_image_feature = self.__preprocess_for_pixel_values(data) + data['pixel_values'] = new_image_feature + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def __preprocess_for_pixel_values(self, data): + orig_sizes = data['orig_size'] + pixel_values = data['pixel_values'] + + if type(pixel_values) is list or pixel_values.ndim == 5: + if type(pixel_values) is list: + pixel_values = [ + x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values + ] + # b*n, c, h, w + concat_images = torch.cat([image for image in pixel_values], dim=0) + else: + raise NotImplementedError() + + # b*n, 27*27, d + visual_outputs = self.visual_encoder( + concat_images, output_hidden_states=True) + if type(self.visual_encoder).__name__ == 'CLIPVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + + bs, pn, hs = visual_outputs.shape + # token merge + if self.token_merge_ratio != 1: + # 27 不是偶数,不能被整除,需要 hard code 处理下 + if pn == 27 * 27: + # 直接减掉最后 1 个 token,减掉点,确保能被整除 + visual_outputs = visual_outputs[:, :-1] + visual_outputs = visual_outputs.view(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) + + # b*n, 182, d + image_features = self.projector(visual_outputs) + + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + + new_image_feature = [] + # 由于进行了 token merge,因此 unpad 操作不再需要 + if self.token_merge_ratio == 1: + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.visual_encoder.config.image_size \ + // self.visual_encoder.config.patch_size + assert height * width == base_image_feature.shape[0] + if self.image_aspect_ratio == 'anyres': + num_patch = get_anyres_image_grid_shape( + orig_sizes[image_idx], self.image_grid_pinpoints, + self.visual_encoder.config.image_size) + num_patch_width, num_patch_height = num_patch + image_feature = image_feature.view(num_patch_height, + num_patch_width, height, + width, -1) + else: + raise NotImplementedError + + if 'unpad' in self.mm_patch_merge_type: + image_feature = image_feature.permute(4, 0, 2, 1, + 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, + orig_sizes[image_idx]) + image_feature = torch.cat( + (image_feature, + self.image_newline[:, None, None].expand( + *image_feature.shape[:-1], 1)), + dim=-1) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, + 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), + dim=0) + else: + image_feature = image_feature[0] + if 'unpad' in self.mm_patch_merge_type: + image_feature = torch.cat( + (image_feature, self.image_newline[None]), dim=0) + new_image_feature.append(image_feature) + else: + new_image_feature = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + # 182, d + base_image_feature = image_feature[0] + # n, 182, d + image_feature = image_feature[1:] + # n*182,d + image_feature = image_feature.flatten(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + new_image_feature.append(image_feature) + else: + image_feature = image_feature[0] + new_image_feature.append(image_feature) + return new_image_feature From cdac294a7c1f8bfdea6d3160e5694942135a4c9a Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 11:39:22 +0800 Subject: [PATCH 025/126] update --- ...a_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py index ffd1d3889..028aa0a54 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -50,7 +50,7 @@ # Save save_steps = 500 -save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training evaluation_freq = 500 From e25b70c2384be41f1ece2b5b1cbb59fe32f22a07 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 11:48:33 +0800 Subject: [PATCH 026/126] fix --- xtuner/model/anyres_llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index ac11ea689..cf9aa344e 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -169,7 +169,7 @@ def __preprocess_for_pixel_values(self, data): # b*n, 27*27, d visual_outputs = self.visual_encoder( - concat_images, output_hidden_states=True) + concat_images, output_hidden_states=True).to(self.visual_encoder.dtype) if type(self.visual_encoder).__name__ == 'CLIPVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': From 622250c4b2312ac9a0189eecd3d68eb1c8847e8d Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 11:56:19 +0800 Subject: [PATCH 027/126] fix --- xtuner/model/anyres_llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index cf9aa344e..0bac691d9 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -169,7 +169,7 @@ def __preprocess_for_pixel_values(self, data): # b*n, 27*27, d visual_outputs = self.visual_encoder( - concat_images, output_hidden_states=True).to(self.visual_encoder.dtype) + concat_images.to(self.visual_encoder.dtype), output_hidden_states=True) if type(self.visual_encoder).__name__ == 'CLIPVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': From c450da735c2c749fe62af9a7404357dafcc5de1d Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 13:11:34 +0800 Subject: [PATCH 028/126] add pretrain --- ..._so400m_p14_anyres_e1_gpu8_all_finetune.py | 2 +- ...glip_so400m_p14_anyres_e1_gpu8_pretrain.py | 212 ++++++++++++++++++ xtuner/model/anyres_llava.py | 8 +- 3 files changed, 217 insertions(+), 5 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py index 028aa0a54..3f0568ff7 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -99,7 +99,7 @@ llava_dataset = dict( type=AnyResLLaVADataset, image_grid_pinpoints=image_grid_pinpoints, - offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py new file mode 100644 index 000000000..2f4a08639 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size'])) +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 0bac691d9..3e34edd54 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -44,7 +44,7 @@ def __init__(self, llm, self.llm.config.use_cache = False dispatch_modules(self.llm) - if token_merge_ratio != 1: + if token_merge_ratio == -1: projector_config = ProjectorConfig( visual_hidden_size=self.visual_encoder.config.hidden_size, llm_hidden_size=self.llm.config.hidden_size, @@ -53,10 +53,10 @@ def __init__(self, llm, projector_config = ProjectorConfig( visual_hidden_size=self.visual_encoder.config.hidden_size * token_merge_ratio, llm_hidden_size=self.llm.config.hidden_size, - depth=projector_depth, - llm_hidden_size_ratio=1) + depth=projector_depth) self.projector = ProjectorModel(projector_config).to( self.visual_encoder.dtype) + print(self.projector,'=======================================================') if self.freeze_llm: self.llm.requires_grad_(False) @@ -184,7 +184,7 @@ def __preprocess_for_pixel_values(self, data): if pn == 27 * 27: # 直接减掉最后 1 个 token,减掉点,确保能被整除 visual_outputs = visual_outputs[:, :-1] - visual_outputs = visual_outputs.view(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) + visual_outputs = visual_outputs.reshape(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) # b*n, 182, d image_features = self.projector(visual_outputs) From 3043bb72a640816a859635ce145c116af57d7706 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 13:15:11 +0800 Subject: [PATCH 029/126] add pretrain --- .../llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py index 2f4a08639..5f98dd440 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py @@ -96,6 +96,7 @@ llava_dataset = dict( type=AnyResLLaVADataset, image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, From 14f452838dae76f29763481678250ccbcac55822 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 14:26:17 +0800 Subject: [PATCH 030/126] addcomment --- xtuner/dataset/utils.py | 2 ++ xtuner/model/anyres_llava.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index aded85f3e..b8a47d0d6 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -338,6 +338,7 @@ def resize_and_pad_image(image, target_resolution): # Resize the image resized_image = image.resize((new_width, new_height)) + # TODO: 应该是填充均值,而且后续应该考虑 padding new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0)) paste_x = (target_width - new_width) // 2 paste_y = (target_height - new_height) // 2 @@ -384,6 +385,7 @@ def process_anyres_image(image, processor, possible_resolutions, patch_size, sho patches = divide_to_patches(image_padded, patch_size) + # 这里直接 resize,所以后续不用考虑 padding image_original_resize = image.resize((shortest_edge, shortest_edge)) image_patches = [image_original_resize] + patches diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 3e34edd54..636831e28 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -56,7 +56,6 @@ def __init__(self, llm, depth=projector_depth) self.projector = ProjectorModel(projector_config).to( self.visual_encoder.dtype) - print(self.projector,'=======================================================') if self.freeze_llm: self.llm.requires_grad_(False) @@ -193,7 +192,6 @@ def __preprocess_for_pixel_values(self, data): image_features = torch.split(image_features, split_sizes, dim=0) new_image_feature = [] - # 由于进行了 token merge,因此 unpad 操作不再需要 if self.token_merge_ratio == 1: for image_idx, image_feature in enumerate(image_features): if image_feature.shape[0] > 1: @@ -238,6 +236,7 @@ def __preprocess_for_pixel_values(self, data): (image_feature, self.image_newline[None]), dim=0) new_image_feature.append(image_feature) else: + # 由于进行了 token merge,unpad 操作不好弄,暂时不支持 new_image_feature = [] for image_idx, image_feature in enumerate(image_features): if image_feature.shape[0] > 1: From 8bffc5a87153aaafe24bbefa081eb17504573f15 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 20:10:51 +0800 Subject: [PATCH 031/126] fix path --- ...phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py index 3f0568ff7..4e3ae7e4d 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -27,7 +27,7 @@ llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' # Specify the pretrained pth -pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain/iter_2181.pth' # Data data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' @@ -99,7 +99,7 @@ llava_dataset = dict( type=AnyResLLaVADataset, image_grid_pinpoints=image_grid_pinpoints, - offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, From bf6e5e518242bc1d3972fa13b7969156a0be9f77 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 20:56:10 +0800 Subject: [PATCH 032/126] fix --- ...glip_so400m_p14_anyres_e1_gpu8_pretrain.py | 1 + .../anyres_llava_proxy_eval_dataset.py | 4 +++- xtuner/dataset/llava.py | 4 +++- xtuner/dataset/utils.py | 17 ++++++++++------- xtuner/model/anyres_llava.py | 19 ++++++++++++++++--- 5 files changed, 33 insertions(+), 12 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py index 5f98dd440..c693c7a2c 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py @@ -95,6 +95,7 @@ ####################################################################### llava_dataset = dict( type=AnyResLLaVADataset, + pad_image_to_square=True, # change this image_grid_pinpoints=image_grid_pinpoints, offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', data_path=data_path, diff --git a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py index 1718bcde9..3a76f3c2e 100644 --- a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py +++ b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py @@ -77,7 +77,9 @@ def getitem(self, idx, data): data_dict['orig_size'] = orig_size image = process_anyres_image(image, self.eval_ds.image_processor, self.image_grid_pinpoints, - self._patch_size, self._shortest_edge) + self._patch_size, self._shortest_edge, + pad_mean=tuple(int(x * 255) for x in self.eval_ds.image_processor.image_mean), + orig_img_pad_to_square=self.eval_ds.pad_image_to_square) data_dict['pixel_values'] = image return data_dict diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index ec087293f..9da9a2d15 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -131,7 +131,9 @@ def __getitem__(self, index): data_dict['orig_size'] = orig_size image = process_anyres_image(image, self.image_processor, self.image_grid_pinpoints, - self._patch_size, self._shortest_edge) + self._patch_size, self._shortest_edge, + pad_mean=tuple(int(x * 255) for x in self.image_processor.image_mean), + orig_img_pad_to_square=self.pad_image_to_square) data_dict['pixel_values'] = image else: data_dict['orig_size'] = self._crop_size diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index b8a47d0d6..c6376ebe5 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -75,7 +75,7 @@ def encode_fn(example, assert input_ids_with_output input_ids, labels = [], [] - next_needs_bos_token = True + next_needs_bos_token = False for single_turn_conversation in example['conversation']: input = single_turn_conversation['input'] if DEFAULT_IMAGE_TOKEN in input and with_image_token: @@ -310,7 +310,7 @@ def select_best_resolution(original_size, possible_resolutions): return best_fit -def resize_and_pad_image(image, target_resolution): +def resize_and_pad_image(image, target_resolution,pad_mean): """Resize and pad an image to a target resolution while maintaining aspect ratio. @@ -338,10 +338,10 @@ def resize_and_pad_image(image, target_resolution): # Resize the image resized_image = image.resize((new_width, new_height)) - # TODO: 应该是填充均值,而且后续应该考虑 padding - new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + new_image = Image.new('RGB', (target_width, target_height), pad_mean) paste_x = (target_width - new_width) // 2 paste_y = (target_height - new_height) // 2 + # 居中 padding new_image.paste(resized_image, (paste_x, paste_y)) return new_image @@ -368,7 +368,7 @@ def divide_to_patches(image, patch_size): return patches -def process_anyres_image(image, processor, possible_resolutions, patch_size, shortest_edge): +def process_anyres_image(image, processor, possible_resolutions, patch_size, shortest_edge, pad_mean=(0, 0, 0), orig_img_pad_to_square=False): """Process an image with variable resolutions. Args: @@ -381,11 +381,14 @@ def process_anyres_image(image, processor, possible_resolutions, patch_size, sho torch.Tensor: A tensor containing the processed image patches. """ best_resolution = select_best_resolution(image.size, possible_resolutions) - image_padded = resize_and_pad_image(image, best_resolution) + image_padded = resize_and_pad_image(image, best_resolution, pad_mean) patches = divide_to_patches(image_padded, patch_size) - # 这里直接 resize,所以后续不用考虑 padding + if orig_img_pad_to_square: + # 不是居中 padding + image = expand2square(image, pad_mean) + image_original_resize = image.resize((shortest_edge, shortest_edge)) image_patches = [image_original_resize] + patches diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 636831e28..3e7113774 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -108,8 +108,8 @@ def __init__(self, llm, self.template = template self.token_merge_ratio = token_merge_ratio - # self.image_newline = torch.randn( - # self.llm.config.hidden_size, dtype=self.visual_encoder.dtype) + self.image_newline = torch.randn( + self.llm.config.hidden_size, dtype=self.visual_encoder.dtype) self.image_grid_pinpoints = image_grid_pinpoints # self.mm_patch_merge_type = 'spatial_unpad' self.image_aspect_ratio = 'anyres' @@ -244,11 +244,24 @@ def __preprocess_for_pixel_values(self, data): base_image_feature = image_feature[0] # n, 182, d image_feature = image_feature[1:] - # n*182,d + + # n,182+1, d + image_feature = torch.cat( + (image_feature, + self.image_newline[None, None].expand( + image_feature.shape[0], 1, image_feature.shape[-1])), + dim=1) + + # n*183,d image_feature = image_feature.flatten(0, 1) image_feature = torch.cat((base_image_feature, image_feature), dim=0) new_image_feature.append(image_feature) else: + # 182, d image_feature = image_feature[0] + # 183,d + image_feature = torch.cat( + (image_feature, self.image_newline[None]), dim=1) + new_image_feature.append(image_feature) return new_image_feature From 8f7a2f7f96369e3d27f772364ef0523ec2944df6 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 21:01:20 +0800 Subject: [PATCH 033/126] fix --- xtuner/model/anyres_llava.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 3e7113774..bea808fbc 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -12,6 +12,8 @@ make_inputs_require_grad, prepare_inputs_labels_for_multimodal) +import torch.nn as nn + class AnyResLLaVAModel(LLaVAModel): @@ -108,8 +110,9 @@ def __init__(self, llm, self.template = template self.token_merge_ratio = token_merge_ratio - self.image_newline = torch.randn( - self.llm.config.hidden_size, dtype=self.visual_encoder.dtype) + self.image_newline = nn.Parameter( + torch.randn( + self.llm.config.hidden_size, dtype=self.visual_encoder.dtype)) self.image_grid_pinpoints = image_grid_pinpoints # self.mm_patch_merge_type = 'spatial_unpad' self.image_aspect_ratio = 'anyres' From a151111fbaacbf850b9e9e5374df480282375ca5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 21:05:12 +0800 Subject: [PATCH 034/126] update --- xtuner/dataset/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index c6376ebe5..60dbce54e 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -75,7 +75,7 @@ def encode_fn(example, assert input_ids_with_output input_ids, labels = [], [] - next_needs_bos_token = False + next_needs_bos_token = True for single_turn_conversation in example['conversation']: input = single_turn_conversation['input'] if DEFAULT_IMAGE_TOKEN in input and with_image_token: From 6753fdef39de4dd59b87480105332679a2df8051 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 21:15:14 +0800 Subject: [PATCH 035/126] fix --- xtuner/model/anyres_llava.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index bea808fbc..83d2e9e85 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -245,6 +245,10 @@ def __preprocess_for_pixel_values(self, data): if image_feature.shape[0] > 1: # 182, d base_image_feature = image_feature[0] + # 183,d + base_image_feature = torch.cat( + (base_image_feature, self.image_newline[None]), dim=1) + # n, 182, d image_feature = image_feature[1:] From 6f5a66d508f92c2072b8b851d20ad54c5786d619 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 9 Apr 2024 21:18:14 +0800 Subject: [PATCH 036/126] fix --- xtuner/model/anyres_llava.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 83d2e9e85..3f32c002f 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -247,7 +247,7 @@ def __preprocess_for_pixel_values(self, data): base_image_feature = image_feature[0] # 183,d base_image_feature = torch.cat( - (base_image_feature, self.image_newline[None]), dim=1) + (base_image_feature, self.image_newline[None]), dim=0) # n, 182, d image_feature = image_feature[1:] @@ -268,7 +268,7 @@ def __preprocess_for_pixel_values(self, data): image_feature = image_feature[0] # 183,d image_feature = torch.cat( - (image_feature, self.image_newline[None]), dim=1) + (image_feature, self.image_newline[None]), dim=0) new_image_feature.append(image_feature) return new_image_feature From 5ec58f92e33873962d9bae499bbc83b705bda074 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Wed, 10 Apr 2024 16:46:15 +0800 Subject: [PATCH 037/126] fix bug --- xtuner/model/anyres_llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 3f32c002f..26985d2ed 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -270,5 +270,5 @@ def __preprocess_for_pixel_values(self, data): image_feature = torch.cat( (image_feature, self.image_newline[None]), dim=0) - new_image_feature.append(image_feature) + new_image_feature.append(image_feature) return new_image_feature From 02974d60dd26b596656fd734063c8211734cdf32 Mon Sep 17 00:00:00 2001 From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com> Date: Thu, 11 Apr 2024 20:18:25 +0800 Subject: [PATCH 038/126] token_merge_ratio (#2) --- xtuner/model/llava.py | 51 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index c0719a744..e3619f8b3 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -30,6 +30,7 @@ def __init__(self, freeze_llm=False, freeze_visual_encoder=False, visual_select_layer=-2, + token_merge_ratio=1, pretrained_pth=None, projector_depth=2, llm_lora=None, @@ -52,8 +53,12 @@ def __init__(self, self.llm.config.use_cache = False dispatch_modules(self.llm) + assert int(token_merge_ratio**0.5)**2 == token_merge_ratio, \ + '`token_merge_ratio` must be a square number.' + self.token_merge_ratio = int(token_merge_ratio) + projector_config = ProjectorConfig( - visual_hidden_size=self.visual_encoder.config.hidden_size, + visual_hidden_size=self.visual_encoder.config.hidden_size * token_merge_ratio, llm_hidden_size=self.llm.config.hidden_size, depth=projector_depth) self.projector = ProjectorModel(projector_config).to( @@ -249,18 +254,50 @@ def _build_from_cfg_or_module(self, cfg_or_mod): else: raise NotImplementedError + @staticmethod + def _merge_tokens(tokens, token_merge_ratio): + if token_merge_ratio > 1: + # B, N, C + b, n, c = tokens.shape + h = w = int(n ** 0.5) + h_ratio = w_ratio = int(token_merge_ratio ** 0.5) + assert h * w == n + assert n % token_merge_ratio == 0, 'The number of visual tokens is not divisible by `token_merge_ratio`.' + # B, H, W, C + tokens = tokens.view(b, h, w, c) + # B, H, W // w_r, C * w_r + tokens = tokens.view(b, h, w // w_ratio, c * w_ratio) + # B, W // w_r, H, C * w_r + tokens = tokens.permute(0, 2, 1, 3).contiguous() + # B, W // w_r, H // h_r, C * w_r * h_r + tokens = tokens.view(b, w // w_ratio, h // h_ratio, + c * w_ratio * h_ratio) + # B, W * H // w_r // h_r, C * w_r * h_r + tokens = tokens.view(b, w * h // w_ratio // h_ratio, + c * w_ratio * h_ratio) + return tokens + + @staticmethod + def _get_model_class_name(model): + base_model = model + if model.__class__.__name__ == 'PeftModel': + base_model = model.base_model.model + else: + base_model = model + return base_model.__class__.__name__ + def _prepare_data_for_llm(self, data): if 'pixel_values' in data: visual_outputs = self.visual_encoder( data['pixel_values'].to(self.visual_encoder.dtype), output_hidden_states=True) - if type(self.visual_encoder).__name__ == 'CLIPVisionModel': - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] - elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] - else: - raise NotImplementedError + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + + if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': + visual_outputs = visual_outputs[:, 1:] + visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) pixel_values = self.projector(visual_outputs) + data['pixel_values'] = pixel_values data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) return data From fa2948aebbf3331e04e874775607c989f27b15d6 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 11 Apr 2024 20:23:39 +0800 Subject: [PATCH 039/126] add config --- ...yres_pixel_shuffle_e1_gpu8_all_finetune.py | 384 ++++++++++++++++++ xtuner/model/anyres_llava.py | 7 +- 2 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py new file mode 100644 index 000000000..cd4579397 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py @@ -0,0 +1,384 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + merge_type='pixel_shuffle', # xxxxxxx + freeze_llm=False, + freeze_visual_encoder=False, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 26985d2ed..e6471eaef 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -32,10 +32,12 @@ def __init__(self, llm, tokenizer=None, template=None, image_grid_pinpoints=None, + merge_type='simple', # or pixel_shuffle token_merge_ratio=4): super(LLaVAModel, self).__init__() self.freeze_llm = freeze_llm self.freeze_visual_encoder = freeze_visual_encoder + self.merge_type = merge_type with LoadWoInit(): if isinstance(llm, dict): llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) @@ -186,7 +188,10 @@ def __preprocess_for_pixel_values(self, data): if pn == 27 * 27: # 直接减掉最后 1 个 token,减掉点,确保能被整除 visual_outputs = visual_outputs[:, :-1] - visual_outputs = visual_outputs.reshape(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) + if self.merge_type == 'simple': + visual_outputs = visual_outputs.reshape(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) + else: + visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) # b*n, 182, d image_features = self.projector(visual_outputs) From 10e24fece1cf111e57592d34f8cb4798867488b1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 11 Apr 2024 20:58:29 +0800 Subject: [PATCH 040/126] fix bug --- xtuner/model/anyres_llava.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index e6471eaef..9d3ed2a0d 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -186,12 +186,30 @@ def __preprocess_for_pixel_values(self, data): if self.token_merge_ratio != 1: # 27 不是偶数,不能被整除,需要 hard code 处理下 if pn == 27 * 27: - # 直接减掉最后 1 个 token,减掉点,确保能被整除 - visual_outputs = visual_outputs[:, :-1] if self.merge_type == 'simple': - visual_outputs = visual_outputs.reshape(bs, (pn-1) // self.token_merge_ratio, int(hs * 4)) + # 直接减掉最后 1 个 token,减掉点,确保能被整除 + visual_outputs = visual_outputs[:, :-1] + visual_outputs = visual_outputs.reshape(bs, (pn - 1) // self.token_merge_ratio, int(hs * 4)) else: - visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) + # 只能补 token 了 + h_ratio = w_ratio = int(self.token_merge_ratio ** 0.5) + visual_outputs = visual_outputs.reshape(bs, 27, 27, -1) + # pad 为 28*28 + visual_outputs = torch.cat( + (visual_outputs, torch.zeros(bs, 1, 27, hs, device=visual_outputs.device,dtype=visual_outputs.dtype)), dim=1) + visual_outputs = torch.cat( + (visual_outputs, torch.zeros(bs, 28, 1, hs, device=visual_outputs.device,dtype=visual_outputs.dtype)), dim=2) + + # B, H, W // w_r, C * w_r + visual_outputs = visual_outputs.view(bs, 28, 28 // w_ratio, hs * w_ratio) + # B, W // w_r, H, C * w_r + visual_outputs = visual_outputs.permute(0, 2, 1, 3).contiguous() + # B, W // w_r, H // h_r, C * w_r * h_r + visual_outputs = visual_outputs.view(bs, 28 // w_ratio, 28 // h_ratio, + hs * w_ratio * h_ratio) + # B, W * H // w_r // h_r, C * w_r * h_r + visual_outputs = visual_outputs.view(bs, 28 * 28 // w_ratio // h_ratio, + hs * w_ratio * h_ratio).contiguous() # b*n, 182, d image_features = self.projector(visual_outputs) From 7d59d822d40c7d0afedd408532529f0a337dfafe Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 15:39:11 +0800 Subject: [PATCH 041/126] add mini-geminie --- ...o400m_p14_384_convnext_e1_gpu8_pretrain.py | 214 ++++++++++++++++++ xtuner/dataset/__init__.py | 4 +- xtuner/dataset/mini_gemini_dataset.py | 55 +++++ xtuner/model/__init__.py | 3 +- xtuner/model/mini_gemini.py | 129 +++++++++++ xtuner/model/modules/__init__.py | 3 +- xtuner/model/modules/openclip_encoder.py | 205 +++++++++++++++++ 7 files changed, 610 insertions(+), 3 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py create mode 100644 xtuner/dataset/mini_gemini_dataset.py create mode 100644 xtuner/model/mini_gemini.py create mode 100644 xtuner/model/modules/openclip_encoder.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py new file mode 100644 index 000000000..a740ddab2 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py @@ -0,0 +1,214 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import MiniGeminiDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import MiniGeminiModel +from xtuner.model.modules import OpenCLIPVisionTower + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +visual_encoder_aux_name = 'model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup' +visual_encoder_aux_path = '/mnt/petrelfs/share_data/zhaoxiangyu/models--laion--CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/snapshots/39918dfbdf69ccd2172e6510a430e92337ee23e1/' + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=MiniGeminiModel, + visual_encoder_aux=dict( + type=OpenCLIPVisionTower, + vision_tower=visual_encoder_aux_name, + vision_tower_path=visual_encoder_aux_path, + optimize_vision_tower_aux=False, + ), + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=MiniGeminiDataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + image_size_aux=864, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['pixel_values_aux'])) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index 936abd606..a92e2f593 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -14,6 +14,7 @@ from .utils import decode_base64_to_image, expand2square, load_image from .llava_proxy_eval_dataset import LLaVAProxyEvalDataset from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset +from .mini_gemini_dataset import MiniGeminiDataset # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -36,5 +37,6 @@ 'InvRefCOCOJsonDataset', 'AnyResLLaVADataset', 'LLaVAProxyEvalDataset', - 'AnyResLLaVAProxyEvalDataset' + 'AnyResLLaVAProxyEvalDataset', + 'MiniGeminiDataset' ] diff --git a/xtuner/dataset/mini_gemini_dataset.py b/xtuner/dataset/mini_gemini_dataset.py new file mode 100644 index 000000000..48dac0824 --- /dev/null +++ b/xtuner/dataset/mini_gemini_dataset.py @@ -0,0 +1,55 @@ +from .llava import LLaVADataset +import torch +from PIL import Image +import os +from .utils import expand2square +import numpy as np + + +class MiniGeminiDataset(LLaVADataset): + # siglip 864 + # clip 768 + def __init__(self, *args, image_size_aux=864, **kwargs): + self.image_size_aux = image_size_aux + super().__init__(*args, **kwargs) + + self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) + self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) + + def __getitem__(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(self.image_folder, + image_file)).convert('RGB') + image_aux = image + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + # aux image + if self.pad_image_to_square: + image_aux = expand2square( + image_aux, + tuple( + int(x * 255) for x in self.aux_mean)) + image_aux = image_aux.resize((self.image_size_aux, self.image_size_aux), resample=Image.BILINEAR) + image_aux = np.array(image_aux) # H, W, 3 + image_aux = image_aux / 255.0 + image_aux = (image_aux - self.aux_mean) / self.aux_std + image_aux = torch.tensor(image_aux).permute(2, 0, 1) + data_dict['pixel_values_aux'] = image_aux + else: + if hasattr(self.image_processor, 'crop_size'): + crop_size = self.image_processor.crop_size + else: + crop_size = self.image_processor.size + data_dict['pixel_values'] = torch.zeros(3, crop_size['height'], + crop_size['width']) + data_dict['pixel_values_aux'] = torch.zeros(3, self.image_size_aux, self.image_size_aux) + return data_dict diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index dd0f57eb3..e7d37e8c3 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -2,5 +2,6 @@ from .llava import LLaVAModel from .sft import SupervisedFinetune from .anyres_llava import AnyResLLaVAModel +from .mini_gemini import MiniGeminiModel -__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel'] diff --git a/xtuner/model/mini_gemini.py b/xtuner/model/mini_gemini.py new file mode 100644 index 000000000..87a711b7b --- /dev/null +++ b/xtuner/model/mini_gemini.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import torch +import torch.nn as nn +from .utils import (get_peft_model_state_dict, guess_load_checkpoint, + prepare_inputs_labels_for_multimodal) +from .llava import LLaVAModel + + +class MiniGeminiModel(LLaVAModel): + def __init__(self, *args, visual_encoder_aux=None, pretrained_pth=None, **kwargs): + super().__init__(*args, pretrained_pth=None, **kwargs) + self.visual_encoder_aux = self._build_from_cfg_or_module(visual_encoder_aux) + + if self.freeze_visual_encoder: + self.visual_encoder_aux.requires_grad_(False) + + if self.use_activation_checkpointing: + self.visual_encoder_aux.activation_checkpointing_enable() + + mm_hidden_size = self.visual_encoder.config.hidden_size + mm_hidden_size_aux = self.visual_encoder_aux.hidden_size + self.vlm_uni_query_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size), + nn.Linear(mm_hidden_size, mm_hidden_size)) + self.vlm_uni_aux_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size_aux), + nn.Linear(mm_hidden_size_aux, + mm_hidden_size)) + self.vlm_uni_val_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size_aux), + nn.Linear(mm_hidden_size_aux, + mm_hidden_size)) + + if pretrained_pth is not None and not self.freeze_visual_encoder: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + # to load convnext model + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'=======Load pretrained weight from {pretrained_pth}') + + def activation_checkpointing_disable(self): + super().activation_checkpointing_disable() + self.visual_encoder_aux.gradient_checkpointing_disable() + + def activation_checkpointing_enable(self): + super().activation_checkpointing_enable() + self.visual_encoder_aux.gradient_checkpointing_enable() + + def state_dict(self, *args, **kwargs): + state_dict = super().state_dict(*args, **kwargs) + to_return = OrderedDict() + # Step 1. visual_encoder + if self.use_visual_encoder_lora: + to_return.update( + get_peft_model_state_dict( + self.visual_encoder, state_dict=state_dict)) + elif not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder.' in k + }) + # Step 2. LLM + if self.use_llm_lora: + to_return.update( + get_peft_model_state_dict(self.llm, state_dict=state_dict)) + elif not self.freeze_llm: + to_return.update( + {k: v + for k, v in state_dict.items() if 'llm.' in k}) + # Step 3. Projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'projector.' in k}) + + # Step 4. visual_encoder_aux + if not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder_aux.' in k + }) + # Step 5. unified projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'vlm_uni_' in k}) + return to_return + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + visual_outputs = self.visual_encoder( + data['pixel_values'].to(self.visual_encoder.dtype), + output_hidden_states=True) + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + + if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': + visual_outputs = visual_outputs[:, 1:] + + visual_outputs_aux = torch.stack(data['pixel_values_aux']) + visual_outputs_aux = self.visual_encoder_aux( + visual_outputs_aux.to(self.visual_encoder_aux.dtype) + ) + visual_outputs = self.unified_resampler(visual_outputs, visual_outputs_aux) + + pixel_values = self.projector(visual_outputs) + data['pixel_values'] = pixel_values + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def unified_resampler(self, images, images_aux): + # patchwise with square images + patch_num = int(images.shape[1] ** 0.5) # 27 + # 216x216 + patch_size = images_aux.shape[-1] // patch_num # 8 + # within patch attention + images_aux = images_aux.permute(0, 2, 3, 1) + images_aux = images_aux.reshape(len(images_aux), patch_num, patch_size, patch_num, patch_size, + images_aux.shape[-1]) + images_aux = images_aux.permute(0, 1, 3, 2, 4, 5) + images_aux = images_aux.reshape(len(images_aux), patch_num ** 2, patch_size ** 2, + images_aux.shape[-1]).contiguous() + + # token attention + embed_query = self.vlm_uni_query_projector(images) + embed_aux = self.vlm_uni_aux_projector(images_aux) + embed_value = self.vlm_uni_val_projector(images_aux) + embed_att = embed_query[:, :, None] @ (embed_aux.transpose(-1, -2) / (embed_aux.shape[-1] ** 0.5)) + embed_att = embed_att.nan_to_num() + embed_feat = (embed_att.softmax(-1) @ embed_value).mean(2) + + image_features = images + embed_feat + return image_features diff --git a/xtuner/model/modules/__init__.py b/xtuner/model/modules/__init__.py index 1207a9249..ce8a3906f 100644 --- a/xtuner/model/modules/__init__.py +++ b/xtuner/model/modules/__init__.py @@ -1,4 +1,5 @@ from .dispatch import dispatch_modules from .projector import ProjectorConfig, ProjectorModel +from .openclip_encoder import OpenCLIPVisionTower -__all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel'] +__all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel', 'OpenCLIPVisionTower'] diff --git a/xtuner/model/modules/openclip_encoder.py b/xtuner/model/modules/openclip_encoder.py new file mode 100644 index 000000000..656929acf --- /dev/null +++ b/xtuner/model/modules/openclip_encoder.py @@ -0,0 +1,205 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +import json +import logging +from pathlib import Path +from typing import Dict, Optional +from transformers.deepspeed import is_deepspeed_zero3_enabled + + +try: + import deepspeed + from open_clip.factory import load_state_dict, get_model_config + from open_clip.model import CLIPVisionCfg, CLIPTextCfg, _build_vision_tower, convert_to_custom_text_state_dict, \ + resize_pos_embed +except ImportError: + pass + + +class OpenCLIPVisionTower(nn.Module): + def __init__(self, vision_tower, optimize_vision_tower_aux=False, delay_load=False): + super().__init__() + + self.is_loaded = False + self.vision_tower_name = vision_tower + self.vision_config = json.load(open(os.path.join(vision_tower, 'open_clip_config.json'), 'r')) + self.is_optimize = optimize_vision_tower_aux + + if not delay_load: + self.load_model() + + def load_model(self): + ckpt_path = os.path.join(self.vision_tower_name, 'open_clip_pytorch_model.bin') + if 'convnext' in self.vision_tower_name: + if 'large' in self.vision_tower_name and 'd-320' in self.vision_tower_name: + self.model_type = 'convnext_large_d_320' + self.model_channel = [192, 384, 768, 1536] # stage 0-3 + elif 'base' in self.vision_tower_name and 'w-320' in self.vision_tower_name: + self.model_type = 'convnext_base_w_320' + self.model_channel = [128, 256, 512, 1024] + elif 'xxlarge' in self.vision_tower_name: + self.model_type = 'convnext_xxlarge' + self.model_channel = [384, 768, 1536, 3072] + + clip_model = CLIP(**get_model_config(self.model_type)) + clip_model.visual.trunk.norm_pre = None + clip_model.visual.trunk.head = None + clip_model.visual.head = None + print(f'Loading pretrained weights ({self.model_type}).') + load_checkpoint(clip_model, ckpt_path, strict=False) + + self.clip_model = clip_model + self.is_loaded = True + # decompose stem and stages blocks in vision tower + # self.vision_stem = clip_model.visual.trunk.stem + # self.vision_stages = clip_model.visual.trunk.stages + + self.clip_model.vision_stem.requires_grad_(False) + + # self.vision_stages.requires_grad_(False) + + def gradient_checkpointing_enabled(self): + self.clip_model.set_grad_checkpointing(True) + + def gradient_checkpointing_disabled(self): + self.clip_model.set_grad_checkpointing(False) + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.backbone(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)) + image_features.append(image_feature) + else: + image_features = self.backbone(images.to(device=self.device, dtype=self.dtype)) + + return image_features + + def backbone(self, images): + if not self.is_optimize: + with torch.no_grad(): + results = self.basic_forward(images) + else: + results = self.basic_forward(images) + + target_size = (results['stage_0'].shape[-2], results['stage_0'].shape[-1]) + result_cat = [] + for _stage in results: + if _stage == 'stage_0': + result_cat.append(results[_stage].contiguous()) + else: + result_cat.append(F.interpolate(results[_stage].float().contiguous() , + size=target_size, + mode='bilinear', + align_corners=False).to(dtype=results[_stage].dtype)) + result_cat = torch.cat(result_cat, dim=1) + + return result_cat.contiguous() + + def basic_forward(self, images): + results = {} + x = self.clip_model.vision_stem(images) + for _idx in range(len(self.vision_stages)): + x = self.clip_model.vision_stages[_idx](x) + results[f'stage_{_idx}'] = x + return results + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.vision_stem[0].weight.dtype + + @property + def device(self): + return self.vision_stem[0].weight.device + + @property + def config(self): + return self.vision_config + + @property + def hidden_size(self): + return sum(self.model_channel) + + +# modified function from open_clip to support zero3 stage +def load_checkpoint(model, checkpoint_path, strict=True): + if Path(checkpoint_path).suffix in ('.npz', '.npy'): + from open_clip.big_vision import load_big_vision_weights + load_big_vision_weights(model, checkpoint_path) + return {} + + state_dict = load_state_dict(checkpoint_path) + # detect old format and make compatible with new format + if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): + state_dict = convert_to_custom_text_state_dict(state_dict) + # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712 + # if 'logit_bias' not in state_dict and model.logit_bias is not None: + # state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"]) + # Certain text transformers no longer expect position_ids after transformers==4.31 + position_id_key = 'text.transformer.embeddings.position_ids' + if position_id_key in state_dict and not hasattr(model, position_id_key): + del state_dict[position_id_key] + resize_pos_embed(state_dict, model) + # resize_text_pos_embed(state_dict, model) + #incompatible_keys = model.load_state_dict(state_dict, strict=strict) + if is_deepspeed_zero3_enabled(): + + error_msgs = [] + + def load(module: nn.Module, state_dict, prefix=""): + metadata = None + + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) + # Parameters of module and children will start with prefix. We can exit early if there are none in this + # state_dict + if len([key for key in state_dict if key.startswith(prefix)]) > 0: + if is_deepspeed_zero3_enabled(): + # In sharded models, each shard has only part of the full state_dict, so only gather + # parameters that are in the current state_dict. + named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False)) + params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters] + if len(params_to_gather) > 0: + # because zero3 puts placeholders in model params, this context + # manager gathers (unpartitions) the params of the current layer, then loads from + # the state dict and then re-partitions them again + with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0): + if torch.distributed.get_rank() == 0: + module._load_from_state_dict(*args) + else: + module._load_from_state_dict(*args) + + for name, child in module._modules.items(): + if child is not None: + load(child, state_dict, prefix + name + ".") + + load(model, state_dict) + incompatible_keys = [] + else: + incompatible_keys = model.load_state_dict(state_dict, strict=strict) + logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}") + return incompatible_keys + + +class CLIP(nn.Module): + output_dict: torch.jit.Final[bool] + + def __init__( + self, + embed_dim: int, + vision_cfg: CLIPVisionCfg, + text_cfg: CLIPTextCfg, + quick_gelu: bool = False, + cast_dtype: Optional[torch.dtype] = None, + output_dict: bool = False, + ): + super().__init__() + self.output_dict = output_dict + + self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype) From 6afaa553f641f0cb747e467c48087270a94c34ca Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 17:39:36 +0800 Subject: [PATCH 042/126] fix bug --- ...clip_p14_384_convnext_e1_gpu8_pretrain.py} | 32 ++++---- xtuner/dataset/mini_gemini_dataset.py | 75 ++++++++++++------- xtuner/model/mini_gemini.py | 17 ++++- xtuner/model/modules/openclip_encoder.py | 6 +- 4 files changed, 82 insertions(+), 48 deletions(-) rename xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/{llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py => llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py} (90%) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py similarity index 90% rename from xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py rename to xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py index a740ddab2..c4136acf1 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py @@ -5,7 +5,7 @@ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR from torch.optim import AdamW from transformers import (AutoModelForCausalLM, AutoTokenizer, - SiglipImageProcessor, SiglipVisionModel) + CLIPImageProcessor, CLIPVisionModel) from xtuner.dataset import MiniGeminiDataset from xtuner.dataset.collate_fns import mm_collate_fn @@ -21,14 +21,14 @@ ####################################################################### # Model llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' -visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' # Data data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' image_folder = data_root + 'LLaVA-Pretrain/images' prompt_template = PROMPT_TEMPLATE.vicuna -max_length = int(2048 - (384 // 14) ** 2) +max_length = int(2048 - (336 // 14) ** 2) visual_encoder_aux_name = 'model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup' visual_encoder_aux_path = '/mnt/petrelfs/share_data/zhaoxiangyu/models--laion--CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/snapshots/39918dfbdf69ccd2172e6510a430e92337ee23e1/' @@ -47,7 +47,7 @@ # Save save_steps = 500 -save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training evaluation_freq = 500 @@ -65,7 +65,7 @@ padding_side='right') image_processor = dict( - type=SiglipImageProcessor.from_pretrained, + type=CLIPImageProcessor.from_pretrained, pretrained_model_name_or_path=visual_encoder_name_or_path, trust_remote_code=True) @@ -87,7 +87,7 @@ pretrained_model_name_or_path=llm_name_or_path, trust_remote_code=True), visual_encoder=dict( - type=SiglipVisionModel.from_pretrained, + type=CLIPVisionModel.from_pretrained, pretrained_model_name_or_path=visual_encoder_name_or_path)) ####################################################################### @@ -96,7 +96,7 @@ llava_dataset = dict( type=MiniGeminiDataset, offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', - image_size_aux=864, + image_size_aux=768, # siglip 864, clip 768 data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, @@ -155,15 +155,15 @@ # Log the dialogue periodically during the training process, optional custom_hooks = [ dict(type=DatasetInfoHook, tokenizer=tokenizer), - dict( - type=EvaluateChatHook, - tokenizer=tokenizer, - image_processor=image_processor, - every_n_iters=evaluation_freq, - evaluation_inputs=evaluation_inputs, - evaluation_images=evaluation_images, - system=SYSTEM, - prompt_template=prompt_template) + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) ] # configure default hooks diff --git a/xtuner/dataset/mini_gemini_dataset.py b/xtuner/dataset/mini_gemini_dataset.py index 48dac0824..b386f7b14 100644 --- a/xtuner/dataset/mini_gemini_dataset.py +++ b/xtuner/dataset/mini_gemini_dataset.py @@ -9,12 +9,20 @@ class MiniGeminiDataset(LLaVADataset): # siglip 864 # clip 768 - def __init__(self, *args, image_size_aux=864, **kwargs): + def __init__(self, *args, image_size_aux=768, **kwargs): self.image_size_aux = image_size_aux super().__init__(*args, **kwargs) - self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) - self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) + self._model_name = type(self.image_processor).__name__ + + if self._model_name == 'CLIPImageProcessor': + self.crop_size_raw = self.image_processor.size.copy() + self.image_processor.crop_size['height'] = image_size_aux + self.image_processor.crop_size['width'] = image_size_aux + self.image_processor.size['shortest_edge'] = image_size_aux + else: + self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) + self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) def __getitem__(self, index): data_dict = self.text_data[index] @@ -22,28 +30,45 @@ def __getitem__(self, index): image_file = data_dict['image'] image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') - image_aux = image - if self.pad_image_to_square: - image = expand2square( - image, - tuple( - int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - data_dict['pixel_values'] = image - - # aux image - if self.pad_image_to_square: - image_aux = expand2square( - image_aux, - tuple( - int(x * 255) for x in self.aux_mean)) - image_aux = image_aux.resize((self.image_size_aux, self.image_size_aux), resample=Image.BILINEAR) - image_aux = np.array(image_aux) # H, W, 3 - image_aux = image_aux / 255.0 - image_aux = (image_aux - self.aux_mean) / self.aux_std - image_aux = torch.tensor(image_aux).permute(2, 0, 1) - data_dict['pixel_values_aux'] = image_aux + + if self._model_name == 'CLIPImageProcessor': + # clip 和 convnext 均值和方差一样,前处理相同,但是 siglip 不一致 + if self.pad_image_to_square: + image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean)) + + image_aux = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values_aux'] = image_aux + + image = image_aux.clone() + image = torch.nn.functional.interpolate( + image[None], size=[self.crop_size_raw['height'], self.crop_size_raw['width']], mode='bilinear', + align_corners=False + )[0] + data_dict['pixel_values'] = image + else: + # siglip + image_aux = image + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + # aux image + if self.pad_image_to_square: + image_aux = expand2square( + image_aux, + tuple( + int(x * 255) for x in self.aux_mean)) + image_aux = image_aux.resize((self.image_size_aux, self.image_size_aux), resample=Image.BILINEAR) + image_aux = np.array(image_aux) # H, W, 3 + image_aux = image_aux / 255.0 + image_aux = (image_aux - self.aux_mean) / self.aux_std + image_aux = torch.tensor(image_aux).permute(2, 0, 1) + data_dict['pixel_values_aux'] = image_aux else: if hasattr(self.image_processor, 'crop_size'): crop_size = self.image_processor.crop_size diff --git a/xtuner/model/mini_gemini.py b/xtuner/model/mini_gemini.py index 87a711b7b..d70de9035 100644 --- a/xtuner/model/mini_gemini.py +++ b/xtuner/model/mini_gemini.py @@ -39,11 +39,13 @@ def __init__(self, *args, visual_encoder_aux=None, pretrained_pth=None, **kwargs def activation_checkpointing_disable(self): super().activation_checkpointing_disable() - self.visual_encoder_aux.gradient_checkpointing_disable() + if hasattr(self, 'visual_encoder_aux'): + self.visual_encoder_aux.activation_checkpointing_disable() def activation_checkpointing_enable(self): super().activation_checkpointing_enable() - self.visual_encoder_aux.gradient_checkpointing_enable() + if hasattr(self, 'visual_encoder_aux'): + self.visual_encoder_aux.activation_checkpointing_enable() def state_dict(self, *args, **kwargs): state_dict = super().state_dict(*args, **kwargs) @@ -88,7 +90,7 @@ def _prepare_data_for_llm(self, data): visual_outputs = self.visual_encoder( data['pixel_values'].to(self.visual_encoder.dtype), output_hidden_states=True) - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer].bf16() if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': visual_outputs = visual_outputs[:, 1:] @@ -121,9 +123,16 @@ def unified_resampler(self, images, images_aux): embed_query = self.vlm_uni_query_projector(images) embed_aux = self.vlm_uni_aux_projector(images_aux) embed_value = self.vlm_uni_val_projector(images_aux) + # TODO siglip+convnext 在第一次 forward 后正常,但是 embed_att 会出现 nan + # TODO 导致第二次迭代时候 embed_value 会出现 nan,无法训练 + # TODO 怀疑是特征不匹配,即使全部转换为 fp32 也会出现 nan, 需要进一步排查 embed_att = embed_query[:, :, None] @ (embed_aux.transpose(-1, -2) / (embed_aux.shape[-1] ** 0.5)) + # print('=xxxx=', torch.any(torch.isnan(embed_query)).item(), + # torch.any(torch.isnan(embed_aux)).item(), + # torch.any(torch.isnan(embed_value)).item(), + # torch.any(torch.isnan(embed_att)).item()) embed_att = embed_att.nan_to_num() embed_feat = (embed_att.softmax(-1) @ embed_value).mean(2) - + # print('=xxcccxx=', torch.any(torch.isnan(embed_feat)).item()) image_features = images + embed_feat return image_features diff --git a/xtuner/model/modules/openclip_encoder.py b/xtuner/model/modules/openclip_encoder.py index 656929acf..cff91c396 100644 --- a/xtuner/model/modules/openclip_encoder.py +++ b/xtuner/model/modules/openclip_encoder.py @@ -60,10 +60,10 @@ def load_model(self): # self.vision_stages.requires_grad_(False) - def gradient_checkpointing_enabled(self): + def activation_checkpointing_enable(self): self.clip_model.set_grad_checkpointing(True) - def gradient_checkpointing_disabled(self): + def activation_checkpointing_disable(self): self.clip_model.set_grad_checkpointing(False) def forward(self, images): @@ -102,7 +102,7 @@ def basic_forward(self, images): results = {} x = self.clip_model.vision_stem(images) for _idx in range(len(self.vision_stages)): - x = self.clip_model.vision_stages[_idx](x) + x = self.clip_model.vision_stages[_idx](x) results[f'stage_{_idx}'] = x return results From cef1cb7d95cc39d8d55f898644796bc345d8654b Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 17:42:43 +0800 Subject: [PATCH 043/126] fix bug --- xtuner/model/modules/openclip_encoder.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xtuner/model/modules/openclip_encoder.py b/xtuner/model/modules/openclip_encoder.py index cff91c396..185afe842 100644 --- a/xtuner/model/modules/openclip_encoder.py +++ b/xtuner/model/modules/openclip_encoder.py @@ -19,19 +19,22 @@ class OpenCLIPVisionTower(nn.Module): - def __init__(self, vision_tower, optimize_vision_tower_aux=False, delay_load=False): + def __init__(self, vision_tower, vision_tower_path, optimize_vision_tower_aux=False, delay_load=False): super().__init__() self.is_loaded = False self.vision_tower_name = vision_tower - self.vision_config = json.load(open(os.path.join(vision_tower, 'open_clip_config.json'), 'r')) + self.vision_tower_path = vision_tower_path + self.vision_config = json.load( + open(os.path.join(self.vision_tower_path, 'open_clip_config.json'), 'r') + ) self.is_optimize = optimize_vision_tower_aux if not delay_load: self.load_model() def load_model(self): - ckpt_path = os.path.join(self.vision_tower_name, 'open_clip_pytorch_model.bin') + ckpt_path = os.path.join(self.vision_tower_path, 'open_clip_pytorch_model.bin') if 'convnext' in self.vision_tower_name: if 'large' in self.vision_tower_name and 'd-320' in self.vision_tower_name: self.model_type = 'convnext_large_d_320' From 5c95d523c459337cf97e6ae66b57052f301757ae Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 17:45:45 +0800 Subject: [PATCH 044/126] fix bug --- xtuner/model/modules/openclip_encoder.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/xtuner/model/modules/openclip_encoder.py b/xtuner/model/modules/openclip_encoder.py index 185afe842..66ad89c91 100644 --- a/xtuner/model/modules/openclip_encoder.py +++ b/xtuner/model/modules/openclip_encoder.py @@ -59,15 +59,15 @@ def load_model(self): # self.vision_stem = clip_model.visual.trunk.stem # self.vision_stages = clip_model.visual.trunk.stages - self.clip_model.vision_stem.requires_grad_(False) + self.clip_model.visual.trunk.stem.requires_grad_(False) # self.vision_stages.requires_grad_(False) def activation_checkpointing_enable(self): - self.clip_model.set_grad_checkpointing(True) + self.clip_model.visual.set_grad_checkpointing(True) def activation_checkpointing_disable(self): - self.clip_model.set_grad_checkpointing(False) + self.clip_model.visual.set_grad_checkpointing(False) def forward(self, images): if type(images) is list: @@ -93,19 +93,19 @@ def backbone(self, images): if _stage == 'stage_0': result_cat.append(results[_stage].contiguous()) else: - result_cat.append(F.interpolate(results[_stage].float().contiguous() , - size=target_size, - mode='bilinear', + result_cat.append(F.interpolate(results[_stage].float().contiguous() , + size=target_size, + mode='bilinear', align_corners=False).to(dtype=results[_stage].dtype)) result_cat = torch.cat(result_cat, dim=1) return result_cat.contiguous() def basic_forward(self, images): - results = {} - x = self.clip_model.vision_stem(images) - for _idx in range(len(self.vision_stages)): - x = self.clip_model.vision_stages[_idx](x) + results = {} + x = self.clip_model.visual.trunk.stem(images) + for _idx in range(len(self.clip_model.visual.trunk.stages)): + x = self.clip_model.visual.trunk.stages[_idx](x) results[f'stage_{_idx}'] = x return results @@ -115,11 +115,11 @@ def dummy_feature(self): @property def dtype(self): - return self.vision_stem[0].weight.dtype + return self.clip_model.visual.trunk.stem[0].weight.dtype @property def device(self): - return self.vision_stem[0].weight.device + return self.clip_model.visual.trunk.stem[0].weight.device @property def config(self): From 736eba743c5482ab21fdf97b84468380d71fcac7 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 17:48:53 +0800 Subject: [PATCH 045/126] fix bug --- xtuner/dataset/mini_gemini_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/dataset/mini_gemini_dataset.py b/xtuner/dataset/mini_gemini_dataset.py index b386f7b14..56ac87456 100644 --- a/xtuner/dataset/mini_gemini_dataset.py +++ b/xtuner/dataset/mini_gemini_dataset.py @@ -16,7 +16,7 @@ def __init__(self, *args, image_size_aux=768, **kwargs): self._model_name = type(self.image_processor).__name__ if self._model_name == 'CLIPImageProcessor': - self.crop_size_raw = self.image_processor.size.copy() + self.crop_size_raw = self.image_processor.crop_size.copy() self.image_processor.crop_size['height'] = image_size_aux self.image_processor.crop_size['width'] = image_size_aux self.image_processor.size['shortest_edge'] = image_size_aux From b5ec2321fb6b155af9f58cde1f239642367fdad7 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 17:50:42 +0800 Subject: [PATCH 046/126] fix bug --- xtuner/model/mini_gemini.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/model/mini_gemini.py b/xtuner/model/mini_gemini.py index d70de9035..23486ef68 100644 --- a/xtuner/model/mini_gemini.py +++ b/xtuner/model/mini_gemini.py @@ -90,7 +90,7 @@ def _prepare_data_for_llm(self, data): visual_outputs = self.visual_encoder( data['pixel_values'].to(self.visual_encoder.dtype), output_hidden_states=True) - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer].bf16() + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': visual_outputs = visual_outputs[:, 1:] From 2b4b3531330feb82c006b3a390ff585dc4453e4c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 18:16:20 +0800 Subject: [PATCH 047/126] add finetune --- ...o400m_p14_384_convnext_e1_gpu8_finetune.py | 376 ++++++++++++++++++ xtuner/dataset/__init__.py | 4 +- .../dataset/mini_gemini_proxy_eval_dataset.py | 95 +++++ xtuner/model/mini_gemini.py | 5 +- 4 files changed, 476 insertions(+), 4 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py create mode 100644 xtuner/dataset/mini_gemini_proxy_eval_dataset.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py new file mode 100644 index 000000000..a87a5770a --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py @@ -0,0 +1,376 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import MiniGeminiDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import MiniGeminiModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.model.modules import OpenCLIPVisionTower +from xtuner.dataset import MiniGeminiProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (336 // 14) ** 2) +image_size_aux = 768 + +visual_encoder_aux_name = 'model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup' +visual_encoder_aux_path = '/mnt/petrelfs/share_data/zhaoxiangyu/models--laion--CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/snapshots/39918dfbdf69ccd2172e6510a430e92337ee23e1/' + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=MiniGeminiModel, + visual_encoder_aux=dict( + type=OpenCLIPVisionTower, + vision_tower=visual_encoder_aux_name, + vision_tower_path=visual_encoder_aux_path, + optimize_vision_tower_aux=False, + ), + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=MiniGeminiDataset, + image_size_aux=image_size_aux, # siglip 864, clip 768 + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['pixel_values_aux'])) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id','pixel_values_aux'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'pixel_values_aux']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index a92e2f593..05a571c3a 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -15,6 +15,7 @@ from .llava_proxy_eval_dataset import LLaVAProxyEvalDataset from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset from .mini_gemini_dataset import MiniGeminiDataset +from .mini_gemini_proxy_eval_dataset import MiniGeminiProxyEvalDataset # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -38,5 +39,6 @@ 'AnyResLLaVADataset', 'LLaVAProxyEvalDataset', 'AnyResLLaVAProxyEvalDataset', - 'MiniGeminiDataset' + 'MiniGeminiDataset', + 'MiniGeminiProxyEvalDataset' ] diff --git a/xtuner/dataset/mini_gemini_proxy_eval_dataset.py b/xtuner/dataset/mini_gemini_proxy_eval_dataset.py new file mode 100644 index 000000000..b7d93cd74 --- /dev/null +++ b/xtuner/dataset/mini_gemini_proxy_eval_dataset.py @@ -0,0 +1,95 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +import numpy as np + + +class MiniGeminiProxyEvalDataset: + def __init__(self, eval_dataset, image_size_aux=768): + self.eval_ds = eval_dataset + + self._model_name = type(eval_dataset.image_processor).__name__ + + if self._model_name == 'CLIPImageProcessor': + self.crop_size_raw = eval_dataset.image_processor.crop_size.copy() + self.eval_ds.image_processor.crop_size['height'] = image_size_aux + self.eval_ds.image_processor.crop_size['width'] = image_size_aux + self.eval_ds.image_processor.size['shortest_edge'] = image_size_aux + else: + self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) + self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + if self._model_name == 'CLIPImageProcessor': + # clip 和 convnext 均值和方差一样,前处理相同,但是 siglip 不一致 + if self.eval_ds.pad_image_to_square: + image = expand2square(image, tuple(int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + + image_aux = self.eval_ds.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values_aux'] = image_aux + + image = image_aux.clone() + image = torch.nn.functional.interpolate( + image[None], size=[self.crop_size_raw['height'], self.crop_size_raw['width']], mode='bilinear', + align_corners=False + )[0] + data_dict['pixel_values'] = image + else: + raise NotImplementedError + + return data_dict diff --git a/xtuner/model/mini_gemini.py b/xtuner/model/mini_gemini.py index 23486ef68..14cb466ff 100644 --- a/xtuner/model/mini_gemini.py +++ b/xtuner/model/mini_gemini.py @@ -30,12 +30,11 @@ def __init__(self, *args, visual_encoder_aux=None, pretrained_pth=None, **kwargs nn.Linear(mm_hidden_size_aux, mm_hidden_size)) - if pretrained_pth is not None and not self.freeze_visual_encoder: + if pretrained_pth is not None: pretrained_state_dict = guess_load_checkpoint(pretrained_pth) - # to load convnext model self.load_state_dict(pretrained_state_dict, strict=False) - print(f'=======Load pretrained weight from {pretrained_pth}') + print(f'Load pretrained weight from {pretrained_pth}') def activation_checkpointing_disable(self): super().activation_checkpointing_disable() From f706c463fc931836e724ff6d5dca20392a9467ca Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 20:44:43 +0800 Subject: [PATCH 048/126] =?UTF-8?q?add=20mmstar=20=E5=92=8C=20vqav2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...clip_p14_384_convnext_e1_gpu8_finetune.py} | 41 +++- ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 4 +- ..._siglip_so400m_p14_384_e1_gpu8_finetune.py | 4 +- ..._so400m_p14_anyres_e1_gpu8_all_finetune.py | 4 +- ...yres_pixel_shuffle_e1_gpu8_all_finetune.py | 4 +- xtuner/dataset/evaluation/__init__.py | 4 +- xtuner/dataset/evaluation/gqa_dataset.py | 10 +- xtuner/dataset/evaluation/vqav2_dataset.py | 139 +++++++++++ xtuner/dataset/evaluation/vqav2_utils.py | 216 ++++++++++++++++++ 9 files changed, 401 insertions(+), 25 deletions(-) rename xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/{llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py => llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py} (92%) create mode 100644 xtuner/dataset/evaluation/vqav2_dataset.py create mode 100644 xtuner/dataset/evaluation/vqav2_utils.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py similarity index 92% rename from xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py rename to xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py index a87a5770a..d75b200e3 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_convnext_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py @@ -14,7 +14,7 @@ from xtuner.model import MiniGeminiModel from xtuner.utils import PROMPT_TEMPLATE from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset, GQADataset + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -249,14 +249,6 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - dict( - type=MultipleChoiceDataset, - proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), dict( type=MultipleChoiceDataset, proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), @@ -341,13 +333,40 @@ dict( type=GQADataset, proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), - question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', - gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # 以下两个需要提交服务器进行在线评测 + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # vqav2 图片大概是 12w,推理要很久 + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True, + # ), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index a7d3dd074..2a6f04571 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -321,8 +321,8 @@ pad_image_to_square=True), dict( type=GQADataset, - question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', - gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py index dead563c9..94c3a02e4 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -321,8 +321,8 @@ pad_image_to_square=True), dict( type=GQADataset, - question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', - gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py index 4e3ae7e4d..ebc97f3ea 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -348,8 +348,8 @@ type=GQADataset, proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, image_grid_pinpoints=image_grid_pinpoints), - question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', - gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py index cd4579397..33bf6d9f5 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py @@ -349,8 +349,8 @@ type=GQADataset, proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, image_grid_pinpoints=image_grid_pinpoints), - question_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', - gt_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 76f5a333e..bca2e4609 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -4,5 +4,7 @@ from .hallusion_dataset import HallusionDataset from .textvqa_dataset import TextVQADataset from .gqa_dataset import GQADataset +from .vqav2_dataset import VQAv2Dataset -__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset'] +__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset', + 'VQAv2Dataset'] diff --git a/xtuner/dataset/evaluation/gqa_dataset.py b/xtuner/dataset/evaluation/gqa_dataset.py index ba38c64b6..fd16ec12b 100644 --- a/xtuner/dataset/evaluation/gqa_dataset.py +++ b/xtuner/dataset/evaluation/gqa_dataset.py @@ -15,8 +15,8 @@ class GQADataset(BaseEvalDataset): def __init__( self, - question_file, - gt_file, + data_file, + ann_file, image_folder, prompt_template, image_processor, @@ -28,8 +28,8 @@ def __init__( proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), ): super().__init__(metainfo) - self.data_file = question_file - self.gt_file = gt_file + self.data_file = data_file + self.ann_file = ann_file # Save detailed information for easy viewing self.answer_file = 'answer_gqa_results.jsonl' # solely for evaluation purposes @@ -110,7 +110,7 @@ def evaluate(self, results, work_dir): with open(prediction_file, 'w') as f: json.dump(all_preds, f) - evaluator = eval_gqa(questions=self.gt_file, predictions=prediction_file) + evaluator = eval_gqa(questions=self.ann_file, predictions=prediction_file) print_log('============================================', 'current') scores = evaluator.forward() print_log('============================================', 'current') diff --git a/xtuner/dataset/evaluation/vqav2_dataset.py b/xtuner/dataset/evaluation/vqav2_dataset.py new file mode 100644 index 000000000..e7161e75a --- /dev/null +++ b/xtuner/dataset/evaluation/vqav2_dataset.py @@ -0,0 +1,139 @@ +import os +import os.path as osp +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .vqav2_utils import EvalAIAnswerProcessor + + +class VQAv2Dataset(BaseEvalDataset): + + METAINFO: dict = dict(name='vqa_v2') + + def __init__( + self, + data_file, + test_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + self.data_file = data_file + self.test_file = test_file + self.image_folder = image_folder + # Save detailed information for easy viewing + self.answer_file = 'answer_vqav2_results.jsonl' + # solely for evaluation purposes + self.prediction_file = 'pred_vqav2_results.jsonl' + self.answer_processor = EvalAIAnswerProcessor() + + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + self.template = prompt_template + self.pad_image_to_square = pad_image_to_square + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + question_data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + data_list = [] + for idx in range(len(question_data)): + sample = question_data[idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + category = sample['category'] + + data = { + 'img_id': idx, + 'index': index, + 'image_path': image_path, + 'question': question, + 'category': category, + } + data_list.append(data) + + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, results, work_dir): + answers_file = osp.join(work_dir, self.answer_file) + ans_file = open(answers_file, "w") + + for pred_dict in results: + idx = pred_dict["img_id"] + gt_data = self.data[idx] + + ans_file.write( + json.dumps( + { + "question_id": gt_data['index'], + "prompt": gt_data['question'], + "text": pred_dict['prediction'], + "metadata": {}, + } + ) + + "\n" + ) + ans_file.close() + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(answers_file)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(self.test_file)] + + all_answers = [] + + for x in test_split: + if x['question_id'] not in results: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': '' + }) + else: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': self.answer_processor(results[x['question_id']]) + }) + + prediction_file = osp.join(work_dir, self.prediction_file) + with open(prediction_file, 'w') as f: + json.dump(all_answers, f) + + print_log('============================================', 'current') + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + print_log(f'Please submit the generated {prediction_file} file to the official server for evaluation.', + 'current') + print_log('============================================', 'current') + return {'acc': 0} diff --git a/xtuner/dataset/evaluation/vqav2_utils.py b/xtuner/dataset/evaluation/vqav2_utils.py new file mode 100644 index 000000000..51566338b --- /dev/null +++ b/xtuner/dataset/evaluation/vqav2_utils.py @@ -0,0 +1,216 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import re + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or ( + re.search(self.COMMA_STRIP, in_text) is not None + ): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item From 91b718e522c4a61bc85da5a6a6fa9eb65d1ee53c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 20:49:37 +0800 Subject: [PATCH 049/126] update --- .../llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py index d75b200e3..474679eb8 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py @@ -342,6 +342,7 @@ pad_image_to_square=True), dict( type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', prompt_template=PROMPT_TEMPLATE.vicuna, tokenizer=tokenizer, @@ -359,6 +360,7 @@ # vqav2 图片大概是 12w,推理要很久 # dict( # type=VQAv2Dataset, + # proxy_eval_dataset = dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', From ca5e7fc1b9c8b491285368166ab018bad71b3970 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 12 Apr 2024 21:17:59 +0800 Subject: [PATCH 050/126] fix bug --- xtuner/dataset/mini_gemini_dataset.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/xtuner/dataset/mini_gemini_dataset.py b/xtuner/dataset/mini_gemini_dataset.py index 56ac87456..33ad914ec 100644 --- a/xtuner/dataset/mini_gemini_dataset.py +++ b/xtuner/dataset/mini_gemini_dataset.py @@ -70,11 +70,8 @@ def __getitem__(self, index): image_aux = torch.tensor(image_aux).permute(2, 0, 1) data_dict['pixel_values_aux'] = image_aux else: - if hasattr(self.image_processor, 'crop_size'): - crop_size = self.image_processor.crop_size - else: - crop_size = self.image_processor.size - data_dict['pixel_values'] = torch.zeros(3, crop_size['height'], - crop_size['width']) data_dict['pixel_values_aux'] = torch.zeros(3, self.image_size_aux, self.image_size_aux) + if self._model_name == 'CLIPImageProcessor': + data_dict['pixel_values'] = torch.zeros(3, self.crop_size_raw['height'], + self.crop_size_raw['width']) return data_dict From 1131f37279a0bbb5e9fb58653a475c5b0d2e8613 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 18 Apr 2024 14:44:51 +0800 Subject: [PATCH 051/126] support s2+siglip --- ...glip_so400m_p14_384_s2_e1_gpu8_pretrain.py | 209 ++++++++++++++++++ xtuner/dataset/llava.py | 12 + xtuner/model/llava.py | 49 ++-- xtuner/model/utils.py | 85 +++++++ 4 files changed, 342 insertions(+), 13 deletions(-) create mode 100644 xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py new file mode 100644 index 000000000..07821b1bf --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py @@ -0,0 +1,209 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +import torch + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) +s2_scales = [1, 2] + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + s2_scales=s2_scales, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + s2_scales=s2_scales, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 9da9a2d15..648bb9e3d 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -28,6 +28,7 @@ def __init__(self, dataset_map_fn=None, template_map_fn=None, max_length=2048, + s2_scales=None, # [1, 2] or [1,2,3] pad_image_to_square=False): super().__init__() @@ -70,6 +71,17 @@ def __init__(self, self.image_processor = image_processor self.pad_image_to_square = pad_image_to_square + self.max_s2_scale = s2_scales + if s2_scales is not None: + self.max_s2_scale = max(s2_scales) + if hasattr(self.image_processor, 'crop_size'): + self.image_processor.crop_size['height'] *= self.max_s2_scale + self.image_processor.crop_size['width'] *= self.max_s2_scale + self.image_processor.size['shortest_edge'] *= self.max_s2_scale + else: + self.image_processor.size['height'] *= self.max_s2_scale + self.image_processor.size['width'] *= self.max_s2_scale + @property def modality_length(self): length_list = [] diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index e3619f8b3..1e4b0587b 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -15,12 +15,13 @@ from .utils import (LoadWoInit, find_all_linear_names, get_peft_model_state_dict, guess_load_checkpoint, make_inputs_require_grad, - prepare_inputs_labels_for_multimodal, traverse_dict) + prepare_inputs_labels_for_multimodal, traverse_dict, s2_forward) from xtuner.tools.utils import get_stop_criteria from xtuner.dataset.utils import expand2square, load_image from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, StopWordStoppingCriteria) - +from functools import reduce +from mmengine.logging import print_log class LLaVAModel(BaseModel): @@ -31,6 +32,7 @@ def __init__(self, freeze_visual_encoder=False, visual_select_layer=-2, token_merge_ratio=1, + s2_scales=None, # [1, 2] or [1,2,3] pretrained_pth=None, projector_depth=2, llm_lora=None, @@ -41,6 +43,7 @@ def __init__(self, tokenizer=None, template=None): super().__init__() + self.s2_scales = s2_scales self.freeze_llm = freeze_llm self.freeze_visual_encoder = freeze_visual_encoder with LoadWoInit(): @@ -57,8 +60,15 @@ def __init__(self, '`token_merge_ratio` must be a square number.' self.token_merge_ratio = int(token_merge_ratio) + visual_hidden_size = self.visual_encoder.config.hidden_size * token_merge_ratio + self.s2_scales = s2_scales + if s2_scales is not None: + assert 1 in s2_scales, 'The scale of the original image must be included.' + total_scales = reduce(lambda x, y: x * y, s2_scales) + visual_hidden_size = visual_hidden_size * total_scales + projector_config = ProjectorConfig( - visual_hidden_size=self.visual_encoder.config.hidden_size * token_merge_ratio, + visual_hidden_size=visual_hidden_size, llm_hidden_size=self.llm.config.hidden_size, depth=projector_depth) self.projector = ProjectorModel(projector_config).to( @@ -112,8 +122,16 @@ def __init__(self, self.image_processor = image_processor if image_processor is not None: self.image_processor = BUILDER.build(image_processor) - self.template = template + if s2_scales is not None: + if hasattr(self.image_processor, 'crop_size'): + orig_img_size = self.image_processor.crop_size['height'] + else: + orig_img_size = self.image_processor.size['height'] + self.s2_img_sizes = [int(orig_img_size * scale) for scale in s2_scales] + + self.template = template + print_log(self, logger='current') def _parse_lora_config(self, lora_config): if isinstance(lora_config, dict) or isinstance( @@ -279,23 +297,28 @@ def _merge_tokens(tokens, token_merge_ratio): @staticmethod def _get_model_class_name(model): - base_model = model if model.__class__.__name__ == 'PeftModel': base_model = model.base_model.model else: base_model = model return base_model.__class__.__name__ + def __forward_feature(self, images): + visual_outputs = self.visual_encoder(images.to(self.visual_encoder.dtype), output_hidden_states=True) + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': + visual_outputs = visual_outputs[:, 1:] + return visual_outputs + def _prepare_data_for_llm(self, data): if 'pixel_values' in data: - visual_outputs = self.visual_encoder( - data['pixel_values'].to(self.visual_encoder.dtype), - output_hidden_states=True) - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] - - if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': - visual_outputs = visual_outputs[:, 1:] - visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) + if self.s2_scales is None: + visual_outputs = self.__forward_feature(data['pixel_values']) + visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) + else: + visual_outputs = s2_forward(self.__forward_feature, data['pixel_values'], + img_sizes=self.s2_img_sizes) + pixel_values = self.projector(visual_outputs) data['pixel_values'] = pixel_values diff --git a/xtuner/model/utils.py b/xtuner/model/utils.py index 0a35b5970..2553a369b 100644 --- a/xtuner/model/utils.py +++ b/xtuner/model/utils.py @@ -308,3 +308,88 @@ def guess_load_checkpoint(pth_model): else: raise FileNotFoundError(f'Cannot find {pth_model}') return state_dict + + + +# from https://github.com/bfshi/scaling_on_scales + +import math +import torch.nn.functional as F +from einops import rearrange + + +def split_chessboard(x, num_split): + """ + x: b * c * h * w + Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension + """ + B, C, H, W = x.shape + assert H % num_split == 0 and W % num_split == 0 + h, w = H // num_split, W // num_split + x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0) + return x_split + + +def merge_chessboard(x, num_split): + """ + x: b * c * h * w + Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square. + (inverse of split_chessboard) + """ + B, C, H, W = x.shape + assert B % (num_split**2) == 0 + b = B // (num_split**2) + x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1) + for i in range(num_split)], dim=-2) + return x_merge + + +def s2_forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0, + output_shape='bnc'): + + assert input.dim() == 4, "Input image must be in the shape of BxCxHxW." + assert input.shape[2] == input.shape[3], "Currently only square images are supported." + assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)." + assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token." + + b, c, input_size, _ = input.shape + + # image size for each scale + assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes." + img_sizes = img_sizes or [int(input_size * scale) for scale in scales] + + # prepare multiscale inputs + max_split_size = max_split_size or input_size # The maximum size of each split of image. Set as the input size by default + num_splits = [math.ceil(size / max_split_size) for size in img_sizes] # number of splits each scale + input_multiscale = [] + for size, num_split in zip(img_sizes, num_splits): + x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype) + x = split_chessboard(x, num_split=num_split) + input_multiscale.append(x) + + # run feedforward on each scale + outs_multiscale = [model(x) for x in input_multiscale] + if num_prefix_token > 0: + outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale] + outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale] + if output_shape == 'bnc': + outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5)) + for out in outs_multiscale] + + # merge outputs of different splits for each scale separately + outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)] + + # interpolate outputs from different scales and concat together + output_size = outs_multiscale[resize_output_to_idx].shape[-2] + out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size, + mode='area').to(outs_multiscale[i].dtype) + for i in range(len(outs_multiscale))], dim=1) + if output_shape == 'bnc': + out = rearrange(out, 'b c h w -> b (h w) c') + if num_prefix_token > 0: + # take the mean of prefix tokens from different splits for each scale + outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale] + out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1) + out = torch.cat([out_prefix_multiscale, out], dim=1) + + return out From b9cb7b9b1802865b0fd1f42cdcdb03228ae6f559 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 18 Apr 2024 15:09:58 +0800 Subject: [PATCH 052/126] fix --- xtuner/model/llava.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 1e4b0587b..67cf95f1f 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -128,6 +128,7 @@ def __init__(self, orig_img_size = self.image_processor.crop_size['height'] else: orig_img_size = self.image_processor.size['height'] + self.orig_img_size = orig_img_size self.s2_img_sizes = [int(orig_img_size * scale) for scale in s2_scales] self.template = template @@ -317,7 +318,8 @@ def _prepare_data_for_llm(self, data): visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) else: visual_outputs = s2_forward(self.__forward_feature, data['pixel_values'], - img_sizes=self.s2_img_sizes) + img_sizes=self.s2_img_sizes, + max_split_size=self.orig_img_size) pixel_values = self.projector(visual_outputs) From c9a3ffce132eb41e561885a94f72cf46f63641d2 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 18 Apr 2024 20:12:55 +0800 Subject: [PATCH 053/126] update config --- ...lip_so400m_p14_384_e1_gpu8_all_finetune.py | 73 ++++++++++--------- ..._siglip_so400m_p14_384_e1_gpu8_pretrain.py | 15 ++-- ...glip_so400m_p14_384_s2_e1_gpu8_pretrain.py | 6 +- xtuner/dataset/collate_fns/mm_collate_fn.py | 11 ++- xtuner/dataset/utils.py | 6 +- xtuner/model/modules/__init__.py | 3 +- xtuner/utils/templates.py | 4 + 7 files changed, 64 insertions(+), 54 deletions(-) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py index 2a6f04571..ae00d642f 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -23,8 +23,8 @@ # PART 1 Settings # ####################################################################### # Model -llm_name_or_path = 'microsoft/phi-2' -visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' # Specify the pretrained pth pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' @@ -32,7 +32,7 @@ data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' image_folder = data_root + 'llava_images' -prompt_template = PROMPT_TEMPLATE.vicuna +prompt_template = PROMPT_TEMPLATE.plain max_length = int(2048 - (384 // 14) ** 2) # Scheduler & Optimizer @@ -48,14 +48,14 @@ warmup_ratio = 0.03 # Save -save_steps = 500 -save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training -evaluation_freq = 500 +evaluation_freq = 1000 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' -evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] +evaluation_inputs = ['Please describe this picture'] ####################################################################### # PART 2 Model & Tokenizer & Image Processor # @@ -216,69 +216,63 @@ # ==================== val and test cfg ======================= val_dataset = [ dict( - type=MMEDataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', - image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', - prompt_template=PROMPT_TEMPLATE.vicuna, + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - # dict( - # type=MultipleChoiceDataset, - # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - # prompt_template=PROMPT_TEMPLATE.vicuna, - # tokenizer=tokenizer, - # image_processor=image_processor, - # pad_image_to_square=True) ] test_dataset = [ dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), - dict( - type=MultipleChoiceDataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), @@ -287,7 +281,7 @@ data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), @@ -295,7 +289,7 @@ type=MMEDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, # for_llava_prompt=True, # 开了后,perception 会掉 @@ -303,7 +297,7 @@ dict( type=HallusionDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), @@ -315,7 +309,7 @@ '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' ], coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), @@ -324,7 +318,14 @@ data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', - prompt_template=PROMPT_TEMPLATE.vicuna, + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py index 0547c9a14..ed45a9b4f 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -19,11 +19,11 @@ # PART 1 Settings # ####################################################################### # Model -llm_name_or_path = 'microsoft/phi-2' -visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' # Data -data_root = './data/llava_data/' +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' image_folder = data_root + 'LLaVA-Pretrain/images' prompt_template = PROMPT_TEMPLATE.vicuna @@ -42,14 +42,14 @@ warmup_ratio = 0.03 # Save -save_steps = 500 -save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training -evaluation_freq = 500 +evaluation_freq = 1000 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' -evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] +evaluation_inputs = ['Please describe this picture'] ####################################################################### # PART 2 Model & Tokenizer & Image Processor # @@ -85,6 +85,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py index 07821b1bf..ed00b7c80 100644 --- a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py @@ -14,7 +14,6 @@ from xtuner.engine.runner import TrainLoop from xtuner.utils import PROMPT_TEMPLATE from xtuner.model import LLaVAModel -import torch ####################################################################### # PART 1 Settings # @@ -27,7 +26,7 @@ data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' image_folder = data_root + 'LLaVA-Pretrain/images' -prompt_template = PROMPT_TEMPLATE.vicuna +prompt_template = PROMPT_TEMPLATE.plain max_length = int(2048 - (384 // 14) ** 2) s2_scales = [1, 2] @@ -75,10 +74,9 @@ image_processor=image_processor, freeze_llm=True, freeze_visual_encoder=True, + # phi2 不能用 flash attention, Loss 下降趋势不正常, fp16 推理也有潜在风险 llm=dict( type=AutoModelForCausalLM.from_pretrained, - torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2", pretrained_model_name_or_path=llm_name_or_path, trust_remote_code=True), visual_encoder=dict( diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py index 67d2a8f7b..47e29409c 100644 --- a/xtuner/dataset/collate_fns/mm_collate_fn.py +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -33,6 +33,7 @@ def mm_collate_fn(instances: Sequence[Dict], if has_image: pixel_values.append(data['pixel_values']) + ori_length = [len(ids) for ids in input_ids] if len(instances) > 1: input_ids = pad_sequence( input_ids, batch_first=True, padding_value=pad_index) @@ -43,9 +44,15 @@ def mm_collate_fn(instances: Sequence[Dict], if mode == 'train': labels = torch.stack(labels) + # Some tokenizers have the same eos token and pad token, so input_ids + # cannot be masked directly based on the pad token id. + attention_mask = torch.zeros_like(input_ids).bool() + for i in ori_length: + attention_mask[:i] = True + if mode == 'train': - attention_mask = input_ids.ne(pad_index) - position_ids = attention_mask.long().cumsum(-1) - 1 + bs, seq_len = input_ids.shape + position_ids = torch.arange(seq_len).unsqueeze(0).long().repeat(bs, 1) if len(cumulative_len) == 0: cumulative_len = None diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index 60dbce54e..4fab3df0f 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -91,9 +91,9 @@ def encode_fn(example, input_encode.append(IMAGE_TOKEN_INDEX) else: input_encode = tokenizer.encode(input, add_special_tokens=False) - if next_needs_bos_token: - input_ids += bos_token_id - labels += [IGNORE_INDEX] * len(bos_token_id) + # if next_needs_bos_token: + # input_ids += bos_token_id + # labels += [IGNORE_INDEX] * len(bos_token_id) input_ids += input_encode labels += [IGNORE_INDEX] * len(input_encode) if input_ids_with_output: diff --git a/xtuner/model/modules/__init__.py b/xtuner/model/modules/__init__.py index ce8a3906f..1207a9249 100644 --- a/xtuner/model/modules/__init__.py +++ b/xtuner/model/modules/__init__.py @@ -1,5 +1,4 @@ from .dispatch import dispatch_modules from .projector import ProjectorConfig, ProjectorModel -from .openclip_encoder import OpenCLIPVisionTower -__all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel', 'OpenCLIPVisionTower'] +__all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel'] diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index 1660a1ddd..32449c8e0 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -142,6 +142,10 @@ SUFFIX='<|END_OF_TURN_TOKEN|>', SUFFIX_AS_EOS=True, STOP_WORDS=['<|END_OF_TURN_TOKEN|>']), + plain=dict( + SYSTEM='', + INSTRUCTION='USER: {input} ASSISTANT:', + SEP=''), ) SYSTEM_TEMPLATE = ConfigDict( From bd013203afe161a70c4aeaf4e43e6befcce2890b Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 19 Apr 2024 10:53:45 +0800 Subject: [PATCH 054/126] add llama3 --- ...va_llama3_8b_chat_clip_e1_gpu8_pretrain.py | 203 ++++++++++++++++++ xtuner/dataset/map_fns/template_map_fn.py | 20 +- xtuner/utils/templates.py | 7 + 3 files changed, 223 insertions(+), 7 deletions(-) create mode 100644 xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py new file mode 100644 index 000000000..d9a4d34da --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = 'You are a pirate chatbot who always responds in pirate speak!' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template, global_system=SYSTEM), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/map_fns/template_map_fn.py b/xtuner/dataset/map_fns/template_map_fn.py index d7673b99e..821176544 100644 --- a/xtuner/dataset/map_fns/template_map_fn.py +++ b/xtuner/dataset/map_fns/template_map_fn.py @@ -4,17 +4,23 @@ from mmengine.utils.misc import get_object_from_string -def template_map_fn(example, template): +def template_map_fn(example, template, global_system=None): conversation = example.get('conversation', []) for i, single_turn_conversation in enumerate(conversation): input = single_turn_conversation.get('input', '') if input is None: input = '' input_text = template.INSTRUCTION.format(input=input, round=i + 1) - system = single_turn_conversation.get('system', '') - if system != '' and system is not None: - system = template.SYSTEM.format(system=system) - input_text = system + input_text + if global_system is not None: + if i == 0: + # only add system to the first turn + system = template.SYSTEM.format(system=global_system) + input_text = system + input_text + else: + system = single_turn_conversation.get('system', '') + if system != '' and system is not None: + system = template.SYSTEM.format(system=system) + input_text = system + input_text single_turn_conversation['input'] = input_text if template.get('SUFFIX', None): @@ -30,7 +36,7 @@ def template_map_fn(example, template): return {'conversation': conversation} -def template_map_fn_factory(template): +def template_map_fn_factory(template, global_system=None): if isinstance(template, str): # for resume template = get_object_from_string(template) - return partial(template_map_fn, template=template) + return partial(template_map_fn, template=template, global_system=global_system) diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index 32449c8e0..8f7460aeb 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -146,6 +146,13 @@ SYSTEM='', INSTRUCTION='USER: {input} ASSISTANT:', SEP=''), + llama3_chat=dict( + SYSTEM=( + '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' + '{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'), + INSTRUCTION='{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>', + SEP='\n\n', + STOP_WORDS=['<|eot_id|>']), ) SYSTEM_TEMPLATE = ConfigDict( From d3021341f7eb54c2d9c21f59f018fd9a530508a9 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 19 Apr 2024 11:25:36 +0800 Subject: [PATCH 055/126] fix temp --- ...va_llama3_8b_chat_clip_e1_gpu8_pretrain.py | 4 ++-- xtuner/dataset/map_fns/template_map_fn.py | 20 +++++++------------ xtuner/dataset/utils.py | 6 +++--- xtuner/utils/templates.py | 6 +++--- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py index d9a4d34da..db96bb827 100644 --- a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py @@ -47,7 +47,7 @@ # Evaluate the generation performance during the training evaluation_freq = 1000 -SYSTEM = 'You are a pirate chatbot who always responds in pirate speak!' +SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] @@ -92,7 +92,7 @@ image_processor=image_processor, dataset_map_fn=llava_map_fn, template_map_fn=dict( - type=template_map_fn_factory, template=prompt_template, global_system=SYSTEM), + type=template_map_fn_factory, template=prompt_template), max_length=max_length, pad_image_to_square=False) diff --git a/xtuner/dataset/map_fns/template_map_fn.py b/xtuner/dataset/map_fns/template_map_fn.py index 821176544..d7673b99e 100644 --- a/xtuner/dataset/map_fns/template_map_fn.py +++ b/xtuner/dataset/map_fns/template_map_fn.py @@ -4,23 +4,17 @@ from mmengine.utils.misc import get_object_from_string -def template_map_fn(example, template, global_system=None): +def template_map_fn(example, template): conversation = example.get('conversation', []) for i, single_turn_conversation in enumerate(conversation): input = single_turn_conversation.get('input', '') if input is None: input = '' input_text = template.INSTRUCTION.format(input=input, round=i + 1) - if global_system is not None: - if i == 0: - # only add system to the first turn - system = template.SYSTEM.format(system=global_system) - input_text = system + input_text - else: - system = single_turn_conversation.get('system', '') - if system != '' and system is not None: - system = template.SYSTEM.format(system=system) - input_text = system + input_text + system = single_turn_conversation.get('system', '') + if system != '' and system is not None: + system = template.SYSTEM.format(system=system) + input_text = system + input_text single_turn_conversation['input'] = input_text if template.get('SUFFIX', None): @@ -36,7 +30,7 @@ def template_map_fn(example, template, global_system=None): return {'conversation': conversation} -def template_map_fn_factory(template, global_system=None): +def template_map_fn_factory(template): if isinstance(template, str): # for resume template = get_object_from_string(template) - return partial(template_map_fn, template=template, global_system=global_system) + return partial(template_map_fn, template=template) diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index 4fab3df0f..60dbce54e 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -91,9 +91,9 @@ def encode_fn(example, input_encode.append(IMAGE_TOKEN_INDEX) else: input_encode = tokenizer.encode(input, add_special_tokens=False) - # if next_needs_bos_token: - # input_ids += bos_token_id - # labels += [IGNORE_INDEX] * len(bos_token_id) + if next_needs_bos_token: + input_ids += bos_token_id + labels += [IGNORE_INDEX] * len(bos_token_id) input_ids += input_encode labels += [IGNORE_INDEX] * len(input_encode) if input_ids_with_output: diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index 8f7460aeb..f8e37ee88 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -148,9 +148,9 @@ SEP=''), llama3_chat=dict( SYSTEM=( - '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' - '{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'), - INSTRUCTION='{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>', + '<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>'), + INSTRUCTION='<|start_header_id|>user<|end_header_id|>\n\n' + '{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>', SEP='\n\n', STOP_WORDS=['<|eot_id|>']), ) From b71b2374593fc59afc7f38b1a8a0a627ac154517 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 19 Apr 2024 12:04:32 +0800 Subject: [PATCH 056/126] update temp --- xtuner/utils/templates.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index f8e37ee88..cf530329a 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -147,11 +147,13 @@ INSTRUCTION='USER: {input} ASSISTANT:', SEP=''), llama3_chat=dict( - SYSTEM=( - '<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>'), - INSTRUCTION='<|start_header_id|>user<|end_header_id|>\n\n' - '{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>', - SEP='\n\n', + SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n' + '{system}<|eot_id|>'), + INSTRUCTION=( + '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>' + '<|start_header_id|>assistant<|end_header_id|>\n\n'), + SUFFIX='<|eot_id|>', + SUFFIX_AS_EOS=True, STOP_WORDS=['<|eot_id|>']), ) From 9e2c2b23ea69cc5c1c69576df00031a84bddf7d3 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 19 Apr 2024 13:33:39 +0800 Subject: [PATCH 057/126] add finetune config --- ...ama3_8b_chat_clip_lora_e1_gpu8_finetune.py | 357 ++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py new file mode 100644 index 000000000..098e33760 --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py @@ -0,0 +1,357 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From a5ac45b49126aae97fe9bfe1ee857493155cc738 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 19 Apr 2024 16:02:02 +0800 Subject: [PATCH 058/126] add internvl config --- ...b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py | 203 +++++++ ...hat_clip_lora_e1_gpu8_internvl_finetune.py | 494 ++++++++++++++++++ 2 files changed, 697 insertions(+) create mode 100644 xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py create mode 100644 xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py new file mode 100644 index 000000000..7a8b32ea8 --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +data_path = data_root + 'share-captioner_coco_lcs_sam_1246k_1107.json' +image_folder = data_root + 'data' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(4096 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py new file mode 100644 index 000000000..bff5b1c4b --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py @@ -0,0 +1,494 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain/iter_xxx.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 5000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 5000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +sharegpt4v_caption_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +geoqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 9458ad279333d670de4e6786aa30af9334a81aeb Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 23 Apr 2024 12:00:13 +0800 Subject: [PATCH 059/126] updata --- ...b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py | 2 +- ...ama3_8b_chat_clip_lora_e1_gpu8_finetune.py | 52 +++++++++++--- ...hat_clip_lora_e1_gpu8_internvl_finetune.py | 72 ++++++++++++++----- xtuner/dataset/llava_proxy_eval_dataset.py | 8 ++- xtuner/model/llava.py | 3 +- 5 files changed, 104 insertions(+), 33 deletions(-) diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py index 7a8b32ea8..a6ddcc2f8 100644 --- a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py @@ -85,7 +85,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py index 098e33760..d1a564bc2 100644 --- a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py @@ -35,8 +35,8 @@ max_length = int(2048 - (336 // 14) ** 2) # Scheduler & Optimizer -batch_size = 16 # per_device -accumulative_counts = 1 +batch_size = 8 # per_device +accumulative_counts = 2 dataloader_num_workers = 4 max_epochs = 1 optim_type = AdamW @@ -80,6 +80,8 @@ llm=dict( type=AutoModelForCausalLM.from_pretrained, pretrained_model_name_or_path=llm_name_or_path, + # to speed inference + # attn_implementation='sdpa', trust_remote_code=True), visual_encoder=dict( type=CLIPVisionModel.from_pretrained, @@ -93,7 +95,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/llama3_8b_finetune', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, @@ -234,13 +236,6 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - # dict( - # type=MultipleChoiceDataset, - # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', - # prompt_template=prompt_template, - # tokenizer=tokenizer, - # image_processor=image_processor, - # pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', @@ -329,6 +324,43 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py index bff5b1c4b..29c04623f 100644 --- a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py @@ -16,7 +16,7 @@ from peft import LoraConfig from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset, GQADataset + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop ####################################################################### @@ -25,7 +25,7 @@ # Model llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' -pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain/iter_xxx.pth' +pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain/iter_9742.pth' # Data data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' @@ -106,6 +106,7 @@ llm=dict( type=AutoModelForCausalLM.from_pretrained, pretrained_model_name_or_path=llm_name_or_path, + # attn_implementation='sdpa', trust_remote_code=True), visual_encoder=dict( type=CLIPVisionModel.from_pretrained, @@ -117,9 +118,12 @@ ####################################################################### # PART 3 Dataset & Dataloader # ####################################################################### + +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/' + sharegpt4v_caption_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', data_path=sharegpt4v_caption_data_path, image_folder=sharegpt4v_caption_image_folder, tokenizer=tokenizer, @@ -132,7 +136,7 @@ llava_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'llava_dataset', data_path=llava_data_path, image_folder=llava_image_folder, tokenizer=tokenizer, @@ -145,7 +149,7 @@ sharegpt4v_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', data_path=sharegpt4v_data_path, image_folder=sharegpt4v_image_folder, tokenizer=tokenizer, @@ -159,7 +163,7 @@ dvqa_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'dvqa_dataset', data_path=dvqa_data_path, image_folder=dvqa_image_folder, tokenizer=tokenizer, @@ -172,7 +176,7 @@ chartqa_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'chartqa_dataset', data_path=chartqa_data_path, image_folder=chartqa_image_folder, tokenizer=tokenizer, @@ -185,7 +189,7 @@ ai2d_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'ai2d_dataset', data_path=ai2d_data_path, image_folder=ai2d_image_folder, tokenizer=tokenizer, @@ -198,7 +202,7 @@ docvqa_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'docvqa_dataset', data_path=docvqa_data_path, image_folder=docvqa_image_folder, tokenizer=tokenizer, @@ -211,7 +215,7 @@ geoqa_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'geoqa_dataset', data_path=geoqa_data_path, image_folder=geoqa_image_folder, tokenizer=tokenizer, @@ -224,7 +228,7 @@ synthdog_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_root+'synthdog_dataset', data_path=synthdog_data_path, image_folder=synthdog_image_folder, tokenizer=tokenizer, @@ -245,7 +249,7 @@ batch_size=batch_size, num_workers=dataloader_num_workers, pin_memory=True, - dataset=llava_dataset, + dataset=train_dataset, sampler=dict( type=LengthGroupedSampler, length_property='modality_length', @@ -371,13 +375,6 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - # dict( - # type=MultipleChoiceDataset, - # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', - # prompt_template=prompt_template, - # tokenizer=tokenizer, - # image_processor=image_processor, - # pad_image_to_square=True), dict( type=MultipleChoiceDataset, data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', @@ -466,6 +463,43 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index e918c1045..73e85af44 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -42,7 +42,10 @@ def getitem(self, idx, data): chunk_encode = [] for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): if idx == 0: - cur_encode = self.eval_ds.tokenizer.encode(chunk) + # add bos token + bos_token_id = self.eval_ds.tokenizer.bos_token_id + cur_encode = [bos_token_id] + cur_encode += self.eval_ds.tokenizer.encode(chunk) else: cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) chunk_encode.append(cur_encode) @@ -56,7 +59,7 @@ def getitem(self, idx, data): data_dict['input_ids'] = ids # 3 process image - if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2']: # MMEDataset or TextVQADataset image = Image.open(os.path.join(self.eval_ds.image_folder, data['image_path'])).convert('RGB') @@ -73,3 +76,4 @@ def getitem(self, idx, data): data_dict['pixel_values'] = image return data_dict + diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 67cf95f1f..60793ebe3 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -258,7 +258,8 @@ def _dispatch_lm_model_cfg(self, cfg, max_position_embeddings=None): pretrained_model_name_or_path = cfg.pretrained_model_name_or_path llm_cfg = AutoConfig.from_pretrained( pretrained_model_name_or_path, trust_remote_code=True) - cfg, llm_cfg = self._prepare_for_flash_attn(cfg, llm_cfg) + if not hasattr(cfg, 'attn_implementation'): + cfg, llm_cfg = self._prepare_for_flash_attn(cfg, llm_cfg) if max_position_embeddings is not None: cfg, llm_cfg = self._prepare_for_long_context_training( cfg, llm_cfg, max_position_embeddings) From e79b7aa3c24a58515c85e8a5839000abe54006b2 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 23 Apr 2024 16:38:40 +0800 Subject: [PATCH 060/126] add chartqa --- ...ama3_8b_chat_clip_lora_e1_gpu8_finetune.py | 12 +- xtuner/dataset/evaluation/__init__.py | 3 +- xtuner/dataset/evaluation/chartqa_dataset.py | 173 ++++++++++++++++++ xtuner/dataset/llava_proxy_eval_dataset.py | 5 +- 4 files changed, 190 insertions(+), 3 deletions(-) create mode 100644 xtuner/dataset/evaluation/chartqa_dataset.py diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py index d1a564bc2..84e68e658 100644 --- a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py @@ -16,7 +16,7 @@ from peft import LoraConfig from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset, GQADataset + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop ####################################################################### @@ -361,6 +361,16 @@ # tokenizer=tokenizer, # image_processor=image_processor, # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ) ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index bca2e4609..da037ee53 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -5,6 +5,7 @@ from .textvqa_dataset import TextVQADataset from .gqa_dataset import GQADataset from .vqav2_dataset import VQAv2Dataset +from .chartqa_dataset import ChartQADataset __all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset', - 'VQAv2Dataset'] + 'VQAv2Dataset', 'ChartQADataset'] diff --git a/xtuner/dataset/evaluation/chartqa_dataset.py b/xtuner/dataset/evaluation/chartqa_dataset.py new file mode 100644 index 000000000..834f2a0cc --- /dev/null +++ b/xtuner/dataset/evaluation/chartqa_dataset.py @@ -0,0 +1,173 @@ +import os +import os.path as osp +from typing import Optional +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .gqa_eval_utils import eval_gqa + + +def relaxed_correctness(target: str, + prediction: str, + max_relative_change: float = 0.05) -> bool: + """Calculates relaxed correctness. + + The correctness tolerates certain error ratio defined by max_relative_change. + See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1: + “Following Methani et al. (2020), we use a relaxed accuracy measure for the + numeric answers to allow a minor inaccuracy that may result from the automatic + data extraction process. We consider an answer to be correct if it is within + 5% of the gold answer. For non-numeric answers, we still need an exact match + to consider an answer to be correct.” + + Args: + target: Target string. + prediction: Predicted string. + max_relative_change: Maximum relative change. + + Returns: + Whether the prediction was correct given the specified tolerance. + """ + + def _to_float(text: str) -> Optional[float]: + try: + if text.endswith('%'): + # Convert percentages to floats. + return float(text.rstrip('%')) / 100.0 + else: + return float(text) + except ValueError: + return None + + prediction_float = _to_float(prediction) + target_float = _to_float(target) + if prediction_float is not None and target_float: + relative_change = abs(prediction_float - + target_float) / abs(target_float) + return relative_change <= max_relative_change + else: + return prediction.lower() == target.lower() + + +def evaluate_relaxed_accuracy(entries): + scores = [] + for elem in entries: + if isinstance(elem['label'], str): + elem['label'] = [elem['label']] + score = max([ + relaxed_correctness(elem['prediction'].strip(), ann) + for ann in elem['label'] + ]) + scores.append(score) + return sum(scores) / len(scores) + + +class ChartQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='chartqa') + + def __init__( + self, + data_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + + if isinstance(data_file, str): + data_file = [data_file] + self.raw_data = [json.load(open(f)) for f in data_file] + # test_human, test_augmented + self.name = [ + os.path.splitext(os.path.basename(f))[0] for f in data_file + ] + self.name_map = {name: i for i, name in enumerate(self.name)} + self.revert_name_map = {i: name for i, name in enumerate(self.name)} + + template = prompt_template + self.template = template + + self.image_folder = image_folder + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + data_list = [] + idx = 0 + + for data_idx in range(len(self.raw_data)): + for sample_idx in range(len(self.raw_data[data_idx])): + sample = self.raw_data[data_idx][sample_idx] + image_path = sample['imgname'] + question = sample['query'] + answer = sample['label'] + category = self.name[data_idx] + data = { + 'img_id': idx, + 'img': image_path, + 'question': question, + 'answer': answer, + 'category': category + } + data_list.append(data) + idx += 1 + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, result, work_dir): + orig_index = [x['img_id'] for x in self.data] + results = [[] for _ in range(len(self.name))] + for pred_dict in result: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + cur_result = {} + cur_result['query'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['label'] = filtered_rows.get('answer') + + index = self.name_map[filtered_rows['category']] + results[index].append(cur_result) + + print_log('============================================', 'current') + acc_list = [] + for i, result in enumerate(results): + prediction_file = osp.join(work_dir, self.revert_name_map[i] + '.json') + with open(prediction_file, 'w') as f: + json.dump(result, f) + + _accuracy = evaluate_relaxed_accuracy(result) + print_log('Acc: {}, Category: {}, # samples: {}'.format(_accuracy, self.revert_name_map[i], + len(result)), 'current') + acc_list.append(_accuracy) + + print_log('============================================', 'current') + acc = sum(acc_list) / len(acc_list) + print_log('Overall Acc: {}'.format(acc), 'current') + print_log('============================================', 'current') + print_log('ChartQA successfully finished evaluating', 'current') + + return {'Acc': acc} diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index 73e85af44..d65375fa4 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -28,6 +28,9 @@ def getitem(self, idx, data): else: text = text + ("Answer with the option's letter from the " 'given choices directly.') + elif self.eval_ds.metainfo['name'] == 'chartqa': + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text else: text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text @@ -59,7 +62,7 @@ def getitem(self, idx, data): data_dict['input_ids'] = ids # 3 process image - if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2']: + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: # MMEDataset or TextVQADataset image = Image.open(os.path.join(self.eval_ds.image_folder, data['image_path'])).convert('RGB') From dc153176c646b3651e43b5a1857bd73714691a47 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 23 Apr 2024 17:20:43 +0800 Subject: [PATCH 061/126] fix --- xtuner/dataset/evaluation/chartqa_dataset.py | 11 ++++++----- xtuner/dataset/evaluation/vqav2_dataset.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/xtuner/dataset/evaluation/chartqa_dataset.py b/xtuner/dataset/evaluation/chartqa_dataset.py index 834f2a0cc..5d9939d9f 100644 --- a/xtuner/dataset/evaluation/chartqa_dataset.py +++ b/xtuner/dataset/evaluation/chartqa_dataset.py @@ -8,11 +8,10 @@ from xtuner.registry import BUILDER from mmengine.logging import print_log from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset -from .gqa_eval_utils import eval_gqa -def relaxed_correctness(target: str, - prediction: str, +def relaxed_correctness(prediction: str, + target: str, max_relative_change: float = 0.05) -> bool: """Calculates relaxed correctness. @@ -25,8 +24,8 @@ def relaxed_correctness(target: str, to consider an answer to be correct.” Args: - target: Target string. prediction: Predicted string. + target: Target string. max_relative_change: Maximum relative change. Returns: @@ -83,6 +82,8 @@ def __init__( proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), ): super().__init__(metainfo) + self.use_system=use_system + self.for_llava_prompt = for_llava_prompt if isinstance(data_file, str): data_file = [data_file] @@ -119,7 +120,7 @@ def load_data_list(self): category = self.name[data_idx] data = { 'img_id': idx, - 'img': image_path, + 'image_path': image_path, 'question': question, 'answer': answer, 'category': category diff --git a/xtuner/dataset/evaluation/vqav2_dataset.py b/xtuner/dataset/evaluation/vqav2_dataset.py index e7161e75a..f9d4fb7d0 100644 --- a/xtuner/dataset/evaluation/vqav2_dataset.py +++ b/xtuner/dataset/evaluation/vqav2_dataset.py @@ -33,9 +33,9 @@ def __init__( self.test_file = test_file self.image_folder = image_folder # Save detailed information for easy viewing - self.answer_file = 'answer_vqav2_results.jsonl' + self.answer_file = 'answer_vqav2_results.json' # solely for evaluation purposes - self.prediction_file = 'pred_vqav2_results.jsonl' + self.prediction_file = 'pred_vqav2_results.json' self.answer_processor = EvalAIAnswerProcessor() self.use_system = use_system From d9de3038094962aebf3cd3bafc89e976d6e6147d Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 09:41:06 +0800 Subject: [PATCH 062/126] update --- xtuner/dataset/evaluation/__init__.py | 3 +- xtuner/dataset/evaluation/docvqa_dataset.py | 138 ++++++++++++++++++++ 2 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 xtuner/dataset/evaluation/docvqa_dataset.py diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index da037ee53..3b6c71463 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -6,6 +6,7 @@ from .gqa_dataset import GQADataset from .vqav2_dataset import VQAv2Dataset from .chartqa_dataset import ChartQADataset +from .docvqa_dataset import DocVQADataset __all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset', - 'VQAv2Dataset', 'ChartQADataset'] + 'VQAv2Dataset', 'ChartQADataset', 'DocVQADataset'] diff --git a/xtuner/dataset/evaluation/docvqa_dataset.py b/xtuner/dataset/evaluation/docvqa_dataset.py new file mode 100644 index 000000000..4da2d4967 --- /dev/null +++ b/xtuner/dataset/evaluation/docvqa_dataset.py @@ -0,0 +1,138 @@ +import os +import os.path as osp +from typing import Optional +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +import pandas as pd +from xtuner.dataset.utils import decode_base64_to_image +import numpy as np + + +def levenshtein_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + + +def anls_compute(groundtruth, prediction): + gt_answer = ' '.join(groundtruth.strip().lower().split()) + det_answer = ' '.join(prediction.strip().lower().split()) + dist = levenshtein_distance(gt_answer, det_answer) + length = max(len(groundtruth.upper()), len(prediction.upper())) + values = 0.0 if length == 0 else float(dist) / float(length) + return values + + +def hit_calculate(result, dataset_name, anls_threshold=0.5): + if dataset_name == 'DocVQA': + # return [1 - np.min(x['match']) >= anls_threshold for x in result] + return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result] + else: + raise NotImplementedError(f"Dataset {dataset_name} not supported for hit calculation") + + +class DocVQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='docvqa') + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + anls_threshold=0.5, use_system=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.anls_threshold = anls_threshold + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + question = self.df.iloc[idx]['question'] + split = self.df.iloc[idx]['split'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'question': question, + 'split': split, + 'answer': answer, + 'index': index, + 'img_id': idx + } + data_list.append(data) + return data_list + + @master_only + def evaluate(self, results, work_dir): + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['split'] = filtered_rows.get('split') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + match = [anls_compute(results['answer'], results['prediction']) for results in new_results] + + splits = set([results['split'] for results in new_results]) + ret = dict() + for sp in splits: + sub = [match[i] for i, x in enumerate(new_results) if x['split'] == sp] + hit = hit_calculate(sub, 'DocVQA') + ret[sp] = np.mean(hit) * 100 From c18a81d908019ec5ebe8476bffa49ff35268552f Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 14:50:41 +0800 Subject: [PATCH 063/126] add phi3 pretrain config --- ...clip_vit_large_p14_336_e1_gpu8_pretrain.py | 199 ++++++++++++++++++ xtuner/dataset/llava_proxy_eval_dataset.py | 2 +- 2 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py new file mode 100644 index 000000000..a12baea99 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index d65375fa4..135b42eca 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -48,7 +48,7 @@ def getitem(self, idx, data): # add bos token bos_token_id = self.eval_ds.tokenizer.bos_token_id cur_encode = [bos_token_id] - cur_encode += self.eval_ds.tokenizer.encode(chunk) + cur_encode += self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) else: cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) chunk_encode.append(cur_encode) From 2d0bad56f82ee7ea3f0506026c408ebef1c32277 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 15:03:30 +0800 Subject: [PATCH 064/126] update --- ...clip_vit_large_p14_336_e1_gpu8_pretrain.py | 2 +- ...vit_large_lora_p14_336_e1_gpu8_finetune.py | 393 ++++++++++++++++++ 2 files changed, 394 insertions(+), 1 deletion(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py index a12baea99..7203658b3 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -19,7 +19,7 @@ # PART 1 Settings # ####################################################################### # Model -llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct' +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' # Data diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py new file mode 100644 index 000000000..b5eeea58d --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -0,0 +1,393 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 2d43f20ebea512098eb3c6bc21ada2da765ea5aa Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 15:06:35 +0800 Subject: [PATCH 065/126] update --- ...uct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index b5eeea58d..6887274cb 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -11,10 +11,14 @@ from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.dataset.samplers import LengthGroupedSampler from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook -from xtuner.engine.runner import TrainLoop from xtuner.model import LLaVAModel from xtuner.utils import PROMPT_TEMPLATE from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler ####################################################################### # PART 1 Settings # From f6a688b6da6324b74b9702ade2cabccc9419fb61 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 15:10:55 +0800 Subject: [PATCH 066/126] fix --- ...mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py | 3 +++ ...struct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py index 7203658b3..6db926b50 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -67,6 +67,9 @@ model = dict( type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, freeze_llm=True, freeze_visual_encoder=True, llm=dict( diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index 6887274cb..316c0855c 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -74,6 +74,9 @@ model = dict( type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, freeze_llm=False, freeze_visual_encoder=True, pretrained_pth=pretrained_pth, From dd0365d0eea52791895d780618467f9ecacb37ee Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 15:12:57 +0800 Subject: [PATCH 067/126] update --- ..._mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py | 1 + ...nstruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py index 6db926b50..0da0fd43a 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -85,6 +85,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, + offline_processed_text_folder='./phi3_mini_llava_pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index 316c0855c..3e3cd501c 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -96,7 +96,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder='./phi3_mini_llava_finetune', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, From 643c6f66bf089046a374116d44c498162332dcf5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 15:47:41 +0800 Subject: [PATCH 068/126] fix mmmu results --- xtuner/dataset/evaluation/multiple_choice_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index e62e73401..bc035d7c7 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -218,9 +218,10 @@ def show_result(ret_json): data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] l2 = calc_acc(data_main, 'l2-category') ret_json.update(l2) - else: - leaf = calc_acc(data_main, 'category') - ret_json.update(leaf) + + leaf = calc_acc(data_main, 'category') + ret_json.update(leaf) + print_log('============================================', 'current') show_result(ret_json) print_log('============================================', 'current') From 797af9a7ef7081811007c09e0767eb3cdd56c988 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 16:12:27 +0800 Subject: [PATCH 069/126] fix mmmu results --- .../evaluation/multiple_choice_dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index bc035d7c7..cbc75f63b 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -16,6 +16,21 @@ from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset +def MMMU_preproc(data): + cnt = 0 + As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer']) + lt = len(data) + for i in range(lt): + if pd.isna(As[i]): + As[i] = Ans[i] + Bs[i] = 'Other Answers' + cnt += 1 + print_log(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ', 'current') + data['A'] = As + data['B'] = Bs + return data + + class MultipleChoiceDataset(BaseEvalDataset): # 'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d' METAINFO: dict = dict(name='multiple_choice') @@ -26,6 +41,9 @@ def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_i self.use_system = use_system self.data_file = data_file self.df = pd.read_csv(data_file, sep='\t') + + if 'MMMU' in os.path.basename(data_file): + self.df = MMMU_preproc(self.df) self.split = 'dev' if 'answer' in self.df.iloc[0].keys() else 'test' self.has_l2_category = 'l2-category' in self.df.columns.to_list() From e24b6dcadedca24755946b5c27d672f783e29993 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 16:59:56 +0800 Subject: [PATCH 070/126] fix mmmu results --- .../evaluation/multiple_choice_dataset.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index cbc75f63b..1e9402e93 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -80,6 +80,8 @@ def load_data_list(self): answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ 0].keys() else None category = self.df.iloc[idx]['category'] + split = self.df.iloc[idx]['split'] if 'split' in self.df.iloc[ + 0].keys() else None options = { cand: self.load_from_df(idx, cand) @@ -100,6 +102,7 @@ def load_data_list(self): 'options_dict': options, 'index': index, 'context': hint, + 'split': split, 'img_id': idx } if self.has_l2_category: @@ -121,17 +124,17 @@ def load_from_df(self, idx, key): @master_only def evaluate(self, results, work_dir): - def calc_acc(df, group='category'): + def calc_acc(df, split, group='category'): assert group in ['overall', 'category', 'l2-category'] if group == 'overall': - res = {'Average': np.mean(df['hit'])} + res = {'Average': np.mean(df[df['split'] == split]['hit'])} else: res = {} abilities = list(set(df[group])) abilities.sort() for ab in abilities: sub_df = df[df[group] == ab] - res[ab] = np.mean(sub_df['hit']) + res[ab] = np.mean(sub_df[sub_df['split'] == split]['hit']) return res def eval_sub_data(sub_data, answer_map): @@ -145,9 +148,12 @@ def eval_sub_data(sub_data, answer_map): return 0 return 1 - def show_result(ret_json): + def show_result(ret_json, split): show_dict = ret_json.copy() - table = Table(title=f' Multiple Choice ({self.data_file}) ') + if split != 'none': + table = Table(title=f'{split}: Multiple Choice ({self.data_file}) ') + else: + table = Table(title=f' Multiple Choice ({self.data_file}) ') console = Console() table.add_column('Category', justify='left') table.add_column('Accuracy (%)', justify='right') @@ -230,18 +236,24 @@ def show_result(ret_json): main_idx = data_main['index'] data_main['category'] = [cate_map[i] for i in main_idx] - ret_json = calc_acc(data_main, 'overall') + if 'split' in data_main: + splits = list(set(data_main['split'])) + else: + splits = ['none'] - if self.has_l2_category: - data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] - l2 = calc_acc(data_main, 'l2-category') - ret_json.update(l2) + for split in splits: + ret_json = calc_acc(data_main, split, 'overall') - leaf = calc_acc(data_main, 'category') - ret_json.update(leaf) + if self.has_l2_category: + data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] + l2 = calc_acc(data_main, split, 'l2-category') + ret_json.update(l2) - print_log('============================================', 'current') - show_result(ret_json) + leaf = calc_acc(data_main, split, 'category') + ret_json.update(leaf) + + print_log('============================================', 'current') + show_result(ret_json,split) print_log('============================================', 'current') print_log('Multiple Choice successfully finished evaluating' 'current') return ret_json From 1ced889f319ba46e75351793310fe49cb507953f Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 17:03:01 +0800 Subject: [PATCH 071/126] fix mmmu results --- xtuner/dataset/evaluation/multiple_choice_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index 1e9402e93..07617675f 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -150,12 +150,12 @@ def eval_sub_data(sub_data, answer_map): def show_result(ret_json, split): show_dict = ret_json.copy() + table = Table(title=f' Multiple Choice ({self.data_file}) ') + console = Console() if split != 'none': - table = Table(title=f'{split}: Multiple Choice ({self.data_file}) ') + table.add_column(f'Category ({split} )', justify='left') else: - table = Table(title=f' Multiple Choice ({self.data_file}) ') - console = Console() - table.add_column('Category', justify='left') + table.add_column('Category', justify='left') table.add_column('Accuracy (%)', justify='right') average = show_dict.pop('Average') * 100 table.add_row('Average', f'{average:.1f}') From 35945822f321f0025efcccacece7b987e45274f0 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 17:08:44 +0800 Subject: [PATCH 072/126] update --- xtuner/dataset/evaluation/multiple_choice_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index 07617675f..de0903bfb 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -153,7 +153,7 @@ def show_result(ret_json, split): table = Table(title=f' Multiple Choice ({self.data_file}) ') console = Console() if split != 'none': - table.add_column(f'Category ({split} )', justify='left') + table.add_column(f'Category ({split})', justify='left') else: table.add_column('Category', justify='left') table.add_column('Accuracy (%)', justify='right') From 2f4055c35dbdc958fc91986a807904fdb940eb42 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 25 Apr 2024 20:39:29 +0800 Subject: [PATCH 073/126] update infovqa --- ...vit_large_lora_p14_336_e1_gpu8_finetune.py | 28 ++++++++++- xtuner/dataset/evaluation/__init__.py | 4 +- ...cvqa_dataset.py => general_vqa_dataset.py} | 50 +++++++++++++++---- xtuner/dataset/llava_proxy_eval_dataset.py | 4 +- 4 files changed, 71 insertions(+), 15 deletions(-) rename xtuner/dataset/evaluation/{docvqa_dataset.py => general_vqa_dataset.py} (75%) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index 3e3cd501c..dcea773e4 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -15,7 +15,7 @@ from xtuner.utils import PROMPT_TEMPLATE from peft import LoraConfig from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ - HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler @@ -371,7 +371,31 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True - ) + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py index 3b6c71463..652ae88e4 100644 --- a/xtuner/dataset/evaluation/__init__.py +++ b/xtuner/dataset/evaluation/__init__.py @@ -6,7 +6,7 @@ from .gqa_dataset import GQADataset from .vqav2_dataset import VQAv2Dataset from .chartqa_dataset import ChartQADataset -from .docvqa_dataset import DocVQADataset +from .general_vqa_dataset import GeneralVQADataset __all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset', - 'VQAv2Dataset', 'ChartQADataset', 'DocVQADataset'] + 'VQAv2Dataset', 'ChartQADataset', 'GeneralVQADataset'] diff --git a/xtuner/dataset/evaluation/docvqa_dataset.py b/xtuner/dataset/evaluation/general_vqa_dataset.py similarity index 75% rename from xtuner/dataset/evaluation/docvqa_dataset.py rename to xtuner/dataset/evaluation/general_vqa_dataset.py index 4da2d4967..dc73cd017 100644 --- a/xtuner/dataset/evaluation/docvqa_dataset.py +++ b/xtuner/dataset/evaluation/general_vqa_dataset.py @@ -1,7 +1,5 @@ import os import os.path as osp -from typing import Optional -import json from mmengine.dist import master_only from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset @@ -39,15 +37,26 @@ def anls_compute(groundtruth, prediction): def hit_calculate(result, dataset_name, anls_threshold=0.5): - if dataset_name == 'DocVQA': + if 'DocVQA' in dataset_name or 'InfoVQA' in dataset_name: # return [1 - np.min(x['match']) >= anls_threshold for x in result] return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result] + elif 'OCRVQA' in dataset_name: + return [np.max(x['match']) for x in result] else: raise NotImplementedError(f"Dataset {dataset_name} not supported for hit calculation") -class DocVQADataset(BaseEvalDataset): - METAINFO: dict = dict(name='docvqa') +def istype(s, type): + if isinstance(s, type): + return True + try: + return isinstance(eval(s), type) + except Exception as _: + return False + + +class GeneralVQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='gvqa') def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, anls_threshold=0.5, use_system=False, metainfo=None, @@ -86,6 +95,11 @@ def get_image(self, image): def __len__(self): return len(self.df) + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + def load_data_list(self): data_list = [] for idx in range(len(self.df)): @@ -121,18 +135,34 @@ def evaluate(self, results, work_dir): cur_result['split'] = filtered_rows.get('split') cur_result['prediction'] = pred_dict['prediction'] cur_result['index'] = filtered_rows.get('index') - cur_result['answer'] = filtered_rows.get('answer') + cur_result['index'] = filtered_rows.get('answer') + answers = filtered_rows.get('answer') + if istype(answers, list): + answers = eval(answers) + else: + answers = [answers] + if 'OCRVQA' in self.name: + match = [(1.0 if (x.strip().lower() == cur_result['prediction'].strip().lower()) else 0.0) for x in + answers] + else: + match = [anls_compute(x, cur_result['prediction']) for x in answers] + cur_result['match'] = match + new_results.append(cur_result) results_df = pd.DataFrame(new_results) with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: results_df.to_excel(writer, index=False) - match = [anls_compute(results['answer'], results['prediction']) for results in new_results] - splits = set([results['split'] for results in new_results]) ret = dict() for sp in splits: - sub = [match[i] for i, x in enumerate(new_results) if x['split'] == sp] - hit = hit_calculate(sub, 'DocVQA') + sub = [new_results[i] for i, x in enumerate(new_results) if x['split'] == sp] + hit = hit_calculate(sub, self.name) ret[sp] = np.mean(hit) * 100 + + print_log('============================================', 'current') + print_log(ret, 'current') + print_log('============================================', 'current') + print_log(f'DocVQA successfully finished evaluating', 'current') + return ret diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index 135b42eca..d13d273df 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -26,9 +26,11 @@ def getitem(self, idx, data): if is_cn_string(text): text = text + '请直接回答选项字母。' else: + # TODO prompt are different of vlmevalkit text = text + ("Answer with the option's letter from the " 'given choices directly.') - elif self.eval_ds.metainfo['name'] == 'chartqa': + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit text = data['question'] + '\nAnswer the question using a single word or phrase.' text = DEFAULT_IMAGE_TOKEN + '\n' + text else: From 666beededb997c866335f407b960f6343aa75828 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 09:35:59 +0800 Subject: [PATCH 074/126] fix --- xtuner/dataset/evaluation/general_vqa_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xtuner/dataset/evaluation/general_vqa_dataset.py b/xtuner/dataset/evaluation/general_vqa_dataset.py index dc73cd017..64c4661e1 100644 --- a/xtuner/dataset/evaluation/general_vqa_dataset.py +++ b/xtuner/dataset/evaluation/general_vqa_dataset.py @@ -106,7 +106,8 @@ def load_data_list(self): index = self.df.iloc[idx]['index'] image = self.df.iloc[idx]['image'] question = self.df.iloc[idx]['question'] - split = self.df.iloc[idx]['split'] + split = self.df.iloc[idx]['split'] if 'split' in self.df.iloc[ + 0].keys() else None answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ 0].keys() else None From 13b499c09c1f56fa8916192d04e04fed1f43af01 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 09:46:11 +0800 Subject: [PATCH 075/126] fix --- .../dataset/evaluation/general_vqa_dataset.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/xtuner/dataset/evaluation/general_vqa_dataset.py b/xtuner/dataset/evaluation/general_vqa_dataset.py index 64c4661e1..d827cc6f5 100644 --- a/xtuner/dataset/evaluation/general_vqa_dataset.py +++ b/xtuner/dataset/evaluation/general_vqa_dataset.py @@ -114,11 +114,13 @@ def load_data_list(self): data = { 'img': image, 'question': question, - 'split': split, 'answer': answer, 'index': index, 'img_id': idx } + if split is not None: + data['split'] = split + data_list.append(data) return data_list @@ -155,12 +157,16 @@ def evaluate(self, results, work_dir): with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: results_df.to_excel(writer, index=False) - splits = set([results['split'] for results in new_results]) ret = dict() - for sp in splits: - sub = [new_results[i] for i, x in enumerate(new_results) if x['split'] == sp] - hit = hit_calculate(sub, self.name) - ret[sp] = np.mean(hit) * 100 + if 'split' in results_df: + splits = list(set(results_df['split'])) + for sp in splits: + sub = [new_results[i] for i, x in enumerate(new_results) if x['split'] == sp] + hit = hit_calculate(sub, self.name) + ret[sp] = np.mean(hit) * 100 + else: + hit = hit_calculate(new_results, self.name) + ret['overall'] = np.mean(hit) * 100 print_log('============================================', 'current') print_log(ret, 'current') From 2cbb29b1b22150297e219c6950c849c82e86e779 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 10:33:43 +0800 Subject: [PATCH 076/126] add any res --- ...ge_lora_p14_336_anyres_e1_gpu8_finetune.py | 455 ++++++++++++++++++ .../anyres_llava_proxy_eval_dataset.py | 7 +- xtuner/dataset/llava.py | 1 + xtuner/model/anyres_llava.py | 6 +- 4 files changed, 465 insertions(+), 4 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py new file mode 100644 index 000000000..60dc81ada --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -0,0 +1,455 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import AnyResLLaVADataset, AnyResLLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=-1, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +pad_image_to_square = False + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='./phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +proxy_eval_dataset = dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=pad_image_to_square), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py index 3a76f3c2e..e9ae16728 100644 --- a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py +++ b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py @@ -35,8 +35,13 @@ def getitem(self, idx, data): if is_cn_string(text): text = text + '请直接回答选项字母。' else: + # TODO prompt are different of vlmevalkit text = text + ("Answer with the option's letter from the " 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text else: text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text @@ -65,7 +70,7 @@ def getitem(self, idx, data): data_dict['input_ids'] = ids # 3 process image - if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: # MMEDataset or TextVQADataset image = Image.open(os.path.join(self.eval_ds.image_folder, data['image_path'])).convert('RGB') diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index e6b1c8ec0..cfd2eabcf 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -160,6 +160,7 @@ def __getitem__(self, index): self.image_grid_pinpoints, self._patch_size, self._shortest_edge, pad_mean=tuple(int(x * 255) for x in self.image_processor.image_mean), + # keep the same as the original implementation orig_img_pad_to_square=self.pad_image_to_square) data_dict['pixel_values'] = image else: diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 9d3ed2a0d..e91821d4a 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -116,7 +116,7 @@ def __init__(self, llm, torch.randn( self.llm.config.hidden_size, dtype=self.visual_encoder.dtype)) self.image_grid_pinpoints = image_grid_pinpoints - # self.mm_patch_merge_type = 'spatial_unpad' + self.mm_patch_merge_type = 'spatial_unpad' self.image_aspect_ratio = 'anyres' def state_dict(self, *args, **kwargs): @@ -183,7 +183,7 @@ def __preprocess_for_pixel_values(self, data): bs, pn, hs = visual_outputs.shape # token merge - if self.token_merge_ratio != 1: + if self.token_merge_ratio != -1: # 27 不是偶数,不能被整除,需要 hard code 处理下 if pn == 27 * 27: if self.merge_type == 'simple': @@ -218,7 +218,7 @@ def __preprocess_for_pixel_values(self, data): image_features = torch.split(image_features, split_sizes, dim=0) new_image_feature = [] - if self.token_merge_ratio == 1: + if self.token_merge_ratio == -1: for image_idx, image_feature in enumerate(image_features): if image_feature.shape[0] > 1: base_image_feature = image_feature[0] From 42d08d731b5c4fcaf1deabc5c48233d536bdb18a Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 11:00:56 +0800 Subject: [PATCH 077/126] fix --- ...ge_lora_p14_336_anyres_e1_gpu8_finetune.py | 19 ++++++++++--------- ...vit_large_lora_p14_336_e1_gpu8_finetune.py | 16 ++++++++-------- .../evaluation/multiple_choice_dataset.py | 2 +- xtuner/dataset/llava_proxy_eval_dataset.py | 4 ++++ 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py index 60dc81ada..cdd06a56d 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -418,15 +418,16 @@ image_processor=image_processor, pad_image_to_square=pad_image_to_square ), - dict( - type=GeneralVQADataset, - proxy_eval_dataset=proxy_eval_dataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', - prompt_template=prompt_template, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=pad_image_to_square - ), + # 有问题,需要图片 + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset=proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square + # ), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index dcea773e4..bc1596b4f 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -388,14 +388,14 @@ image_processor=image_processor, pad_image_to_square=True ), - dict( - type=GeneralVQADataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', - prompt_template=prompt_template, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True - ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), ] # TODO: We are not currently using val_evaluator diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index de0903bfb..a272e7f74 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -239,7 +239,7 @@ def show_result(ret_json, split): if 'split' in data_main: splits = list(set(data_main['split'])) else: - splits = ['none'] + splits = [None] for split in splits: ret_json = calc_acc(data_main, split, 'overall') diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py index d13d273df..e04f8391a 100644 --- a/xtuner/dataset/llava_proxy_eval_dataset.py +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -33,6 +33,10 @@ def getitem(self, idx, data): # TODO prompt are different of vlmevalkit text = data['question'] + '\nAnswer the question using a single word or phrase.' text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text else: text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text From 0340daa62a32ae20b40330b447439bc995dcbd6a Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 11:03:56 +0800 Subject: [PATCH 078/126] fix --- ...vit_large_lora_p14_336_e1_gpu8_finetune.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py index bc1596b4f..ae713b691 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -339,20 +339,20 @@ tokenizer=tokenizer, image_processor=image_processor, pad_image_to_square=True), - dict( - type=MultipleChoiceDataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', - prompt_template=prompt_template, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), - dict( - type=MultipleChoiceDataset, - data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', - prompt_template=prompt_template, - tokenizer=tokenizer, - image_processor=image_processor, - pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), # dict( # type=VQAv2Dataset, # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', From cb1f29c8a16025f859c5613e000723c7db717b11 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 11:27:40 +0800 Subject: [PATCH 079/126] fix --- xtuner/dataset/evaluation/chartqa_dataset.py | 7 +++++-- .../evaluation/multiple_choice_dataset.py | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/xtuner/dataset/evaluation/chartqa_dataset.py b/xtuner/dataset/evaluation/chartqa_dataset.py index 5d9939d9f..c8b47dc7f 100644 --- a/xtuner/dataset/evaluation/chartqa_dataset.py +++ b/xtuner/dataset/evaluation/chartqa_dataset.py @@ -62,7 +62,7 @@ def evaluate_relaxed_accuracy(entries): for ann in elem['label'] ]) scores.append(score) - return sum(scores) / len(scores) + return scores, sum(scores) / len(scores) class ChartQADataset(BaseEvalDataset): @@ -156,11 +156,14 @@ def evaluate(self, result, work_dir): print_log('============================================', 'current') acc_list = [] for i, result in enumerate(results): + scores, _accuracy = evaluate_relaxed_accuracy(result) + + for res, score in zip(result, scores): + res['score'] = score prediction_file = osp.join(work_dir, self.revert_name_map[i] + '.json') with open(prediction_file, 'w') as f: json.dump(result, f) - _accuracy = evaluate_relaxed_accuracy(result) print_log('Acc: {}, Category: {}, # samples: {}'.format(_accuracy, self.revert_name_map[i], len(result)), 'current') acc_list.append(_accuracy) diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py index a272e7f74..e9e139e34 100644 --- a/xtuner/dataset/evaluation/multiple_choice_dataset.py +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -102,9 +102,11 @@ def load_data_list(self): 'options_dict': options, 'index': index, 'context': hint, - 'split': split, 'img_id': idx } + if split is not None: + data['split'] = split + if self.has_l2_category: data.update({'l2-category': self.df.iloc[idx]['l2-category']}) data_list.append(data) @@ -127,14 +129,20 @@ def evaluate(self, results, work_dir): def calc_acc(df, split, group='category'): assert group in ['overall', 'category', 'l2-category'] if group == 'overall': - res = {'Average': np.mean(df[df['split'] == split]['hit'])} + if split is None: + res = {'Average': np.mean(df['hit'])} + else: + res = {'Average': np.mean(df[df['split'] == split]['hit'])} else: res = {} abilities = list(set(df[group])) abilities.sort() for ab in abilities: sub_df = df[df[group] == ab] - res[ab] = np.mean(sub_df[sub_df['split'] == split]['hit']) + if split is None: + res[ab] = np.mean(sub_df['hit']) + else: + res[ab] = np.mean(sub_df[sub_df['split'] == split]['hit']) return res def eval_sub_data(sub_data, answer_map): @@ -152,7 +160,7 @@ def show_result(ret_json, split): show_dict = ret_json.copy() table = Table(title=f' Multiple Choice ({self.data_file}) ') console = Console() - if split != 'none': + if split is not None: table.add_column(f'Category ({split})', justify='left') else: table.add_column('Category', justify='left') From a19d59713a9995462c6bebe09773db4b8d7d93a6 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 13:47:57 +0800 Subject: [PATCH 080/126] fix --- xtuner/dataset/evaluation/general_vqa_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/dataset/evaluation/general_vqa_dataset.py b/xtuner/dataset/evaluation/general_vqa_dataset.py index d827cc6f5..c87a4d7ac 100644 --- a/xtuner/dataset/evaluation/general_vqa_dataset.py +++ b/xtuner/dataset/evaluation/general_vqa_dataset.py @@ -171,5 +171,5 @@ def evaluate(self, results, work_dir): print_log('============================================', 'current') print_log(ret, 'current') print_log('============================================', 'current') - print_log(f'DocVQA successfully finished evaluating', 'current') + print_log(f'{self.name} successfully finished evaluating', 'current') return ret From 7db1352e3838e438e8ee6e8116e64e0937f751e3 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 13:57:35 +0800 Subject: [PATCH 081/126] update file --- ..._full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py index cdd06a56d..d989c9e82 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -34,7 +34,7 @@ data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' image_folder = data_root + 'llava_images' prompt_template = PROMPT_TEMPLATE.phi3_chat -max_length = int(2048 - (336 / 14) ** 2) +max_length = int(4096 - (336 / 14) ** 2) # Scheduler & Optimizer batch_size = 16 # per_device From 652237500e0b885a586fb84f3217d3c11974e304 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 14:00:29 +0800 Subject: [PATCH 082/126] update file --- ...ull_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py index d989c9e82..50a62e030 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -58,8 +58,8 @@ evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] -image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], - [384, 1152]] +image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1152, 336], + [336, 1008]] ####################################################################### # PART 2 Model & Tokenizer & Image Processor # From 256c0f509cbe81830afc63b776d9809590109931 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 14:09:40 +0800 Subject: [PATCH 083/126] fix --- xtuner/model/anyres_llava.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index e91821d4a..f92b9bc55 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -174,9 +174,10 @@ def __preprocess_for_pixel_values(self, data): # b*n, 27*27, d visual_outputs = self.visual_encoder( concat_images.to(self.visual_encoder.dtype), output_hidden_states=True) - if type(self.visual_encoder).__name__ == 'CLIPVisionModel': + + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] - elif type(self.visual_encoder).__name__ == 'SiglipVisionModel': + elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] else: raise NotImplementedError From 1f56691baa05bc919b21138ecc56436bc732b686 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 14:12:43 +0800 Subject: [PATCH 084/126] fix --- xtuner/model/anyres_llava.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index f92b9bc55..2d25a1dda 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -176,6 +176,7 @@ def __preprocess_for_pixel_values(self, data): concat_images.to(self.visual_encoder.dtype), output_hidden_states=True) if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] From d4ef3101f9735e01a80774198a6880bfc97e6cc5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 15:31:45 +0800 Subject: [PATCH 085/126] fix --- xtuner/model/anyres_llava.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py index 2d25a1dda..f92b9bc55 100644 --- a/xtuner/model/anyres_llava.py +++ b/xtuner/model/anyres_llava.py @@ -176,7 +176,6 @@ def __preprocess_for_pixel_values(self, data): concat_images.to(self.visual_encoder.dtype), output_hidden_states=True) if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': - visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] From 7f62009067774db88ee030db023d3b68952c0e00 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 15:37:03 +0800 Subject: [PATCH 086/126] add config --- ..._chat_clip_e1_gpu16_sharegpt4v_pretrain.py | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py diff --git a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py new file mode 100644 index 000000000..bc577ffd8 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/basemodel/checkpoints/llm/hf_hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/e8cf5276ae3e97cfde8a058e64a636f2cde47820' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +data_path = data_root + 'share-captioner_coco_lcs_sam_1246k_1107.json' +image_folder = data_root + 'data' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(4096 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 5e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) From 1bd9be13e3763a52f78ed387a35afa1f520c4dbe Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 26 Apr 2024 18:07:45 +0800 Subject: [PATCH 087/126] update --- ...a_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py index bc577ffd8..3093fd1e1 100644 --- a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py +++ b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py @@ -30,7 +30,7 @@ max_length = int(4096 - (336 // 14) ** 2) # Scheduler & Optimizer -batch_size = 16 # per_device +batch_size = 8 # per_device 32GPUx8bs accumulative_counts = 1 dataloader_num_workers = 4 max_epochs = 1 @@ -99,7 +99,7 @@ train_dataloader = dict( batch_size=batch_size, num_workers=dataloader_num_workers, - pin_memory=True, + # pin_memory=True, dataset=llava_dataset, sampler=dict(type=DefaultSampler, shuffle=True), collate_fn=dict(type=mm_collate_fn)) @@ -160,13 +160,13 @@ # record the time of every iteration. timer=dict(type=IterTimerHook), # print log every 10 iterations. - logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=5), # enable the parameter scheduler. param_scheduler=dict(type=ParamSchedulerHook), # save checkpoint per `save_steps`. checkpoint=dict( type=CheckpointHook, - save_optimizer=False, # can save disk memory mmengine >=0.10.3 + save_optimizer=True, # can save disk memory mmengine >=0.10.3 by_epoch=False, interval=save_steps, max_keep_ckpts=save_total_limit), From f6abf85dd91aafc62b770bd5504a39d5a76ab0f1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 28 Apr 2024 10:26:53 +0800 Subject: [PATCH 088/126] add 70b finetune --- ...0b_chat_clip_e1_gpu16_internvl_finetune.py | 528 ++++++++++++++++++ ...ge_lora_p14_336_anyres_e1_gpu8_finetune.py | 2 +- 2 files changed, 529 insertions(+), 1 deletion(-) create mode 100644 xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py diff --git a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py new file mode 100644 index 000000000..7e88e7f66 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py @@ -0,0 +1,528 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/basemodel/checkpoints/llm/hf_hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/e8cf5276ae3e97cfde8a058e64a636f2cde47820' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain/iter_4871.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 4 # per_device 32gpu x 4bs +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 4000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 4000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + # attn_implementation='sdpa', + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### + +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/' + +sharegpt4v_caption_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'llava_dataset', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +geoqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'geoqa_dataset', + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=5), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +# val_dataset = [ +# dict( +# type=GQADataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', +# ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', +# image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# ] +# +# test_dataset = [ +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=TextVQADataset, +# data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', +# ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', +# image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MMEDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', +# image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# # for_llava_prompt=True, # 开了后,perception 会掉 +# pad_image_to_square=True), +# dict( +# type=HallusionDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=POPEDataset, +# data_file=[ +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' +# ], +# coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=GQADataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', +# ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', +# image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# # dict( +# # type=VQAv2Dataset, +# # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', +# # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', +# # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', +# # prompt_template=PROMPT_TEMPLATE.vicuna, +# # tokenizer=tokenizer, +# # image_processor=image_processor, +# # pad_image_to_square=True), +# ] +# +# # TODO: We are not currently using val_evaluator +# # Don't support num_workers > 0 +# val_dataloader = dict( +# batch_size=1, +# num_workers=0, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict(type=ConcatDataset, datasets=val_dataset), +# collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +# val_evaluator = dict() +# val_cfg = dict(type=ValLoop) +# +# # TODO: We are not currently using test_evaluator +# test_dataloader = dict( +# batch_size=1, +# num_workers=0, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict(type=ConcatDataset, datasets=test_dataset), +# collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +# ) +# +# test_evaluator = val_evaluator +# test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py index 50a62e030..ff16eb0ca 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -58,7 +58,7 @@ evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] -image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1152, 336], +image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] ####################################################################### From d732b58d61793972d454c94c5a1c71471af324f1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 28 Apr 2024 15:23:31 +0800 Subject: [PATCH 089/126] add internvl 1.5 pretrain --- ...14_336_anyres_e1_gpu8_internvl_finetune.py | 605 ++++++++++++++++++ .../phi3_internvl_1-5_pretrain.py | 212 ++++++ xtuner/dataset/__init__.py | 5 +- .../anyres_llava_proxy_eval_dataset.py | 10 + xtuner/dataset/llava.py | 57 +- xtuner/dataset/utils.py | 76 +++ xtuner/engine/hooks/dataset_info_hook.py | 4 - xtuner/model/__init__.py | 3 +- xtuner/model/internvl_1_5_llava.py | 151 +++++ 9 files changed, 1115 insertions(+), 8 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py create mode 100644 xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py create mode 100644 xtuner/model/internvl_1_5_llava.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py new file mode 100644 index 000000000..6fa45b613 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py @@ -0,0 +1,605 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import AnyResLLaVADataset, AnyResLLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], + [336, 1008]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=-1, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +pad_image_to_square = False + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' + +sharegpt4v_caption_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'llava_dataset', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + + +dvqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +geoqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'geoqa_dataset', + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size'])) + + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +proxy_eval_dataset = dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=pad_image_to_square), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + # 有问题,需要图片 + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset=proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py new file mode 100644 index 000000000..653e48024 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(4094 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='./phi3_mini_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index b3c3c63fe..4dddf7bd1 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -7,7 +7,7 @@ load_intern_repo_tokenized_dataset, load_intern_repo_untokenized_dataset) -from .llava import LLaVADataset, AnyResLLaVADataset +from .llava import LLaVADataset, AnyResLLaVADataset, InternVL_V1_5_LLaVADataset from .json_dataset import load_json_file from .modelscope import process_ms_dataset from .moss_sft import MOSSSFTDataset @@ -37,5 +37,6 @@ 'LLaVAProxyEvalDataset', 'AnyResLLaVAProxyEvalDataset', 'MiniGeminiDataset', - 'MiniGeminiProxyEvalDataset' + 'MiniGeminiProxyEvalDataset', + 'InternVL_V1_5_LLaVADataset' ] diff --git a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py index e9ae16728..9c3f9351c 100644 --- a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py +++ b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py @@ -42,6 +42,10 @@ def getitem(self, idx, data): # TODO prompt are different of vlmevalkit text = data['question'] + '\nAnswer the question using a single word or phrase.' text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text else: text = data['question'] text = DEFAULT_IMAGE_TOKEN + '\n' + text @@ -52,6 +56,12 @@ def getitem(self, idx, data): inputs = '' inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + # 2 tokenize inputs chunk_encode = [] for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index cfd2eabcf..d44e336fb 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -13,7 +13,7 @@ from xtuner.registry import BUILDER from .huggingface import process_hf_dataset -from .utils import expand2square, process_anyres_image +from .utils import expand2square, process_anyres_image, total_image_token, dynamic_preprocess def load_jsonl(json_file): @@ -168,3 +168,58 @@ def __getitem__(self, index): data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], self._crop_size['width']) return data_dict + + +class InternVL_V1_5_LLaVADataset(LLaVADataset): + def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, *args, **kwargs): + self.min_num = min_num + self.max_num = max_num + self.downsample_ratio = downsample_ratio + super().__init__(*args, **kwargs) + + if hasattr(self.image_processor, 'crop_size'): + self._crop_size = self.image_processor.crop_size + else: + self._crop_size = self.image_processor.size + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + # clip + self._image_size = image_size + self._patch_size = (self._image_size // 14) * downsample_ratio # 12 + + @property + def modality_length(self): + print_log('start calculating modality length', logger='current'), + length_list = [] + for data_dict in self.text_data: + cur_len = len(data_dict['input_ids']) + if data_dict.get('image', None) is None: + image_file = data_dict['image'] + image = Image.open(os.path.join(self.image_folder, + image_file)) + num_image_token = total_image_token(image.size, self.min_num, self.max_num, self._image_size, + self._patch_size) + cur_len += num_image_token + cur_len = -cur_len + length_list.append(cur_len) + print_log('end calculating modality length', logger='current'), + return length_list + + def __getitem__(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(self.image_folder, + image_file)).convert('RGB') + images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) + for image in images: + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + images.append(image) + images = torch.stack(images, dim=0) + data_dict['pixel_values'] = images + else: + data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], + self._crop_size['width']) + return data_dict diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index 60dbce54e..c626a3aab 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -447,3 +447,79 @@ def unpad_image(tensor, original_size): return unpadded_tensor # ---------------------------------------------------------------------- + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def total_image_token(orig_size, min_num=1, max_num=6, image_size=336, patch_size=24, use_thumbnail=True): + orig_width, orig_height = orig_size + + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + max_num >= i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + if use_thumbnail: + blocks += 1 + + return blocks*patch_size*patch_size + + +def dynamic_preprocess(image, min_num=1, max_num=6, image_size=336, use_thumbnail=True): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + max_num >= i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images diff --git a/xtuner/engine/hooks/dataset_info_hook.py b/xtuner/engine/hooks/dataset_info_hook.py index 76b49e6a9..d997373ec 100644 --- a/xtuner/engine/hooks/dataset_info_hook.py +++ b/xtuner/engine/hooks/dataset_info_hook.py @@ -42,16 +42,12 @@ def log(self, runner, dataset, mode='train'): def before_train(self, runner) -> None: do_train = runner.train_loop is not None do_eval = runner.val_loop is not None - do_test = runner.test_loop is not None if do_train: train_dataset = runner.train_dataloader.dataset self.log(runner, train_dataset, mode='train') if do_eval: eval_dataset = runner.val_dataloader.dataset self.log(runner, eval_dataset, mode='eval') - if do_test: - test_dataset = runner.test_dataloader.dataset - self.log(runner, test_dataset, mode='test') def before_val(self, runner) -> None: eval_dataset = runner.val_dataloader.dataset diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index e7d37e8c3..6ba85bb12 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -3,5 +3,6 @@ from .sft import SupervisedFinetune from .anyres_llava import AnyResLLaVAModel from .mini_gemini import MiniGeminiModel +from .internvl_1_5_llava import InternVL_v1_5_LLaVAModel -__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel', 'InternVL_v1_5_LLaVAModel'] diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py new file mode 100644 index 000000000..80935ae05 --- /dev/null +++ b/xtuner/model/internvl_1_5_llava.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .llava import LLaVAModel + + +from xtuner.registry import BUILDER +from .modules import ProjectorConfig, ProjectorModel, dispatch_modules +from .utils import (LoadWoInit, guess_load_checkpoint, + make_inputs_require_grad, + prepare_inputs_labels_for_multimodal) + + +class InternVL_v1_5_LLaVAModel(LLaVAModel): + def __init__(self, llm, + visual_encoder, + freeze_llm=False, + freeze_visual_encoder=False, + visual_select_layer=-2, + pretrained_pth=None, + projector_depth=2, + llm_lora=None, + visual_encoder_lora=None, + use_activation_checkpointing=True, + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None, + merge_type='pixel_shuffle', # or pixel_shuffle + downsample_ratio=0.5): + super(LLaVAModel, self).__init__() + self.downsample_ratio = downsample_ratio + + self.freeze_llm = freeze_llm + self.freeze_visual_encoder = freeze_visual_encoder + self.merge_type = merge_type + with LoadWoInit(): + if isinstance(llm, dict): + llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) + + self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( + visual_encoder) + self.llm.config.use_cache = False + dispatch_modules(self.llm) + + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size * (int(1 / self.downsample_ratio) ** 2), + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + self.projector = ProjectorModel(projector_config).to( + self.visual_encoder.dtype) + + if self.freeze_llm: + self.llm.requires_grad_(False) + if self.freeze_visual_encoder: + self.visual_encoder.requires_grad_(False) + + self.use_activation_checkpointing = use_activation_checkpointing + if use_activation_checkpointing: + # For backward compatibility + if hasattr(self.llm, 'enable_input_require_grads'): + self.llm.enable_input_require_grads() + else: + self.llm.get_input_embeddings().register_forward_hook( + make_inputs_require_grad) + if hasattr(self.visual_encoder, 'enable_input_require_grads'): + self.visual_encoder.enable_input_require_grads() + else: + self.visual_encoder.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + self.projector.enable_input_require_grads() + + # enable gradient (activation) checkpointing for memory efficiency + self.gradient_checkpointing_enable() + + self.use_llm_lora = llm_lora is not None + self.use_visual_encoder_lora = visual_encoder_lora is not None + + if self.use_llm_lora: + self._prepare_llm_for_lora(llm_lora, use_activation_checkpointing) + if self.use_visual_encoder_lora: + self._prepare_visual_encoder_for_lora( + visual_encoder_lora, use_activation_checkpointing) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + self.visual_select_layer = visual_select_layer + + self._is_init = True + + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + self.template = template + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + new_image_feature = self.__preprocess_for_pixel_values(data) + data['pixel_values'] = new_image_feature + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def __preprocess_for_pixel_values(self, data): + pixel_values = data['pixel_values'] + + if pixel_values.ndim == 5: + pixel_values = [x if x.ndim == 4 else x.unsqueeze(0) for x in pixel_values] + + assert isinstance(pixel_values, list) + new_image_feature = [] + for bs in range(len(pixel_values)): + # 这样可以省一点显存,虽然会慢一点 + # n, c, h, w + visual_outputs = self.visual_encoder( + pixel_values[bs].to(self.visual_encoder.dtype), output_hidden_states=True) + + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + # n, hw, c + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + # n,h'w',c' + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + + vit_embeds = self.projector(vit_embeds) + new_image_feature.append(vit_embeds) + return new_image_feature + + def pixel_shuffle(self, x, scale_factor=0.5): + n, h, w, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), + int(c / (scale_factor * scale_factor))) + x = x.permute(0, 2, 1, 3).contiguous() + return x From 6f8d2fb4a586a1899c2c369df31de7dbd1e80d77 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 28 Apr 2024 19:10:44 +0800 Subject: [PATCH 090/126] add internvl 1.5 finetune --- .../phi3_internvl_1-5_finetune.py | 455 ++++++++++++++++++ xtuner/dataset/__init__.py | 4 +- .../internvl_v1_5_llava_proxy_eval_dataset.py | 97 ++++ xtuner/dataset/llava.py | 4 +- xtuner/model/internvl_1_5_llava.py | 90 +++- 5 files changed, 622 insertions(+), 28 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py create mode 100644 xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py new file mode 100644 index 000000000..4c2ca112c --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py @@ -0,0 +1,455 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset, InternVL_v1_5_LLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/phi3_internvl_1-5_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='./phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +eval_num = 6 + +proxy_eval_dataset = dict(type=InternVL_v1_5_LLaVAProxyEvalDataset, min_num=eval_num, max_num=eval_num) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index 4dddf7bd1..4964520fc 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -18,6 +18,7 @@ from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset from .mini_gemini_dataset import MiniGeminiDataset from .mini_gemini_proxy_eval_dataset import MiniGeminiProxyEvalDataset +from .internvl_v1_5_llava_proxy_eval_dataset import InternVL_v1_5_LLaVAProxyEvalDataset # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -38,5 +39,6 @@ 'AnyResLLaVAProxyEvalDataset', 'MiniGeminiDataset', 'MiniGeminiProxyEvalDataset', - 'InternVL_V1_5_LLaVADataset' + 'InternVL_V1_5_LLaVADataset', + 'InternVL_v1_5_LLaVAProxyEvalDataset' ] diff --git a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py new file mode 100644 index 000000000..0c180ab83 --- /dev/null +++ b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py @@ -0,0 +1,97 @@ +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +from .utils import dynamic_preprocess + + +class InternVL_v1_5_LLaVAProxyEvalDataset: + def __init__(self, eval_dataset, min_num, max_num): + self.eval_ds = eval_dataset + self.min_num = min_num + self.max_num = max_num + + # TODO: Assuming they are all squares. + if hasattr(eval_dataset.image_processor, 'crop_size'): + self._crop_size = eval_dataset.image_processor.crop_size + else: + self._crop_size = eval_dataset.image_processor.size + self._image_size = self._crop_size['height'] + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) + for i, image in enumerate(images): + image = self.eval_ds.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + images[i] = image + images = torch.stack(images, dim=0) + data_dict['pixel_values'] = images + return data_dict diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index d44e336fb..a55733e4c 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -213,10 +213,10 @@ def __getitem__(self, index): image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) - for image in images: + for i, image in enumerate(images): image = self.image_processor.preprocess( image, return_tensors='pt')['pixel_values'][0] - images.append(image) + images[i] = image images = torch.stack(images, dim=0) data_dict['pixel_values'] = images else: diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 80935ae05..24033dfd0 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .llava import LLaVAModel - +import torch from xtuner.registry import BUILDER from .modules import ProjectorConfig, ProjectorModel, dispatch_modules @@ -110,32 +110,72 @@ def _prepare_data_for_llm(self, data): def __preprocess_for_pixel_values(self, data): pixel_values = data['pixel_values'] - if pixel_values.ndim == 5: - pixel_values = [x if x.ndim == 4 else x.unsqueeze(0) for x in pixel_values] + if type(pixel_values) is list or pixel_values.ndim == 5: + if type(pixel_values) is list: + pixel_values = [ + x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values + ] + # b*n, c, h, w + concat_images = torch.cat([image.to(self.visual_encoder.dtype) for image in pixel_values], dim=0) + else: + raise NotImplementedError() + + # b*n, hw, d + visual_outputs = self.visual_encoder(concat_images, output_hidden_states=True) + + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + + # n, hw, c + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + # n,h'w',c' + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + + vit_embeds = self.projector(vit_embeds) + + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(vit_embeds, split_sizes, dim=0) - assert isinstance(pixel_values, list) new_image_feature = [] - for bs in range(len(pixel_values)): - # 这样可以省一点显存,虽然会慢一点 - # n, c, h, w - visual_outputs = self.visual_encoder( - pixel_values[bs].to(self.visual_encoder.dtype), output_hidden_states=True) - - if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': - vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] - elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': - vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] - else: - raise NotImplementedError - # n, hw, c - h = w = int(vit_embeds.shape[1] ** 0.5) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) - vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) - # n,h'w',c' - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) - - vit_embeds = self.projector(vit_embeds) - new_image_feature.append(vit_embeds) + for image_feature in image_features: + B, N, C = image_feature.shape + image_feature = image_feature.reshape(B * N, C) + new_image_feature.append(image_feature) + + # TODO: for 这种写法无法在 zero + checkpoint 情况下使用 + # if isinstance(pixel_values, torch.Tensor) and pixel_values.ndim == 5: + # pixel_values = [x if x.ndim == 4 else x.unsqueeze(0) for x in pixel_values] + # assert isinstance(pixel_values, list) + + # for bs in range(len(pixel_values)): + # # 这样可以省一点显存,虽然会慢一点 + # # n, c, h, w + # visual_outputs = self.visual_encoder( + # pixel_values[bs].to(self.visual_encoder.dtype), output_hidden_states=True) + # + # if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + # vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + # elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + # vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] + # else: + # raise NotImplementedError + # # n, hw, c + # h = w = int(vit_embeds.shape[1] ** 0.5) + # vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + # vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + # # n,h'w',c' + # vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + # + # vit_embeds = self.projector(vit_embeds) + # B, N, C = vit_embeds.shape + # vit_embeds = vit_embeds.reshape(B * N, C) + # new_image_feature.append(vit_embeds) return new_image_feature def pixel_shuffle(self, x, scale_factor=0.5): From ab0b0037738ed4112e865ec8612932f3dbc38e12 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 28 Apr 2024 20:53:22 +0800 Subject: [PATCH 091/126] update --- calc_image_size_offline.py | 56 +++++++++++++++++++ .../phi3_internvl_1-5_finetune.py | 3 +- .../phi3_internvl_1-5_pretrain.py | 2 +- xtuner/dataset/llava.py | 41 ++++++++++---- 4 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 calc_image_size_offline.py diff --git a/calc_image_size_offline.py b/calc_image_size_offline.py new file mode 100644 index 000000000..67686e7e7 --- /dev/null +++ b/calc_image_size_offline.py @@ -0,0 +1,56 @@ +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import json +from PIL import Image +import os + +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data + + +def calc_fn(data_dict): + size = {'width': 0, 'height': 0, 'image': 'None'} + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(image_folder, + image_file)) + size['image'] = image_file + size['width'] = image.size[0] + size['height'] = image.size[1] + return size + + +if __name__ == '__main__': + print('start calculating modality length') + if data_path.endswith('.json'): + json_data = json.load(open(data_path)) + elif data_path.endswith('.jsonl'): + json_data = load_jsonl(data_path) + else: + raise NotImplementedError + + with ThreadPoolExecutor(max_workers=8) as executor: + length_list = list( + tqdm( + executor.map(calc_fn, json_data), + desc='Calculating modality length', + total=len(json_data))) + print('end calculating modality length') + + new_output_dict = {} + for i in range(len(length_list)): + if length_list[i]['image'] != 'None': + new_output_dict[length_list[i]['image']] = [length_list[i]['width'], length_list[i]['height']] + + with open('image_size.json', 'w') as f: + json.dump(new_output_dict, f) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py index 4c2ca112c..d22e2be0a 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py @@ -104,7 +104,8 @@ min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder='./phi3_mini_llava_finetune', + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + image_size_json=None, data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py index 653e48024..c92e3cecf 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py @@ -94,7 +94,7 @@ min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder='./phi3_mini_llava_pretrain', + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_pretrain', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index a55733e4c..bf8c90aeb 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -14,6 +14,8 @@ from xtuner.registry import BUILDER from .huggingface import process_hf_dataset from .utils import expand2square, process_anyres_image, total_image_token, dynamic_preprocess +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm def load_jsonl(json_file): @@ -171,7 +173,7 @@ def __getitem__(self, index): class InternVL_V1_5_LLaVADataset(LLaVADataset): - def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, *args, **kwargs): + def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, image_size_json=None, *args, **kwargs): self.min_num = min_num self.max_num = max_num self.downsample_ratio = downsample_ratio @@ -188,21 +190,38 @@ def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, *args self._image_size = image_size self._patch_size = (self._image_size // 14) * downsample_ratio # 12 - @property - def modality_length(self): - print_log('start calculating modality length', logger='current'), - length_list = [] - for data_dict in self.text_data: + self.image_size_json = None + if image_size_json is not None: + with open(image_size_json, 'r') as f: + self.image_size_json = json.load(f) + + def __calc_fn(self, data_dict): + cur_len = len(data_dict['input_ids']) + if data_dict.get('image', None) is not None: cur_len = len(data_dict['input_ids']) - if data_dict.get('image', None) is None: + if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = Image.open(os.path.join(self.image_folder, - image_file)) - num_image_token = total_image_token(image.size, self.min_num, self.max_num, self._image_size, + if self.image_size_json is not None: + size = self.image_size_json[image_file] + else: + image = Image.open(os.path.join(self.image_folder, + image_file)) + size = image.size + num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, self._patch_size) cur_len += num_image_token cur_len = -cur_len - length_list.append(cur_len) + return cur_len + + @property + def modality_length(self): + print_log('start calculating modality length', logger='current'), + with ThreadPoolExecutor(max_workers=8) as executor: + length_list = list( + tqdm( + executor.map(self.__calc_fn, self.text_data), + desc='Calculating modality length', + total=len(self.text_data))) print_log('end calculating modality length', logger='current'), return length_list From f47d06d83ba446a13e5445d536b1b0b683417261 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sun, 28 Apr 2024 21:09:33 +0800 Subject: [PATCH 092/126] update --- xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py index 0c180ab83..3c733ba43 100644 --- a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py +++ b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py @@ -56,12 +56,6 @@ def getitem(self, idx, data): inputs = '' inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) - if self.eval_ds.use_system: - inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') - else: - inputs = '' - inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) - # 2 tokenize inputs chunk_encode = [] for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): From 323dfbb1a7e0a61e34920f82ce2621694d1e507a Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 29 Apr 2024 15:19:58 +0800 Subject: [PATCH 093/126] add layer-wise learning rate (LLDR) --- ...vit_large_p14_336_lldr_e1_gpu8_finetune.py | 428 ++++++++++++++++++ xtuner/engine/optimizers/__init__.py | 6 + .../layer_decay_optim_wrapper_constructor.py | 156 +++++++ xtuner/engine/optimizers/utils.py | 35 ++ xtuner/model/llava.py | 27 +- 5 files changed, 648 insertions(+), 4 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py create mode 100644 xtuner/engine/optimizers/__init__.py create mode 100644 xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py create mode 100644 xtuner/engine/optimizers/utils.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py new file mode 100644 index 000000000..a17ca3587 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py @@ -0,0 +1,428 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.engine.optimizers import LearningRateDecayOptimWrapperConstructor, get_layer_depth_for_CLIPVisionModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor=LearningRateDecayOptimWrapperConstructor, # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/engine/optimizers/__init__.py b/xtuner/engine/optimizers/__init__.py new file mode 100644 index 000000000..9372c9ff7 --- /dev/null +++ b/xtuner/engine/optimizers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .layer_decay_optim_wrapper_constructor import LearningRateDecayOptimWrapperConstructor +from .utils import get_layer_depth_for_CLIPVisionModel +__all__ = [ + 'LearningRateDecayOptimWrapperConstructor', 'get_layer_depth_for_CLIPVisionModel' +] diff --git a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py new file mode 100644 index 000000000..8cdd23b7f --- /dev/null +++ b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import Callable, List, Optional + +from mmengine.logging import MMLogger +from mmengine.optim import DefaultOptimWrapperConstructor +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm +from torch import nn +from torch.nn import GroupNorm, LayerNorm + + +class LearningRateDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor): + """Different learning rates are set for different layers of backbone. + + By default, each parameter share the same optimizer settings, and we + provide an argument ``paramwise_cfg`` to specify parameter-wise settings. + It is a dict and may contain the following fields: + + - ``layer_decay_rate`` (float): The learning rate of a parameter will + multiply it by multiple times according to the layer depth of the + parameter. Usually, it's less than 1, so that the earlier layers will + have a lower learning rate. Defaults to 1. + - ``bias_decay_mult`` (float): It will be multiplied to the weight + decay for all bias parameters (except for those in normalization layers). + - ``norm_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of normalization layers. + - ``flat_decay_mult`` (float): It will be multiplied to the weight + decay for all one-dimensional parameters + - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If + one of the keys in ``custom_keys`` is a substring of the name of one + parameter, then the setting of the parameter will be specified by + ``custom_keys[key]`` and other setting like ``bias_decay_mult`` will be + ignored. It should be a dict and may contain fields ``decay_mult``. + (The ``lr_mult`` is disabled in this constructor). + + Example: + + In the config file, you can use this constructor as below: + + .. code:: python + + optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=4e-3, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + constructor='LearningRateDecayOptimWrapperConstructor', + paramwise_cfg=dict( + layer_decay_rate=0.75, # layer-wise lr decay factor + norm_decay_mult=0., + flat_decay_mult=0., + custom_keys={ + '.cls_token': dict(decay_mult=0.0), + '.pos_embed': dict(decay_mult=0.0) + })) + """ + def add_params(self, + params: List[dict], + module: nn.Module, + prefix: str = '', + **kwargs) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (List[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + optimizer_cfg (dict): The configuration of optimizer. + prefix (str): The prefix of the module. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + logger = MMLogger.get_current_instance() + + assert hasattr(module, 'get_layer_depth'), 'The model should have `get_layer_depth` method' + get_layer_depth = module.get_layer_depth + + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) + flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) + decay_rate = self.paramwise_cfg.get('layer_decay_rate', 1.0) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + param_name = prefix + name + if not param.requires_grad: + continue + + if self.base_wd is not None: + base_wd = self.base_wd + custom_key = next( + filter(lambda k: k in param_name, sorted_keys), None) + # custom parameters decay + if custom_key is not None: + custom_cfg = custom_keys[custom_key].copy() + decay_mult = custom_cfg.pop('decay_mult', 1.) + + param_group['weight_decay'] = base_wd * decay_mult + # add custom settings to param_group + param_group.update(custom_cfg) + # norm decay + elif is_norm and norm_decay_mult is not None: + param_group['weight_decay'] = base_wd * norm_decay_mult + # bias decay + elif name == 'bias' and bias_decay_mult is not None: + param_group['weight_decay'] = base_wd * bias_decay_mult + # flatten parameters decay + elif param.ndim == 1 and flat_decay_mult is not None: + param_group['weight_decay'] = base_wd * flat_decay_mult + else: + param_group['weight_decay'] = base_wd + + layer_id, max_id = get_layer_depth(param_name) + scale = decay_rate**(max_id - layer_id - 1) + param_group['lr'] = self.base_lr * scale + param_group['lr_scale'] = scale + param_group['layer_id'] = layer_id + param_group['param_name'] = param_name + + params.append(param_group) + + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}{child_name}.' + self.add_params( + params, + child_mod, + prefix=child_prefix, + get_layer_depth=get_layer_depth, + ) + + if prefix == '': + layer_params = defaultdict(list) + for param in params: + layer_params[param['layer_id']].append(param) + for layer_id, layer_params in layer_params.items(): + lr_scale = layer_params[0]['lr_scale'] + lr = layer_params[0]['lr'] + msg = [ + f'layer {layer_id} params ' + f'(lr={lr:.3g}, lr_scale={lr_scale:.3g}):' + ] + for param in layer_params: + msg.append(f'\t{param["param_name"]}: ' + f'weight_decay={param["weight_decay"]:.3g}') + logger.debug('\n'.join(msg)) diff --git a/xtuner/engine/optimizers/utils.py b/xtuner/engine/optimizers/utils.py new file mode 100644 index 000000000..d1279c1a8 --- /dev/null +++ b/xtuner/engine/optimizers/utils.py @@ -0,0 +1,35 @@ + +def get_layer_depth_for_CLIPVisionModel(self, param_name: str, prefix: str = 'vision_model.'): + """Get the layer-wise depth of a parameter. + + Args: + param_name (str): The name of the parameter. + prefix (str): The prefix for the parameter. + Defaults to an empty string. + + Returns: + Tuple[int, int]: The layer-wise depth and the num of layers. + + Note: + The first depth is the stem module (``layer_depth=0``), and the + last depth is the subsequent module (``layer_depth=num_layers-1``) + """ + num_layers = self.config.num_hidden_layers + 2 + + if not param_name.startswith(prefix): + # For subsequent module like head + return num_layers - 1, num_layers + + param_name = param_name[len(prefix):] + + if param_name.startswith('embeddings'): + layer_depth = 0 + elif param_name.startswith('pre_layrnorm'): + layer_depth = 0 + elif param_name.startswith('encoder.layers'): + layer_id = int(param_name.replace('encoder.', '').split('.')[1]) + layer_depth = layer_id + 1 + else: + layer_depth = num_layers - 1 + + return layer_depth, num_layers diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index c7a50e648..75e20f590 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -22,6 +22,8 @@ StopWordStoppingCriteria) from functools import reduce from mmengine.logging import print_log +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel + class LLaVAModel(BaseModel): @@ -41,7 +43,9 @@ def __init__(self, max_position_embeddings=None, image_processor=None, tokenizer=None, - template=None): + template=None, + use_lldr=False, # LearningRateDecayOptimWrapperConstructor + ): super().__init__() self.s2_scales = s2_scales self.freeze_llm = freeze_llm @@ -51,12 +55,20 @@ def __init__(self, llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( visual_encoder) + + if use_lldr: + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + visual_encoder_clazz = visual_encoder['type'] + visual_encoder_clazz.get_layer_depth = get_layer_depth_for_CLIPVisionModel + self.llm.config.use_cache = False dispatch_modules(self.llm) - assert int(token_merge_ratio**0.5)**2 == token_merge_ratio, \ + assert int(token_merge_ratio ** 0.5) ** 2 == token_merge_ratio, \ '`token_merge_ratio` must be a square number.' self.token_merge_ratio = int(token_merge_ratio) @@ -134,6 +146,13 @@ def __init__(self, self.template = template print_log(self, logger='current') + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + def get_layer_depth(self): + assert hasattr(self.visual_encoder, 'get_layer_depth'), \ + 'The visual_encoder does not have `get_layer_depth` method.' + return self.visual_encoder.get_layer_depth + def _parse_lora_config(self, lora_config): if isinstance(lora_config, dict) or isinstance( lora_config, Config) or isinstance(lora_config, ConfigDict): @@ -292,10 +311,10 @@ def _merge_tokens(tokens, token_merge_ratio): # B, W // w_r, H, C * w_r tokens = tokens.permute(0, 2, 1, 3).contiguous() # B, W // w_r, H // h_r, C * w_r * h_r - tokens = tokens.view(b, w // w_ratio, h // h_ratio, + tokens = tokens.view(b, w // w_ratio, h // h_ratio, c * w_ratio * h_ratio) # B, W * H // w_r // h_r, C * w_r * h_r - tokens = tokens.view(b, w * h // w_ratio // h_ratio, + tokens = tokens.view(b, w * h // w_ratio // h_ratio, c * w_ratio * h_ratio) return tokens From e605a7336a34eb0044dbe74f303346ac5a691262 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 29 Apr 2024 15:26:52 +0800 Subject: [PATCH 094/126] update config --- .../llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py index d22e2be0a..db7ee4beb 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py @@ -49,11 +49,11 @@ warmup_ratio = 0.03 # Save -save_steps = 1500 +save_steps = 2000 save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training -evaluation_freq = 1500 +evaluation_freq = 2000 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] @@ -105,7 +105,7 @@ max_num=max_num, downsample_ratio=downsample_ratio, offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', - image_size_json=None, + image_size_json='/mnt/petrelfs/huanghaian/code/mm/xtuner/image_size.json', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, From ed1a8362bf081ea95e6e3a7d0879d01fc7af16b5 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 29 Apr 2024 15:30:19 +0800 Subject: [PATCH 095/126] fix --- ...ull_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py index a17ca3587..50004c1cc 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py @@ -19,7 +19,7 @@ from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler -from xtuner.engine.optimizers import LearningRateDecayOptimWrapperConstructor, get_layer_depth_for_CLIPVisionModel +from xtuner.engine.optimizers import LearningRateDecayOptimWrapperConstructor ####################################################################### # PART 1 Settings # @@ -80,7 +80,7 @@ template=prompt_template, image_processor=image_processor, freeze_llm=False, - freeze_visual_encoder=True, + freeze_visual_encoder=False, pretrained_pth=pretrained_pth, llm=dict( type=AutoModelForCausalLM.from_pretrained, @@ -88,9 +88,7 @@ trust_remote_code=True), visual_encoder=dict( type=CLIPVisionModel.from_pretrained, - pretrained_model_name_or_path=visual_encoder_name_or_path), - visual_encoder_lora=dict( - type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') + pretrained_model_name_or_path=visual_encoder_name_or_path) ) ####################################################################### @@ -98,7 +96,7 @@ ####################################################################### llava_dataset = dict( type=LLaVADataset, - offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_pretrain', + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', data_path=data_path, image_folder=image_folder, tokenizer=tokenizer, From f5a1922c5031fa1d54e882841b3061b4ce9255ab Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 29 Apr 2024 17:04:49 +0800 Subject: [PATCH 096/126] update --- ...ruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py | 5 ++--- xtuner/engine/__init__.py | 3 ++- .../optimizers/layer_decay_optim_wrapper_constructor.py | 2 ++ xtuner/model/llava.py | 5 +++-- xtuner/tools/train.py | 4 +++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py index 50004c1cc..20f6494e1 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py @@ -19,7 +19,6 @@ from xtuner.dataset import ConcatDataset from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop from mmengine.dataset import DefaultSampler -from xtuner.engine.optimizers import LearningRateDecayOptimWrapperConstructor ####################################################################### # PART 1 Settings # @@ -28,7 +27,7 @@ llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' # Specify the pretrained pth -pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 # Data data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' @@ -128,7 +127,7 @@ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), accumulative_counts=accumulative_counts, - constructor=LearningRateDecayOptimWrapperConstructor, # ==================== + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l loss_scale='dynamic', dtype='float16') diff --git a/xtuner/engine/__init__.py b/xtuner/engine/__init__.py index ae4a46726..18fd1d3c8 100644 --- a/xtuner/engine/__init__.py +++ b/xtuner/engine/__init__.py @@ -3,9 +3,10 @@ from .hooks import (DatasetInfoHook, EvaluateChatHook, ThroughputHook, VarlenAttnArgsToMessageHubHook) from .runner import TrainLoop, ValLoop, TestLoop +from .optimizers import LearningRateDecayOptimWrapperConstructor __all__ = [ 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook', 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop', - 'ValLoop', 'TestLoop' + 'ValLoop', 'TestLoop', 'LearningRateDecayOptimWrapperConstructor' ] diff --git a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py index 8cdd23b7f..f338acd80 100644 --- a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py +++ b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py @@ -7,8 +7,10 @@ from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm from torch import nn from torch.nn import GroupNorm, LayerNorm +from xtuner.registry import BUILDER +@BUILDER.register_module() class LearningRateDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor): """Different learning rates are set for different layers of backbone. diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 75e20f590..6525c2801 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -23,6 +23,7 @@ from functools import reduce from mmengine.logging import print_log from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel +import types class LLaVAModel(BaseModel): @@ -62,8 +63,8 @@ def __init__(self, if use_lldr: # The following code is only meaningful when the optim_wrapper configuration # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. - visual_encoder_clazz = visual_encoder['type'] - visual_encoder_clazz.get_layer_depth = get_layer_depth_for_CLIPVisionModel + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) diff --git a/xtuner/tools/train.py b/xtuner/tools/train.py index 23e3d2a3f..c661c2078 100644 --- a/xtuner/tools/train.py +++ b/xtuner/tools/train.py @@ -316,7 +316,9 @@ def main(): cfg.__setitem__('strategy', strategy) optim_wrapper = dict( type='DeepSpeedOptimWrapper', - optimizer=cfg.optim_wrapper.optimizer) + optimizer=cfg.optim_wrapper.optimizer, + constructor=cfg.optim_wrapper.get('constructor', None), + paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) cfg.__setitem__('optim_wrapper', optim_wrapper) cfg.runner_type = 'FlexibleRunner' From cfd8d4d0e0668bc78f1eafa645a02fd1a59b262e Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 29 Apr 2024 18:05:45 +0800 Subject: [PATCH 097/126] fix --- .../layer_decay_optim_wrapper_constructor.py | 16 +++++++++++----- xtuner/model/llava.py | 4 ++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py index f338acd80..707ea4556 100644 --- a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py +++ b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py @@ -7,10 +7,10 @@ from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm from torch import nn from torch.nn import GroupNorm, LayerNorm -from xtuner.registry import BUILDER +from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS -@BUILDER.register_module() +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() class LearningRateDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor): """Different learning rates are set for different layers of backbone. @@ -62,6 +62,7 @@ def add_params(self, params: List[dict], module: nn.Module, prefix: str = '', + get_layer_depth: Optional[Callable] = None, **kwargs) -> None: """Add all parameters of module to the params list. @@ -81,9 +82,6 @@ def add_params(self, sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) logger = MMLogger.get_current_instance() - assert hasattr(module, 'get_layer_depth'), 'The model should have `get_layer_depth` method' - get_layer_depth = module.get_layer_depth - bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) @@ -93,6 +91,14 @@ def add_params(self, is_norm = isinstance(module, (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + # The model should have `get_layer_depth` method + if get_layer_depth is None and not hasattr(module, 'get_layer_depth'): + raise NotImplementedError('The layer-wise learning rate decay need' + f' the model {type(module)} has' + ' `get_layer_depth` method.') + else: + get_layer_depth = get_layer_depth or module.get_layer_depth + for name, param in module.named_parameters(recurse=False): param_group = {'params': [param]} param_name = prefix + name diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 6525c2801..67c8d8564 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -149,10 +149,10 @@ def __init__(self, # The following code is only meaningful when the optim_wrapper configuration # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. - def get_layer_depth(self): + def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): assert hasattr(self.visual_encoder, 'get_layer_depth'), \ 'The visual_encoder does not have `get_layer_depth` method.' - return self.visual_encoder.get_layer_depth + return self.visual_encoder.get_layer_depth(param_name, prefix) def _parse_lora_config(self, lora_config): if isinstance(lora_config, dict) or isinstance( From 98e6ac9cd74bdfe0284228954e6acb9430f56b93 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 6 May 2024 09:54:14 +0800 Subject: [PATCH 098/126] update config --- ...vit_large_p14_336_lldr_e1_gpu8_finetune.py | 1 + .../llava_phi3_mini_4k_more_data.py | 518 +++++++++++++++++ .../phi3_internvl_1-5_more_data.py | 532 ++++++++++++++++++ 3 files changed, 1051 insertions(+) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py create mode 100644 xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py index 20f6494e1..1906935dc 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py @@ -182,6 +182,7 @@ # save checkpoint per `save_steps`. checkpoint=dict( type=CheckpointHook, + save_optimizer=False, by_epoch=False, interval=save_steps, max_keep_ckpts=save_total_limit), diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py new file mode 100644 index 000000000..2a2eaca10 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py @@ -0,0 +1,518 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = True +# sharegpt4v_caption_dataset = dict( +# type=LLaVADataset, +# offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', +# data_path=sharegpt4v_caption_data_path, +# image_folder=sharegpt4v_caption_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=max_length, +# pad_image_to_square=pad_image_to_square) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py new file mode 100644 index 000000000..e0a4ec3cf --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py @@ -0,0 +1,532 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset, InternVL_v1_5_LLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/phi3_internvl_1-5_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = False + +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +eval_num = 6 + +proxy_eval_dataset = dict(type=InternVL_v1_5_LLaVAProxyEvalDataset, min_num=eval_num, max_num=eval_num) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 1706628ab86b41a134451df12233a6c70c974ae1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 7 May 2024 09:52:55 +0800 Subject: [PATCH 099/126] update config --- xtuner/dataset/llava.py | 22 ++++++------ xtuner/model/internvl_1_5_llava.py | 18 ++++++++++ xtuner/tools/calc_image_size.py | 56 ++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 xtuner/tools/calc_image_size.py diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index bf8c90aeb..e3ba70273 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -213,17 +213,17 @@ def __calc_fn(self, data_dict): cur_len = -cur_len return cur_len - @property - def modality_length(self): - print_log('start calculating modality length', logger='current'), - with ThreadPoolExecutor(max_workers=8) as executor: - length_list = list( - tqdm( - executor.map(self.__calc_fn, self.text_data), - desc='Calculating modality length', - total=len(self.text_data))) - print_log('end calculating modality length', logger='current'), - return length_list + # @property + # def modality_length(self): + # print_log('start calculating modality length', logger='current'), + # with ThreadPoolExecutor(max_workers=8) as executor: + # length_list = list( + # tqdm( + # executor.map(self.__calc_fn, self.text_data), + # desc='Calculating modality length', + # total=len(self.text_data))) + # print_log('end calculating modality length', logger='current'), + # return length_list def __getitem__(self, index): data_dict = self.text_data[index] diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 24033dfd0..59ac31830 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -9,6 +9,9 @@ make_inputs_require_grad, prepare_inputs_labels_for_multimodal) +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel +import types + class InternVL_v1_5_LLaVAModel(LLaVAModel): def __init__(self, llm, @@ -25,6 +28,7 @@ def __init__(self, llm, image_processor=None, tokenizer=None, template=None, + use_lldr=False, # LearningRateDecayOptimWrapperConstructor merge_type='pixel_shuffle', # or pixel_shuffle downsample_ratio=0.5): super(LLaVAModel, self).__init__() @@ -40,6 +44,13 @@ def __init__(self, llm, self.llm = self._build_from_cfg_or_module(llm) self.visual_encoder = self._build_from_cfg_or_module( visual_encoder) + + if use_lldr: + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, + self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) @@ -100,6 +111,13 @@ def __init__(self, llm, self.image_processor = BUILDER.build(image_processor) self.template = template + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): + assert hasattr(self.visual_encoder, 'get_layer_depth'), \ + 'The visual_encoder does not have `get_layer_depth` method.' + return self.visual_encoder.get_layer_depth(param_name, prefix) + def _prepare_data_for_llm(self, data): if 'pixel_values' in data: new_image_feature = self.__preprocess_for_pixel_values(data) diff --git a/xtuner/tools/calc_image_size.py b/xtuner/tools/calc_image_size.py new file mode 100644 index 000000000..1a2879976 --- /dev/null +++ b/xtuner/tools/calc_image_size.py @@ -0,0 +1,56 @@ +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import json +from PIL import Image +import os + +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data + + +def calc_fn(data_dict): + size = {'width': 0, 'height': 0, 'image': 'None'} + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(image_folder, + image_file)) + size['image'] = image_file + size['width'] = image.size[0] + size['height'] = image.size[1] + return size + + +if __name__ == '__main__': + print('start calculating modality length') + if data_path.endswith('.json'): + json_data = json.load(open(data_path)) + elif data_path.endswith('.jsonl'): + json_data = load_jsonl(data_path) + else: + raise NotImplementedError + + with ThreadPoolExecutor(max_workers=16) as executor: + length_list = list( + tqdm( + executor.map(calc_fn, json_data), + desc='Calculating modality length', + total=len(json_data))) + print('end calculating modality length') + + new_output_dict = {} + for i in range(len(length_list)): + if length_list[i]['image'] != 'None': + new_output_dict[length_list[i]['image']] = [length_list[i]['width'], length_list[i]['height']] + + with open('llava_v1_5_mix665k_image_size.json', 'w') as f: + json.dump(new_output_dict, f) From 38c8c278831c1b194138cd0d4ad234c807319189 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 7 May 2024 19:01:45 +0800 Subject: [PATCH 100/126] add test --- .../llava/llama3_70b_chat/test_config.py | 207 ++++++++++++++++++ xtuner/dataset/__init__.py | 2 +- xtuner/dataset/collate_fns/__init__.py | 2 +- xtuner/dataset/collate_fns/mm_collate_fn.py | 10 + xtuner/dataset/llava_proxy_eval_dataset1.py | 106 +++++++++ xtuner/model/__init__.py | 3 +- xtuner/model/openai.py | 63 ++++++ 7 files changed, 390 insertions(+), 3 deletions(-) create mode 100644 xtuner/configs/llava/llama3_70b_chat/test_config.py create mode 100644 xtuner/dataset/llava_proxy_eval_dataset1.py create mode 100644 xtuner/model/openai.py diff --git a/xtuner/configs/llava/llama3_70b_chat/test_config.py b/xtuner/configs/llava/llama3_70b_chat/test_config.py new file mode 100644 index 000000000..6475440f6 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/test_config.py @@ -0,0 +1,207 @@ +from xtuner.model import OpenAIModel +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn1 +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from xtuner.dataset import LLaVAProxyEvalDataset1 + +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +model = dict(type=OpenAIModel) +prompt_template=None + +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +proxy_eval_dataset = dict(type=LLaVAProxyEvalDataset1) + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), +] + +# # TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn1, extra_collate_keys=['img_id']) +) + +test_evaluator = {} +test_cfg = dict(type=TestLoop, select_metric='first') + diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index 4964520fc..bc606fc27 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -19,7 +19,7 @@ from .mini_gemini_dataset import MiniGeminiDataset from .mini_gemini_proxy_eval_dataset import MiniGeminiProxyEvalDataset from .internvl_v1_5_llava_proxy_eval_dataset import InternVL_v1_5_LLaVAProxyEvalDataset - +from llava_proxy_eval_dataset1 import LLaVAProxyEvalDataset1 # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) diff --git a/xtuner/dataset/collate_fns/__init__.py b/xtuner/dataset/collate_fns/__init__.py index a08d404a2..9ddfd5fc1 100644 --- a/xtuner/dataset/collate_fns/__init__.py +++ b/xtuner/dataset/collate_fns/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .default_collate_fn import default_collate_fn from .mmlu_collate_fn import mmlu_collate_fn -from .mm_collate_fn import mm_collate_fn +from .mm_collate_fn import mm_collate_fn,mm_collate_fn1 __all__ = ['default_collate_fn', 'mmlu_collate_fn', 'mm_collate_fn'] diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py index 47e29409c..1e4c1704d 100644 --- a/xtuner/dataset/collate_fns/mm_collate_fn.py +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -85,3 +85,13 @@ def mm_collate_fn(instances: Sequence[Dict], return data_dict else: return {'data': data_dict, 'data_samples': None} + + +def mm_collate_fn1(instances: Sequence[Dict], + pad_index: int = DEFAULT_PAD_TOKEN_INDEX, + return_hf_format: bool = False, + extra_collate_keys=None): + data_dict = {'pixel_values': [inst['pixel_values'] for inst in instances], + 'text': [inst['text'] for inst in instances], + 'img_id': [inst['img_id'] for inst in instances]} + return {'data': data_dict, 'data_samples': None} diff --git a/xtuner/dataset/llava_proxy_eval_dataset1.py b/xtuner/dataset/llava_proxy_eval_dataset1.py new file mode 100644 index 000000000..570555e24 --- /dev/null +++ b/xtuner/dataset/llava_proxy_eval_dataset1.py @@ -0,0 +1,106 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +import asyncio +from openai import AsyncOpenAI +from typing import List + +import base64 +from io import BytesIO +from typing import Union + +import requests +from PIL import Image + + +def encode_image_base64(image: Image.Image) -> str: + """encode image to base64 format.""" + buffered = BytesIO() + image.save(buffered, format='PNG') + + return f"data:image/jpeg;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}" + + +class LLaVAProxyEvalDataset1: + def __init__(self, eval_dataset): + self.eval_ds = eval_dataset + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + data_dict['text'] = text + + # if self.eval_ds.use_system: + # inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + # else: + # inputs = '' + # inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + # chunk_encode = [] + # for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + # if idx == 0: + # # add bos token + # bos_token_id = self.eval_ds.tokenizer.bos_token_id + # cur_encode = [bos_token_id] + # cur_encode += self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + # else: + # cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + # chunk_encode.append(cur_encode) + # assert len(chunk_encode) == 2 + # ids = [] + # for idx, cur_chunk_encode in enumerate(chunk_encode): + # ids.extend(cur_chunk_encode) + # if idx != len(chunk_encode) - 1: + # ids.append(IMAGE_TOKEN_INDEX) + # ids = torch.tensor(ids) + # data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])) + else: + image = self.eval_ds.get_image(data['img']) + image = encode_image_base64(image) + # if self.eval_ds.pad_image_to_square: + # image = expand2square( + # image, + # tuple( + # int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + # image = self.eval_ds.image_processor.preprocess( + # image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index 6ba85bb12..241d86e24 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -4,5 +4,6 @@ from .anyres_llava import AnyResLLaVAModel from .mini_gemini import MiniGeminiModel from .internvl_1_5_llava import InternVL_v1_5_LLaVAModel +from .openai import OpenAIModel -__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel', 'InternVL_v1_5_LLaVAModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel', 'InternVL_v1_5_LLaVAModel', 'OpenAIModel'] diff --git a/xtuner/model/openai.py b/xtuner/model/openai.py new file mode 100644 index 000000000..718d821e6 --- /dev/null +++ b/xtuner/model/openai.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import torch +import torch.nn as nn +from .utils import (get_peft_model_state_dict, guess_load_checkpoint, + prepare_inputs_labels_for_multimodal) +from mmengine.model import BaseModel +import asyncio +from openai import AsyncOpenAI +from typing import List + + +class OpenaiBackend: + + def __init__(self, api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1'): + self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) + + async def request_completion(self, model_id, messages): + response = await self.client.chat.completions.create( + model=model_id, + messages=messages, + temperature=0.8, + top_p=0.8) + return response # .choices[0].message.content + + async def _batch_infer(self, messages: List[List]): + model_cards = await self.client.models.list()._get_page() + model_id = model_cards.data[0].id + + tasks = [self.request_completion(model_id, msg) for msg in messages] + + responses = await asyncio.gather(*tasks) + + return [res.choices[0].message.content for res in responses] + + def batch_infer(self, messages: List[List]): + return asyncio.run(self._batch_infer(messages)) + + +class OpenAIModel(BaseModel): + def __init__(self): + super().__init__() + self.model = OpenaiBackend(base_url='http://10.140.24.142:23333/v1') + + def forward(self, data, data_samples=None, mode='loss'): + pixel_values = data['pixel_values'][0] + text = data['text'][0] + + prompt = [ + { + 'role': 'user', + 'content': [ + {'type': 'text', 'text': text}, + {'type': 'image_url', 'image_url': {'url': pixel_values}} + ] + } + ] + prediction = self.model.batch_infer([prompt])[0] + return dict(prediction=prediction) + + def preparing_for_generation(self, metainfo: dict = None): + pass From 55f01aa1c75dde7fff1e39e33d5d6621d368b427 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 7 May 2024 19:47:38 +0800 Subject: [PATCH 101/126] fix --- xtuner/dataset/__init__.py | 2 +- xtuner/model/openai.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index bc606fc27..e98cbd157 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -19,7 +19,7 @@ from .mini_gemini_dataset import MiniGeminiDataset from .mini_gemini_proxy_eval_dataset import MiniGeminiProxyEvalDataset from .internvl_v1_5_llava_proxy_eval_dataset import InternVL_v1_5_LLaVAProxyEvalDataset -from llava_proxy_eval_dataset1 import LLaVAProxyEvalDataset1 +from .llava_proxy_eval_dataset1 import LLaVAProxyEvalDataset1 # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) diff --git a/xtuner/model/openai.py b/xtuner/model/openai.py index 718d821e6..81f13fa1d 100644 --- a/xtuner/model/openai.py +++ b/xtuner/model/openai.py @@ -59,5 +59,7 @@ def forward(self, data, data_samples=None, mode='loss'): prediction = self.model.batch_infer([prompt])[0] return dict(prediction=prediction) + def gradient_checkpointing_disable(self): + pass def preparing_for_generation(self, metainfo: dict = None): pass From 1c5de9dd7b4b9ccd9d5eb568e44230e9ea942a00 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 10 May 2024 10:47:36 +0800 Subject: [PATCH 102/126] add allava --- .../llava/phi3_mini_chat/allava_pretrain.py | 271 ++++++++++++++++++ xtuner/dataset/llava.py | 26 +- 2 files changed, 286 insertions(+), 11 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py new file mode 100644 index 000000000..aa62dd9ea --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py @@ -0,0 +1,271 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset,ConcatDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +sharegpt4v_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.json' +sharegpt4v_image_folder = data_root + 'data' + +data_root = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root + 'allava_laion/ALLaVA-Caption-LAION-4V_llava.json' +allava_laion_image_folder = 's3://xtuner/huanghaian/data/ALLaVA-4V/' + +data_root = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' +allava_vflan_image_folder = 's3://xtuner/huanghaian/data/ALLaVA-4V/' + +allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K.json' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(4096 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +sharegpt4v_dataset = dict( + type=LLaVADataset, + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_laion_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_text_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=allava_text_data_path, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +# 偷个懒,应该是用 repeat +allava_text1_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=allava_text_data_path, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataset = dict( + type=ConcatDataset, + datasets=[ + sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, + allava_text_dataset, allava_text1_dataset + ]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index e3ba70273..ba06da8bc 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -2,7 +2,7 @@ import json import logging import os - +import io import torch from datasets import Dataset as HFDataset from datasets import DatasetDict, load_from_disk @@ -14,8 +14,7 @@ from xtuner.registry import BUILDER from .huggingface import process_hf_dataset from .utils import expand2square, process_anyres_image, total_image_token, dynamic_preprocess -from concurrent.futures import ThreadPoolExecutor -from tqdm import tqdm +from mmengine.fileio import get def load_jsonl(json_file): @@ -112,12 +111,20 @@ def modality_length(self): def __len__(self): return len(self.text_data) + def get_image(self, path): + if path.startswith('s3://'): + img_bytes = get(path) + with io.BytesIO(img_bytes) as buff: + img = Image.open(buff).convert('RGB') + return img + else: + return Image.open(path).convert('RGB') + def __getitem__(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = Image.open(os.path.join(self.image_folder, - image_file)).convert('RGB') + image = self.get_image(os.path.join(self.image_folder, image_file)) if self.pad_image_to_square: image = expand2square( image, @@ -153,8 +160,7 @@ def __getitem__(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = Image.open(os.path.join(self.image_folder, - image_file)).convert('RGB') + image = self.get_image(os.path.join(self.image_folder, image_file)) orig_size = image.size # use to remove padding data_dict['orig_size'] = orig_size @@ -204,8 +210,7 @@ def __calc_fn(self, data_dict): if self.image_size_json is not None: size = self.image_size_json[image_file] else: - image = Image.open(os.path.join(self.image_folder, - image_file)) + image = self.get_image(os.path.join(self.image_folder, image_file)) size = image.size num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, self._patch_size) @@ -229,8 +234,7 @@ def __getitem__(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = Image.open(os.path.join(self.image_folder, - image_file)).convert('RGB') + image = self.get_image(os.path.join(self.image_folder, image_file)) images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) for i, image in enumerate(images): image = self.image_processor.preprocess( From c361adc2940090ecc1197446d063d5af97e7abac Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 10 May 2024 13:32:33 +0800 Subject: [PATCH 103/126] fix --- .../llava/phi3_mini_chat/allava_pretrain.py | 35 +++++++------------ xtuner/tools/train.py | 16 ++++++--- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py index aa62dd9ea..58cd808d5 100644 --- a/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py @@ -7,7 +7,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor, CLIPVisionModel) -from xtuner.dataset import LLaVADataset,ConcatDataset +from xtuner.dataset import LLaVADataset, ConcatDataset from xtuner.dataset.collate_fns import mm_collate_fn from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook @@ -33,15 +33,15 @@ data_root = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' -allava_vflan_image_folder = 's3://xtuner/huanghaian/data/ALLaVA-4V/' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' -allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K.json' +allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' prompt_template = PROMPT_TEMPLATE.phi3_chat -max_length = int(4096 - (336 / 14)**2) +max_length = int(2048 - (336 / 14) ** 2) # Scheduler & Optimizer -batch_size = 32 # per_device +batch_size = 16 # per_device 16gx16 accumulative_counts = 1 dataloader_num_workers = 4 max_epochs = 1 @@ -94,8 +94,11 @@ ####################################################################### # PART 3 Dataset & Dataloader # ####################################################################### +cache_2k_root = data_root + 'phi3_mini_2k_offline/' + sharegpt4v_dataset = dict( type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'sharegpt4v_dataset', data_path=sharegpt4v_data_path, image_folder=sharegpt4v_image_folder, tokenizer=tokenizer, @@ -108,7 +111,7 @@ allava_laion_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset', data_path=allava_laion_data_path, image_folder=allava_laion_image_folder, tokenizer=tokenizer, @@ -121,7 +124,7 @@ allava_vflan_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset', data_path=allava_vflan_data_path, image_folder=allava_vflan_image_folder, tokenizer=tokenizer, @@ -134,23 +137,11 @@ allava_text_dataset = dict( type=LLaVADataset, - offline_processed_text_folder=None, - data_path=allava_text_data_path, - tokenizer=tokenizer, - image_processor=image_processor, - dataset_map_fn=llava_map_fn, - template_map_fn=dict( - type=template_map_fn_factory, template=prompt_template), - max_length=max_length, - pad_image_to_square=False) - -# 偷个懒,应该是用 repeat -allava_text1_dataset = dict( - type=LLaVADataset, - offline_processed_text_folder=None, + offline_processed_text_folder=cache_2k_root + 'allava_text_dataset', data_path=allava_text_data_path, tokenizer=tokenizer, image_processor=image_processor, + image_folder=None, dataset_map_fn=llava_map_fn, template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), @@ -161,7 +152,7 @@ type=ConcatDataset, datasets=[ sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, - allava_text_dataset, allava_text1_dataset + allava_text_dataset, allava_text_dataset ]) train_dataloader = dict( diff --git a/xtuner/tools/train.py b/xtuner/tools/train.py index 4c9400101..a211e937c 100644 --- a/xtuner/tools/train.py +++ b/xtuner/tools/train.py @@ -332,11 +332,17 @@ def main(): 'sequence_parallel_size', 1)) cfg.__setitem__('strategy', strategy) - optim_wrapper = dict( - type='DeepSpeedOptimWrapper', - optimizer=cfg.optim_wrapper.optimizer, - constructor=cfg.optim_wrapper.get('constructor', None), - paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) + if 'constructor' in cfg.optim_wrapper: + optim_wrapper = dict( + type='DeepSpeedOptimWrapper', + optimizer=cfg.optim_wrapper.optimizer, + constructor=cfg.optim_wrapper.constructor, + paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) + else: + optim_wrapper = dict( + type='DeepSpeedOptimWrapper', + optimizer=cfg.optim_wrapper.optimizer, + paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) cfg.__setitem__('optim_wrapper', optim_wrapper) cfg.runner_type = 'FlexibleRunner' From 2409629a50bd39b14b5a2f02179e5c3a3a15ef3f Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 10 May 2024 15:39:46 +0800 Subject: [PATCH 104/126] add finetune --- .../llava/phi3_mini_chat/allava_finetune.py | 539 ++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 xtuner/configs/llava/phi3_mini_chat/allava_finetune.py diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py new file mode 100644 index 000000000..14f3044cb --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py @@ -0,0 +1,539 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/allava_pretrain/iter_4214.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root1 + 'allava_laion/ALLaVA-Instruct-LAION-4V_llava.json' +allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' + +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device 16g +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = True + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +################## +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' + +allava_laion_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset_sft', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset, allava_laion_dataset, allava_vflan_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 43b27d09460cf5ff4cf7675aed52f4c3ab9f0749 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 10 May 2024 16:29:30 +0800 Subject: [PATCH 105/126] add finetune1 --- .../llava/phi3_mini_chat/allava_finetune1.py | 399 ++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py new file mode 100644 index 000000000..cb5eedfa9 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py @@ -0,0 +1,399 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/allava_pretrain/iter_4214.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/zhaoxiangyu/' +data_path = data_root + 'instruct_llava_allava_doc_dvqa_share_ai2d_1383k.json' +image_folder = '/mnt/petrelfs/share_data/huanghaian/xiangyu_mix_sft_data/' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device 16g +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_path = '/mnt/petrelfs/share_data/huanghaian/xiangyu_mix_sft_data/phi3-mini-2k-sft' +pad_image_to_square = True + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_path, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From 28796c23d26f8f353793769f3ef69950f5eafaf8 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sat, 11 May 2024 11:07:57 +0800 Subject: [PATCH 106/126] add config --- .../llava/phi3_mini_chat/llava_allava.py | 466 ++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_allava.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py new file mode 100644 index 000000000..1102b92e9 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py @@ -0,0 +1,466 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root1 + 'allava_laion/ALLaVA-Instruct-LAION-4V_llava.json' +allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' + +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' + +allava_laion_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset_sft', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, allava_laion_dataset, allava_vflan_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From aae7b46f4bdef6888f3e88305272325d28b770e3 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sat, 11 May 2024 11:08:53 +0800 Subject: [PATCH 107/126] updata --- xtuner/configs/llava/phi3_mini_chat/allava_finetune.py | 2 +- xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py index 14f3044cb..176059073 100644 --- a/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py @@ -64,7 +64,7 @@ max_length = int(2048 - (336 / 14) ** 2) # Scheduler & Optimizer -batch_size = 8 # per_device 16g +batch_size = 16 # per_device accumulative_counts = 1 dataloader_num_workers = 4 max_epochs = 1 diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py index cb5eedfa9..f2460a4ee 100644 --- a/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py @@ -37,7 +37,7 @@ max_length = int(2048 - (336 / 14) ** 2) # Scheduler & Optimizer -batch_size = 8 # per_device 16g +batch_size = 16 # per_device accumulative_counts = 1 dataloader_num_workers = 4 max_epochs = 1 From 85a62f5ee895bb4b5572d5c3711198798d2202e1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Sat, 11 May 2024 14:24:00 +0800 Subject: [PATCH 108/126] updata --- .../llava/phi3_mini_chat/llava_allava.py | 18 +- .../phi3_mini_chat/llava_allava_sharegpt.py | 466 ++++++++++++++++++ 2 files changed, 467 insertions(+), 17 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py index 1102b92e9..af15260cc 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_allava.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py @@ -35,9 +35,6 @@ image_folder = data_root + 'llava_images' data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' -allava_laion_data_path = data_root1 + 'allava_laion/ALLaVA-Instruct-LAION-4V_llava.json' -allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' - allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' @@ -117,19 +114,6 @@ cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' -allava_laion_dataset = dict( - type=LLaVADataset, - offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset_sft', - data_path=allava_laion_data_path, - image_folder=allava_laion_image_folder, - tokenizer=tokenizer, - image_processor=image_processor, - dataset_map_fn=llava_map_fn, - template_map_fn=dict( - type=template_map_fn_factory, template=prompt_template), - max_length=max_length, - pad_image_to_square=True) - allava_vflan_dataset = dict( type=LLaVADataset, offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', @@ -145,7 +129,7 @@ train_dataset = dict( type=ConcatDataset, - datasets=[llava_dataset, allava_laion_dataset, allava_vflan_dataset]) + datasets=[llava_dataset, allava_vflan_dataset]) train_dataloader = dict( batch_size=batch_size, diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py new file mode 100644 index 000000000..1b5ca0cb8 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py @@ -0,0 +1,466 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +sharegpt4v_sft_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, allava_vflan_dataset, sharegpt4v_sft_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') From f998ae55831c3fb216553be0ca8adb39ceacbb0c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 13 May 2024 15:25:04 +0800 Subject: [PATCH 109/126] update --- .../llava/phi3_mini_chat/llava_allava_sharegpt.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py index 1b5ca0cb8..2fbcf84cd 100644 --- a/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py @@ -26,18 +26,20 @@ # Model llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + # Specify the pretrained pth pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 # Data -data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' -data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' -image_folder = data_root + 'llava_images' +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' sharegpt4v_image_folder = data_root + 'data' @@ -105,8 +107,8 @@ llava_dataset = dict( type=LLaVADataset, offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', - data_path=data_path, - image_folder=image_folder, + data_path=llava_data_path, + image_folder=llava_image_folder, tokenizer=tokenizer, image_processor=image_processor, dataset_map_fn=llava_map_fn, From 6278913c6f4158a53efd4b28661107a0dd4ec06f Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 16 May 2024 20:37:43 +0800 Subject: [PATCH 110/126] add patch select --- .../phi3_internvl_1-5_more_data_pretrain.py | 358 ++++++++++++++++++ xtuner/dataset/__init__.py | 5 +- xtuner/dataset/huggingface.py | 20 +- xtuner/dataset/llava.py | 68 ++-- xtuner/dataset/utils.py | 60 ++- 5 files changed, 482 insertions(+), 29 deletions(-) create mode 100644 xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py new file mode 100644 index 000000000..77bc6d745 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -0,0 +1,358 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset import ConcatDataset +from xtuner.dataset.utils import internvl_1_5_encode_fn +from xtuner.dataset.samplers import LengthGroupedSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +share_data_root = '/mnt/hwfile/xtuner/huanghaian/data/sharegpt4v/' +sharegpt4v_data_path = share_data_root + 'share-captioner_coco_lcs_sam_1246k_1107_llava.json' +sharegpt4v_image_folder = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/data' + +data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root + 'allava_laion/ALLaVA-Caption-LAION-4V_llava.json' +allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' + +data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' + +laion_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/' +laion_data_path = laion_data_root + 'filter_rand_10m_llava.json' +laion_image_folder = 'public:s3://public-dataset/laion-coco/images/' + +coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' +coyo_data_path = coyo_data_root + 'filter_rand_10m_llava.json' +coyo_image_folder = 'public:s3://public-dataset/COYO-700M/data/' + +prompt_template = PROMPT_TEMPLATE.phi3_chat + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_2k_root = laion_data_root + 'phi3_mini_2k_offline/' +laion_coco_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'laion_coco_dataset_10m', + data_path=laion_data_path, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +cache_2k_root = coyo_data_root + 'phi3_mini_2k_offline/' +coyo_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'coyo_dataset_10m', + data_path=coyo_data_path, + image_folder=coyo_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +cache_2k_root = share_data_root + 'phi3_mini_2k_offline/' +sharegpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +cache_2k_root = data_root + 'phi3_mini_2k_offline/' +allava_laion_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +cache_2k_root = data_root + 'phi3_mini_2k_offline/' +allava_vflan_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +allava_text_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + use_patch=False, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_2k_root + 'allava_text_dataset', + data_path=allava_text_data_path, + tokenizer=tokenizer, + image_processor=image_processor, + image_folder=None, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num, + use_patch=False), # 核心参数 + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=2048) + +train_dataset = dict( + type=ConcatDataset, + datasets=[ + laion_coco_dataset, coyo_dataset, + sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, + allava_text_dataset, allava_text_dataset + ]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=True, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index e98cbd157..14fd06d81 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -13,7 +13,7 @@ from .moss_sft import MOSSSFTDataset from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset, RefCOCOJsonEvalDataset) -from .utils import decode_base64_to_image, expand2square, load_image +from .utils import decode_base64_to_image, expand2square, load_image, internvl_1_5_encode_fn from .llava_proxy_eval_dataset import LLaVAProxyEvalDataset from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset from .mini_gemini_dataset import MiniGeminiDataset @@ -40,5 +40,6 @@ 'MiniGeminiDataset', 'MiniGeminiProxyEvalDataset', 'InternVL_V1_5_LLaVADataset', - 'InternVL_v1_5_LLaVAProxyEvalDataset' + 'InternVL_v1_5_LLaVAProxyEvalDataset', + 'internvl_1_5_encode_fn' ] diff --git a/xtuner/dataset/huggingface.py b/xtuner/dataset/huggingface.py index c44e88688..f29eafe49 100644 --- a/xtuner/dataset/huggingface.py +++ b/xtuner/dataset/huggingface.py @@ -12,7 +12,8 @@ from torch import distributed as dist from xtuner.registry import BUILDER, MAP_FUNC -from .utils import Packer, encode_fn +from .utils import Packer +from .utils import encode_fn as default_encode_fn def get_lengths(example): @@ -66,12 +67,21 @@ def add_template_to_dataset(dataset, template_map_fn, map_num_proc): def tokenize_dataset(dataset, tokenizer, max_length, with_image_token, input_ids_with_output, remove_unused_columns, - map_num_proc): + map_num_proc, encode_map_fn=None): assert (tokenizer is not None) and (max_length is not None), \ f'({tokenizer}, {max_length})' if isinstance(tokenizer, dict) or isinstance( tokenizer, Config) or isinstance(tokenizer, ConfigDict): tokenizer = BUILDER.build(tokenizer) + if encode_map_fn is None: + encode_fn = default_encode_fn + else: + if isinstance(encode_map_fn, + dict) or isinstance(encode_map_fn, Config) or \ + isinstance(encode_map_fn, ConfigDict): + encode_fn = BUILDER.build(encode_map_fn) + else: + encode_fn = encode_map_fn dataset = dataset.map( partial( encode_fn, @@ -103,6 +113,7 @@ def process(dataset, max_length=None, dataset_map_fn=None, template_map_fn=None, + encode_map_fn=None, max_dataset_length=None, split='train', remove_unused_columns=False, @@ -198,7 +209,8 @@ def process(dataset, if do_dataset_tokenization: dataset = tokenize_dataset(dataset, tokenizer, max_length, with_image_token, input_ids_with_output, - remove_unused_columns, map_num_proc) + remove_unused_columns, map_num_proc, + encode_map_fn=encode_map_fn) if input_ids_with_output: assert {'input_ids', 'labels'}.issubset(dataset.column_names) @@ -226,6 +238,7 @@ def process_hf_dataset(dataset, dataset_map_fn=None, template_map_fn=None, max_dataset_length=None, + encode_map_fn=None, split='train', remove_unused_columns=False, rename_maps=[], @@ -284,6 +297,7 @@ def process_hf_dataset(dataset, max_length=max_length, dataset_map_fn=dataset_map_fn, template_map_fn=template_map_fn, + encode_map_fn=encode_map_fn, max_dataset_length=max_dataset_length, split=split, remove_unused_columns=remove_unused_columns, diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index ba06da8bc..9193577e8 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -15,7 +15,9 @@ from .huggingface import process_hf_dataset from .utils import expand2square, process_anyres_image, total_image_token, dynamic_preprocess from mmengine.fileio import get - +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor +import numpy as np def load_jsonl(json_file): with open(json_file) as f: @@ -37,6 +39,7 @@ def __init__(self, max_dataset_length=None, dataset_map_fn=None, template_map_fn=None, + encode_map_fn=None, max_length=2048, s2_scales=None, # [1, 2] or [1,2,3] pad_image_to_square=False): @@ -72,6 +75,7 @@ def __init__(self, max_length=max_length, dataset_map_fn=dataset_map_fn, template_map_fn=template_map_fn, + encode_map_fn=encode_map_fn, split='train', max_dataset_length=max_dataset_length, remove_unused_columns=False, @@ -179,10 +183,11 @@ def __getitem__(self, index): class InternVL_V1_5_LLaVADataset(LLaVADataset): - def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, image_size_json=None, *args, **kwargs): + def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, use_patch=True, *args, **kwargs): self.min_num = min_num self.max_num = max_num self.downsample_ratio = downsample_ratio + self.use_patch = use_patch super().__init__(*args, **kwargs) if hasattr(self.image_processor, 'crop_size'): @@ -196,10 +201,7 @@ def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, image self._image_size = image_size self._patch_size = (self._image_size // 14) * downsample_ratio # 12 - self.image_size_json = None - if image_size_json is not None: - with open(image_size_json, 'r') as f: - self.image_size_json = json.load(f) + self.max_refetch = 1000 def __calc_fn(self, data_dict): cur_len = len(data_dict['input_ids']) @@ -207,35 +209,52 @@ def __calc_fn(self, data_dict): cur_len = len(data_dict['input_ids']) if data_dict.get('image', None) is not None: image_file = data_dict['image'] - if self.image_size_json is not None: - size = self.image_size_json[image_file] + assert 'image_wh' in data_dict + if 'image_wh' in data_dict: + size = data_dict['image_wh'][0] else: image = self.get_image(os.path.join(self.image_folder, image_file)) size = image.size - num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, - self._patch_size) + if self.use_patch: + num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, + self._patch_size) + else: + num_image_token = self._patch_size * self._patch_size cur_len += num_image_token cur_len = -cur_len return cur_len - # @property - # def modality_length(self): - # print_log('start calculating modality length', logger='current'), - # with ThreadPoolExecutor(max_workers=8) as executor: - # length_list = list( - # tqdm( - # executor.map(self.__calc_fn, self.text_data), - # desc='Calculating modality length', - # total=len(self.text_data))) - # print_log('end calculating modality length', logger='current'), - # return length_list + @property + def modality_length(self): + print_log('start calculating modality length', logger='current'), + with ThreadPoolExecutor(max_workers=16) as executor: + length_list = list( + tqdm( + executor.map(self.__calc_fn, self.text_data), + desc='Calculating modality length', + total=len(self.text_data))) + print_log('end calculating modality length', logger='current'), + return length_list def __getitem__(self, index): + for _ in range(self.max_refetch + 1): + data = self.prepare_data(index) + # Broken images may cause the returned data to be None + if data is None: + idx = self._rand_another() + continue + return data + + def prepare_data(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = self.get_image(os.path.join(self.image_folder, image_file)) - images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) + try: + image = self.get_image(os.path.join(self.image_folder, image_file)) + except Exception as e: + print_log(f'Error: {e}', logger='current') + return None + images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size, use_patch=self.use_patch) for i, image in enumerate(images): image = self.image_processor.preprocess( image, return_tensors='pt')['pixel_values'][0] @@ -246,3 +265,6 @@ def __getitem__(self, index): data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], self._crop_size['width']) return data_dict + + def _rand_another(self) -> int: + return np.random.randint(0, len(self.text_data)) diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index c626a3aab..d6a184c6a 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -486,10 +486,16 @@ def total_image_token(orig_size, min_num=1, max_num=6, image_size=336, patch_siz return blocks*patch_size*patch_size -def dynamic_preprocess(image, min_num=1, max_num=6, image_size=336, use_thumbnail=True): +def dynamic_preprocess(image, min_num=1, max_num=6, image_size=336, use_thumbnail=True, use_patch=True): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height + if not use_patch: + processed_images = [] + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if @@ -523,3 +529,55 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=336, use_thumbnai thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images + + +def internvl_1_5_encode_fn(example, + tokenizer, + max_length, + input_ids_with_output=True, + with_image_token=False, + min_num=1, + max_num=6, + image_size=336, + patch_size=12, + use_patch=True): + """We only support the following three scenarios: + + 1. Incremental pretraining dataset. + example['conversation'] = [ + { + 'input': '', + 'output': '### Human: Can you write xxx' + } + ] + + 2. Single-turn conversation dataset. + example['conversation'] = [ + { + 'input': 'Give three tips for staying healthy.', + 'output': '1.Eat a balanced diet xxx' + } + ] + + 3. Multi-turn conversation dataset. + example['conversation'] = [ + { + 'input': 'Give three tips for staying healthy.', + 'output': '1.Eat a balanced diet xxx' + }, + { + 'input': 'Please expand on the second point.', + 'output': 'Here is an expanded explanation of the xxx' + } + ] + """ + img_token = 0 + if 'image' in example: + if use_patch: + assert 'image_wh' in example + img_token = total_image_token(example['image_wh'][0], min_num, max_num, image_size, patch_size) + else: + # clip + img_token = patch_size * patch_size + max_length = max_length - img_token + return encode_fn(example, tokenizer, max_length, input_ids_with_output, with_image_token) From 98f1f586b4a8fa7a11238069d90e9fd4e629358d Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 16 May 2024 20:44:06 +0800 Subject: [PATCH 111/126] fix --- xtuner/dataset/llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 9193577e8..d2db1ebfa 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -241,7 +241,7 @@ def __getitem__(self, index): data = self.prepare_data(index) # Broken images may cause the returned data to be None if data is None: - idx = self._rand_another() + index = self._rand_another() continue return data From bd4bf226b77a0cd28bb34a2137dfcebaa81330be Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 17 May 2024 10:37:45 +0800 Subject: [PATCH 112/126] update --- .../phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index 77bc6d745..d21c84ae7 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -45,8 +45,8 @@ laion_image_folder = 'public:s3://public-dataset/laion-coco/images/' coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' -coyo_data_path = coyo_data_root + 'filter_rand_10m_llava.json' -coyo_image_folder = 'public:s3://public-dataset/COYO-700M/data/' +coyo_data_path = coyo_data_root + 'filter_rand_20m_llava.json' +coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' prompt_template = PROMPT_TEMPLATE.phi3_chat @@ -138,7 +138,7 @@ min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'coyo_dataset_10m', + offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m', data_path=coyo_data_path, image_folder=coyo_image_folder, tokenizer=tokenizer, From 32723b1468a0f7d314c40597a1a881271bdfb063 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 17 May 2024 16:58:58 +0800 Subject: [PATCH 113/126] fix bug --- .../phi3_internvl_1-5_more_data_pretrain.py | 12 ++++++------ xtuner/dataset/huggingface.py | 4 +++- xtuner/dataset/llava.py | 10 ++++++++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index d21c84ae7..617c7e6d4 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -22,21 +22,21 @@ # PART 1 Settings # ####################################################################### # Model -llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' -visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +llm_name_or_path = '/mnt/hwfile/xtuner/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = '/mnt/hwfile/xtuner/linzhihao/model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' # Data share_data_root = '/mnt/hwfile/xtuner/huanghaian/data/sharegpt4v/' sharegpt4v_data_path = share_data_root + 'share-captioner_coco_lcs_sam_1246k_1107_llava.json' -sharegpt4v_image_folder = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/data' +sharegpt4v_image_folder = '/mnt/hwfile/xtuner/linzhihao/dataset/sharegpt4v/data' data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' allava_laion_data_path = data_root + 'allava_laion/ALLaVA-Caption-LAION-4V_llava.json' -allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' +allava_laion_image_folder = '/mnt/hwfile/openmmlab/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' -allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' +allava_vflan_image_folder = '/mnt/hwfile/openmmlab/zhaoxiangyu/' allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' @@ -51,7 +51,7 @@ prompt_template = PROMPT_TEMPLATE.phi3_chat # Scheduler & Optimizer -batch_size = 32 # per_device +batch_size = 16 # per_device accumulative_counts = 1 dataloader_num_workers = 4 max_epochs = 1 diff --git a/xtuner/dataset/huggingface.py b/xtuner/dataset/huggingface.py index f29eafe49..c6937cf13 100644 --- a/xtuner/dataset/huggingface.py +++ b/xtuner/dataset/huggingface.py @@ -79,7 +79,9 @@ def tokenize_dataset(dataset, tokenizer, max_length, with_image_token, if isinstance(encode_map_fn, dict) or isinstance(encode_map_fn, Config) or \ isinstance(encode_map_fn, ConfigDict): - encode_fn = BUILDER.build(encode_map_fn) + encode_fn = encode_map_fn.pop('type') + if len(encode_map_fn) != 0: + encode_fn = partial(encode_fn, **encode_map_fn) else: encode_fn = encode_map_fn dataset = dataset.map( diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index d2db1ebfa..d35a91691 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -213,8 +213,13 @@ def __calc_fn(self, data_dict): if 'image_wh' in data_dict: size = data_dict['image_wh'][0] else: - image = self.get_image(os.path.join(self.image_folder, image_file)) - size = image.size + try: + image = self.get_image(os.path.join(self.image_folder, image_file)) + size = image.size + except Exception as e: + print(f'Error: {e}', flush=True) + print_log(f'Error: {e}', logger='current') + size = [1, 1] if self.use_patch: num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, self._patch_size) @@ -252,6 +257,7 @@ def prepare_data(self, index): try: image = self.get_image(os.path.join(self.image_folder, image_file)) except Exception as e: + print(f'Error: {e}', flush=True) print_log(f'Error: {e}', logger='current') return None images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size, use_patch=self.use_patch) From 1eac3a01f4270edd6c3434887b7cd896d0fe167e Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 20 May 2024 13:39:31 +0800 Subject: [PATCH 114/126] update --- .../llava/llama3_70b_chat/test_config.py | 5 +- .../phi3_internvl_1-5_more_data_pretrain.py | 103 +++++++++++++----- xtuner/dataset/llava.py | 25 +++-- xtuner/model/openai.py | 4 +- 4 files changed, 92 insertions(+), 45 deletions(-) diff --git a/xtuner/configs/llava/llama3_70b_chat/test_config.py b/xtuner/configs/llava/llama3_70b_chat/test_config.py index 6475440f6..9e676d433 100644 --- a/xtuner/configs/llava/llama3_70b_chat/test_config.py +++ b/xtuner/configs/llava/llama3_70b_chat/test_config.py @@ -24,8 +24,8 @@ llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' -model = dict(type=OpenAIModel) -prompt_template=None +model = dict(type=OpenAIModel, base_url='http://10.140.24.142:23333/v1') +prompt_template = None tokenizer = dict( type=AutoTokenizer.from_pretrained, @@ -204,4 +204,3 @@ test_evaluator = {} test_cfg = dict(type=TestLoop, select_metric='first') - diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index 617c7e6d4..71ac1cb63 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -44,9 +44,12 @@ laion_data_path = laion_data_root + 'filter_rand_10m_llava.json' laion_image_folder = 'public:s3://public-dataset/laion-coco/images/' -coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' -coyo_data_path = coyo_data_root + 'filter_rand_20m_llava.json' -coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' +# coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' +# coyo_data_path1 = coyo_data_root + 'filter_rand_20m_llava_1.json' +# coyo_data_path2 = coyo_data_root + 'filter_rand_20m_llava_2.json' +# coyo_data_path3 = coyo_data_root + 'filter_rand_20m_llava_3.json' +# coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' + prompt_template = PROMPT_TEMPLATE.phi3_chat @@ -63,11 +66,11 @@ warmup_ratio = 0.03 # Save -save_steps = 1000 +save_steps = 5000 save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training -evaluation_freq = 1000 +evaluation_freq = 5000 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] @@ -131,27 +134,69 @@ type=template_map_fn_factory, template=prompt_template), max_length=2048) -cache_2k_root = coyo_data_root + 'phi3_mini_2k_offline/' -coyo_dataset = dict( - type=InternVL_V1_5_LLaVADataset, - use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 - min_num=min_num, - max_num=max_num, - downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m', - data_path=coyo_data_path, - image_folder=coyo_image_folder, - tokenizer=tokenizer, - image_processor=image_processor, - dataset_map_fn=llava_map_fn, - encode_map_fn=dict( - type=internvl_1_5_encode_fn, - min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 - template_map_fn=dict( - type=template_map_fn_factory, template=prompt_template), - max_length=2048) +# cache_2k_root = coyo_data_root + 'phi3_mini_2k_offline/' +# coyo_dataset1 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_1', +# data_path=coyo_data_path1, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) +# +# coyo_dataset2 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_2', +# data_path=coyo_data_path2, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) +# +# coyo_dataset3 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_3', +# data_path=coyo_data_path3, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) cache_2k_root = share_data_root + 'phi3_mini_2k_offline/' sharegpt4v_dataset = dict( @@ -243,7 +288,7 @@ train_dataset = dict( type=ConcatDataset, datasets=[ - laion_coco_dataset, coyo_dataset, + laion_coco_dataset, # coyo_dataset1, coyo_dataset2, coyo_dataset3, sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, allava_text_dataset, allava_text_dataset ]) @@ -268,6 +313,8 @@ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), accumulative_counts=accumulative_counts, + # constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + # paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l loss_scale='dynamic', dtype='float16') @@ -321,7 +368,7 @@ # save checkpoint per `save_steps`. checkpoint=dict( type=CheckpointHook, - save_optimizer=True, + save_optimizer=False, by_epoch=False, interval=save_steps, max_keep_ckpts=save_total_limit), diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index d35a91691..68ad8b6d2 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -116,7 +116,7 @@ def __len__(self): return len(self.text_data) def get_image(self, path): - if path.startswith('s3://'): + if "s3://" in path: img_bytes = get(path) with io.BytesIO(img_bytes) as buff: img = Image.open(buff).convert('RGB') @@ -229,17 +229,18 @@ def __calc_fn(self, data_dict): cur_len = -cur_len return cur_len - @property - def modality_length(self): - print_log('start calculating modality length', logger='current'), - with ThreadPoolExecutor(max_workers=16) as executor: - length_list = list( - tqdm( - executor.map(self.__calc_fn, self.text_data), - desc='Calculating modality length', - total=len(self.text_data))) - print_log('end calculating modality length', logger='current'), - return length_list + # 太慢了,改离线吧 + # @property + # def modality_length(self): + # print_log('start calculating modality length', logger='current'), + # with ThreadPoolExecutor(max_workers=16) as executor: + # length_list = list( + # tqdm( + # executor.map(self.__calc_fn, self.text_data), + # desc='Calculating modality length', + # total=len(self.text_data))) + # print_log('end calculating modality length', logger='current'), + # return length_list def __getitem__(self, index): for _ in range(self.max_refetch + 1): diff --git a/xtuner/model/openai.py b/xtuner/model/openai.py index 81f13fa1d..1c261f534 100644 --- a/xtuner/model/openai.py +++ b/xtuner/model/openai.py @@ -39,9 +39,9 @@ def batch_infer(self, messages: List[List]): class OpenAIModel(BaseModel): - def __init__(self): + def __init__(self, base_url): super().__init__() - self.model = OpenaiBackend(base_url='http://10.140.24.142:23333/v1') + self.model = OpenaiBackend(base_url=base_url) def forward(self, data, data_samples=None, mode='loss'): pixel_values = data['pixel_values'][0] From 5137d54865f0131aa46e6097e371081dd28ad84c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 20 May 2024 14:32:22 +0800 Subject: [PATCH 115/126] add new config --- .../phi3_internvl_1-5_more_data_pretrain.py | 240 +++++++++++++++--- 1 file changed, 205 insertions(+), 35 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index 71ac1cb63..7b02f7227 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmengine.dataset import DefaultSampler from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, LoggerHook, ParamSchedulerHook) from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR @@ -40,16 +39,29 @@ allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' -laion_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/' -laion_data_path = laion_data_root + 'filter_rand_10m_llava.json' +laion_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/orig_merge_70m_data/' +laion_data_path0 = laion_data_root + 'filter_data_0_llava.json' +laion_data_path1 = laion_data_root + 'filter_data_1_llava.json' +laion_data_path2 = laion_data_root + 'filter_data_2_llava.json' +laion_data_path3 = laion_data_root + 'filter_data_3_llava.json' +laion_data_path4 = laion_data_root + 'filter_data_4_llava.json' +laion_data_path5 = laion_data_root + 'filter_data_5_llava.json' +laion_data_path6 = laion_data_root + 'filter_data_6_llava.json' +laion_data_path7 = laion_data_root + 'filter_data_7_llava.json' laion_image_folder = 'public:s3://public-dataset/laion-coco/images/' +# laion-coco-ocr +laion_ocr_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/orig_merge_17m_ocr_data/' +laion_ocr_data_path0 = laion_data_root + 'filter_data_0_llava.json' +laion_ocr_data_path1 = laion_data_root + 'filter_data_1_llava.json' + # coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' # coyo_data_path1 = coyo_data_root + 'filter_rand_20m_llava_1.json' # coyo_data_path2 = coyo_data_root + 'filter_rand_20m_llava_2.json' # coyo_data_path3 = coyo_data_root + 'filter_rand_20m_llava_3.json' # coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' +max_length = 4096 prompt_template = PROMPT_TEMPLATE.phi3_chat @@ -112,15 +124,180 @@ ####################################################################### # PART 3 Dataset & Dataloader # ####################################################################### -cache_2k_root = laion_data_root + 'phi3_mini_2k_offline/' -laion_coco_dataset = dict( +cache_4k_root = laion_data_root + 'phi3_mini_4k_offline/' +laion_coco_dataset0 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_0', + data_path=laion_data_path0, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset1 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_1', + data_path=laion_data_path1, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset2 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_2', + data_path=laion_data_path2, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset3 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_3', + data_path=laion_data_path3, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset4 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_4', + data_path=laion_data_path4, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset5 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_5', + data_path=laion_data_path5, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset6 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_6', + data_path=laion_data_path6, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset7 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_7', + data_path=laion_data_path7, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + + +###############################################################################333 +cache_4k_root = laion_ocr_data_root + 'phi3_mini_4k_offline/' +laion_coco_ocr_dataset0 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_ocr_dataset_10m_0', + data_path=laion_ocr_data_path0, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_ocr_dataset1 = dict( type=InternVL_V1_5_LLaVADataset, - use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'laion_coco_dataset_10m', - data_path=laion_data_path, + offline_processed_text_folder=cache_4k_root + 'laion_coco_ocr_dataset_10m_1', + data_path=laion_ocr_data_path1, image_folder=laion_image_folder, tokenizer=tokenizer, image_processor=image_processor, @@ -128,11 +305,10 @@ encode_map_fn=dict( type=internvl_1_5_encode_fn, min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 + max_num=max_num), template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), - max_length=2048) + max_length=max_length) # cache_2k_root = coyo_data_root + 'phi3_mini_2k_offline/' # coyo_dataset1 = dict( @@ -198,14 +374,13 @@ # type=template_map_fn_factory, template=prompt_template), # max_length=2048) -cache_2k_root = share_data_root + 'phi3_mini_2k_offline/' +cache_4k_root = share_data_root + 'phi3_mini_4k_offline/' sharegpt4v_dataset = dict( type=InternVL_V1_5_LLaVADataset, - use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'sharegpt4v_dataset', + offline_processed_text_folder=cache_4k_root + 'sharegpt4v_dataset', data_path=sharegpt4v_data_path, image_folder=sharegpt4v_image_folder, tokenizer=tokenizer, @@ -214,20 +389,18 @@ encode_map_fn=dict( type=internvl_1_5_encode_fn, min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 + max_num=max_num), template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), - max_length=2048) + max_length=max_length) -cache_2k_root = data_root + 'phi3_mini_2k_offline/' +cache_4k_root = data_root + 'phi3_mini_4k_offline/' allava_laion_dataset = dict( type=InternVL_V1_5_LLaVADataset, - use_patch=False, min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset', + offline_processed_text_folder=cache_4k_root + 'allava_laion_dataset', data_path=allava_laion_data_path, image_folder=allava_laion_image_folder, tokenizer=tokenizer, @@ -236,20 +409,18 @@ encode_map_fn=dict( type=internvl_1_5_encode_fn, min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 + max_num=max_num), template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), - max_length=2048) + max_length=max_length) -cache_2k_root = data_root + 'phi3_mini_2k_offline/' +cache_4k_root = data_root + 'phi3_mini_4k_offline/' allava_vflan_dataset = dict( type=InternVL_V1_5_LLaVADataset, - use_patch=False, min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset', + offline_processed_text_folder=cache_4k_root + 'allava_vflan_dataset', data_path=allava_vflan_data_path, image_folder=allava_vflan_image_folder, tokenizer=tokenizer, @@ -258,19 +429,17 @@ encode_map_fn=dict( type=internvl_1_5_encode_fn, min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 + max_num=max_num), template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), - max_length=2048) + max_length=max_length) allava_text_dataset = dict( type=InternVL_V1_5_LLaVADataset, - use_patch=False, min_num=min_num, max_num=max_num, downsample_ratio=downsample_ratio, - offline_processed_text_folder=cache_2k_root + 'allava_text_dataset', + offline_processed_text_folder=cache_4k_root + 'allava_text_dataset', data_path=allava_text_data_path, tokenizer=tokenizer, image_processor=image_processor, @@ -279,16 +448,17 @@ encode_map_fn=dict( type=internvl_1_5_encode_fn, min_num=min_num, - max_num=max_num, - use_patch=False), # 核心参数 + max_num=max_num), template_map_fn=dict( type=template_map_fn_factory, template=prompt_template), - max_length=2048) + max_length=max_length) train_dataset = dict( type=ConcatDataset, datasets=[ - laion_coco_dataset, # coyo_dataset1, coyo_dataset2, coyo_dataset3, + laion_coco_dataset0, laion_coco_dataset1, laion_coco_dataset2, laion_coco_dataset3, + laion_coco_dataset4, laion_coco_dataset5, laion_coco_dataset6, laion_coco_dataset7, + laion_coco_ocr_dataset0, laion_coco_ocr_dataset1, sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, allava_text_dataset, allava_text_dataset ]) From e0dbf4f89b2deefb57bd5949ea0e8a76606e1244 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Mon, 20 May 2024 16:55:46 +0800 Subject: [PATCH 116/126] update config --- .../phi3_internvl_1-5_more_data_pretrain.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index 7b02f7227..bca994df7 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -52,8 +52,8 @@ # laion-coco-ocr laion_ocr_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/orig_merge_17m_ocr_data/' -laion_ocr_data_path0 = laion_data_root + 'filter_data_0_llava.json' -laion_ocr_data_path1 = laion_data_root + 'filter_data_1_llava.json' +laion_ocr_data_path0 = laion_ocr_data_root + 'filter_data_0_llava.json' +laion_ocr_data_path1 = laion_ocr_data_root + 'filter_data_1_llava.json' # coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' # coyo_data_path1 = coyo_data_root + 'filter_rand_20m_llava_1.json' @@ -61,6 +61,10 @@ # coyo_data_path3 = coyo_data_root + 'filter_rand_20m_llava_3.json' # coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' +coco_caption_data_root = '/mnt/hwfile/xtuner/huanghaian/data/coco_caption/' +coco_caption_data_path = coco_caption_data_root + 'coco_karpathy_train_val_llava.json' +coco_caption_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/coco_caption/' + max_length = 4096 prompt_template = PROMPT_TEMPLATE.phi3_chat @@ -453,12 +457,32 @@ type=template_map_fn_factory, template=prompt_template), max_length=max_length) +cache_4k_root = coco_caption_data_root + 'phi3_mini_4k_offline/' +coco_caption_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'coco_karpathy_train_val_llava', + data_path=coco_caption_data_path, + image_folder=coco_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + train_dataset = dict( type=ConcatDataset, datasets=[ laion_coco_dataset0, laion_coco_dataset1, laion_coco_dataset2, laion_coco_dataset3, laion_coco_dataset4, laion_coco_dataset5, laion_coco_dataset6, laion_coco_dataset7, - laion_coco_ocr_dataset0, laion_coco_ocr_dataset1, + laion_coco_ocr_dataset0, laion_coco_ocr_dataset1, coco_caption_dataset, sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, allava_text_dataset, allava_text_dataset ]) From 8939442b8552c96e7fc15e3fcc6f4e7766a8eae9 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Tue, 21 May 2024 16:52:31 +0800 Subject: [PATCH 117/126] update config --- .../phi3_internvl_1-5_more_data_pretrain.py | 147 ++++++++++++++++-- xtuner/dataset/huggingface.py | 11 +- xtuner/dataset/llava.py | 73 +++++---- 3 files changed, 184 insertions(+), 47 deletions(-) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py index bca994df7..1d3e59916 100644 --- a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -82,11 +82,11 @@ warmup_ratio = 0.03 # Save -save_steps = 5000 +save_steps = 10000 save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) # Evaluate the generation performance during the training -evaluation_freq = 5000 +evaluation_freq = 10000 SYSTEM = '' evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' evaluation_inputs = ['Please describe this picture'] @@ -111,12 +111,13 @@ model = dict( type=InternVL_v1_5_LLaVAModel, + use_lldr=True, # xxxxxxx downsample_ratio=downsample_ratio, tokenizer=tokenizer, template=prompt_template, image_processor=image_processor, freeze_llm=True, - freeze_visual_encoder=True, + freeze_visual_encoder=False, llm=dict( type=AutoModelForCausalLM.from_pretrained, pretrained_model_name_or_path=llm_name_or_path, @@ -274,7 +275,6 @@ type=template_map_fn_factory, template=prompt_template), max_length=max_length) - ###############################################################################333 cache_4k_root = laion_ocr_data_root + 'phi3_mini_4k_offline/' laion_coco_ocr_dataset0 = dict( @@ -477,14 +477,137 @@ type=template_map_fn_factory, template=prompt_template), max_length=max_length) +laion_gpt4v_root = '/mnt/hwfile/xtuner/huanghaian/data/laion_gpt4v/' +laion_gpt4v_data_path = laion_gpt4v_root + 'laion_gpt4v_llava.json' +laion_gpt4v_image_folder = laion_gpt4v_root + 'images/' + +cache_4k_root = laion_gpt4v_root + 'phi3_mini_4k_offline/' +laion_gpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_gpt4v_llava', + data_path=laion_gpt4v_data_path, + image_folder=laion_gpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +coco_text_root = '/mnt/hwfile/xtuner/huanghaian/data/coco_text/' +coco_text_data_path = coco_text_root + 'cocotext_v2_llava.json' +coco_text_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/coco_text/' + +cache_4k_root = coco_text_root + 'phi3_mini_4k_offline/' +coco_text_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'coco_text_dataset', + data_path=coco_text_data_path, + image_folder=coco_text_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +textcap_root = '/mnt/hwfile/xtuner/huanghaian/data/TextCaps/' +textcap_data_path = textcap_root + 'TextCaps_0.1_train_val_llava.json' +textcap_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/TextCaps/' +cache_4k_root = textcap_root + 'phi3_mini_4k_offline/' + +text_cap_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'text_cap_dataset', + data_path=textcap_data_path, + image_folder=textcap_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +textocr_gpt4v_root = '/mnt/hwfile/xtuner/huanghaian/data/TextOCR-GPT4V/' +textocr_gpt4v_data_path = textocr_gpt4v_root + 'train_llava.json' +textocr_gpt4v_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/TextOCR-GPT4V/' +cache_4k_root = textocr_gpt4v_root + 'phi3_mini_4k_offline/' + +textocr_gpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'textocr_gpt4v_dataset', + data_path=textocr_gpt4v_data_path, + image_folder=textocr_gpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +bunny_laion_root = '/mnt/hwfile/xtuner/huanghaian/data/Bunny-v1_0-data/pretrain/' +bunny_laion_data_path = bunny_laion_root + 'bunny_pretrain_laion_2m_llava.json' +bunny_laion_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/Bunny-v1_0-data/pretrain/images' +cache_4k_root = bunny_laion_root + 'phi3_mini_4k_offline/' + +bunny_laion_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'bunny_laion_dataset', + data_path=bunny_laion_data_path, + image_folder=bunny_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +# 42m train_dataset = dict( type=ConcatDataset, datasets=[ - laion_coco_dataset0, laion_coco_dataset1, laion_coco_dataset2, laion_coco_dataset3, - laion_coco_dataset4, laion_coco_dataset5, laion_coco_dataset6, laion_coco_dataset7, + # laion_coco_dataset0, laion_coco_dataset1, laion_coco_dataset2, laion_coco_dataset3, + # laion_coco_dataset4, laion_coco_dataset5, laion_coco_dataset6, laion_coco_dataset7, + laion_coco_dataset0, laion_coco_dataset5, laion_coco_ocr_dataset0, laion_coco_ocr_dataset1, coco_caption_dataset, - sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, - allava_text_dataset, allava_text_dataset + sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, laion_gpt4v_dataset, + allava_text_dataset, coco_text_dataset, text_cap_dataset, textocr_gpt4v_dataset, + bunny_laion_dataset ]) train_dataloader = dict( @@ -507,8 +630,8 @@ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), accumulative_counts=accumulative_counts, - # constructor='LearningRateDecayOptimWrapperConstructor', # ==================== - # paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l loss_scale='dynamic', dtype='float16') @@ -556,13 +679,13 @@ # record the time of every iteration. timer=dict(type=IterTimerHook), # print log every 10 iterations. - logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=50), # enable the parameter scheduler. param_scheduler=dict(type=ParamSchedulerHook), # save checkpoint per `save_steps`. checkpoint=dict( type=CheckpointHook, - save_optimizer=False, + save_optimizer=True, by_epoch=False, interval=save_steps, max_keep_ckpts=save_total_limit), diff --git a/xtuner/dataset/huggingface.py b/xtuner/dataset/huggingface.py index c6937cf13..5aa7e374b 100644 --- a/xtuner/dataset/huggingface.py +++ b/xtuner/dataset/huggingface.py @@ -14,10 +14,18 @@ from xtuner.registry import BUILDER, MAP_FUNC from .utils import Packer from .utils import encode_fn as default_encode_fn +from .utils import total_image_token def get_lengths(example): - return {'length': len(example['input_ids'])} + cur_len = len(example['input_ids']) + if example.get('image', None) is not None: + assert 'image_wh' in example + size = example['image_wh'][0] + num_image_token = total_image_token(size, 1, 6, 336, 12) + cur_len += num_image_token + cur_len = -cur_len + return {'length': cur_len} def build_origin_dataset(dataset, split): @@ -228,6 +236,7 @@ def process(dataset, # add 'length' dataset = dataset.map(get_lengths, num_proc=map_num_proc) + setattr(dataset, 'modality_length', dataset['length']) setattr(dataset, 'length', dataset['length']) return dataset diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 68ad8b6d2..911b0e095 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -204,43 +204,48 @@ def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, use_p self.max_refetch = 1000 def __calc_fn(self, data_dict): - cur_len = len(data_dict['input_ids']) - if data_dict.get('image', None) is not None: - cur_len = len(data_dict['input_ids']) - if data_dict.get('image', None) is not None: - image_file = data_dict['image'] - assert 'image_wh' in data_dict - if 'image_wh' in data_dict: - size = data_dict['image_wh'][0] - else: - try: - image = self.get_image(os.path.join(self.image_folder, image_file)) - size = image.size - except Exception as e: - print(f'Error: {e}', flush=True) - print_log(f'Error: {e}', logger='current') - size = [1, 1] - if self.use_patch: - num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, - self._patch_size) - else: - num_image_token = self._patch_size * self._patch_size - cur_len += num_image_token - cur_len = -cur_len + cur_len = data_dict['length'] + # cur_len = len(data_dict['input_ids']) + # if data_dict.get('image', None) is not None: + # cur_len = len(data_dict['input_ids']) + # if data_dict.get('image', None) is not None: + # image_file = data_dict['image'] + # assert 'image_wh' in data_dict + # if 'image_wh' in data_dict: + # size = data_dict['image_wh'][0] + # else: + # try: + # image = self.get_image(os.path.join(self.image_folder, image_file)) + # size = image.size + # except Exception as e: + # print(f'Error: {e}', flush=True) + # print_log(f'Error: {e}', logger='current') + # size = [1, 1] + # if self.use_patch: + # num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, + # self._patch_size) + # else: + # num_image_token = self._patch_size * self._patch_size + # cur_len += num_image_token + # cur_len = -cur_len return cur_len # 太慢了,改离线吧 - # @property - # def modality_length(self): - # print_log('start calculating modality length', logger='current'), - # with ThreadPoolExecutor(max_workers=16) as executor: - # length_list = list( - # tqdm( - # executor.map(self.__calc_fn, self.text_data), - # desc='Calculating modality length', - # total=len(self.text_data))) - # print_log('end calculating modality length', logger='current'), - # return length_list + @property + def modality_length(self): + # 可以超级加速 + print_log('start calculating modality length', logger='current') + # with ThreadPoolExecutor(max_workers=16) as executor: + # length_list = list( + # tqdm( + # executor.map(self.__calc_fn, self.text_data), + # desc='Calculating modality length', + # total=len(self.text_data))) + # print_log('end calculating modality length', logger='current') + + length_list = self.text_data['length'] + print_log('end calculating modality length', logger='current') + return length_list def __getitem__(self, index): for _ in range(self.max_refetch + 1): From 5c744e745ab99af57dc437506c0e97ac1c30a6cd Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 30 May 2024 12:56:07 +0800 Subject: [PATCH 118/126] update 1.8 --- xtuner/model/internvl_1_5_llava.py | 66 ++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 59ac31830..4cc5a1b15 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -11,6 +11,9 @@ from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel import types +from mmengine.logging import print_log +import torch.nn as nn +from fairscale.nn.checkpoint import checkpoint_wrapper class InternVL_v1_5_LLaVAModel(LLaVAModel): @@ -30,7 +33,8 @@ def __init__(self, llm, template=None, use_lldr=False, # LearningRateDecayOptimWrapperConstructor merge_type='pixel_shuffle', # or pixel_shuffle - downsample_ratio=0.5): + downsample_ratio=0.5, + custom_mlp=False): super(LLaVAModel, self).__init__() self.downsample_ratio = downsample_ratio @@ -54,12 +58,24 @@ def __init__(self, llm, self.llm.config.use_cache = False dispatch_modules(self.llm) - projector_config = ProjectorConfig( - visual_hidden_size=self.visual_encoder.config.hidden_size * (int(1 / self.downsample_ratio) ** 2), - llm_hidden_size=self.llm.config.hidden_size, - depth=projector_depth) - self.projector = ProjectorModel(projector_config).to( - self.visual_encoder.dtype) + self.custom_mlp = custom_mlp + if custom_mlp is True: + self.mlp1 = nn.Sequential( + nn.LayerNorm(self.visual_encoder.config.hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(self.visual_encoder.config.hidden_size * int(1 / self.downsample_ratio) ** 2, + self.llm.config.hidden_size), + nn.GELU(), + nn.Linear(self.llm.config.hidden_size, self.llm.config.hidden_size) + ) + self.mlp1 = self.mlp1.to(self.visual_encoder.dtype) + self.mlp1 = checkpoint_wrapper(self.mlp1) + else: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size * (int(1 / self.downsample_ratio) ** 2), + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + self.projector = ProjectorModel(projector_config).to( + self.visual_encoder.dtype) if self.freeze_llm: self.llm.requires_grad_(False) @@ -74,12 +90,17 @@ def __init__(self, llm, else: self.llm.get_input_embeddings().register_forward_hook( make_inputs_require_grad) - if hasattr(self.visual_encoder, 'enable_input_require_grads'): - self.visual_encoder.enable_input_require_grads() + + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass else: - self.visual_encoder.get_input_embeddings( - ).register_forward_hook(make_inputs_require_grad) - self.projector.enable_input_require_grads() + if hasattr(self.visual_encoder, 'enable_input_require_grads'): + self.visual_encoder.enable_input_require_grads() + else: + self.visual_encoder.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + if custom_mlp is False: + self.projector.enable_input_require_grads() # enable gradient (activation) checkpointing for memory efficiency self.gradient_checkpointing_enable() @@ -95,7 +116,6 @@ def __init__(self, llm, if pretrained_pth is not None: pretrained_state_dict = guess_load_checkpoint(pretrained_pth) - self.load_state_dict(pretrained_state_dict, strict=False) print(f'Load pretrained weight from {pretrained_pth}') @@ -111,6 +131,26 @@ def __init__(self, llm, self.image_processor = BUILDER.build(image_processor) self.template = template + print_log(self, logger='current') + + def activation_checkpointing_enable(self): + self.llm.gradient_checkpointing_enable() + if self.custom_mlp is False: + self.projector.gradient_checkpointing_enable() + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass + else: + self.visual_encoder.gradient_checkpointing_enable() + + def activation_checkpointing_disable(self): + self.llm.gradient_checkpointing_disable() + if self.custom_mlp is False: + self.projector.gradient_checkpointing_disable() + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass + else: + self.visual_encoder.gradient_checkpointing_disable() + # The following code is only meaningful when the optim_wrapper configuration # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): From a3740043af34d57890b4af0f526452ae4dbc73d1 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 30 May 2024 13:22:15 +0800 Subject: [PATCH 119/126] update 1.8 --- xtuner/dataset/llava.py | 43 ++++++++++++++++--- .../process_untokenized_llava_concatdata.py | 36 ++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 xtuner/tools/process_untokenized_llava_concatdata.py diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 911b0e095..9cb4fca8a 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -182,24 +182,50 @@ def __getitem__(self, index): return data_dict +from torchvision.transforms.functional import InterpolationMode +import torchvision.transforms as T + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + class InternVL_V1_5_LLaVADataset(LLaVADataset): - def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, use_patch=True, *args, **kwargs): + def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, use_patch=True, custom=False, *args, + **kwargs): self.min_num = min_num self.max_num = max_num self.downsample_ratio = downsample_ratio self.use_patch = use_patch super().__init__(*args, **kwargs) - if hasattr(self.image_processor, 'crop_size'): - self._crop_size = self.image_processor.crop_size + self.custom = custom + + if custom: + self.image_processor = build_transform(448) + self._crop_size = {'height': 448, 'width': 448} else: - self._crop_size = self.image_processor.size + if hasattr(self.image_processor, 'crop_size'): + self._crop_size = self.image_processor.crop_size + else: + self._crop_size = self.image_processor.size + self._patch_size = self._crop_size['height'] self._shortest_edge = self._crop_size['height'] # clip self._image_size = image_size - self._patch_size = (self._image_size // 14) * downsample_ratio # 12 + self._patch_size = (self._image_size // 14) * downsample_ratio # 12, 16 self.max_refetch = 1000 @@ -268,8 +294,11 @@ def prepare_data(self, index): return None images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size, use_patch=self.use_patch) for i, image in enumerate(images): - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] + if self.custom: + image = self.image_processor(image) + else: + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] images[i] = image images = torch.stack(images, dim=0) data_dict['pixel_values'] = images diff --git a/xtuner/tools/process_untokenized_llava_concatdata.py b/xtuner/tools/process_untokenized_llava_concatdata.py new file mode 100644 index 000000000..097f324d6 --- /dev/null +++ b/xtuner/tools/process_untokenized_llava_concatdata.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import warnings + +from mmengine import Config + +from xtuner.registry import BUILDER + +# ignore FutureWarning in hf datasets +warnings.simplefilter(action='ignore', category=FutureWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='config file name or path.') + parser.add_argument('--save-folder', help='The folder to save data order.') + args = parser.parse_args() + return args + + +def build_llava_dataset(config): + dataset = BUILDER.build(config) + return dataset + + +if __name__ == '__main__': + args = parse_args() + cfg = Config.fromfile(args.config) + + datasets = cfg.train_dataloader.datasets + for dataset_cfg in datasets: + llava_dataset = build_llava_dataset(dataset_cfg) + text_data = llava_dataset.text_data + variable_name = [k for k, v in locals().items() if v == dataset_cfg][0] + save_folder = args.save_folder + f'/{variable_name}' + text_data.save_to_disk(save_folder) From 0e8febb02b97213ec3689453288cf3d22eaa60be Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 30 May 2024 14:34:29 +0800 Subject: [PATCH 120/126] fix --- xtuner/tools/process_untokenized_llava_concatdata.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xtuner/tools/process_untokenized_llava_concatdata.py b/xtuner/tools/process_untokenized_llava_concatdata.py index 097f324d6..4326224c4 100644 --- a/xtuner/tools/process_untokenized_llava_concatdata.py +++ b/xtuner/tools/process_untokenized_llava_concatdata.py @@ -13,7 +13,6 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('config', help='config file name or path.') - parser.add_argument('--save-folder', help='The folder to save data order.') args = parser.parse_args() return args @@ -27,10 +26,9 @@ def build_llava_dataset(config): args = parse_args() cfg = Config.fromfile(args.config) - datasets = cfg.train_dataloader.datasets + datasets = cfg.train_dataloader.dataset.datasets for dataset_cfg in datasets: + offline_processed_text_folder = dataset_cfg.pop('offline_processed_text_folder') llava_dataset = build_llava_dataset(dataset_cfg) text_data = llava_dataset.text_data - variable_name = [k for k, v in locals().items() if v == dataset_cfg][0] - save_folder = args.save_folder + f'/{variable_name}' - text_data.save_to_disk(save_folder) + text_data.save_to_disk(offline_processed_text_folder) From 0c09d9c9fee1c2f416e387769b60723aad8c69d0 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Thu, 30 May 2024 20:59:09 +0800 Subject: [PATCH 121/126] fix --- xtuner/dataset/huggingface.py | 11 +++-- xtuner/dataset/llava.py | 2 +- xtuner/dataset/utils.py | 6 ++- .../process_untokenized_llava_concatdata.py | 48 +++++++++++++++++-- 4 files changed, 57 insertions(+), 10 deletions(-) diff --git a/xtuner/dataset/huggingface.py b/xtuner/dataset/huggingface.py index 5aa7e374b..c3e2c93ad 100644 --- a/xtuner/dataset/huggingface.py +++ b/xtuner/dataset/huggingface.py @@ -21,10 +21,13 @@ def get_lengths(example): cur_len = len(example['input_ids']) if example.get('image', None) is not None: assert 'image_wh' in example - size = example['image_wh'][0] - num_image_token = total_image_token(size, 1, 6, 336, 12) - cur_len += num_image_token - cur_len = -cur_len + image_wh = example['image_wh'] + if image_wh is not None: + if isinstance(image_wh[0], int): + image_wh = [image_wh] + num_image_token = total_image_token(image_wh[0], 1, 12, 448, 16) + cur_len += num_image_token + cur_len = -cur_len return {'length': cur_len} diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 9cb4fca8a..e8400478d 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -66,7 +66,7 @@ def __init__(self, raise NotImplementedError for idx in range(len(json_data)): - if isinstance(json_data[idx]['id'], int): + if 'id' in json_data[idx] and isinstance(json_data[idx]['id'], int): json_data[idx]['id'] = str(json_data[idx]['id']) json_data = DatasetDict({'train': HFDataset.from_list(json_data)}) self.text_data = process_hf_dataset( diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index d6a184c6a..f044e5c89 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -575,7 +575,11 @@ def internvl_1_5_encode_fn(example, if 'image' in example: if use_patch: assert 'image_wh' in example - img_token = total_image_token(example['image_wh'][0], min_num, max_num, image_size, patch_size) + image_wh = example['image_wh'] + if image_wh is not None: + if isinstance(image_wh[0], int): + image_wh = [image_wh] + img_token = total_image_token(image_wh[0], min_num, max_num, image_size, patch_size) else: # clip img_token = patch_size * patch_size diff --git a/xtuner/tools/process_untokenized_llava_concatdata.py b/xtuner/tools/process_untokenized_llava_concatdata.py index 4326224c4..124be9ac6 100644 --- a/xtuner/tools/process_untokenized_llava_concatdata.py +++ b/xtuner/tools/process_untokenized_llava_concatdata.py @@ -3,8 +3,11 @@ import warnings from mmengine import Config +import numpy as np from xtuner.registry import BUILDER +from tqdm import tqdm +from mmengine.logging import MMLogger # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -26,9 +29,46 @@ def build_llava_dataset(config): args = parse_args() cfg = Config.fromfile(args.config) + logger = MMLogger.get_instance( + name='xtuner', + log_file='benchmark_test.log') + datasets = cfg.train_dataloader.dataset.datasets - for dataset_cfg in datasets: + for dataset_cfg in tqdm(datasets): offline_processed_text_folder = dataset_cfg.pop('offline_processed_text_folder') - llava_dataset = build_llava_dataset(dataset_cfg) - text_data = llava_dataset.text_data - text_data.save_to_disk(offline_processed_text_folder) + logger.info('=================================================================') + logger.info(f'offline_processed_text_folder: {offline_processed_text_folder}') + try: + llava_dataset = build_llava_dataset(dataset_cfg) + text_data = llava_dataset.text_data + + length_list = text_data['length'] + length_np = np.abs(length_list) + min_, max_, mid_ = np.min(length_np), np.max(length_np), np.median(length_np) + logger.info(f'token len({length_np.shape[0]}): max: {max_}, min: {min_}, mid: {mid_}') + try: + image_wh_list = text_data['image_wh'] + new_list = [] + for d in image_wh_list: + if d is not None: + if isinstance(d[0], list): + new_list.append(d[0]) + else: + new_list.append(d) + new_list = np.array(new_list).reshape(-1, 2) + row_sums = np.sum(new_list, axis=1) + max_idx = np.argmax(row_sums) + min_idx = np.argmin(row_sums) + mid_idx = np.argsort(row_sums)[len(row_sums) // 2] + max_value = new_list[max_idx] + min_value = new_list[min_idx] + mid_value = new_list[mid_idx] + logger.info(f'Image wh: max: {max_value}, min: {min_value}, mid: {mid_value}\n') + + except Exception as e: + logger.error(f'=======Error: {e}') + + text_data.save_to_disk(offline_processed_text_folder) + except Exception as e: + logger.error(f'--------Error: {e}') + raise NotImplementedError From 7b46ab38cf284f4c68bed5e0699ee98fcc9d44e9 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 31 May 2024 14:05:10 +0800 Subject: [PATCH 122/126] fix eval --- .../internvl_v1_5_llava_proxy_eval_dataset.py | 40 +++++++++++++++---- .../map_fns/dataset_map_fns/llava_map_fn.py | 5 +++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py index 3c733ba43..4a8cd4985 100644 --- a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py +++ b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py @@ -5,18 +5,41 @@ from xtuner.tools.utils import is_cn_string from .utils import dynamic_preprocess +from torchvision.transforms.functional import InterpolationMode +import torchvision.transforms as T + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + class InternVL_v1_5_LLaVAProxyEvalDataset: - def __init__(self, eval_dataset, min_num, max_num): + def __init__(self, eval_dataset, min_num, max_num, custom=False): self.eval_ds = eval_dataset self.min_num = min_num self.max_num = max_num - # TODO: Assuming they are all squares. - if hasattr(eval_dataset.image_processor, 'crop_size'): - self._crop_size = eval_dataset.image_processor.crop_size + self.custom = custom + if custom: + self.image_processor = build_transform(448) + self._crop_size = {'height': 448, 'width': 448} else: - self._crop_size = eval_dataset.image_processor.size + # TODO: Assuming they are all squares. + if hasattr(eval_dataset.image_processor, 'crop_size'): + self._crop_size = eval_dataset.image_processor.crop_size + else: + self._crop_size = eval_dataset.image_processor.size + self._image_size = self._crop_size['height'] def getitem(self, idx, data): @@ -83,8 +106,11 @@ def getitem(self, idx, data): images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) for i, image in enumerate(images): - image = self.eval_ds.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] + if self.custom: + image = self.image_processor(image) + else: + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] images[i] = image images = torch.stack(images, dim=0) data_dict['pixel_values'] = images diff --git a/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py b/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py index a08ca395b..5449588e5 100644 --- a/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py +++ b/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py @@ -29,6 +29,7 @@ def llava_map_fn(example): while messages and messages[0]['from'] == 'gpt': # Skip the first one if it is from gpt messages = messages[1:] + # system_v = '' for msg in messages: if msg['from'] == 'human': if DEFAULT_IMAGE_TOKEN in msg['value']: @@ -40,7 +41,11 @@ def llava_map_fn(example): elif msg['from'] == 'gpt': conversation.append({'input': input, 'output': msg['value']}) + # conversation.append({'input': input, 'output': msg['value'], 'system': system_v}) input = '' + # system_v = '' + # elif msg['from'] == 'system': + # system_v = msg['value'] else: raise NotImplementedError return {'conversation': conversation} From 41da1b12165ce0b8af0b02f27b23abc30bb9132b Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 31 May 2024 16:14:35 +0800 Subject: [PATCH 123/126] fix --- xtuner/dataset/llava.py | 4 ++++ xtuner/engine/optimizers/__init__.py | 5 ++-- xtuner/engine/optimizers/utils.py | 36 ++++++++++++++++++++++++++++ xtuner/model/llava.py | 8 ++++++- 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index e8400478d..25c21f7d7 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -286,6 +286,10 @@ def prepare_data(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] + if isinstance(image_file, list): + if len(image_file) > 1: + return None + image_file = image_file[0] try: image = self.get_image(os.path.join(self.image_folder, image_file)) except Exception as e: diff --git a/xtuner/engine/optimizers/__init__.py b/xtuner/engine/optimizers/__init__.py index 9372c9ff7..adf3acc80 100644 --- a/xtuner/engine/optimizers/__init__.py +++ b/xtuner/engine/optimizers/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .layer_decay_optim_wrapper_constructor import LearningRateDecayOptimWrapperConstructor -from .utils import get_layer_depth_for_CLIPVisionModel +from .utils import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel __all__ = [ - 'LearningRateDecayOptimWrapperConstructor', 'get_layer_depth_for_CLIPVisionModel' + 'LearningRateDecayOptimWrapperConstructor', 'get_layer_depth_for_CLIPVisionModel', + 'get_layer_depth_for_InternVisionModel' ] diff --git a/xtuner/engine/optimizers/utils.py b/xtuner/engine/optimizers/utils.py index d1279c1a8..6cf2d0677 100644 --- a/xtuner/engine/optimizers/utils.py +++ b/xtuner/engine/optimizers/utils.py @@ -33,3 +33,39 @@ def get_layer_depth_for_CLIPVisionModel(self, param_name: str, prefix: str = 'vi layer_depth = num_layers - 1 return layer_depth, num_layers + + +def get_layer_depth_for_InternVisionModel(self, param_name: str, prefix: str = ''): + """Get the layer-wise depth of a parameter. + + Args: + param_name (str): The name of the parameter. + prefix (str): The prefix for the parameter. + Defaults to an empty string. + + Returns: + Tuple[int, int]: The layer-wise depth and the num of layers. + + Note: + The first depth is the stem module (``layer_depth=0``), and the + last depth is the subsequent module (``layer_depth=num_layers-1``) + """ + num_layers = self.config.num_hidden_layers + 2 + + if not param_name.startswith(prefix): + # For subsequent module like head + return num_layers - 1, num_layers + + param_name = param_name[len(prefix):] + + if param_name.startswith('embeddings'): + layer_depth = 0 + elif param_name.startswith('pre_layrnorm'): + layer_depth = 0 + elif param_name.startswith('encoder.layers'): + layer_id = int(param_name.replace('encoder.', '').split('.')[1]) + layer_depth = layer_id + 1 + else: + layer_depth = num_layers - 1 + + return layer_depth, num_layers \ No newline at end of file diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 1a4003dc3..e437c0db2 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -22,7 +22,7 @@ StopWordStoppingCriteria) from functools import reduce from mmengine.logging import print_log -from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel import types @@ -65,6 +65,8 @@ def __init__(self, # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, self.visual_encoder) + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) @@ -152,6 +154,10 @@ def __init__(self, def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): assert hasattr(self.visual_encoder, 'get_layer_depth'), \ 'The visual_encoder does not have `get_layer_depth` method.' + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + prefix = 'visual_encoder.vision_model.' + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + prefix = 'visual_encoder.' return self.visual_encoder.get_layer_depth(param_name, prefix) def _parse_lora_config(self, lora_config): From 44b8d1cab968fbf58b1896ee31c93712b5494860 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 31 May 2024 16:18:53 +0800 Subject: [PATCH 124/126] fix --- xtuner/model/internvl_1_5_llava.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 4cc5a1b15..350123352 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -9,7 +9,7 @@ make_inputs_require_grad, prepare_inputs_labels_for_multimodal) -from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel import types from mmengine.logging import print_log import torch.nn as nn @@ -52,9 +52,15 @@ def __init__(self, llm, if use_lldr: # The following code is only meaningful when the optim_wrapper configuration # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. - if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': - self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, - self.visual_encoder) + if use_lldr: + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, + self.visual_encoder) + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, + self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) @@ -156,6 +162,10 @@ def activation_checkpointing_disable(self): def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): assert hasattr(self.visual_encoder, 'get_layer_depth'), \ 'The visual_encoder does not have `get_layer_depth` method.' + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + prefix = 'visual_encoder.vision_model.' + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + prefix = 'visual_encoder.' return self.visual_encoder.get_layer_depth(param_name, prefix) def _prepare_data_for_llm(self, data): From 673ce12b4f11e1275f271543ad02b7ec01f6b163 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 31 May 2024 16:31:14 +0800 Subject: [PATCH 125/126] fix --- xtuner/model/internvl_1_5_llava.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 350123352..21d2ee4df 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -52,15 +52,12 @@ def __init__(self, llm, if use_lldr: # The following code is only meaningful when the optim_wrapper configuration # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. - if use_lldr: - # The following code is only meaningful when the optim_wrapper configuration - # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. - if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': - self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, - self.visual_encoder) - elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': - self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, - self.visual_encoder) + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, + self.visual_encoder) + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, + self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) @@ -191,7 +188,7 @@ def __preprocess_for_pixel_values(self, data): # b*n, hw, d visual_outputs = self.visual_encoder(concat_images, output_hidden_states=True) - if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + if self._get_model_class_name(self.visual_encoder) in ['CLIPVisionModel', 'InternVisionModel']: vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] From 17ab71c38117253501a40aff55cb71259a25ba0c Mon Sep 17 00:00:00 2001 From: huanghaian Date: Fri, 31 May 2024 17:13:11 +0800 Subject: [PATCH 126/126] fix --- xtuner/model/internvl_1_5_llava.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py index 21d2ee4df..173754648 100644 --- a/xtuner/model/internvl_1_5_llava.py +++ b/xtuner/model/internvl_1_5_llava.py @@ -54,10 +54,10 @@ def __init__(self, llm, # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, - self.visual_encoder) + self.visual_encoder) elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, - self.visual_encoder) + self.visual_encoder) self.llm.config.use_cache = False dispatch_modules(self.llm) @@ -202,7 +202,10 @@ def __preprocess_for_pixel_values(self, data): # n,h'w',c' vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) - vit_embeds = self.projector(vit_embeds) + if self.custom_mlp is False: + vit_embeds = self.projector(vit_embeds) + else: + vit_embeds = self.mlp1(vit_embeds) split_sizes = [image.shape[0] for image in pixel_values] image_features = torch.split(vit_embeds, split_sizes, dim=0)