From 3e0b5bbe8c23c5c23d0a90b1be66a0881f742c21 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 24 Sep 2024 21:45:28 +0400 Subject: [PATCH] small refactoring (#2411) Co-authored-by: Andrei Anufriev --- notebooks/mllama-3.2/data_preprocessing.py | 29 --- notebooks/mllama-3.2/ov_mllama_compression.py | 6 - notebooks/mllama-3.2/ov_mllama_helper.py | 175 +++++++++++++++++- 3 files changed, 171 insertions(+), 39 deletions(-) diff --git a/notebooks/mllama-3.2/data_preprocessing.py b/notebooks/mllama-3.2/data_preprocessing.py index 370187976eb..61b8246edce 100644 --- a/notebooks/mllama-3.2/data_preprocessing.py +++ b/notebooks/mllama-3.2/data_preprocessing.py @@ -39,35 +39,6 @@ def get_pil_from_url(url): return image.convert("RGB") -# def collate_fn_llm(example, image_column="image_url", text_column="caption"): -# """ -# Preprocesses an example by loading and transforming image and text data. -# Checks if the text data in the example is valid by calling the `check_text_data` function. -# Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function. -# If there is any error during the download process, returns None. -# Returns the preprocessed inputs with transformed image and text data. -# """ -# assert len(example) == 1 -# example = example[0] - -# if not check_text_data(example[text_column]): -# raise ValueError("Text data is not valid") - -# url = example[image_column] -# try: -# image = get_pil_from_url(url) -# h, w = image.size -# if h == 1 or w == 1: -# return None -# except Exception: -# return None - -# inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True) -# if inputs['input_ids'].shape[1] > max_length: -# return None -# return inputs - - def prepare_calibration_data_vision(dataloader, init_steps): """ This function prepares calibration data from a dataloader for a specified number of initialization steps. diff --git a/notebooks/mllama-3.2/ov_mllama_compression.py b/notebooks/mllama-3.2/ov_mllama_compression.py index b6b602f3a99..8beb9def891 100644 --- a/notebooks/mllama-3.2/ov_mllama_compression.py +++ b/notebooks/mllama-3.2/ov_mllama_compression.py @@ -133,9 +133,3 @@ def compress( print(f"Model compression finished. Compressed model can be found in {saving_path}") return saving_path - - -# model_id = "Llama-3.2-11B-Vision-Instruct/OV" -# processor = AutoProcessor.from_pretrained(model_id) - -# compress(model_id, processor) diff --git a/notebooks/mllama-3.2/ov_mllama_helper.py b/notebooks/mllama-3.2/ov_mllama_helper.py index f5ddcd86276..84360a1681d 100644 --- a/notebooks/mllama-3.2/ov_mllama_helper.py +++ b/notebooks/mllama-3.2/ov_mllama_helper.py @@ -4,7 +4,6 @@ from transformers.models.llama.modeling_llama import repeat_kv from openvino.frontend.pytorch.patch_model import __make_16bit_traceable from typing import Optional, Union, List, Tuple, Dict -from optimum.exporters.openvino.stateful import patch_stateful from transformers.generation import GenerationMixin from transformers.modeling_outputs import ModelOutput import openvino.runtime.opset13 as ops @@ -83,6 +82,176 @@ def callback(matcher: Matcher) -> bool: } +def model_has_state(ov_model: ov.Model): + return len(ov_model.get_sinks()) > 0 + + +def model_has_input_output_name(ov_model: ov.Model, name: str): + """ + Helper function for checking that model has specified input or output name + + Parameters: + ov_model (ov.Model): + name (str): + name of input or output + + Returns: + True if input or output with requested name exists else False + """ + return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) + + +def fuse_cache_reorder( + ov_model: ov.Model, + not_kv_inputs: List[str], + key_value_input_names: List[str], + gather_dim: int, +): + """ + Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. + + Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. + Should be run before make_stateful. Implements optimumum's _reorder_cache + inside the model in the beginning of each iteration. + Gather works along given gather_dim dimension that may vary from model to model. + KV-cache inputs are identified based on names in key_value_input_names. + Append the new beam_idx parameter to not_kv_inputs. + + Parameters: + ov_model (`ov.Model`): + openvino model for processing + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + gather_dim (int): + dimension for gathering cache during reorder pass + """ + + if model_has_input_output_name(ov_model, "beam_idx"): + raise ValueError("Model already has fused cache") + input_batch = ov_model.input("input_ids").get_partial_shape()[0] + beam_idx = ops.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) + beam_idx.output(0).get_tensor().add_names({"beam_idx"}) # why list is not accepted? + ov_model.add_parameters([beam_idx]) + not_kv_inputs.append(ov_model.inputs[-1]) + # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx + for input_name in key_value_input_names: + parameter_output_port = ov_model.input(input_name) + consumers = parameter_output_port.get_target_inputs() + gather = ops.gather(parameter_output_port, beam_idx, ops.constant(gather_dim)) + for consumer in consumers: + consumer.replace_source_output(gather.output(0)) + ov_model.validate_nodes_and_infer_types() + + +def build_state_initializer(ov_model: ov.Model, batch_dim: int): + """ + Build initialization ShapeOf Expression for all ReadValue ops + + Parameters: + ov_model (ov.Model): + openvino model + batch_dim (int): + index of dimension corresponding to batch size + """ + input_ids = ov_model.input("input_ids") + batch = ops.gather( + ops.shape_of(input_ids, output_type="i64"), + ops.constant([0]), + ops.constant(0), + ) + for op in ov_model.get_ops(): + if op.get_type_name() == "ReadValue": + dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] + dims[batch_dim] = batch + dims = [(ops.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims] + shape = ops.concat(dims, axis=0) + broadcast = ops.broadcast(ops.constant(0.0, dtype=op.get_output_element_type(0)), shape) + op.set_arguments([broadcast]) + ov_model.validate_nodes_and_infer_types() + + +def make_stateful( + ov_model: ov.Model, + not_kv_inputs: List[str], + key_value_input_names: List[str], + key_value_output_names: List[str], + batch_dim: int, + num_attention_heads: int, + num_beams_and_batch: int = None, +): + """ + Hides kv-cache inputs and outputs inside the model as variables. + + Parameters: + ov_model (ov.Model): + openvino model + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + key_value_output_names (`List[str]`): + list of names for key value input layers + batch_dim (int): + index of batch dimension in key value layers + num_attention_heads (int): + number of attention heads for batch dimension initialization + num_beams_an_batch (int): + precalculated number of beams and batch for shapes initialization + """ + from openvino._offline_transformations import apply_make_stateful_transformation + + input_output_map = {} + + if num_beams_and_batch is not None: + # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue + for input in not_kv_inputs: + shape = input.get_partial_shape() + if shape.rank.get_length() <= 2: # == 1 for beam_index + shape[0] = num_beams_and_batch + input.get_node().set_partial_shape(shape) + for kv_name_pair in zip(key_value_input_names, key_value_output_names): + input_output_map[kv_name_pair[0]] = kv_name_pair[1] + if num_beams_and_batch is not None: + input = ov_model.input(kv_name_pair[0]) + shape = input.get_partial_shape() + shape[batch_dim] = num_beams_and_batch * num_attention_heads + input.get_node().set_partial_shape(shape) + + if num_beams_and_batch is not None: + # Re-validation model if shapes are altered above + ov_model.validate_nodes_and_infer_types() + + apply_make_stateful_transformation(ov_model, input_output_map) + if num_beams_and_batch is None: + build_state_initializer(ov_model, batch_dim) + + +def patch_stateful(ov_model): + key_value_input_names = [key_name for key in ov_model.inputs for key_name in key.get_names() if "past_key_values" in key_name] + key_value_output_names = [key_name for key in ov_model.outputs for key_name in key.get_names() if "present" in key_name] + not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] + if not key_value_input_names or not key_value_output_names: + return + not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] + if not key_value_input_names or not key_value_output_names: + return + batch_dim = 0 + num_attention_heads = 1 + + fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) + make_stateful( + ov_model, + not_kv_inputs, + key_value_input_names, + key_value_output_names, + batch_dim, + num_attention_heads, + None, + ) + + def convert_mllama(model_id, out_dir): out_dir = Path(out_dir) @@ -306,8 +475,7 @@ def cross_attn_forward( output.get_tensor().set_names({output_name}) ov_model.validate_nodes_and_infer_types() - - patch_stateful(model.config.text_config, ov_model) + patch_stateful(ov_model) ov.save_model(ov_model, lang_model_path) del ov_model cleanup_torchscript_cache() @@ -785,7 +953,6 @@ def prepare_remote_tensors(self): if __name__ == "__main__": - # convert_mllama("/home/ea/llama3.2/Llama-3.2-11B-Vision-Instruct", "Llama-3.2-11B-Vision-Instruct/OV") model_id = "Llama-3.2-11B-Vision-Instruct/OV" LANGUAGE_MODEL_NAME = "llm_int4_asym_r10_gs64_max_activation_variance_all_layers.xml" IMAGE_ENCODER_NAME = "openvino_vision_encoder.xml"