From 3e0b5bbe8c23c5c23d0a90b1be66a0881f742c21 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 24 Sep 2024 21:45:28 +0400
Subject: [PATCH] small refactoring (#2411)

Co-authored-by: Andrei Anufriev <andrey.anufriev@intel.com>
---
 notebooks/mllama-3.2/data_preprocessing.py    |  29 ---
 notebooks/mllama-3.2/ov_mllama_compression.py |   6 -
 notebooks/mllama-3.2/ov_mllama_helper.py      | 175 +++++++++++++++++-
 3 files changed, 171 insertions(+), 39 deletions(-)

diff --git a/notebooks/mllama-3.2/data_preprocessing.py b/notebooks/mllama-3.2/data_preprocessing.py
index 370187976eb..61b8246edce 100644
--- a/notebooks/mllama-3.2/data_preprocessing.py
+++ b/notebooks/mllama-3.2/data_preprocessing.py
@@ -39,35 +39,6 @@ def get_pil_from_url(url):
     return image.convert("RGB")
 
 
-# def collate_fn_llm(example, image_column="image_url", text_column="caption"):
-#     """
-#     Preprocesses an example by loading and transforming image and text data.
-#     Checks if the text data in the example is valid by calling the `check_text_data` function.
-#     Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
-#     If there is any error during the download process, returns None.
-#     Returns the preprocessed inputs with transformed image and text data.
-#     """
-#     assert len(example) == 1
-#     example = example[0]
-
-#     if not check_text_data(example[text_column]):
-#         raise ValueError("Text data is not valid")
-
-#     url = example[image_column]
-#     try:
-#         image = get_pil_from_url(url)
-#         h, w = image.size
-#         if h == 1 or w == 1:
-#             return None
-#     except Exception:
-#         return None
-
-#     inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True)
-#     if inputs['input_ids'].shape[1] > max_length:
-#         return None
-#     return inputs
-
-
 def prepare_calibration_data_vision(dataloader, init_steps):
     """
     This function prepares calibration data from a dataloader for a specified number of initialization steps.
diff --git a/notebooks/mllama-3.2/ov_mllama_compression.py b/notebooks/mllama-3.2/ov_mllama_compression.py
index b6b602f3a99..8beb9def891 100644
--- a/notebooks/mllama-3.2/ov_mllama_compression.py
+++ b/notebooks/mllama-3.2/ov_mllama_compression.py
@@ -133,9 +133,3 @@ def compress(
     print(f"Model compression finished. Compressed model can be found in {saving_path}")
 
     return saving_path
-
-
-# model_id = "Llama-3.2-11B-Vision-Instruct/OV"
-# processor = AutoProcessor.from_pretrained(model_id)
-
-# compress(model_id, processor)
diff --git a/notebooks/mllama-3.2/ov_mllama_helper.py b/notebooks/mllama-3.2/ov_mllama_helper.py
index f5ddcd86276..84360a1681d 100644
--- a/notebooks/mllama-3.2/ov_mllama_helper.py
+++ b/notebooks/mllama-3.2/ov_mllama_helper.py
@@ -4,7 +4,6 @@
 from transformers.models.llama.modeling_llama import repeat_kv
 from openvino.frontend.pytorch.patch_model import __make_16bit_traceable
 from typing import Optional, Union, List, Tuple, Dict
-from optimum.exporters.openvino.stateful import patch_stateful
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import ModelOutput
 import openvino.runtime.opset13 as ops
@@ -83,6 +82,176 @@ def callback(matcher: Matcher) -> bool:
 }
 
 
+def model_has_state(ov_model: ov.Model):
+    return len(ov_model.get_sinks()) > 0
+
+
+def model_has_input_output_name(ov_model: ov.Model, name: str):
+    """
+    Helper function for checking that model has specified input or output name
+
+    Parameters:
+      ov_model (ov.Model):
+      name (str):
+          name of input or output
+
+    Returns:
+      True if input or output with requested name exists else False
+    """
+    return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])
+
+
+def fuse_cache_reorder(
+    ov_model: ov.Model,
+    not_kv_inputs: List[str],
+    key_value_input_names: List[str],
+    gather_dim: int,
+):
+    """
+    Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.
+
+    Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
+    Should be run before make_stateful. Implements optimumum's _reorder_cache
+    inside the model in the beginning of each iteration.
+    Gather works along given gather_dim dimension that may vary from model to model.
+    KV-cache inputs are identified based on names in key_value_input_names.
+    Append the new beam_idx parameter to not_kv_inputs.
+
+    Parameters:
+      ov_model (`ov.Model`):
+          openvino model for processing
+      not_kv_inputs (`List[str]`):
+          list of input nodes in model that not related to past key values
+      key_value_input_names (`List[str]`):
+          list of names for key value input layers
+      gather_dim (int):
+          dimension for gathering cache during reorder pass
+    """
+
+    if model_has_input_output_name(ov_model, "beam_idx"):
+        raise ValueError("Model already has fused cache")
+    input_batch = ov_model.input("input_ids").get_partial_shape()[0]
+    beam_idx = ops.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
+    beam_idx.output(0).get_tensor().add_names({"beam_idx"})  # why list is not accepted?
+    ov_model.add_parameters([beam_idx])
+    not_kv_inputs.append(ov_model.inputs[-1])
+    # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
+    for input_name in key_value_input_names:
+        parameter_output_port = ov_model.input(input_name)
+        consumers = parameter_output_port.get_target_inputs()
+        gather = ops.gather(parameter_output_port, beam_idx, ops.constant(gather_dim))
+        for consumer in consumers:
+            consumer.replace_source_output(gather.output(0))
+    ov_model.validate_nodes_and_infer_types()
+
+
+def build_state_initializer(ov_model: ov.Model, batch_dim: int):
+    """
+    Build initialization ShapeOf Expression for all ReadValue ops
+
+    Parameters:
+      ov_model (ov.Model):
+          openvino model
+      batch_dim (int):
+          index of dimension corresponding to batch size
+    """
+    input_ids = ov_model.input("input_ids")
+    batch = ops.gather(
+        ops.shape_of(input_ids, output_type="i64"),
+        ops.constant([0]),
+        ops.constant(0),
+    )
+    for op in ov_model.get_ops():
+        if op.get_type_name() == "ReadValue":
+            dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
+            dims[batch_dim] = batch
+            dims = [(ops.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims]
+            shape = ops.concat(dims, axis=0)
+            broadcast = ops.broadcast(ops.constant(0.0, dtype=op.get_output_element_type(0)), shape)
+            op.set_arguments([broadcast])
+    ov_model.validate_nodes_and_infer_types()
+
+
+def make_stateful(
+    ov_model: ov.Model,
+    not_kv_inputs: List[str],
+    key_value_input_names: List[str],
+    key_value_output_names: List[str],
+    batch_dim: int,
+    num_attention_heads: int,
+    num_beams_and_batch: int = None,
+):
+    """
+    Hides kv-cache inputs and outputs inside the model as variables.
+
+    Parameters:
+        ov_model (ov.Model):
+            openvino model
+        not_kv_inputs (`List[str]`):
+            list of input nodes in model that not related to past key values
+        key_value_input_names (`List[str]`):
+            list of names for key value input layers
+        key_value_output_names (`List[str]`):
+            list of names for key value input layers
+        batch_dim (int):
+            index of batch dimension in key value layers
+        num_attention_heads (int):
+            number of attention heads for batch dimension initialization
+        num_beams_an_batch (int):
+            precalculated number of beams and batch for shapes initialization
+    """
+    from openvino._offline_transformations import apply_make_stateful_transformation
+
+    input_output_map = {}
+
+    if num_beams_and_batch is not None:
+        # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
+        for input in not_kv_inputs:
+            shape = input.get_partial_shape()
+            if shape.rank.get_length() <= 2:  # == 1 for beam_index
+                shape[0] = num_beams_and_batch
+                input.get_node().set_partial_shape(shape)
+    for kv_name_pair in zip(key_value_input_names, key_value_output_names):
+        input_output_map[kv_name_pair[0]] = kv_name_pair[1]
+        if num_beams_and_batch is not None:
+            input = ov_model.input(kv_name_pair[0])
+            shape = input.get_partial_shape()
+            shape[batch_dim] = num_beams_and_batch * num_attention_heads
+            input.get_node().set_partial_shape(shape)
+
+    if num_beams_and_batch is not None:
+        # Re-validation model if shapes are altered above
+        ov_model.validate_nodes_and_infer_types()
+
+    apply_make_stateful_transformation(ov_model, input_output_map)
+    if num_beams_and_batch is None:
+        build_state_initializer(ov_model, batch_dim)
+
+
+def patch_stateful(ov_model):
+    key_value_input_names = [key_name for key in ov_model.inputs for key_name in key.get_names() if "past_key_values" in key_name]
+    key_value_output_names = [key_name for key in ov_model.outputs for key_name in key.get_names() if "present" in key_name]
+    not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
+    if not key_value_input_names or not key_value_output_names:
+        return
+    not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
+    if not key_value_input_names or not key_value_output_names:
+        return
+    batch_dim = 0
+    num_attention_heads = 1
+
+    fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
+    make_stateful(
+        ov_model,
+        not_kv_inputs,
+        key_value_input_names,
+        key_value_output_names,
+        batch_dim,
+        num_attention_heads,
+        None,
+    )
+
+
 def convert_mllama(model_id, out_dir):
 
     out_dir = Path(out_dir)
@@ -306,8 +475,7 @@ def cross_attn_forward(
             output.get_tensor().set_names({output_name})
 
         ov_model.validate_nodes_and_infer_types()
-
-        patch_stateful(model.config.text_config, ov_model)
+        patch_stateful(ov_model)
         ov.save_model(ov_model, lang_model_path)
         del ov_model
         cleanup_torchscript_cache()
@@ -785,7 +953,6 @@ def prepare_remote_tensors(self):
 
 
 if __name__ == "__main__":
-    # convert_mllama("/home/ea/llama3.2/Llama-3.2-11B-Vision-Instruct", "Llama-3.2-11B-Vision-Instruct/OV")
     model_id = "Llama-3.2-11B-Vision-Instruct/OV"
     LANGUAGE_MODEL_NAME = "llm_int4_asym_r10_gs64_max_activation_variance_all_layers.xml"
     IMAGE_ENCODER_NAME = "openvino_vision_encoder.xml"