From f533af0fe17c1fe9681a8c794bc02f2b794d03ea Mon Sep 17 00:00:00 2001
From: Alexander <kozzzloff@list.ru>
Date: Fri, 29 Nov 2024 16:22:06 +0400
Subject: [PATCH 1/2] Set FP16 KV-cache for non-quantized text models

---
 optimum/exporters/openvino/__main__.py |  2 ++
 optimum/exporters/openvino/convert.py  | 13 +++++++++----
 tests/openvino/test_export.py          |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 3ac8314889..e4fe2a7a41 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -456,6 +456,8 @@ class StoreAttr(object):
         from optimum.intel.openvino.quantization import _weight_only_quantization
 
         _weight_only_quantization(submodel, quantization_config)
+        if "text-generation" in task:
+            submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
 
         compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
         save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index e4ece9801b..fb6ab08e34 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -99,11 +99,15 @@ def _set_runtime_options(
     ],
     task: str,
     library_name: str,
+    quantized_model: bool
 ):
     for model_name in models_and_export_configs.keys():
         _, sub_export_config = models_and_export_configs[model_name]
+        sub_export_config.runtime_options = {}
         if "diffusers" in library_name or "text-generation" in task:
-            sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+            sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
+        if not quantized_model and "text-generation" in task:
+            sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"
 
 
 def _save_model(
@@ -116,8 +120,8 @@ def _save_model(
     compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
 
-    if hasattr(config, "runtime_options"):
-        model = _add_runtime_options_to_rt_info(model, config.runtime_options)
+    runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {}
+    model = _add_runtime_options_to_rt_info(model, runtime_options)
     save_model(model, path, compress_to_fp16)
     del model
     gc.collect()
@@ -755,7 +759,8 @@ def export_from_model(
 
         model.save_config(output)
 
-    _set_runtime_options(models_and_export_configs, task, library_name)
+    _set_runtime_options(models_and_export_configs, task, library_name,
+                         hasattr(ov_config, "quantization_config") and ov_config.quantization_config)
 
     export_models(
         models_and_export_configs=models_and_export_configs,
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 80a45cab6e..2d57f92d0e 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -132,6 +132,7 @@ def _openvino_export(
                         ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
                     )
                     self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]))
+                    self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
 
                 if library_name == "diffusers":
                     self.assertTrue(

From 9c409ac70354c0c37039d59fbeaa30bf143fa396 Mon Sep 17 00:00:00 2001
From: Alexander <kozzzloff@list.ru>
Date: Fri, 29 Nov 2024 16:25:00 +0400
Subject: [PATCH 2/2] Style

---
 optimum/exporters/openvino/convert.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index fb6ab08e34..6012e6cfb5 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -99,7 +99,7 @@ def _set_runtime_options(
     ],
     task: str,
     library_name: str,
-    quantized_model: bool
+    quantized_model: bool,
 ):
     for model_name in models_and_export_configs.keys():
         _, sub_export_config = models_and_export_configs[model_name]
@@ -759,8 +759,12 @@ def export_from_model(
 
         model.save_config(output)
 
-    _set_runtime_options(models_and_export_configs, task, library_name,
-                         hasattr(ov_config, "quantization_config") and ov_config.quantization_config)
+    _set_runtime_options(
+        models_and_export_configs,
+        task,
+        library_name,
+        hasattr(ov_config, "quantization_config") and ov_config.quantization_config,
+    )
 
     export_models(
         models_and_export_configs=models_and_export_configs,