diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2432a7a24124..4f8977c66e9d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,7 +39,7 @@ repos:
         name: check-torchdist
         entry: ./scripts/check-torchdist.py
         language: python
-        exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
+        exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py|deepspeed/runtime/zero/utils.py|deepspeed/tools/pg_sim/ut/base.py|deepspeed/tools/pg_sim/pg.py)
         # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
 
 -   repo: local
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 0c7ee3212fad..08c5be41e957 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -17,6 +17,18 @@ def __init__(self):
     def is_synchronized_device(self):
         ...
 
+    @abc.abstractmethod
+    def use_host_timers(self):
+        ...
+
+    @abc.abstractmethod
+    def resolves_data_dependency(self):
+        ...
+
+    @abc.abstractmethod
+    def handles_memory_backpressure(self):
+        ...
+
     # Device APIs
     @abc.abstractmethod
     def device_name(self, device_index):
@@ -255,6 +267,11 @@ def create_op_builder(self, class_name):
     def get_op_builder(self, class_name):
         ...
 
+    # creates and returns an optimizer, specified by optimizer_name, when the accelerator has its own implementation
+    @abc.abstractmethod
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        ...
+
     @abc.abstractmethod
     def build_extension(self):
         ...
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index fdbbd33c07a2..60d1c597676f 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -21,6 +21,15 @@ def __init__(self):
     def is_synchronized_device(self):
         return True
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
         return 'cpu'
@@ -280,3 +289,6 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        return None
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 2786b425ca7f..ecb15b40c29f 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -42,6 +42,15 @@ def _init_pynvml(self):
     def is_synchronized_device(self):
         return False
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
         if device_index == None:
@@ -183,11 +192,7 @@ def is_bf16_supported(self):
         return torch.cuda.is_bf16_supported()
 
     def is_fp16_supported(self):
-        major, _ = torch.cuda.get_device_capability()
-        if major >= 7:
-            return True
-        else:
-            return False
+        return True
 
     def supported_dtypes(self):
         return [torch.float, torch.half, torch.bfloat16]
@@ -322,3 +327,6 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        return None
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
new file mode 100644
index 000000000000..cc1798546352
--- /dev/null
+++ b/accelerator/hpu_accelerator.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import pkgutil
+import importlib
+import torch
+
+from .abstract_accelerator import DeepSpeedAccelerator
+
+
+class HPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'hpu'
+        self._communication_backend_name = 'hccl'
+        try:
+            import habana_frameworks.torch.hpu as hpu
+            hpu.setDeterministic(True)
+            self.hpu = hpu
+        except ImportError as e:
+            raise ValueError(
+                f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+
+        self.fp16_supported = None
+
+        # TODO SW-163871: remove the below WA once SW-154947 is resolved, solves OOM.
+        zero_mark_step_req_env_var = os.getenv("DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED", "0")
+        self.zero3_synchronized_mark_step_required = zero_mark_step_req_env_var.lower() in ["1", "true"]
+
+    # Device APIs
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return False
+
+    def resolves_data_dependency(self):
+        return True
+
+    def handles_memory_backpressure(self):
+        return True
+
+    def device_name(self, device_index=None):
+        if device_index is None:
+            return 'hpu'
+        return 'hpu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.device(self.device_name(device_index))
+
+    def set_device(self, device_index):
+        self.hpu.set_device(device_index)
+
+    def current_device(self):
+        return (self.hpu.current_device())
+
+    def current_device_name(self):
+        return 'hpu:{}'.format(self.current_device())
+
+    def device_count(self):
+        return self.hpu.device_count()
+
+    def synchronize(self, device_index=None):
+        return self.hpu.synchronize()
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        self.hpu.random.set_rng_state(new_state)
+
+    def get_rng_state(self, device_index=None):
+        return self.hpu.random.get_rng_state()
+
+    def manual_seed(self, seed):
+        self.hpu.random.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        self.hpu.random.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        self.hpu.random.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return self.hpu.random.default_generators[
+            device_index]  # section that is supposed to use this is currently hpu only -refactor
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return self.hpu.Stream
+
+    def stream(self, stream):
+        return self.hpu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return self.hpu.current_stream()
+
+    def default_stream(self, device_index=None):
+        return self.hpu.default_stream()
+
+    @property
+    def Event(self):
+        import habana_frameworks.torch.core as htcore
+        return htcore.hpu.Event  # need correct implementation test only
+
+    # Memory management
+    def empty_cache(self):
+        return
+
+    def memory_allocated(self, device_index=None):
+        return self.hpu.memory_allocated()
+
+    def max_memory_allocated(self, device_index=None):
+        return self.hpu.max_memory_allocated()
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return self.hpu.reset_max_memory_allocated()
+
+    def memory_cached(self, device_index=None):
+        return self.hpu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return self.hpu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return None
+
+    def memory_stats(self, device_index=None):
+        return self.hpu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        self.hpu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        return self.hpu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        return self.hpu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return self.memory_stats(device_index)['Limit']
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        if self.fp16_supported is None:
+            import habana_frameworks.torch.utils.experimental as htexp
+            self.fp16_supported = htexp._is_fp16_supported()
+        return self.fp16_supported
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float, torch.bfloat16]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        return None  # not supported  - doesn't seem to be used yet
+
+    def is_available(self):
+        return self.hpu.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        callback(
+        )  # currently only used in one cuda specific section of the code. re-ecvaluate implementation if this changes - add ticket.
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Tensor operations
+    @property
+    def BFloat16Tensor(self):
+        return self.hpu.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return self.hpu.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return self.hpu.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return self.hpu.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return self.hpu.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return self.hpu.IntTensor
+
+    @property
+    def LongTensor(self):
+        return self.hpu.LongTensor
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory(self.device())
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('hpu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.hpu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.hpu"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CPUOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    # TODO SW-163871: remove the below WA once SW-154947 is resolved, solves OOM.
+    def is_zero3_sync_mark_step_req(self):
+        return self.zero3_synchronized_mark_step_required
+
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        from habana_frameworks.torch.hpex.optimizers import FusedAdamW
+
+        self._optimizers_dict = {
+            'adamw': lambda arg1, **arg2: FusedAdamW(arg1, **arg2),
+            'adam': lambda arg1, **arg2: torch.optim.Adam(arg1, **arg2)
+        }
+        from deepspeed.runtime.config import ADAM_OPTIMIZER, ADAMW_OPTIMIZER, TORCH_ADAM_PARAM, ADAM_W_MODE, \
+            ADAM_W_MODE_DEFAULT
+
+        if optimizer_name in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
+            if cpu_optimization or optimizer_parameters.pop(TORCH_ADAM_PARAM, False):
+                return None
+            if optimizer_name == ADAM_OPTIMIZER and optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT):
+                optimizer_name = ADAMW_OPTIMIZER
+
+        if optimizer_name in self._optimizers_dict:
+            return self._optimizers_dict[optimizer_name](model_parameters, **optimizer_parameters)
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index 77595f6b636c..292ba45ae4a5 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -235,3 +235,6 @@ def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
 
         return BuildExtension
+
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        return None
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 7ebbd320bb15..03d62799c141 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -28,6 +28,15 @@ def __init__(self):
     def is_synchronized_device(self):
         return False
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
         if device_index == None:
@@ -255,3 +264,6 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def get_optimizer(self, optimizer_name, cpu_optimization, model_parameters, **optimizer_parameters):
+        return None
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 49133489b051..5b8951a00edd 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'npu', 'mps']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'npu', 'mps', 'hpu']
 
 ds_accelerator = None
 
@@ -45,7 +45,7 @@ def _validate_accelerator(accel_obj):
 
 
 def is_current_accelerator_supported():
-    return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
+    return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST
 
 
 def get_accelerator():
@@ -84,6 +84,12 @@ def get_accelerator():
                 torch.mps.current_allocated_memory()
             except (RuntimeError, ImportError) as e:
                 raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
+        elif accelerator_name == "hpu":
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(
+                    f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
         elif is_current_accelerator_supported():
             raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
@@ -131,6 +137,13 @@ def get_accelerator():
                 accelerator_name = "mps"
             except (RuntimeError, ImportError) as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401,F811
+
+                accelerator_name = "hpu"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             accelerator_name = "cuda"
 
@@ -156,6 +169,10 @@ def get_accelerator():
         from .mps_accelerator import MPS_Accelerator
 
         ds_accelerator = MPS_Accelerator()
+    elif accelerator_name == 'hpu':
+        from .hpu_accelerator import HPU_Accelerator
+
+        ds_accelerator = HPU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
         accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
diff --git a/build.txt b/build.txt
new file mode 100644
index 000000000000..40f8d43cce71
--- /dev/null
+++ b/build.txt
@@ -0,0 +1 @@
++hpu.synapse.v1.15.1
diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp
index 9a4a8d956519..c87eeea300f6 100644
--- a/csrc/adam/cpu_adam_impl.cpp
+++ b/csrc/adam/cpu_adam_impl.cpp
@@ -244,13 +244,17 @@ int ds_adam_step(int optimizer_id,
     opt->IncrementStep(step, beta1, beta2);
     opt->update_state(lr, epsilon, weight_decay, bias_correction);
 
+    bool bit16_precision = false;
+    if ((params.options().dtype() == at::kHalf) || (params.options().dtype() == at::kBFloat16))
+        bit16_precision = true;
+
     opt->Step_8(params_ptr,
                 grads_ptr,
                 exp_avg_ptr,
                 exp_avg_sq_ptr,
                 params_c.numel(),
                 nullptr,
-                (params.options().dtype() == at::kHalf));
+                bit16_precision);
 
 #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
     opt->SynchronizeStreams();
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 44d3ed3cac61..1b3a6104708d 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -23,6 +23,9 @@ typedef __half ds_half_precision_t;
 #include "acl/acl.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 typedef c10::Half ds_half_precision_t;
+#elif defined(__BFLOAT16__)
+#include <torch/torch.h>
+typedef at::BFloat16 ds_half_precision_t
 #else
 #include <cmath>
 typedef unsigned short ds_half_precision_t;
@@ -260,6 +263,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
             simd_store<span>(_exp_avg + i, momentum_4, false);
             simd_store<span>(_exp_avg_sq + i, variance_4, false);
         }
+// Params are updated only in case of float16, which is currently not supported on HPU
 #if defined(__ENABLE_CUDA__)
         if (dev_params) {
             if (half_precision)
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f77568be7835..936225797642 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -9,6 +9,23 @@
 #include <cpuid.h>
 #include <x86intrin.h>
 #endif
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+template <typename T>
+inline T readAs(const void* src)
+{
+    T res;
+    std::memcpy(&res, src, sizeof(T));
+    return res;
+}
+
+template <typename T>
+inline void writeAs(void* dst, const T& val)
+{
+    std::memcpy(dst, &val, sizeof(T));
+}
 
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
@@ -29,12 +46,58 @@
 #define SIMD_OR(x, y) _mm512_or_ps(x, y)
 #define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
 #define SIMD_WIDTH 16
+static __m512 load_16_bf16_as_f32(const void* data)
+{
+    __m256i a = readAs<__m256i>(data);     // use memcpy to avoid aliasing
+    __m512i b = _mm512_cvtepu16_epi32(a);  // convert 8 u16 to 8 u32
+    __m512i c = _mm512_slli_epi32(b, 16);  // logical shift left of all u32 by
+                                           // 16 bits (representing bf16->f32)
+    return readAs<__m512>(&c);             // use memcpy to avoid aliasing
+}
+
+static void store_16_f32_as_bf16_nearest(__m512 v, void* data)
+{
+    __m512i u32 = readAs<__m512i>(&v);
+
+    // flow assuming non-nan:
+
+    // uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    __m512i b = _mm512_srli_epi32(u32, 16);
+    __m512i lsb_mask = _mm512_set1_epi32(0x00000001);
+    __m512i c = _mm512_and_si512(b, lsb_mask);
+    __m512i bias_constant = _mm512_set1_epi32(0x00007fff);
+    __m512i rounding_bias = _mm512_add_epi32(c, bias_constant);
+
+    // uint16_t res = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+    __m512i d = _mm512_add_epi32(u32, rounding_bias);
+    __m512i e = _mm512_srli_epi32(d, 16);
+    __m256i non_nan_res = _mm512_cvtusepi32_epi16(e);
+
+    // handle nan (exp is all 1s and mantissa != 0)
+    // if ((x & 0x7fffffffU) > 0x7f800000U)
+    __m512i mask_out_sign = _mm512_set1_epi32(0x7fffffff);
+    __m512i non_sign_bits = _mm512_and_si512(u32, mask_out_sign);
+    __m512i nan_threshold = _mm512_set1_epi32(0x7f800000);
+    __mmask16 nan_mask = _mm512_cmp_epi32_mask(non_sign_bits, nan_threshold, _MM_CMPINT_GT);
+
+    // mix in results with nans as needed
+    __m256i nans = _mm256_set1_epi16(0x7fc0);
+    __m256i res = _mm256_mask_mov_epi16(non_nan_res, nan_mask, nans);
+
+    writeAs(data, res);
+}
 
+#if defined(ENABLE_BFLOAT16)
+#define SIMD_LOAD2(x, h) ((h) ? load_16_bf16_as_f32(x) : _mm512_loadu_ps(x))
+
+#define SIMD_STORE2(x, d, h) ((h) ? store_16_f32_as_bf16_nearest(d, x) : _mm512_storeu_ps(x, d))
+#else  // ENABLE_BFLOAT16
 #define SIMD_LOAD2(x, h) \
     ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
 #define SIMD_STORE2(x, d, h)                                                                      \
     ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
          : _mm512_storeu_ps(x, d))
+#endif  // ENABLE_BFLOAT16
 
 #define INTV __m256i
 #elif defined(__AVX256__)
@@ -54,10 +117,58 @@
 
 #define SIMD_LOAD2(x, h) \
     ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
+__m256 load_8_bf16_as_f32(const float* data)
+{
+    __m128i a = readAs<__m128i>(data);     // use memcpy to avoid aliasing
+    __m256i b = _mm256_cvtepu16_epi32(a);  // convert 8 u16 to 8 u32
+    __m256i c = _mm256_slli_epi32(b, 16);  // logical shift left of all u32 by
+                                           // 16 bits (representing bf16->f32)
+    return readAs<__m256>(&c);             // use memcpy to avoid aliasing
+}
+
+void store_8_f32_as_bf16_nearest(__m256 v, float* data)
+{
+    __m256i u32 = readAs<__m256i>(&v);
+
+    // flow assuming non-nan:
+
+    // uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    __m256i b = _mm256_srli_epi32(u32, 16);
+    __m256i lsb_mask = _mm256_set1_epi32(0x00000001);
+    __m256i c = _mm256_and_si256(b, lsb_mask);
+    __m256i bias_constant = _mm256_set1_epi32(0x00007fff);
+    __m256i rounding_bias = _mm256_add_epi32(c, bias_constant);
+
+    // uint16_t res = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+    __m256i d = _mm256_add_epi32(u32, rounding_bias);
+    __m256i e = _mm256_srli_epi32(d, 16);
+    __m128i non_nan_res = _mm256_cvtusepi32_epi16(e);
+
+    // handle nan (exp is all 1s and mantissa != 0)
+    // if ((x & 0x7fffffffU) > 0x7f800000U)
+    __m256i mask_out_sign = _mm256_set1_epi32(0x7fffffff);
+    __m256i non_sign_bits = _mm256_and_si256(u32, mask_out_sign);
+    __m256i nan_threshold = _mm256_set1_epi32(0x7f800000);
+    __mmask8 nan_mask = _mm256_cmp_epi32_mask(non_sign_bits, nan_threshold, _MM_CMPINT_GT);
+
+    // mix in results with nans as needed
+    __m128i nans = _mm_set1_epi16(0x7fc0);
+    __m128i res = _mm_mask_mov_epi16(non_nan_res, nan_mask, nans);
+
+    writeAs(data, res);
+}
+#if defined(ENABLE_BFLOAT16)
+#define SIMD_LOAD2(x, h) ((h) ? load_8_bf16_as_f32(x) : _mm256_loadu_ps(x))
+
+#define SIMD_STORE2(x, d, h) ((h) ? store_8_f32_as_bf16_nearest(d, x) : _mm256_storeu_ps(x, d))
+#else  // ENABLE_BFLOAT16
+#define SIMD_LOAD2(x, h) \
+    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)(x))) : _mm256_loadu_ps(x))
+
 #define SIMD_STORE2(x, d, h)                                                                \
     ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
          : _mm256_storeu_ps(x, d))
-
+#endif  // ENABLE_BFLOAT16
 #define INTV __m128i
 #endif
 
@@ -80,7 +191,7 @@ inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
 template <int span>
 inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
 {
-    size_t width = (half_precision ? 1 : SIMD_WIDTH);
+    size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
 #pragma unroll
     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
 }
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index b7277d1e1678..254785b440b9 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -446,15 +446,15 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            unsigned layer_id,
                                            unsigned num_layers,
                                            at::Tensor& alibi,
-                                           float rope_theta)
+                                           float rope_theta,
+                                           bool is_prompt,
+                                           std::optional<at::Tensor> token_idx)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
     int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
     unsigned hidden_dim = heads * k;
 
-    bool is_prompt = (seq_len > 1);
-
     if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
     unsigned soft_len = InferenceContext::Instance().current_tokens();
 
@@ -847,6 +847,87 @@ std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& inpu
     return {norm_output, res_output};
 }
 
+template <typename T>
+at::Tensor ds_transform4d_0213(at::Tensor& input, int seq_length)
+{
+    auto input_cont = input.contiguous();
+    unsigned batch_size = input.size(0);
+    unsigned num_heads = input.size(1);
+    unsigned seq_length_head_dim = input.size(2);
+    unsigned head_dim = seq_length_head_dim / seq_length;
+    unsigned hidden_dim = num_heads * head_dim;
+
+    auto options = at::TensorOptions()
+                       .dtype(input.options().dtype())
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+
+    launch_transform4d_0213<T>(workspace,
+                               (T*)input.data_ptr(),
+                               batch_size,
+                               num_heads,
+                               seq_length,
+                               hidden_dim,
+                               InferenceContext::Instance().GetCurrentStream(),
+                               1);
+    auto output = at::from_blob(workspace, {batch_size, seq_length, num_heads, head_dim}, options);
+    return output;
+}
+
+template <typename T>
+std::vector<at::Tensor> ds_bias_add_transform_0213(at::Tensor& input,
+                                                   at::Tensor& bias,
+                                                   int num_heads,
+                                                   int trans_count)
+{
+    TORCH_CHECK(
+        trans_count == 1 or trans_count == 3, "trans_count ", trans_count, " is not supported");
+    auto input_cont = input.contiguous();
+
+    unsigned batch_size = input.size(0);
+    unsigned seq_length = input.size(1);
+    unsigned value_size = input.size(2);
+    unsigned hidden_dim = input.size(2) / trans_count;
+    unsigned head_dim = hidden_dim / num_heads;
+
+    auto options = at::TensorOptions()
+                       .dtype(input.options().dtype())
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+    auto final_output = workspace;
+    int num_kv = -1;
+    int repo_theta = -1;
+    size_t offset = (batch_size * seq_length * hidden_dim);
+    launch_bias_add_transform_0213<T>(final_output,
+                                      final_output + offset,
+                                      final_output + 2 * offset,
+                                      (T*)input.data_ptr(),
+                                      (T*)bias.data_ptr(),
+                                      batch_size,
+                                      seq_length,
+                                      0,              // seq_offset
+                                      input.size(1),  // all_tokens .. unused?
+                                      hidden_dim,
+                                      num_heads,
+                                      num_kv,
+                                      -1,     // rotary_dim
+                                      false,  // rotate_half
+                                      false,  // rotate_every_two
+                                      InferenceContext::Instance().GetCurrentStream(),
+                                      trans_count,    // trans_count
+                                      input.size(1),  // max_out_tokens
+                                      repo_theta);
+    return {at::from_blob(final_output, {batch_size, num_heads, seq_length, head_dim}, options),
+            at::from_blob(
+                final_output + offset, {batch_size, num_heads, seq_length, head_dim}, options),
+            at::from_blob(
+                final_output + 2 * offset, {batch_size, num_heads, seq_length, head_dim}, options)};
+}
+
 template <typename T>
 void quantized_gemm(void* output,
                     T* input,
@@ -2010,7 +2091,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
     m.def("dequantize_" #_name,                                                                   \
           &ds_dequantize<_dtype>,                                                                 \
-          "DeepSpeed dequantize with " #_name " (CUDA)")
+          "DeepSpeed dequantize with " #_name " (CUDA)");                                         \
+    m.def("transform4d_0213_" #_name,                                                             \
+          &ds_transform4d_0213<_dtype>,                                                           \
+          "DeepSpeed transform4d 0213 with " #_name " (CUDA)");                                   \
+    m.def("bias_add_transform_0213_" #_name,                                                      \
+          &ds_bias_add_transform_0213<_dtype>,                                                    \
+          "DeepSpeed bias and transform 0213 with " #_name " (CUDA)")
 
     DEF_OPS(fp32, float);
     DEF_OPS(fp16, __half);
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 87b959f56be0..95ee58b0421a 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -18,6 +18,7 @@
 except ImportError:
     HAS_TRITON = False
 
+from .utils import log_dist, OnDevice, logger
 from . import ops
 from . import module_inject
 
@@ -34,11 +35,11 @@
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
 
-from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 
 from .runtime import zero
 from .runtime import DeepSpeedOptimizer, ZeROOptimizer
+from .runtime.compiler import is_compile_supported
 
 from .pipe import PipelineModule
 
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index f809a0c39270..c57739d1d1a0 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -58,6 +58,8 @@
 
 # Vocabulary padding
 VOCAB_TENSOR = 'vocab_tensor'
+# TODO: remove once transition to new universal checkpoint is completed
+VOCAB_DIVISIBILITY_PADDING_TENSOR = 'vocab_divisibility_padding_tensor'
 PADDED_VOCAB_SIZE = 'padded_vocab_size'
 ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
 
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index 77634222d292..f3fd16057497 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import re
 from typing import Dict
 import torch
 
@@ -32,7 +33,13 @@
 
 class DeepSpeedCheckpoint(object):
 
-    def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
+    def __init__(self,
+                 dir,
+                 tp_degree=None,
+                 pp_degree=None,
+                 dp_degree=None,
+                 final_layer_norm_idx=FINAL_LAYER_NORM_INDEX):
+        self.final_layer_norm_idx = final_layer_norm_idx
         self.dir = dir
         self._validate_folder(dir)
 
@@ -70,7 +77,7 @@ def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.pp_to_transformer_map = self._build_pp_transformer_map()
         self.transformer_file_map = self._build_transformer_file_map()
         self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-        self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
+        self.tp_to_final_norm_map = self._build_tp_other_layer_map(self.final_layer_norm_idx)
         self._build_global_state()
 
     def is_change_tp_degree(self):
@@ -122,7 +129,7 @@ def get_embedding_layer_id(self):
         return self.layer_keys[EMBEDDING_LAYER_INDEX]
 
     def get_final_norm_layer_id(self):
-        return self.layer_keys[FINAL_LAYER_NORM_INDEX]
+        return self.layer_keys[self.final_layer_norm_idx]
 
     def get_iteration(self):
         if not ITERATION_KEY in self.global_state:
@@ -207,7 +214,7 @@ def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list:
 
     def _build_pp_transformer_map(self):
         data_map = {}
-        transformer_layers = self.layer_keys[1:-1]
+        transformer_layers = self.layer_keys[1:self.final_layer_norm_idx]
         layers_per_pp = len(transformer_layers) // self.pp_degree
         data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)}
         return data_map
@@ -219,7 +226,7 @@ def _dump_mapping(self, data_map, map_tag=None):
             print(f'{k} = {v}')
 
     def _build_transformer_file_map(self):
-        transformer_layer_keys = self.layer_keys[1:-1]
+        transformer_layer_keys = self.layer_keys[1:self.final_layer_norm_idx]
         file_map = {}
         # XXX: this is not guaranteed
         layers_per_pp = len(transformer_layer_keys) // self.pp_degree
@@ -228,7 +235,7 @@ def _build_transformer_file_map(self):
         #print(f"{transformer_layer_keys} {layers_per_pp}")
         for key_index, layer_key in enumerate(transformer_layer_keys):
             pp_index = key_index // layers_per_pp
-            layer_files = get_files_with_prefix(self.layer_files, layer_key)
+            layer_files = get_files_with_prefix(self.layer_files, layer_key + '-')
             layer_file_partitions = partition_data(layer_files, self.tp_degree)
             for tp_index in range(self.tp_degree):
                 map_key = (tp_index, pp_index)
@@ -253,11 +260,13 @@ def validate_files(self):
 
     def _get_layer_keys(self):
         key_set = set()
-        key_len = len(LAYER_FILE_PREFIX) + 2
         for file_path in self.layer_files:
             _, fname = os.path.split(file_path)
-            key_set.add(fname[:key_len])
-        return sorted(list(key_set))
+            layer_id = re.search(r'layer_(\d+)-model_.*', fname).group(1)
+            key_set.add(layer_id)
+        sorted_ids = sorted(list(key_set), key=int)
+        layer_keys = [LAYER_FILE_PREFIX + str(layer_id) for layer_id in sorted_ids]
+        return layer_keys
 
     def _merge_state_dicts(self, sd_list):
         merged_sd = {}
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index 5849a834cdd3..408a14f6b287 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -6,7 +6,9 @@
 import os
 import torch
 import types
-from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS)
+# todo: remove VOCAB_DIVISIBILITY_PADDING_TENSOR once transition to universal checkpointing is completed
+from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, VOCAB_DIVISIBILITY_PADDING_TENSOR, CAT_DIM,
+                        PARAM_N_SUB_PARAMS)
 
 
 def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
@@ -54,6 +56,23 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
                 padding_size = padded_target_vocab_size - full_hp_param.shape[0]
                 full_hp_param = torch.nn.functional.pad(full_hp_param, (0, 0, 0, padding_size), "constant", 0)
 
+        # TODO: For BWD compatibility with old universal checkpointing mechanism, remove once transition is completed
+        vocab_divisibility_padding_tensor = ckpt_dict.get(VOCAB_DIVISIBILITY_PADDING_TENSOR, None)
+        if vocab_divisibility_padding_tensor is not None:
+            # In the absence of data passed from the user wrt new padded vocab specific to tp degree
+            # we can again derive that data by reverse engineering the target shapes like so:
+            padded_target_vocab_size = self.shape[0] * tp_world_size
+            if padded_target_vocab_size > full_hp_param.shape[0]:
+                # Need to expand
+                pad_size = padded_target_vocab_size - full_hp_param.shape[0]
+                hidden_size = vocab_divisibility_padding_tensor.shape[-1]
+                padding_tensor = vocab_divisibility_padding_tensor.view(1, -1).expand(pad_size, hidden_size)
+                full_hp_param = torch.nn.functional.pad(full_hp_param, (0, 0, 0, pad_size), "constant", 0)
+                full_hp_param[-pad_size:, :] = padding_tensor
+            else:
+                # Need to shrink or keep the same
+                full_hp_param = full_hp_param[:padded_target_vocab_size, :]
+
         full_param_numel = full_hp_param.numel()
         tp_slice_numel = self.numel()
         #        if key == FP32_WEIGHT_KEY and 'word_embeddings.weight' in folder:
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index c65745d3dd0c..e89ad9f1b2e0 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -105,7 +105,7 @@ def _strip_tensor_paddings(self, sd):
             if group_paddings[key] == 0:
                 continue
             for state_name, state_value in group_state.items():
-                if torch.is_tensor(state_value):
+                if torch.is_tensor(state_value) and state_value.dim():
                     raw_length = state_value.numel() - group_paddings[key]
                     group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone()
 
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
index 568211645f40..fab52f4600dd 100644
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -595,7 +595,7 @@ def get_all_ranks_from_group(group=None):
         while True:
             group_ranks.append(cdb.get_global_rank(group, rank))
             rank += 1
-    except RuntimeError:
+    except (RuntimeError, ValueError):
         pass
     return group_ranks
 
@@ -614,7 +614,7 @@ def init_distributed(dist_backend=None,
     ''' Initialize dist backend, potentially performing MPI discovery if needed
 
     Arguments:
-        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo
+        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo, hccl
         auto_mpi_discovery Optional (bool). if distributed environment variables are not set, attempt to discover them from MPI
         distributed_port: Optional (int). torch distributed backend port
         verbose: Optional (bool). verbose logging
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index 531f54d2628f..5f51165f6075 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -8,6 +8,7 @@
 from .utils import *
 from .backend import *
 from .comm import *
+from ..runtime import compiler
 import os
 
 DS_COMM_ALL_GATHER_OFF = False
@@ -120,6 +121,7 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
         self.init_process_group(backend, timeout, init_method, rank, world_size)
 
     @classmethod
+    @compiler.disable
     def get_all_gather_function(self):
         if hasattr(torch.distributed, "all_gather_into_tensor"):
             return torch.distributed.all_gather_into_tensor
@@ -128,6 +130,7 @@ def get_all_gather_function(self):
         return None
 
     @classmethod
+    @compiler.disable
     def get_reduce_scatter_function(self):
         if hasattr(torch.distributed, "reduce_scatter_tensor"):
             return torch.distributed.reduce_scatter_tensor
@@ -143,21 +146,20 @@ def has_reduce_scatter_tensor(self):
 
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
-            torch.distributed.init_process_group(backend,
-                                                 timeout=timeout,
-                                                 init_method=init_method,
-                                                 rank=rank,
-                                                 world_size=world_size)
+            torch.distributed.init_process_group(backend, timeout=timeout, init_method=init_method)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
+    @compiler.disable
     def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
 
+    @compiler.disable
     def inference_all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
 
+    @compiler.disable
     def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         """ proxy func to torch.distributed.all_reduce_coalesced,
         which is included in PyTorch 1.13 and above
@@ -168,6 +170,7 @@ def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group
         op = self._reduce_op(op)
         return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
+    @compiler.disable
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -175,6 +178,7 @@ def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
             return Noop()
         return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
+    @compiler.disable
     def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_SCATTER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -187,6 +191,7 @@ def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_
                                                     group=group,
                                                     async_op=async_op)
 
+    @compiler.disable
     def broadcast(self, tensor, src, group=None, async_op=False):
         if DS_COMM_BROADCAST_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -195,6 +200,7 @@ def broadcast(self, tensor, src, group=None, async_op=False):
         else:
             return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
+    @compiler.disable
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -203,6 +209,7 @@ def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         else:
             return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
+    @compiler.disable
     def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
         if self.has_all_gather_into_tensor():
             return self.all_gather_function(output_tensor=output_tensor,
@@ -210,6 +217,7 @@ def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_
                                             group=group,
                                             async_op=async_op)
 
+    @compiler.disable
     def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -227,6 +235,7 @@ def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=Fals
                                      "please consider upgrading your pytorch installation.")
                 pass
 
+    @compiler.disable
     def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
         """"""
         assert len(output_tensors) == len(input_tensors), ""
@@ -250,6 +259,7 @@ def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_
             else:
                 reqs[-1].wait()
 
+    @compiler.disable
     def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
         if self.has_reduce_scatter_tensor():
             return self.reduce_scatter_function(output_tensor,
@@ -263,6 +273,7 @@ def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, gr
                                  "please consider upgrading your pytorch installation.")
             pass
 
+    @compiler.disable
     def all_to_all_single(self,
                           output,
                           input,
@@ -277,21 +288,27 @@ def all_to_all_single(self,
                                                    group=group,
                                                    async_op=async_op)
 
+    @compiler.disable
     def all_to_all(self, output_tensor_list, input_tensor_list, group=None, async_op=False):
         return torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
 
+    @compiler.disable
     def send(self, tensor, dst, group=None, tag=0):
         return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
+    @compiler.disable
     def recv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.recv(tensor=tensor, src=src, group=group, tag=tag)
 
+    @compiler.disable
     def isend(self, tensor, dst, group=None, tag=0):
         return torch.distributed.isend(tensor=tensor, dst=dst, group=group, tag=tag)
 
+    @compiler.disable
     def irecv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.irecv(tensor=tensor, src=src, group=group, tag=tag)
 
+    @compiler.disable
     def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
         return torch.distributed.gather(tensor=tensor,
                                         gather_list=gather_list,
@@ -299,6 +316,7 @@ def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
                                         group=group,
                                         async_op=async_op)
 
+    @compiler.disable
     def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
         return torch.distributed.scatter(tensor=tensor,
                                          scatter_list=scatter_list,
@@ -306,11 +324,13 @@ def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
                                          group=group,
                                          async_op=async_op)
 
+    @compiler.disable
     def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
         return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
 
+    @compiler.disable
     def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 149d20cd9305..00fb04c64d1f 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -71,6 +71,9 @@ def __init__(self, model, config):
         if hasattr(self.module, "config"):
             TransformerPolicy.hf_model_config = self.module.config
 
+        if config.dtype == torch.half and not get_accelerator().is_fp16_supported():
+            raise ValueError("Type fp16 is not supported.")
+
         # todo: keep this self.injection_dict because we don't use to change config.injection_policy API
         # todo: this will get changed when Molly's PR on auto injection dict is merged
         self.injection_dict = config.injection_policy
@@ -109,11 +112,6 @@ def __init__(self, model, config):
             assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
                 "If you want to use cuda graph, please upgrade torch to at least v1.10"
 
-        # Check if model passed to engine is loaded w/ meta tensors, in which case
-        # kernel injection must be enabled.
-        # NOTE: This check assumes a Hugging Face hierarchy for the device type i.e. module.device.type
-        self.model_meta_device = self.module.device.type == 'meta' if hasattr(self.module, "device") else False
-
         # convert model to intended dtype
         if config.dtype:
             self._convert_to_dtype(config)
@@ -170,14 +168,22 @@ def __init__(self, model, config):
                     self._apply_injection_policy(config, client_module)
 
         device = get_accelerator().current_device_name()
-        self.module.to(device)
+        # NOTE: This check assumes a Hugging Face hierarchy for the device type i.e. module.device.type
+        is_meta_device = hasattr(self.module, "device") and self.module.device.type == 'meta'
+        if is_meta_device:
+            self.module.to_empty(device=device)
+        else:
+            self.module.to(device)
 
         if config.tensor_parallel.tp_size > 1:
             _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
             dist.broadcast(_rng_state, 0)
             get_accelerator().set_rng_state(_rng_state.cpu())
 
-        if config.tensor_parallel.tp_size > 1:
+        if config.enable_cuda_graph and get_accelerator().device_name() == 'hpu':
+            import habana_frameworks.torch as ht
+            self.module = ht.hpu.wrap_in_hpu_graph(self.module)
+        elif config.tensor_parallel.tp_size > 1:
             assert not config.enable_cuda_graph, "Cuda graph is not supported for model parallelism"
 
         # Check if local CUDA graphs can be created in replacement modules
@@ -314,7 +320,7 @@ def _validate_args(self, mpu, replace_with_kernel_inject):
         if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
             raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
 
-        supported_dtypes = [None, torch.half, torch.int8, torch.float]
+        supported_dtypes = [None, torch.half, torch.int8, torch.float, torch.bfloat16]
         if self._config.dtype not in supported_dtypes:
             raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 
@@ -582,7 +588,8 @@ def forward(self, *inputs, **kwargs):
             **kwargs: variable length keyword arguments
         """
         start = None
-        if self.model_profile_enabled and get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph:
+        if self.model_profile_enabled and (get_accelerator().device_name() == 'cuda' or get_accelerator().device_name() == 'hpu') and \
+           self._config.enable_cuda_graph:
             get_accelerator().synchronize()
             start = time.time()
 
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index a7fa2b5053e5..f8c31e3c2a42 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -32,7 +32,10 @@
 from deepspeed.accelerator import get_accelerator
 
 DLTS_HOSTFILE = "/job/hostfile"
-EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS = [
+    'MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX', 'PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES', 'PT_HPU_LAZY_ACC_PAR_MODE',
+    'PT_HPU_ENABLE_WEIGHT_CPU_PERMUTE'
+]
 EXPORT_ENVS += NEBULA_EXPORT_ENVS
 DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
@@ -483,6 +486,18 @@ def main(args=None):
     if args.elastic_training:
         assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"
 
+    if get_accelerator().device_name() == 'hpu':
+        # TODO: SW-113485 need to remove the below WA once SW-113485 is unblocked
+        def update_wa_env_var(key, value):
+            if key not in os.environ.keys():
+                env[key] = value
+
+        update_wa_env_var("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        # todo SW-125782: remove DYNAMIC SHAPE disable WA
+        update_wa_env_var("PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES", "0")
+        # todo SW-145489: remove WEIGHT CPU PERMUTE WA after SW-145491 is resolved
+        update_wa_env_var("PT_HPU_ENABLE_WEIGHT_CPU_PERMUTE", "0")
+
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index a41df58ad059..0830325c30b3 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -87,7 +87,6 @@ def __init__(self,
                                        requires_grad=False)
             self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
                                        requires_grad=False)
-        self.layer_past = None
         try:
             if config.dtype == torch.float32:
                 self.allocate_workspace = inference_module.allocate_workspace_fp32
@@ -149,10 +148,7 @@ def forward(
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
 
-        # We set the prev key/value to None when there is a prompt
-        if input.shape[1] > 1:
-            self.layer_past = None
-        layer_past = layer_past if layer_past is not None else self.layer_past
+        layer_past = past_key_value if past_key_value is not None else layer_past
         head_mask = layer_head_mask if layer_head_mask is not None else head_mask
 
         attn_mask = None
@@ -178,10 +174,10 @@ def forward(
                                               output_attentions,
                                               self.norm_w,
                                               self.norm_b,
-                                              alibi)
+                                              alibi,
+                                              **kwargs)
 
             presents = (key, value)
-            self.layer_past = presents if layer_past is None else None
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index c235cc766209..21703458da7a 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -17,6 +17,16 @@
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
 
 
+def move(tensor, device):
+    if tensor.is_meta:
+        return torch.empty_like(tensor, device=device)
+    else:
+        # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
+        # Using copy=True instead of clone() will help in case of cpu --> cpu.
+        # Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced.
+        return tensor.to(device, copy=True)
+
+
 class ReplaceWithTensorSlicing:
 
     def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
@@ -121,7 +131,9 @@ class Loading():
 
     def is_load_module(module):
         load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
-        load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm"]
+        load_layer_names = [
+            "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear"
+        ]
         return module.__class__ in load_layers or module._get_name() in load_layer_names
 
     def load_buffer(module, state_dict, prefix):
@@ -316,7 +328,7 @@ def _replace(self, child, name, conv_linear_layer):
             data = child.weight.data.split(get_shard_size_list(
                 weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size),
                                            dim=1)
-            data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
+            data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
             del data
 
             setattr(child, "replaced", True)
@@ -324,9 +336,10 @@ def _replace(self, child, name, conv_linear_layer):
                 return LmHeadLinearAllreduce(
                     torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(),
                     child.bias if child.bias is None else torch.nn.parameter.Parameter(
-                        child.bias.to(get_accelerator().current_device_name())), self.mp_group)
+                        move(child.bias,
+                             get_accelerator().current_device_name())), self.mp_group)
             return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \
-                        torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group)
+                        torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name())), self.mp_group)
         else:
 
             # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
@@ -338,30 +351,41 @@ def _replace(self, child, name, conv_linear_layer):
                 #for detecting fused type
                 module_str = str(self.module).strip()
                 #The copy is a regular copy, The shape of dst and src is the same
-                data_dc = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index)
-
-                bias_data_dc = None if child.bias is None else prepare_tp_fused_qkvw(
-                    module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to(
-                        get_accelerator().current_device_name())
+                data_dc = move(
+                    prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index),
+                    get_accelerator().current_device_name())
+                bias_data_dc = None if child.bias is None else move(
+                    prepare_tp_fused_qkvw(module_str, child.bias.data, self.mp_size, mp_replace.gpu_index),
+                    get_accelerator().current_device_name())
             else:
-                data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size),
-                                               dim=1 if self.conv_linear_layer else 0)
-                data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
-                del data
+                if 'gate' in name and 'mixtral' in str(type(self.module)):
+                    data_dc = child.weight.data
+                    bias_data_dc = None if child.bias is None else child.bias.data
+
+                elif 'w2' in name and 'mixtral' in str(type(self.module)):
+                    data = child.weight.data.split(get_shard_size_list(weight_shape[1], self.mp_size), dim=1)
+                    data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
+                    del data
+                    bias_data_dc = None if child.bias is None else \
+                        torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
 
-                if child.bias is not None:
-                    bias_data = child.bias.data.split(get_shard_size_list(
-                        weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size),
-                                                      dim=0)
-                    bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
-                    bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
-                    del bias_data
                 else:
-                    bias_data_dc = None
+                    data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size),
+                                               dim=1 if self.conv_linear_layer else 0)
+                    data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
+                    del data
+                    if child.bias is not None:
+                        bias_data = child.bias.data.split(get_shard_size_list(
+                            weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size),
+                                                      dim=0)
+                        bias_data = move(bias_data[mp_replace.gpu_index], get_accelerator().current_device_name())
+                        bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
+                        del bias_data
+                    else:
+                        bias_data_dc = None
 
             setattr(child, "replaced", True)
-            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc.to(get_accelerator().current_device_name()), requires_grad=False), \
-                        bias=bias_data_dc)
+            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc)
 
     def _slice_embedding(self, child, name, conv_linear_layer):
         if getattr(child, "replaced", False) == True:
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
index 83e109167ffe..ab6286325eed 100644
--- a/deepspeed/module_inject/containers/base.py
+++ b/deepspeed/module_inject/containers/base.py
@@ -203,6 +203,12 @@ def set_mlp(self, _h4h_w, _h4h_b, _4hh_w, _4hh_b):
         self._4hh_b = _4hh_b
 
     def set_layernorm(self, attn_nw, attn_nb, input_nw, input_nb):
+        #TODO SW-164572: remove below mark_step WA once SW-164573 is resolved.
+        if get_accelerator().device_name() == 'hpu':
+            import habana_frameworks.torch.hpu as thpu
+            if thpu.is_initialized():
+                import habana_frameworks.torch.core as htcore
+                htcore.mark_step()
         self.attn_nw = attn_nw
         self.attn_nb = attn_nb
         self.input_nw = input_nw
diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py
index 05f30eec8d85..c103b17e1559 100644
--- a/deepspeed/module_inject/containers/bloom.py
+++ b/deepspeed/module_inject/containers/bloom.py
@@ -23,6 +23,7 @@ def __init__(self, **kwargs):
 
         # All model specific things should be defined here instead of the base class.
         self.bigscience_bloom = True
+        self.triangular_masking = False
 
     def create_module(self, config=None):
         _config = config if config is not None else self.ds_model_config
diff --git a/deepspeed/module_inject/containers/llama.py b/deepspeed/module_inject/containers/llama.py
index f6157e5cdfed..d0ae9e35cc8e 100644
--- a/deepspeed/module_inject/containers/llama.py
+++ b/deepspeed/module_inject/containers/llama.py
@@ -25,6 +25,7 @@ class DS_LLAMAContainer(MetaTensorContainer, HybridGatedMLPContainer, HybridSpli
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self.triangular_masking = False
 
         # All model specific things should be defined here instead of the base class.
 
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index a24d36344cdc..0244b3c8d235 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -17,6 +17,7 @@
 import gc
 from deepspeed.accelerator import get_accelerator
 import re
+from os import getenv
 
 
 def load_model_with_checkpoint(r_module,
@@ -28,6 +29,8 @@ def load_model_with_checkpoint(r_module,
                                rank=0,
                                container=None):
     error_msgs = []
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    llama_large_seq_perf_workaround = getenv("DEEPSPEED_LLAMA_LARGE_SEQ_PERF_WORKAROUND", "0").lower() in ["1", "true"]
 
     def prefix_check():
         # if keys start with 'model.' or 'transformer.', don't skip level 0 prefix
@@ -40,7 +43,11 @@ def prefix_check():
                 return False
         return True
 
-    skip_level_0_prefix = prefix_check() and container.policy.use_load_prefix
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    if llama_large_seq_perf_workaround and container is None:
+        skip_level_0_prefix = prefix_check()
+    else:
+        skip_level_0_prefix = prefix_check() and container.policy.use_load_prefix
 
     def transpose(data):
         with torch.no_grad():
@@ -224,6 +231,11 @@ def load_parameters(module, prefix):
         RMSNorm: load
     }
 
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    if llama_large_seq_perf_workaround:
+        print(f"Removed RMSNorm from layer_policies")
+        del layer_policies[RMSNorm]
+
     all_ds_ids = {}
 
     def load_module_recursive(module, prefix='', level=0):
@@ -248,17 +260,26 @@ def load_module_recursive(module, prefix='', level=0):
                     if child.__class__ is nn.LayerNorm:
                         child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                         setattr(module, name, child)
-                    elif child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]:
+                    elif (not llama_large_seq_perf_workaround
+                          and child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]):
                         child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias)
                         setattr(module, name, child)
+                    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+                    elif llama_large_seq_perf_workaround and child.__class__ is nn.Linear:
+                        child = LinearLayer(weight_shape=child.weight.shape, dtype=child.weight.dtype, bias=child.bias)
+                        setattr(module, name, child)
                     elif child.__class__ is OPTLearnedPositionalEmbedding:
                         child = OPTEmbedding(weight_shape=ds_shape)
                         setattr(module, name, child)
-                    elif child.__class__ in [LlamaRMSNorm, RMSNorm]:
+                    elif not llama_large_seq_perf_workaround and child.__class__ in [LlamaRMSNorm, RMSNorm]:
                         child = RMSNormalize(dim=ds_shape[-1],
                                              dtype=child.weight.dtype,
                                              eps=child.eps if hasattr(child, 'eps') else child.variance_epsilon)
                         setattr(module, name, child)
+                    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+                    elif llama_large_seq_perf_workaround and child.__class__ is LlamaRMSNorm:
+                        child = RMSNormalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.variance_epsilon)
+                        setattr(module, name, child)
                     else:
                         ds_id = None
                         if hasattr(child.weight, 'ds_id'):
diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py
index 41df2b85dc0c..d9cb8c2d9eb5 100644
--- a/deepspeed/module_inject/policy.py
+++ b/deepspeed/module_inject/policy.py
@@ -27,7 +27,7 @@ class DSPolicy(ABC):
     _orig_layer_class = None
 
     def __init__(self):
-        self.cuda_graph_supported = False
+        self.cuda_graph_supported = False if get_accelerator().device_name() != 'hpu' else True
 
     @abstractmethod
     def attention(self):
@@ -62,7 +62,7 @@ def __init__(
             # Type of normalization to perform
             norm_type=NormType.LayerNorm):
         super().__init__()
-        self.cuda_graph_supported = False
+        self.cuda_graph_supported = False if get_accelerator().device_name() != 'hpu' else True
         self.inference = inference
         self.linear_layer = linear_layer
         self.scale_attention = scale_attention
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index fe32378613c9..b3bc04ab33fc 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -202,7 +202,9 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m
     micro_batch_size = -1
     seed = -1
     local_rank = -1
-
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    llama_large_seq_perf_workaround = os.getenv("DEEPSPEED_LLAMA_LARGE_SEQ_PERF_WORKAROUND",
+                                                "0").lower() in ["1", "true"]
     mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group,
                                           mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
 
@@ -321,7 +323,8 @@ def set_lm_head(module):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
         return module
 
-    if checkpoint_dict is not None and not config.replace_with_kernel_inject:
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    if not llama_large_seq_perf_workaround and checkpoint_dict is not None and not config.replace_with_kernel_inject:
         # AutoTP shard loading
         checkpoint = checkpoint_dict["checkpoints"]
         pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
@@ -344,9 +347,11 @@ def set_lm_head(module):
     quantizer = GroupQuantizer(q_int8=quantize)
     world_size = dist.get_world_size() if dist.is_initialized() else 1
     rank = dist.get_rank() if dist.is_initialized() else 0
-    if checkpoint_dict is not None and config.replace_with_kernel_inject:
-        assert container_g.ckpt_load_enabled, \
-               f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
+    # TODO SW-170491: remove the below WA once SW-170491 is resolved.
+    if checkpoint_dict is not None and (config.replace_with_kernel_inject or llama_large_seq_perf_workaround):
+        if not llama_large_seq_perf_workaround or get_accelerator().device_name() != "hpu":
+            assert container_g.ckpt_load_enabled, \
+                   f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
         start_time = time.time()
         checkpoint = checkpoint_dict['checkpoints']
         ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
@@ -563,7 +568,11 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
     """
     sd = None
     if checkpoint is not None:
-        sd = torch.load(checkpoint, map_location='cpu')
+        if checkpoint.endswith(".safetensors"):
+            from safetensors.torch import load_file
+            sd = load_file(checkpoint)
+        else:
+            sd = torch.load(checkpoint, map_location='cpu')
     policy = {}
     if orig_class is not None:
         policy.update({orig_class: (replace_fn, _replace_policy)})
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index eb6ce2f75c69..e04369f0efd1 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -89,7 +89,7 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
                 torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
             ]
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
@@ -108,7 +108,10 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=DeepSpeedSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -136,7 +139,8 @@ def forward(self,
                 output_attentions=False,
                 norm_w=None,
                 norm_b=None,
-                alibi=None):
+                alibi=None,
+                **kwargs):
         if self.attn_qkvw is None:
             self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
         else:
@@ -157,10 +161,17 @@ def forward(self,
                                     gamma=norm_w,
                                     beta=norm_b)
 
+        is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+        token_idx = kwargs.get("token_idx", None)
+        position_ids = kwargs.get("position_ids", None)
+
         context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
-                                                                       alibi=alibi)
+                                                                       alibi=alibi,
+                                                                       is_prompt=is_prompt,
+                                                                       token_idx=token_idx,
+                                                                       position_ids=position_ids)
 
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
@@ -210,7 +221,7 @@ def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_
 
         return tensor_list
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
@@ -246,8 +257,10 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
         matmul_result = torch.matmul(query_layer, key_layer)
         # change view to [batch_size, num_heads, q_length, k_length]
         attention_scores = matmul_result.view(output_size[0], output_size[1], output_size[2], -1)
-
-        offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0
+        if self.config.mp_size > 1 and dist.is_initialized():
+            offset = dist.get_rank() * self.num_attention_heads_per_partition
+        else:
+            offset = 0
         target_dtype = torch.float16 if self.config.dtype == torch.int8 else self.config.dtype
 
         # When using the hybrid engine with BLOOM, input_mask needs to be converted from torch.bool -> torch.int64
@@ -255,7 +268,7 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             input_mask = input_mask.long()
 
         attention_probs = self.softmax_func(attn_scores=attention_scores,
-                                            attn_mask=((1 - input_mask).to(target_dtype) * minus_inf),
+                                            attn_mask=input_mask.to(target_dtype) * minus_inf,
                                             alibi=alibi,
                                             triangular=(self.config.triangular_masking
                                                         and (attention_scores.shape[-2] > 1)),
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 6f9b35cbc05d..feaba15f8985 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -38,8 +38,8 @@ def forward(self,
                 final_bias: Optional[torch.Tensor] = None):
 
         if self.residual_add_func is not None:
-            if final_bias is None:
-                residual = self._vector_add(residual, hidden_state, 1.0 / self.config.mp_size)
+            if final_bias is None and attention_bias is None:
+                residual = self._vector_add(residual + attention_output, hidden_state, 1.0 / self.config.mp_size)
             else:
                 if not self.config.pre_layer_norm and residual_add is not None:
                     # only use residual add if its set and we are not pre layer norm
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index bc309d94df14..f5faeb9f9958 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -30,14 +30,19 @@ def softmax_fallback(self, attn_scores, attn_mask, alibi, triangular, recompute,
         if os.environ.get('DS_KI_FALLBACK') == 'True':
             alibi = alibi[head_offset:head_offset + self.num_attention_heads_per_partition]
             input_dtype = attn_scores.dtype
+            attn_scores *= layer_scale
             if (triangular):
                 tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
-                attn_scores = torch.masked_fill(attn_scores * layer_scale, tri, torch.finfo(input_dtype).min)
+                attn_scores = torch.masked_fill(attn_scores, tri, torch.finfo(input_dtype).min)
             if alibi is not None:
                 attn_scores += alibi
             if attn_mask is not None:
                 # expand atten_mask from two dim into 4 dim, insert two dims in the middle
-                attn_mask = attn_mask[:, None, None, :]
+                if len(attn_mask.shape) == 2:
+                    # The above if statement was added because the mask was already 4D so this
+                    # expansion should be avoided as it expands to 6D and crashes later (in bloom
+                    # HE KI FB)
+                    attn_mask = attn_mask[:, None, None, :]
                 attn_scores += attn_mask
             output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
             return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 0dc4e08a3633..16cc22b34b5d 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -25,11 +25,12 @@ def __init__(self, config: DeepSpeedInferenceConfig):
 
     def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads,
                                  num_kv, norm_factor, triangular_masking, local_attention, window_size, no_masking,
-                                 layer_id, num_layers, alibi, rope_theta):
+                                 layer_id, num_layers, alibi, rope_theta, is_prompt, token_idx, position_ids):
         raise NotImplementedError
 
     def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
-                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor,
+                is_prompt: bool, token_idx: torch.Tensor, position_ids: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -42,6 +43,6 @@ def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads:
                                            self.config.rotate_every_two, heads, num_kv, norm_factor,
                                            self.config.triangular_masking, self.config.local_attention,
                                            self.config.window_size, no_masking, layer_id, num_layers, alibi,
-                                           self.config.rope_theta)
+                                           self.config.rope_theta, is_prompt, token_idx, position_ids)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
index c05370ec74e5..6845d91b06be 100644
--- a/deepspeed/ops/transformer/inference/triton/attention.py
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -125,7 +125,7 @@ def _triton_autotune(min_seqlen,
             context_4d_matmul(output, qkv, head_size)
         Fp16Matmul._update_autotune_table()
 
-    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list):
             qkv_out = qkv_out[0]
 
@@ -143,7 +143,10 @@ def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=TritonSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -161,7 +164,8 @@ def forward(
             norm_w=None,
             norm_b=None,
             alibi=None,
-            use_triton_attention=True):
+            use_triton_attention=True,
+            **kwargs):
 
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
@@ -192,10 +196,16 @@ def forward(
                                               triangular=self.triangular_masking)
             key_layer, value_layer = qkv[:, :, self.hidden_size:2 * self.hidden_size], qkv[:, :, 2 * self.hidden_size:]
         else:
+            is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+            token_idx = kwargs.get("token_idx", None)
+            position_ids = kwargs.get("position_ids", None)
             context_layer, key_layer, value_layer = self.ds_compute_attention(qkv_out=qkv_out,
                                                                               input_mask=input_mask,
                                                                               layer_past=layer_past,
-                                                                              alibi=alibi)
+                                                                              alibi=alibi,
+                                                                              is_prompt=is_prompt,
+                                                                              toke_idx=token_idx,
+                                                                              position_ids=position_ids)
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
 
         inp_norm = qkv_out[-1]
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 772d23f2d0ac..9d66bc49da62 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -439,7 +439,9 @@ def get_partitioned_activations_for_backward(args, inputs, contiguous_checkpoint
             num_non_fp_tensors += 1
             continue
 
-        arg.data = inp.data
+        arg.data = torch.empty([], device=arg.device).data
+        arg.saved_data = inp.data
+
         new_args.append(arg)
         i = arg_index - num_non_fp_tensors
 
@@ -472,7 +474,8 @@ def get_cpu_activations_for_backward(args, inputs):
             new_args.append(arg)
             continue
 
-        arg.data = inp.data
+        arg.data = torch.empty([], device=arg.device).data
+        arg.saved_data = inp.data
         new_args.append(arg)
 
     return new_args
@@ -602,6 +605,9 @@ def backward(ctx, *grads):
         # removing pointers to the contiguous buffer memory
         # so that they can be garbage collected once the checkpoints
         # have been used
+        if grads[0].device.type == 'hpu':
+            import habana_frameworks.torch as htorch
+            htorch.core.mark_step()
         if SYNCHRONIZE:
             get_accelerator().synchronize()
         if PROFILE_TIME:
@@ -628,6 +634,12 @@ def backward(ctx, *grads):
 
         global cuda_device, transport_stream, PARTITION_ACTIVATIONS
 
+        # Rebuild deepspeed_saved_tensors
+        for t in ctx.deepspeed_saved_tensors:
+            if t is not None and hasattr(t, 'saved_data') and t.saved_data is not None:
+                t.data = t.saved_data.to(t.device)
+                t.saved_data = None
+
         if PARTITION_ACTIVATIONS:
             # with get_accelerator().stream(transport_stream):
             inputs = gather_partitioned_activations(ctx.deepspeed_saved_tensors,
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index a02ddbe86403..1ac91816f564 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -37,7 +37,9 @@ def __init__(self,
                  norm_type=2,
                  allgather_bucket_size=5000000000,
                  dp_process_group=None,
-                 timers=None):
+                 timers=None,
+                 grad_acc_dtype=None,
+                 accumulate_grads_via_hooks=False):
         super().__init__()
         see_memory_usage('begin bf16_optimizer', force=True)
         self.timers = timers
@@ -51,6 +53,8 @@ def __init__(self,
         self.allgather_bucket_size = int(allgather_bucket_size)
         self.dp_process_group = dp_process_group
         self.dp_rank = dist.get_rank(group=self.dp_process_group)
+        self.grad_acc_dtype = grad_acc_dtype
+        self.accumulate_grads_via_hooks = accumulate_grads_via_hooks
         self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
 
         # Use torch (un)flatten ops
@@ -119,7 +123,9 @@ def _setup_for_real_optimizer(self):
             num_elem_list = [t.numel() for t in self.bf16_groups[i]]
 
             # create fp32 gradients
-            self.fp32_groups_gradients_flat.append(torch.zeros_like(self.bf16_groups_flat[i], dtype=torch.float32))
+            assert self.grad_acc_dtype in [torch.float32, torch.bfloat16]
+            self.fp32_groups_gradients_flat.append(
+                torch.zeros_like(self.bf16_groups_flat[i], dtype=self.grad_acc_dtype))
 
             # track individual fp32 gradients for entire model
             fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i],
@@ -156,6 +162,9 @@ def _setup_for_real_optimizer(self):
         self.initialize_optimizer_states()
         see_memory_usage('end initialize_optimizer', force=True)
 
+        if self.accumulate_grads_via_hooks:
+            self.create_grad_acc_hooks()
+
         # Need optimizer states initialized before linking lp to optimizer state
         self._link_all_hp_params()
         self._enable_universal_checkpoint()
@@ -204,10 +213,17 @@ def initialize_optimizer_states(self):
         """
         for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
                                                    self.fp32_groups_gradient_flat_partition):
-            param_partition.grad = grad_partition
+            if self.grad_acc_dtype is torch.bfloat16:
+                param_partition.grad = grad_partition.to(param_partition.dtype)
+            else:
+                param_partition.grad = grad_partition
 
         self.optimizer.step()
 
+        if self.grad_acc_dtype is torch.bfloat16:
+            for param_partition in self.fp32_groups_flat_partition:
+                param_partition.grad = None
+
         self.clear_hp_grads()
 
     def _split_flat_tensor(self, flat_tensor, num_elem_list):
@@ -223,6 +239,10 @@ def _split_flat_tensor(self, flat_tensor, num_elem_list):
 
     def _update_storage_to_flattened_tensor(self, tensor_list, flat_tensor):
         updated_params = self.unflatten(flat_tensor, tensor_list)
+        # TODO: SW-170341 need to remove the below WA once SW-169269 is resolved
+        if os.environ.get('DEEPSPEED_HPU_SYNC_INSIDE_INIT', '0') == '1':
+            import habana_frameworks.torch as ht
+            ht.hpu.synchronize()
         for p, q in zip(tensor_list, updated_params):
             p.data = q.data
 
@@ -246,8 +266,19 @@ def step(self, closure=None):
                                         global_norm=all_groups_norm,
                                         mpu=self.mpu)
 
+        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
+                                                   self.fp32_groups_gradient_flat_partition):
+            if self.grad_acc_dtype is torch.bfloat16:
+                param_partition.grad = grad_partition.to(param_partition.dtype)
+            else:
+                param_partition.grad = grad_partition
+
         self.optimizer.step()
 
+        if self.grad_acc_dtype is torch.bfloat16:
+            for param_partition in self.fp32_groups_flat_partition:
+                param_partition.grad = None
+
         self.update_lp_params()
 
         self.clear_hp_grads()
@@ -264,27 +295,33 @@ def backward(self, loss, update_hp_grads=True, clear_lp_grads=False, **bwd_kwarg
         self.clear_lp_grads()
         loss.backward(**bwd_kwargs)
 
-        if update_hp_grads:
+        if not self.accumulate_grads_via_hooks and update_hp_grads:
             self.update_hp_grads(clear_lp_grads=clear_lp_grads)
 
     @torch.no_grad()
-    def update_hp_grads(self, clear_lp_grads=False):
-        for i, group in enumerate(self.bf16_groups):
-            for j, lp in enumerate(group):
-                if lp.grad is None:
-                    continue
+    def update_hp_grad(self, lp, group_idx, param_idx, clear_lp_grads):
+        if lp.grad is None:
+            return
 
-                hp_grad = self.fp32_groups_gradients[i][j]
-                assert hp_grad is not None, \
-                    f'high precision param has no gradient, lp param_id = {id(lp)} group_info = [{i}][{j}]'
+        hp_grad = self.fp32_groups_gradients[group_idx][param_idx]
+        assert hp_grad is not None, \
+            f'high precision param has no gradient, lp param_id = {id(lp)} group_info = [{group_idx}][{param_idx}]'
 
-                hp_grad.data.add_(lp.grad.data.to(hp_grad.dtype).view(hp_grad.shape))
-                lp._hp_grad = hp_grad
-                self.fp32_groups_has_gradients[i][j] = True
+        hp_grad.data.add_(lp.grad.data.to(hp_grad.dtype).view(hp_grad.shape))
+        lp._hp_grad = hp_grad
+        self.fp32_groups_has_gradients[group_idx][param_idx] = True
 
-                # clear gradients
-                if clear_lp_grads:
-                    lp.grad = None
+        # clear gradients
+        if clear_lp_grads:
+            lp.grad = None
+
+    @torch.no_grad()
+    def update_hp_grads(self, clear_lp_grads=False):
+        if self.accumulate_grads_via_hooks:
+            return
+        for i, group in enumerate(self.bf16_groups):
+            for j, lp in enumerate(group):
+                self.update_hp_grad(lp, i, j, clear_lp_grads)
 
     @torch.no_grad()
     def get_grads_for_reduction(self):
@@ -307,7 +344,6 @@ def get_grads_for_norm(self, for_clipping=False):
                     continue
 
                 grads.append(self.fp32_groups_gradients[i][j])
-
         return grads
 
     @torch.no_grad()
@@ -320,6 +356,7 @@ def update_lp_params(self):
             # if i == 0:
             #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
 
+        #TODO: SW-90304 call all_gather_dp_groups with async_op=true if zero optimizer hpu_use_async_collectives is enabled
         all_gather_dp_groups(groups_flat=self.bf16_groups_flat,
                              partitioned_param_groups=self.bf16_partitioned_groups,
                              dp_process_group=self.real_dp_process_group,
@@ -414,6 +451,28 @@ def _load_hp_checkpoint_state(self, checkpoint_dir):
                     lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
                                                 tp_world_size)
 
+    def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
+        assert self.accumulate_grads_via_hooks
+        self.update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=False)
+
+    def create_grad_acc_hooks(self):
+        self.grad_accs = []
+        for i, param_group in enumerate(self.bf16_groups):
+            for j, param in enumerate(param_group):
+                if param.requires_grad:
+
+                    def wrapper(param, i, j):
+                        param_tmp = param.expand_as(param)
+                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+                        def accumulate_hp_grads_and_remove_lp(*notneeded):
+                            self.accumulate_hp_grads_and_remove_lp(param, i, j)
+
+                        grad_acc.register_hook(accumulate_hp_grads_and_remove_lp)
+                        self.grad_accs.append(grad_acc)
+
+                    wrapper(param, i, j)
+
 
 def _get_padded_tensor(src_tensor, size):
     if src_tensor.numel() >= size:
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5cd44864bb2e..67544c54c67e 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -19,7 +19,8 @@ def create(self, tag):
 
     def save(self, state_dict, path: str):
         logger.info(f"[Torch] Saving {path}...")
-        torch.save(state_dict, path)
+        # TODO SW-169821: remove WA: Once SW-169817 is solved to allow numpy() HPU un-detached tensors
+        save(state_dict, path, pickle_protocol=4)
         logger.info(f"[Torch] Saved {path}.")
         return None
 
@@ -32,3 +33,25 @@ def load(self, path: str, map_location=None):
     def commit(self, tag):
         logger.info(f"[Torch] Checkpoint {tag} is ready now!")
         return True
+
+
+# TODO SW-169821: remove WA: Once SW-169817 is solved to allow numpy() HPU un-detached tensors
+def save(data, filename, pickle_protocol=4):
+
+    def convert_for_pickle(obj):
+        if isinstance(obj, torch.Size):
+            return obj
+        elif isinstance(obj, dict):
+            return {k: convert_for_pickle(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [convert_for_pickle(e) for e in obj]
+        elif isinstance(obj, tuple):
+            return tuple([convert_for_pickle(e) for e in obj])
+        else:
+            if isinstance(obj, torch.Tensor):
+                return obj.data.detach().clone().cpu()
+            else:
+                return obj
+
+    data = convert_for_pickle(data)
+    torch.save(data, filename, pickle_protocol=pickle_protocol)
diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
new file mode 100644
index 000000000000..603f563fca60
--- /dev/null
+++ b/deepspeed/runtime/compiler.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Union, Callable, Dict, Any
+import importlib
+import torch
+from ..pydantic_v1 import validator
+from .config_utils import DeepSpeedConfigModel
+
+COMPILE_CONFIG = "compile"
+
+
+def is_compile_supported():
+    return hasattr(torch, "compile")
+
+
+def disable(func):
+    if is_compile_supported():
+        return torch.compiler.disable(func)
+    return func
+
+
+def get_compile_config(param_dict):
+    if COMPILE_CONFIG in param_dict:
+        compile_config_dict = param_dict[COMPILE_CONFIG]
+    else:
+        compile_config_dict = {}
+    return CompileConfig(**compile_config_dict)
+
+
+def get_backend_fn(backend: Union[str, Callable]) -> Union[str, Callable]:
+    if isinstance(backend, Callable):
+        return backend
+
+    elif isinstance(backend, str):
+        if backend in torch._dynamo.list_backends():
+            return backend
+
+        # Get module name from backend name
+        module_name = '.'.join(backend.split('.')[:-1])
+        fn_name = backend.split('.')[-1]
+
+        try:
+            module = importlib.import_module(module_name)
+            backend_fn = getattr(module, fn_name)
+        except ImportError:
+            raise ValueError(
+                f"The backend {backend} is not in the list of available backends and could not be imported.")
+        return backend_fn
+
+    raise ValueError(f"backend for torch.compile must be a string or Callable: {backend}")
+
+
+class CompileConfig(DeepSpeedConfigModel):
+    """
+    [EXPERIMENTAL] This configuration enables users to activate `torch.compile` within DeepSpeed and customize its settings.
+    Please be aware that these features and API designs are experimental and subject to change.
+    """
+
+    enabled: bool = False
+    """
+    Enable torch.compile when True.
+    """
+
+    backend: str = "inductor"
+    """
+    Passed to `backend` argument of torch.compile.
+    If the given value is not in torch._dynamo.list_backends(),
+    DeepSpeed attempts to import and instantiate the module with the given name.
+    """
+
+    kwargs: Dict[str, Any] = {}
+    """
+    Passed to `kwargs` argument of torch.compile.
+    """
+
+    @validator("enabled")
+    def validate_enabled(cls, field_value, values):
+        if field_value and not is_compile_supported():
+            raise ValueError("torch.compile is not supported on this version of PyTorch.")
+        return field_value
+
+
+class CompiledModuleWrapper(torch.nn.Module):
+
+    def __init__(self, module, compile_config: Union[CompileConfig, None] = None):
+        super().__init__()
+
+        assert is_compile_supported(), "torch.compile is not supported on this version of PyTorch."
+
+        modules = self.__dict__.get('_modules')
+        modules['wrapped'] = module
+        self.__dict__['wrapped'] = module
+        self._is_compiled = False
+        self._backend = get_backend_fn(compile_config.backend)
+        self._compile_kwargs = compile_config.kwargs
+        self._compiler_fn = None
+
+    def __getattr__(self, name):
+        return getattr(self.__dict__['wrapped'], name)
+
+    def set_backend(self, backend: Union[str, Callable]):
+        """Set the backend for torch.compile.
+
+        Args:
+            backend (Union[str, Callable]): backend name or a function that takes a torch.nn.Module and returns a compiled module.
+            You can directly pass a function that works as a backend.
+            See also `backend` field in `CompileConfig` for more details.
+        """
+        self._backend = get_backend_fn(backend)
+
+    def set_torch_compile_kwargs(self, kwargs: Dict[str, Union[str, Any]]) -> None:
+        """Set kwargs for torch.compile. Kwargs that are set in DeepSpeed config will be overwritten.
+        You can also pass a backend name with "backend" key to change the backend.
+
+        Args:
+            kwargs (Dict[str, Union[str, Any]]): kwargs passed to torch.compile.
+        """
+
+        if "backend" in kwargs:
+            raise ValueError("backend cannot be set as compile kwargs. Use set_backend instead.")
+        self._compile_kwargs.update(kwargs)
+
+    def set_compiler_fn(self, compiler_fn: Callable) -> None:
+        """Set a function to be used for compiling the module.
+        This function should take a torch.nn.Module as input and return a compiled module.
+        Note that other compile options are ignored when a compiler_fn is set.
+
+        Example:
+        ```python
+            def my_compiler_fn(module: torch.nn.Module):
+                ...
+                return torch.compile(module, ...)
+
+            engine.set_compiler_fn(my_compiler_fn)
+        ```
+        """
+        self._compiler_fn = compiler_fn
+
+    def forward(self, *args, **kwargs) -> Any:
+        if not self.is_compiled:
+            if self._compiler_fn is None:
+                self.__dict__['wrapped'] = torch.compile(self.wrapped, backend=self._backend, **self._compile_kwargs)
+            else:
+                self.__dict__['wrapped'] = self._compiler_fn(self.wrapped)
+            self._is_compiled = True
+
+        return self.__dict__['wrapped'](*args, **kwargs)
+
+    @property
+    def is_compiled(self) -> bool:
+        return self._is_compiled
+
+    @property
+    def backend(self) -> Union[str, Callable]:
+        return self._backend
+
+    @property
+    def torch_compile_kwargs(self) -> Dict[str, Any]:
+        return self._compile_kwargs
+
+    @property
+    def compiler_fn(self) -> Union[Callable, None]:
+        return self._compiler_fn
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index b49469b94f11..3b049131a8e6 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -31,6 +31,7 @@
 from ..comm.config import DeepSpeedCommsConfig
 from ..monitor.config import get_monitor_config
 from ..inference.config import WeightQuantConfig
+from .compiler import get_compile_config
 
 from deepspeed import comm as dist
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
@@ -168,6 +169,13 @@ def get_bfloat16_enabled(param_dict):
     return False
 
 
+def get_bfloat16_accumulate_grads_via_hooks(param_dict):
+    for key in [BFLOAT16, BFLOAT16_OLD]:
+        if key in param_dict.keys():
+            return get_scalar_param(param_dict[key], BFLOAT16_GRAD_ACC_VIA_HOOKS, BFLOAT16_GRAD_ACC_VIA_HOOKS_DEFAULT)
+    return False
+
+
 def get_fp16_master_weights_and_grads_enabled(param_dict):
     if get_fp16_enabled(param_dict):
         return get_scalar_param(param_dict[FP16], FP16_MASTER_WEIGHTS_AND_GRADS, FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
@@ -495,6 +503,10 @@ def get_zero_force_ds_cpu_optimizer(param_dict):
     return get_scalar_param(param_dict, ZERO_FORCE_DS_CPU_OPTIMIZER, ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT)
 
 
+def get_zero_allow_comm_data_type_fp32(param_dict):
+    return get_scalar_param(param_dict, ZERO_ALLOW_COMM_DATA_TYPE_FP32, ZERO_ALLOW_COMM_DATA_TYPE_FP32_DEFAULT)
+
+
 def get_scheduler_name(param_dict):
     if SCHEDULER in param_dict.keys() and TYPE in param_dict[SCHEDULER].keys():
         return param_dict[SCHEDULER][TYPE]
@@ -813,6 +825,7 @@ def _initialize_params(self, param_dict):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.fp16_auto_cast = get_fp16_auto_cast(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
+        self.bfloat16_accumulate_grads_via_hooks = get_bfloat16_accumulate_grads_via_hooks(param_dict)
         assert not (self.fp16_enabled
                     and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict)
@@ -894,6 +907,8 @@ def _initialize_params(self, param_dict):
         self.weight_quantization_config = WeightQuantConfig(
             **param_dict['weight_quantization']) if 'weight_quantization' in param_dict else None
 
+        self.compile_config = get_compile_config(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 96f2a38bd05c..4c446265eb27 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -126,6 +126,10 @@
 BFLOAT16_ENABLED = "enabled"
 BFLOAT16_ENABLED_DEFAULT = False
 
+# bf16 optimizer gradient accumulation via hooks
+BFLOAT16_GRAD_ACC_VIA_HOOKS = "accumulate_grads_via_hooks"
+BFLOAT16_GRAD_ACC_VIA_HOOKS_DEFAULT = False
+
 #########################################
 # FP16 support
 #########################################
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8f62f36f328e..1fd3a8795a51 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -90,6 +90,7 @@
 
 from .pipe.module import PipelineModule
 from .utils import get_ma_status
+from .compiler import CompiledModuleWrapper
 from ..ops.adam import FusedAdam
 from ..moe.sharded_moe import TopKGate, MOELayer
 from ..moe.layer import MoE
@@ -175,21 +176,19 @@ def __init__(self, enable_micro_timers, enable_global_timers):
 class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training."""
 
-    def __init__(
-        self,
-        args,
-        model,
-        optimizer=None,
-        model_parameters=None,
-        training_data=None,
-        lr_scheduler=None,
-        mpu=None,
-        dist_init_required=None,
-        collate_fn=None,
-        config=None,
-        config_class=None,
-        dont_change_device=False,
-    ):
+    def __init__(self,
+                 args,
+                 model,
+                 optimizer=None,
+                 model_parameters=None,
+                 training_data=None,
+                 lr_scheduler=None,
+                 mpu=None,
+                 dist_init_required=None,
+                 collate_fn=None,
+                 config=None,
+                 config_class=None,
+                 dont_change_device=False):
         super(DeepSpeedEngine, self).__init__()
         self.dont_change_device = dont_change_device
         self.client_optimizer = optimizer
@@ -219,6 +218,7 @@ def __init__(
         self.num_experts = []
         self.gate_modules = []
         self.moe_layers = []
+        self.has_sequence_parallel_params = False
         self._step_applied = False
         self._global_grad_norm = None
         self.use_ds_comm = False  # False --> Use torch.dist, True --> Use ds.comm backend.
@@ -227,18 +227,19 @@ def __init__(
 
         self._is_gradient_accumulation_boundary = None
         self.scale_wrt_gas = None
-        self.losses = 0.0
+        self.losses = []
 
         # for debug purposes - can then debug print: debug_get_module_name(module)
         debug_extract_module_and_param_names(model)
 
-        # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
-        self.param_names = {param: name for name, param in model.named_parameters()}
-
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
         see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
+
+        if self.fp16_enabled() and not get_accelerator().is_fp16_supported():
+            raise ValueError("Type fp16 is not supported.")
+
         if mpu is not None:
             if self.elasticity_enabled():
                 if not self.is_elastic_model_parallel_supported():
@@ -261,6 +262,9 @@ def __init__(
         # Configure distributed model
         self._configure_distributed_model(model)
 
+        # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
+        self.param_names = {param: name for name, param in model.named_parameters()}
+
         self._get_model_parameters()
 
         see_memory_usage(f"DeepSpeed Engine: After configure distributed model")
@@ -310,6 +314,14 @@ def __init__(
         elif self.bfloat16_enabled():
             self.optimizer = self._configure_bf16_optimizer(optimizer=None)
 
+        #Sequence parallel related initialization
+        for param in self.module.parameters():
+            if getattr(param, 'sequence_parallel', False):
+                self.has_sequence_parallel_params = True
+                break
+        if self.has_sequence_parallel_params:
+            assert self.mpu is not None, "sequence parallel allreduce only supported with tensor parallel enabled"
+
         # Hook optimizer for snip_momentum pruning
         if hasattr(model, 'pruners'):
             from ..compression.helper import rewrite_optimizer_step
@@ -359,6 +371,9 @@ def __init__(
         self.flatten = _flatten_dense_tensors
         self.unflatten = _unflatten_dense_tensors
 
+        if self._config.compile_config.enabled:
+            self._set_client_model(CompiledModuleWrapper(self.module, self._config.compile_config))
+
     def destroy(self):
         if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
             self.optimizer.destroy()
@@ -448,7 +463,10 @@ def get_global_grad_norm(self) -> float:
         Returns:
             float: norm
         """
-        return self._global_grad_norm
+        grad_norm = self._global_grad_norm
+        if isinstance(grad_norm, torch.Tensor):
+            grad_norm = grad_norm.item()
+        return grad_norm
 
     def __getattr__(self, name):
         """
@@ -462,6 +480,13 @@ def __getattr__(self, name):
             return getattr(self, name)
         elif name in dir(_module):
             return getattr(_module, name)
+        elif isinstance(_module, CompiledModuleWrapper):
+            try:
+                return getattr(_module, name)
+            except AttributeError:
+                raise AttributeError(
+                    f"None of {type(self).__name__}, CompiledModuleWrapper, or the wrapped model has the attribute '{name}'"
+                )
         else:
             raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
@@ -954,14 +979,22 @@ def _scheduler_from_config(self, optimizer):
     def _set_distributed_vars(self, args):
         device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
         if device_rank >= 0:
-            get_accelerator().set_device(device_rank)
-            self.device = torch.device(get_accelerator().device_name(), device_rank)
+            # todo SW-143933 remove the below HPU wa for torch.device initialization, ticket SW-143931 has to be solved.
+            if get_accelerator().device_name() == "hpu":
+                self.device = torch.device("hpu")
+            else:
+                get_accelerator().set_device(device_rank)
+                self.device = torch.device(get_accelerator().device_name(), device_rank)
             self.world_size = dist.get_world_size()
             self.global_rank = dist.get_rank()
         else:
             self.world_size = 1
             self.global_rank = 0
-            self.device = torch.device(get_accelerator().device_name())
+            # todo SW-143933 remove the below HPU wa for torch.device initialization, ticket SW-143931 has to be solved.
+            if get_accelerator().device_name() == "hpu":
+                self.device = torch.device("hpu")
+            else:
+                self.device = torch.device(get_accelerator().device_name())
 
     # Configure based on command line arguments
     def _configure_with_arguments(self, args, mpu):
@@ -1180,9 +1213,12 @@ def _do_optimizer_sanity_check(self, basic_optimizer):
         # data type checks
         elif model_dtype == grad_accum_dtype:
             if model_dtype == torch.bfloat16:
-                raise NotImplementedError(
+                logger.warning(
                     "Bfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulation"
                 )
+                logger.warning(
+                    "BF16 gradient accumulation is not safe numerically with large number of accumulation steps")
+                return BFLOAT16
             if model_dtype == torch.float16:
                 return FP16
             # else optimizer_wrapper = None
@@ -1237,7 +1273,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
         else:
             self.optimizer = basic_optimizer
 
-        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
+        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer.__class__.__name__), ranks=[0])
 
         self.compression_scheduler = self._configure_compression_scheduler()
         self.quantizer = self._configure_quantization()
@@ -1252,6 +1288,11 @@ def _configure_basic_optimizer(self, model_parameters):
                 "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
             )
 
+        optimizer = get_accelerator().get_optimizer(self.optimizer_name(), self.zero_use_cpu_optimizer(),
+                                                    model_parameters, **optimizer_parameters)
+        if optimizer is not None:
+            return optimizer
+
         if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
             torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
             adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)
@@ -1444,7 +1485,9 @@ def _configure_bf16_optimizer(self, optimizer):
                                    clip_grad=clip_grad,
                                    allgather_bucket_size=self.zero_allgather_bucket_size(),
                                    dp_process_group=self.seq_data_parallel_group,
-                                   timers=timers)
+                                   timers=timers,
+                                   grad_acc_dtype=self.get_data_types()[1],
+                                   accumulate_grads_via_hooks=self._config.bfloat16_accumulate_grads_via_hooks)
 
         return optimizer
 
@@ -1582,8 +1625,7 @@ def _configure_zero_optimizer(self, optimizer):
                     communication_data_type=self.communication_data_type,
                     zero_hpz_partition_size=self.zero_hpz_partition_size(),
                     zero_quantized_weights=self.zero_quantized_weights(),
-                    zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
-                )
+                    zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights())
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
@@ -1739,14 +1781,15 @@ def eval(self):
         self.warn_unscaled_loss = True
         self.module.train(False)
 
-    def _scale_loss_by_gas(self, prescaled_loss):
+    def _scale_loss_by_gas(self, prescaled_loss, eval_micro_batches=None):
+        gas = self.gradient_accumulation_steps() if eval_micro_batches is None else eval_micro_batches
         if isinstance(prescaled_loss, torch.Tensor):
-            scaled_loss = prescaled_loss / self.gradient_accumulation_steps()
+            scaled_loss = prescaled_loss / gas
         elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list):
             scaled_loss = []
             for l in prescaled_loss:
                 if isinstance(l, torch.Tensor):
-                    scaled_loss.append(l / self.gradient_accumulation_steps())
+                    scaled_loss.append(l / gas)
                 else:
                     scaled_loss.append(l)
         else:
@@ -1921,16 +1964,20 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
             loss = self._scale_loss_by_gas(loss.float())
 
         # Log training loss
-        self.losses += loss.mean().item()
-        if self.monitor.enabled:
+        #TODO: SW-162087 keeping 0.9.4 code here to avoid perf drop
+        if self.monitor.enabled and not self.pipeline_parallelism:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
+                    loss_sum = torch.stack(self.losses).sum().item() if len(self.losses) > 0 else 0
                     self.summary_events = [(
                         f"Train/Samples/train_loss",
-                        self.losses,
+                        loss_sum / self.gradient_accumulation_steps(),
                         self.global_samples,
                     )]
                     self.monitor.write_events(self.summary_events)
+                self.losses = []
+            else:
+                self.losses.append(loss.mean())
 
         self._start_timers(self.engine_timers.backward_timers)
 
@@ -2093,7 +2140,7 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
             self._report_progress(self.global_steps + 1)
 
-        self.losses = 0.0
+        self.losses = []
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
 
@@ -2417,6 +2464,14 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000)
         if self.has_moe_layers:
             self._reduce_expert_gradients(expert_grads, elements_per_buffer)
 
+        if self.has_sequence_parallel_params:
+            for i, group in enumerate(self.optimizer.bf16_groups):
+                for j, lp in enumerate(group):
+                    if getattr(self.optimizer.bf16_groups[i][j], 'sequence_parallel', False):
+                        dist.all_reduce(self.optimizer.fp32_groups_gradients[i][j],
+                                        op=dist.ReduceOp.SUM,
+                                        group=self.mpu.get_slice_parallel_group())
+
     def sparse_allreduce_no_retain(self, bucket, dp_group):
         allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group)
         # Densify sparse tensor and copy back to original location
@@ -2767,7 +2822,6 @@ def _load_checkpoint(self,
             self._curr_ckpt_path = os.path.join(load_dir, tag)
 
         if self.has_moe_layers:
-            # print(checkpoint.keys())
             old_moe_load = False
             if not isinstance(checkpoint['num_experts'], list):
                 old_moe_load = True
@@ -3201,7 +3255,6 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
             state.update(client_state)
             logger.info(f'Saving model checkpoint: {save_path}')
             self.checkpoint_engine.save(state, save_path)
-        self._curr_save_path = None
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
         name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name)
diff --git a/deepspeed/runtime/hpu_utils.py b/deepspeed/runtime/hpu_utils.py
new file mode 100644
index 000000000000..5a7ca4d4b97f
--- /dev/null
+++ b/deepspeed/runtime/hpu_utils.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed.accelerator import get_accelerator
+
+
+def get_use_hpu():
+    return get_accelerator().device_name() == "hpu"
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index c8d6a0bff444..faac24105aa3 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -149,11 +149,33 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                 if self.global_rank != min(d['ranks']):
                     tied_params += sum(p.numel() for p in d['module'].parameters())
             unique_params -= tied_params
-        params_tensor = torch.LongTensor(data=[num_params, unique_params]).to(self.device)
+
+        # Use Int32 representation instead of Int64 for calclations.
+        # num_param division & modulo after all reduce should be lower than MAX Int32.
+        # Using this value will be safe if used with less than ~2000 devices.
+        # Int32Max > all_reduce_group*chunk_size
+        chunk_size = 10**6
+
+        num_params_quotient = num_params // chunk_size
+        num_params_remainder = num_params % chunk_size
+
+        unique_params_quotient = unique_params // chunk_size
+        unique_params_remainder = unique_params % chunk_size
+
+        assert (unique_params_quotient * chunk_size +
+                unique_params_remainder) == unique_params, "Value mismatch after Int64 splitting"
+        assert (num_params_quotient * chunk_size +
+                num_params_remainder) == num_params, "Value mismatch after Int64 splitting"
+
+        params_tensor = torch.IntTensor(
+            data=[num_params_quotient, num_params_remainder, unique_params_quotient, unique_params_remainder]).to(
+                self.device)
+
         dist.all_reduce(params_tensor, group=self.grid.get_model_parallel_group())
         params_tensor = params_tensor.tolist()
-        total_params = params_tensor[0]
-        unique_params = params_tensor[1]
+        total_params = params_tensor[0] * chunk_size + params_tensor[1]
+        unique_params = params_tensor[2] * chunk_size + params_tensor[3]
+
         if self.grid.data_parallel_id == 0:
             logger.info(f'RANK={self.global_rank} '
                         f'STAGE={self.stage_id} '
@@ -183,6 +205,11 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         self.first_output_send = True
         self.first_gradient_send = True
 
+        self.pipe_partition_input_meta_cache = None
+        self.pipe_partition_output_meta_cache = None
+        self.pipe_partition_grad_meta_cache = None
+        self.grad_partition_grad_layer_meta_cache = None
+
         #stores the loss for the current micro batch being processed
         self.loss = torch.tensor(0.0).to(self.device)
 
@@ -236,6 +263,13 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
             self.timers(STEP_MICRO_TIMER).start()
             self.timers(STEP_MICRO_TIMER).stop()
 
+        # TODO: SW-170309 need to remove the below WA once SW-170308 is resolved
+        import os
+        if os.environ.get('DEEPSPEED_SYNC_MICRO_BATCH_STEP', '0') == '1':
+            self.sync_micro_batch_step = True
+        else:
+            self.sync_micro_batch_step = False
+
     def set_has_attention_mask(self, value):
         assert isinstance(value, bool)
         self.has_attention_mask = value
@@ -309,6 +343,11 @@ def reset_activation_shape(self):
         self.grad_layer = None
         self.meta_buffer = None
 
+        self.pipe_partition_input_meta_cache = None
+        self.pipe_partition_output_meta_cache = None
+        self.pipe_partition_grad_meta_cache = None
+        self.grad_partition_grad_layer_meta_cache = None
+
     def train_batch(self, data_iter=None):
         """Progress the pipeline to train the next batch of data. The engine will ingest
         ``self.train_batch_size()`` total samples collectively across all workers.
@@ -393,7 +432,13 @@ def train_batch(self, data_iter=None):
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg', bcast_loss=True):
+    def eval_batch(self,
+                   data_iter,
+                   return_logits=False,
+                   compute_loss=True,
+                   reduce_output='avg',
+                   bcast_loss=True,
+                   eval_micro_batches=None):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -443,18 +488,20 @@ def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_o
         self.set_dataiterator(data_iter)
 
         # Do the work
-        sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
-                                           stages=self.num_stages,
-                                           stage_id=self.stage_id)
+        micro_batches = self.micro_batches if eval_micro_batches is None \
+                        else eval_micro_batches
+        sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=self.num_stages, stage_id=self.stage_id)
 
         # prevent dead-lock with multiple evals sequence
-        dist.barrier()
+        if not get_accelerator().device_name() == "hpu":
+            dist.barrier()
 
         with torch.no_grad():
             self._exec_schedule(sched)
 
         if self.is_last_stage():
-            eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output)
+            eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output, \
+                                               eval_micro_batches=eval_micro_batches)
 
         if compute_loss and (bcast_loss or self.monitor.enabled):
             eval_output = self._bcast_pipe_scalar(eval_output)
@@ -496,7 +543,7 @@ def is_last_stage(self):
         """True if this process is in the last stage in the pipeline."""
         return self.stage_id == self.num_stages - 1
 
-    def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
+    def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True, eval_micro_batches=None):
         if reduce is None:
             return outputs
 
@@ -511,7 +558,7 @@ def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
                     reduced[idx] += out
 
             # Average over the microbatches
-            reduced = self._scale_loss_by_gas(reduced)
+            reduced = self._scale_loss_by_gas(reduced, eval_micro_batches)
 
             # Average over DP groups
             if reduce_dp and self.is_data_parallel:
@@ -641,9 +688,12 @@ def _exec_forward_pass(self, buffer_id):
 
         # collect the partitioned input from the previous stage
         if self.is_pipe_partitioned and not self.is_first_stage():
-            part_input = PartitionedTensor.from_meta(meta=inputs[0],
+            if self.pipe_partition_input_meta_cache is None:
+                self.pipe_partition_input_meta_cache = inputs[0].to('cpu')
+            part_input = PartitionedTensor.from_meta(meta=self.pipe_partition_input_meta_cache,
                                                      local_part=inputs[1],
-                                                     group=self.grid.get_slice_parallel_group())
+                                                     group=self.grid.get_slice_parallel_group(),
+                                                     device=inputs[0].device)
 
             inputs = (part_input.full(), *inputs[2:])
             inputs[0].requires_grad = True
@@ -675,7 +725,7 @@ def _exec_forward_pass(self, buffer_id):
                 raise ValueError("expecting a tensor or a tuple of tensors")
             part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
-            first_output.data = torch.zeros(1)
+            first_output.data = torch.zeros(1, device=first_output.data.device)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
             # Inject the partitioned tensor into the output before sending
             outputs = (part.to_meta(), part.data(), *outputs_tail)
@@ -718,6 +768,9 @@ def _exec_backward_pass(self, buffer_id):
         if self.is_last_stage():
             super().backward(self.loss)
             self.mem_status('AFTER BWD')
+            # TODO: SW-170309 need to remove the below WA once SW-170308 is resolved
+            if self.sync_micro_batch_step:
+                _ = self.loss.item()
             return
 
         outputs = self.pipe_buffers['outputs'][buffer_id]
@@ -732,9 +785,12 @@ def _exec_backward_pass(self, buffer_id):
         # careful to also restore the computational graph of the tensors we partitioned.
         if self.is_pipe_partitioned:
             if self.is_grad_partitioned:
-                part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                if self.pipe_partition_output_meta_cache is None:
+                    self.pipe_partition_output_meta_cache = outputs[0].to('cpu')
+                part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_output_meta_cache,
                                                           local_part=outputs[1],
-                                                          group=self.grid.get_slice_parallel_group())
+                                                          group=self.grid.get_slice_parallel_group(),
+                                                          device=outputs[0].device)
                 self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
                 outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:])
             else:
@@ -745,9 +801,12 @@ def _exec_backward_pass(self, buffer_id):
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
             #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
-            part_grad = PartitionedTensor.from_meta(meta=self.grad_layer[0],
+            if self.grad_partition_grad_layer_meta_cache is None:
+                self.grad_partition_grad_layer_meta_cache = self.grad_layer[0].to('cpu')
+            part_grad = PartitionedTensor.from_meta(meta=self.grad_partition_grad_layer_meta_cache,
                                                     local_part=self.grad_layer[1],
-                                                    group=self.grid.get_slice_parallel_group())
+                                                    group=self.grid.get_slice_parallel_group(),
+                                                    device=self.grad_layer[0].device)
             grad_tensors = (part_grad.full(), *grad_tensors[2:])
             part_grad = None
             #print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
@@ -761,12 +820,19 @@ def _exec_backward_pass(self, buffer_id):
             out_tensors = [t for t in outputs if t.is_floating_point()]
             assert len(out_tensors) == len(grad_tensors)
             torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)
+            # TODO: SW-170309 need to remove the below WA once SW-170308 is resolved
+            if self.sync_micro_batch_step:
+                _ = out_tensors[len(out_tensors) - 1].to('cpu')
         else:
             torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, ))
+            # TODO: SW-170309 need to remove the below WA once SW-170308 is resolved
+            if self.sync_micro_batch_step:
+                _ = outputs.to('cpu')
 
         if self.using_bf16_optimizer and not self.is_last_stage():
             # manually call because we don't call optimizer.backward()
-            self.optimizer.update_hp_grads(clear_lp_grads=False)
+            if not self._config.bfloat16_accumulate_grads_via_hooks:
+                self.optimizer.update_hp_grads(clear_lp_grads=False)
 
         # Free up the memory from the output of forward()
         self.pipe_buffers['output_tensors'][buffer_id] = None
@@ -1063,6 +1129,10 @@ def _exec_recv_activations(self, buffer_id):
                     buffer = self.meta_buffer
 
                 p2p.recv(buffer, self.prev_stage)
+
+            # Performing the clones in a different loop to reduce host dependency,
+            # and improve performance.
+            for idx, buffer in enumerate(self.pipe_recv_buf):
                 recvd[idx] = buffer.clone().detach()
 
             # NCCL does not like to send torch.BoolTensor types, so un-cast the
@@ -1088,9 +1158,12 @@ def _exec_recv_grads(self, buffer_id):
         # XXX these shapes are hardcoded for Megatron
         # Restore partitioned output if it was partitioned and we are sending full gradients
         if self.is_pipe_partitioned and not self.is_grad_partitioned:
-            part_output = PartitionedTensor.from_meta(meta=outputs[0],
+            if self.pipe_partition_grad_meta_cache is None:
+                self.pipe_partition_grad_meta_cache = outputs[0].to('cpu')
+            part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_grad_meta_cache,
                                                       local_part=outputs[1],
-                                                      group=self.grid.get_slice_parallel_group())
+                                                      group=self.grid.get_slice_parallel_group(),
+                                                      device=outputs[0].device)
             outputs[0].data = part_output.full()
             outputs = (outputs[0], *outputs[2:])
             # save for backward
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
index c2db85d1ba58..ea4de4e7c456 100755
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -9,6 +9,7 @@
 import collections
 import json
 from abc import ABC, abstractmethod
+from deepspeed.accelerator import get_accelerator
 
 from deepspeed.utils import logger
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
@@ -51,6 +52,13 @@ def __init__(self, ckpt_list, version, checkpoint_engine):
         self.module_key = None
         self.ckpt_list = ckpt_list
         self.version = version
+        if get_accelerator().device_name() == 'hpu':
+            self.map_location = torch.device("cpu")
+        else:
+            # From https://pytorch.org/docs/stable/generated/torch.load.html:
+            #   "The storage argument will be the initial deserialization of the storage, residing on the CPU."
+            # Therefore, we can use storage argument
+            self.map_location = lambda storage, loc: storage
         self.checkpoint_engine = TorchCheckpointEngine() if checkpoint_engine is None else checkpoint_engine
         self.check_ckpt_list()
 
@@ -94,8 +102,7 @@ def load(self,
         if num_ckpt == mp_world_size:
             assert os.path.exists(load_path)
             #logger.info(f'rank: {mp_rank} loading checkpoint: {load_path}')
-            sd = self.checkpoint_engine.load(load_path, map_location=lambda storage, \
-                loc: storage)
+            sd = self.checkpoint_engine.load(load_path, map_location=self.map_location)
 
             if quantize:
                 quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
@@ -120,7 +127,7 @@ def get_merge_state_dicts(self, mp_world_size, mp_rank):
         ckpt_list = [self.ckpt_list[i] for i in range(num_to_merge * mp_rank, num_to_merge * (mp_rank + 1))]
 
         logger.info(f"mp_rank: {mp_rank}, ckpt_list: {ckpt_list}")
-        sd_list = [self.checkpoint_engine.load(ckpt, map_location=lambda storage, loc: storage) for ckpt in ckpt_list]
+        sd_list = [self.checkpoint_engine.load(ckpt, map_location=self.map_location) for ckpt in ckpt_list]
         return sd_list
 
     def get_split_state_dict(self, mp_world_size, mp_rank):
@@ -133,7 +140,7 @@ def get_split_state_dict(self, mp_world_size, mp_rank):
 
         logger.info(f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}")
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], map_location=lambda storage, loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], map_location=self.map_location)
 
         return sd, num_to_split, ckpt_offset
 
@@ -167,7 +174,7 @@ def check_ckpt_list(self):
         #logger.info(f'checkpoint file list: {self.ckpt_list}')
         assert len(self.ckpt_list) > 0
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=self.map_location)
 
         # check checkpoint count is same with saved mp_world_size
         if 'mp_world_size' in sd.keys():
@@ -406,7 +413,7 @@ def sanity_check(self, ckpt_file_name):
             "mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias"
         ]
 
-        sd = self.checkpoint_engine.load(ckpt_file_name, map_location=lambda storage, loc: storage)
+        sd = self.checkpoint_engine.load(ckpt_file_name, map_location=self.map_location)
 
         # partial_key is a sub-string of one key in the sd
         def check_key_exist(partial_key, sd):
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 48ccdbc29bf6..8f0dc196adc0 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -343,44 +343,53 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
     parameters = list(filter(lambda p: p.grad is not None, parameters))
-    max_norm = float(max_norm)
     norm_type = float(norm_type)
+    all_norms = []
     if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        for p in parameters:
+            all_norms.append(p.grad.data.abs().max().float())
+        total_norm = torch.stack(all_norms).max()
+        origin_device = total_norm.device.type
+        total_norm = total_norm.to(get_accelerator().device_name())
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
     else:
         total_norm = 0
         for p in parameters:
             if mpu is not None:
                 if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p):
-                    param_norm = p.grad.data.norm(norm_type)
-                    total_norm += param_norm.item()**norm_type
+                    param_norm = p.grad.data.detach().float().norm(norm_type)
+                    all_norms.append(param_norm)
             else:
-                param_norm = p.grad.data.float().norm(norm_type)
-                total_norm += param_norm.item()**norm_type
-
+                param_norm = p.grad.data.detach().float().norm(norm_type)
+                all_norms.append(param_norm)
+        if len(all_norms) > 0:
+            total_norm = torch.stack(all_norms).square().sum().float()
+        else:
+            total_norm = torch.FloatTensor([0.0]).to(parameters[0].device)
+        origin_device = total_norm.device.type
+        total_norm = total_norm.to(get_accelerator().device_name())
         # Sum across all model parallel GPUs.
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+        total_norm = total_norm.pow(1. / norm_type)
 
     # Need to average total_norm across different GPUs due to the presence of moe params
     pg = groups._get_data_parallel_group()
     scaled_norm = total_norm * 1.0 / float(dist.get_world_size(group=pg))
+    scaled_norm_tensor = scaled_norm
 
-    scaled_norm_tensor = get_accelerator().FloatTensor([float(scaled_norm)])
     dist.all_reduce(scaled_norm_tensor, group=pg)
-    total_norm = scaled_norm_tensor.item()
+    total_norm = scaled_norm_tensor
+    total_norm = total_norm.to(origin_device)
 
+    max_norm = torch.tensor([float(max_norm)], device=parameters[0].device)
     clip_coef = max_norm / (total_norm + 1e-6)
-    if clip_coef < 1:
-        for p in parameters:
-            p.grad.data.mul_(clip_coef)
+    tmp_tensor = torch.tensor([1.0], device=parameters[0].device)
+    clip_coef = torch.max(tmp_tensor, clip_coef)
+    for p in parameters:
+        p.grad.data.mul_(clip_coef)
     return total_norm
 
 
@@ -748,6 +757,8 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     new_alloced = get_accelerator().memory_allocated()
     new_cached = get_accelerator().memory_cached()
 
+    new_alloced = torch_memory_allocated()  # noqa: F821
+
     delta_alloced = new_alloced - mem_alloced
     delta_cached = new_cached - mem_cached
 
@@ -875,18 +886,22 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
     assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors'
 
     norm_type = float(norm_type)
+    all_norms = []
     if norm_type == inf:
-        total_norm = max(t.data.abs().max() for t in input_tensors)
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        for t in input_tensors:
+            all_norms.append(t.data.abs().max().float())
+        total_norm_cuda = torch.stack(all_norms).max()
         if mpu is not None:
             dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
-            total_norm = total_norm_cuda[0].item()
+            total_norm = total_norm_cuda.item()
     else:
-        total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        for t in input_tensors:
+            all_norms.append(t.data.float().norm(norm_type))
+        total_norm_cuda = torch.stack(all_norms).pow(norm_type).sum()
+
         if mpu is not None:
             dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+        total_norm = total_norm_cuda.item()**(1. / norm_type)
 
     if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 76583c129cb9..ec634aa8ae5e 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -20,6 +20,7 @@
     "stage": [0|1|2],
     "stage3_max_live_parameters" : 1000000000,
     "stage3_max_reuse_distance" : 1000000000,
+    "stage3_use_all_reduce_for_fetch_params": [true|false],
     "allgather_partitions": [true|false],
     "use_multi_rank_bucket_allreduce": [true|false],
     "allgather_bucket_size": 500000000,
@@ -97,7 +98,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     memory fragmentation during backward pass.
     """
 
-    reduce_scatter: bool = True
+    reduce_scatter: bool = False
     """
     Uses reduce or reduce scatter instead of allreduce to average gradients
     """
@@ -197,7 +198,10 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
     """
 
-    param_persistence_threshold: int = Field(pp_int(1e5), ge=0, alias="stage3_param_persistence_threshold")
+    #WA for SW-148986. Set param_persistence_threshold to 0 for zero inf
+    param_persistence_threshold: int = Field(
+        None, alias="stage3_param_persistence_threshold"
+    )  # None for dynamic default value (see validator `param_persistence_threshold_valid` below)
     """
     Do not partition parameters smaller than this threshold. Smaller values use
     less memory, but can greatly increase communication (especially
@@ -239,6 +243,12 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
                                                            new_param="gather_16bit_weights_on_model_save")
     """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
+    use_all_reduce_for_fetch_params: bool = Field(False, alias="stage3_use_all_reduce_for_fetch_params")
+    """
+    Use all_reduce op when fetching module parameters at stage3. This allows to significantly improve
+    performance by reducing the overhead of concatenation and slicing on the host.
+    """
+
     ignore_unused_parameters: bool = True
     """
     Unused parameters in modules may be unexpected in static networks, but
@@ -315,3 +325,14 @@ def offload_ratio_check(cls, values):
         if offload_config and offload_config.ratio < 1.0:
             assert values.get("stage") == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
         return values
+
+    @validator("param_persistence_threshold")
+    def param_persistence_threshold_valid(cls, field_value, values):
+        if field_value is None:
+            assert (
+                "offload_param"
+                in values), "DeepSpeedZeroConfig: 'offload_param' must be defined before 'param_persistence_threshold'"
+            field_value = pp_int(1e5)
+            if values["offload_param"] is not None and values["offload_param"].device != OffloadDeviceEnum.none:
+                field_value = 0
+        return field_value
diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py
index 9899a070e003..e394610924d3 100755
--- a/deepspeed/runtime/zero/mics.py
+++ b/deepspeed/runtime/zero/mics.py
@@ -12,8 +12,9 @@
 import deepspeed
 import torch
 from deepspeed import comm as dist
+from deepspeed.runtime.zero.utils import is_zero_param
 from deepspeed.runtime.zero.mics_utils import (MiCS_CommGroups, create_mics_comm_groups, scale_tensors)
-from deepspeed.runtime.zero.parameter_offload import (DeepSpeedZeRoOffload, is_zero_param)
+from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
 from deepspeed.runtime.zero.partition_parameters import Init, AllGatherCoalescedHandle, ZeroParamStatus
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
 from deepspeed.utils import instrument_w_nvtx, log_dist
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index d1ecd9288878..6cc56609340f 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -7,11 +7,11 @@
 import torch
 from collections import OrderedDict
 from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.runtime.zero.utils import apply_to_tensors_only, is_zero_param
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, InflightParamRegistry, iter_params
-from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
 FWD_MODULE_STACK = list()
@@ -40,57 +40,6 @@ def isinstance_namedtuple(obj: object) -> bool:
 warned = False
 
 
-def _apply_to_tensors_only(module, functional, backward_function, outputs):
-    """
-    Apply a torch.autograd.Function that calls a `backward_function` to every Tensor in `outputs`.
-
-    Args:
-        module (torch.nn.Module):  A torch module
-        functional (Type[torch.autograd.Function]): The function class to apply.
-        backward_function (Callable[[torch.nn.Module], None]): A backward_function to pass to
-            `functional.apply`.
-        outputs (Any): The output of `module`.
-
-    Returns:
-        Any: The output of `module`.
-    """
-    if isinstance(outputs, (tuple, list)):
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_to_tensors_only(module, functional, backward_function, output)
-            touched_outputs.append(touched_output)
-
-        if isinstance_namedtuple(outputs):
-            # namedtuples require a slightly different syntax.
-            return outputs.__class__(*touched_outputs)
-
-        return outputs.__class__(touched_outputs)
-    elif isinstance(outputs, dict):
-        # apply inplace to avoid recreating dict inherited objects
-        for key in outputs.keys():
-            outputs[key] = _apply_to_tensors_only(module, functional, backward_function, outputs[key])
-        return outputs
-
-    elif isinstance(outputs, torch.Tensor):
-        # this also applies to torch.Tensor's subclasses like torch.nn.parameter.Parameter
-        touched_outputs = functional.apply(module, backward_function, outputs)
-
-        # restore zero param attributes if those get stripped by `backward_function`
-        if not is_zero_param(touched_outputs) and is_zero_param(outputs):
-            touched_outputs.ds_param_alias = outputs
-        return touched_outputs
-    else:
-        if not is_builtin_type(outputs):
-            global warned
-            if not warned and dist.get_rank() == 0:
-                logger.warning(
-                    f"A module has unknown inputs or outputs type ({type(outputs)}) and the tensors embedded in it cannot be detected. "
-                    "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and "
-                    "output tensors and therefore may not get triggered properly.")
-                warned = True
-        return outputs
-
-
 #for each tensor in outputs run the forward_function and register backward_function as hook
 def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs):
     if type(outputs) is tuple:
@@ -150,54 +99,6 @@ def _inject_parameters(module, cls):
         module._parameters = new_param
 
 
-class PreBackwardFunction(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, outputs):
-        ctx.module = module
-        ctx.pre_backward_function = pre_backward_function
-        if not hasattr(module, "applied_pre_backward_ref_cnt"):
-            module.applied_pre_backward_ref_cnt = 0
-        module.applied_pre_backward_ref_cnt += 1
-        #print(f"After Forward: {ctx.module.__class__.__name__}")
-        outputs = outputs.detach()
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        #print(f"Before Backward: {ctx.module.__class__.__name__}")
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-class PostBackwardFunction(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, module, post_backward_function, output):
-        ctx.module = module
-        if output.requires_grad:
-            #TODO SOME TIMES post backward does not seem to be triggered debug in detail
-            #Should only cause increase in memory not correctness issue
-            #if output.grad_fn.__class__.__name__ == 'ViewBackward':
-            #    ctx.view=True
-            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
-            #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
-            #if module.ds_grads_remaining == 0:
-            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
-            module.ds_grads_remaining += 1
-            ctx.post_backward_function = post_backward_function
-        output = output.detach()
-        return output
-
-    @staticmethod
-    def backward(ctx, *args):
-        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
-        if ctx.module.ds_grads_remaining == 0:
-            ctx.post_backward_function(ctx.module)
-            #print(f"After Backward: {ctx.module.__class__.__name__}")
-        return (None, None) + args
-
-
 class DeepSpeedZeRoOffload(object):
 
     def __init__(
@@ -393,6 +294,7 @@ def _pre_forward_module_hook(module, *args):
 
         @instrument_w_nvtx
         def _post_forward_module_hook(module, input, output):
+
             global FWD_MODULE_STACK
             FWD_MODULE_STACK.pop()
             if output is None:
@@ -433,20 +335,49 @@ def _post_forward_module_hook(module, input, output):
 
             self.post_sub_module_forward_function(module)
 
-        def _pre_backward_module_hook(module, inputs, output):
+        def _bwd_hook_unexpected_inputs_msg(value):
+            return f"A module has unknown inputs or outputs type ({type(value)}) and the tensors embedded in it cannot be detected. " \
+                "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and " \
+                "output tensors and therefore may not get triggered properly."
 
-            @instrument_w_nvtx
-            def _run_before_backward_function(sub_module):
-                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
-                # before doing backwards, so each backward will need a pre-fetch - using reference
-                # counting to support this scenario
-                #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
-                if sub_module.applied_pre_backward_ref_cnt > 0:
-                    self.pre_sub_module_backward_function(sub_module)
-                    sub_module.applied_pre_backward_ref_cnt -= 1
-                #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
+        def _pre_backward_module_hook(module, inputs, output):
 
-            return _apply_to_tensors_only(module, PreBackwardFunction, _run_before_backward_function, output)
+            if not hasattr(module, "pre_bwd_fn"):
+
+                @instrument_w_nvtx
+                def _run_before_backward_function(sub_module):
+                    # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
+                    # before doing backwards, so each backward will need a pre-fetch - using reference
+                    # counting to support this scenario
+                    #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
+                    if sub_module.applied_pre_backward_ref_cnt > 0:
+                        self.pre_sub_module_backward_function(sub_module)
+                        sub_module.applied_pre_backward_ref_cnt -= 1
+                    #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
+
+                class PreBackwardFunctionForModule(torch.autograd.Function):
+
+                    @staticmethod
+                    def forward(ctx, outputs):
+                        # Capture `module` and _run_before_backward_function
+                        ctx.module = module
+                        ctx.pre_backward_function = _run_before_backward_function
+                        if not hasattr(ctx.module, "applied_pre_backward_ref_cnt"):
+                            ctx.module.applied_pre_backward_ref_cnt = 0
+                        ctx.module.applied_pre_backward_ref_cnt += 1
+                        outputs = outputs.detach()
+                        return outputs
+
+                    @staticmethod
+                    def backward(ctx, *args):
+                        ctx.pre_backward_function(ctx.module)
+                        return args
+
+                module.pre_bwd_fn = PreBackwardFunctionForModule
+
+            return apply_to_tensors_only(module.pre_bwd_fn.apply,
+                                         output,
+                                         warning_msg_fn=_bwd_hook_unexpected_inputs_msg)
 
         #This is an alternate to doing _post_backward_module_hook
         #it uses tensor.register_hook instead of using torch.autograd.Function
@@ -471,12 +402,44 @@ def _run_before_forward_function(input):
         def _post_backward_module_hook(module, inputs):
             module.ds_grads_remaining = 0
 
-            @instrument_w_nvtx
-            def _run_after_backward_function(sub_module):
-                if sub_module.ds_grads_remaining == 0:
-                    self.post_sub_module_backward_function(sub_module)
-
-            return _apply_to_tensors_only(module, PostBackwardFunction, _run_after_backward_function, inputs)
+            if not hasattr(module, "post_bwd_fn"):
+
+                @instrument_w_nvtx
+                def _run_after_backward_function(sub_module):
+                    if sub_module.ds_grads_remaining == 0:
+                        self.post_sub_module_backward_function(sub_module)
+
+                class PostBackwardFunctionModule(torch.autograd.Function):
+
+                    @staticmethod
+                    def forward(ctx, output):
+                        ctx.module = module
+                        if output.requires_grad:
+                            #TODO SOME TIMES post backward does not seem to be triggered debug in detail
+                            #Should only cause increase in memory not correctness issue
+                            #if output.grad_fn.__class__.__name__ == 'ViewBackward':
+                            #    ctx.view=True
+                            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
+                            #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
+                            #if module.ds_grads_remaining == 0:
+                            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
+                            module.ds_grads_remaining += 1
+                            ctx.post_backward_function = _run_after_backward_function
+                        output = output.detach()
+                        return output
+
+                    @staticmethod
+                    def backward(ctx, *args):
+                        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
+                        if ctx.module.ds_grads_remaining == 0:
+                            ctx.post_backward_function(ctx.module)
+                        return args
+
+                module.post_bwd_fn = PostBackwardFunctionModule
+
+            return apply_to_tensors_only(module.post_bwd_fn.apply,
+                                         inputs,
+                                         warning_msg_fn=_bwd_hook_unexpected_inputs_msg)
 
         # Pre forward hook
         self.forward_hooks.append(module.register_forward_pre_hook(_pre_forward_module_hook))
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index cdf7de512b9b..074a70b27840 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -23,9 +23,9 @@
 
 from deepspeed.utils import groups
 import deepspeed
-from ..utils import see_memory_usage
+from ..utils import get_only_unique_item, see_memory_usage
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks
+from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks, is_zero_param
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.config_utils import get_config_default
 from deepspeed.utils import instrument_w_nvtx, logger
@@ -109,12 +109,6 @@ def debug_rank0(msg: str) -> None:
         logger.debug(msg)
 
 
-def is_zero_param(parameter):
-    if not torch.is_tensor(parameter):
-        return False
-    return hasattr(parameter, 'ds_id')
-
-
 def _init_external_params(module):
     if not hasattr(module, '_external_params'):
         module._external_params = {}
@@ -243,7 +237,7 @@ def wrapped_fn(*args, **kwargs) -> Tensor:
             kwargs['device'] = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         tensor: Tensor = fn(*args, **kwargs)
         if tensor.is_floating_point():
-            tensor.data = tensor.data.to(target_fp_dtype)
+            tensor = tensor.to(target_fp_dtype)
 
         return tensor
 
@@ -287,6 +281,9 @@ def free_param(param: Parameter) -> None:
         # need to make sure that we don't free the parameter while it is still
         # being used for computation
         if not get_accelerator().is_synchronized_device():
+            # TODO SW-163871: remove the below WA once SW-154947 is resolved, solves OOM.
+            if get_accelerator().device_name() == "hpu" and get_accelerator().is_zero3_sync_mark_step_req():
+                get_accelerator().synchronize()
             param.data.record_stream(get_accelerator().current_stream())
     # param.data doesn't store anything meaningful in partitioned state
     param.data = torch.empty(0, dtype=param.dtype, device=param.device)
@@ -667,6 +664,31 @@ def wait(self) -> None:
             handle.wait()
 
 
+class AllReduceCoalescedHandle:
+
+    def __init__(self, handle, params: List[Parameter]) -> None:
+        self.handle = handle
+        self.params = params
+        self.complete = False
+
+        for param in self.params:
+            if param.ds_status != ZeroParamStatus.INFLIGHT:
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
+
+    @instrument_w_nvtx
+    def wait(self) -> None:
+        if self.complete:
+            return
+
+        instrument_w_nvtx(self.handle.wait)()
+
+        for param in self.params:
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.complete = True
+
+
 class QuantizationInfo:
     # a placeholder object to store all quant related vars used in handles
     def __init__(self) -> None:
@@ -859,7 +881,16 @@ def __init__(
         _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
                                                               mpu) if config_dict_or_path is not None else None
         if _ds_config is not None:
-            mem_efficient_linear = _ds_config.zero_config.memory_efficient_linear
+            if _ds_config.zero_config.memory_efficient_linear and _ds_config.compile_config.enabled:
+                # memory_efficient_linear displays numerous errors when torch.compile is enabled.
+                # Refer to https://github.com/pytorch/pytorch/issues/119059 for details.
+                # Further investigation into performance is necessary, even after resolving this issue because
+                # the `memory_efficient_linear` module may lead to more graph breaks compared to the original implementation.
+                logger.warning(f'memory_efficient_linear is disabled when torch.compile is enabled.')
+                mem_efficient_linear = False
+            else:
+                mem_efficient_linear = _ds_config.zero_config.memory_efficient_linear
+
         super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, ds_config=_ds_config, dtype=dtype)
         if not dist.is_initialized():
             init_distributed()
@@ -900,8 +931,12 @@ def __init__(
 
         # Local device is the device where the parameters are consumed, must be default device.
         # It is the device where parameters are fully instantiated using allgather
-        self.local_device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
-        get_accelerator().set_device(self.local_device)
+        # todo SW-143933 remove the below HPU wa for torch.device initialization, ticket SW-143931 has to be solved.
+        if get_accelerator().device_name() == "hpu":
+            self.local_device = torch.device("hpu")
+        else:
+            self.local_device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
+            get_accelerator().set_device(self.local_device)
 
         self.quantized_weights = zero_quantized_weights
         if _ds_config is not None and _ds_config.zero_config.zero_quantized_weights and not self.quantized_weights:
@@ -945,6 +980,11 @@ def __init__(
         if not self.use_all_gather_into_tensor:
             logger.info(f"all_gather_into_tensor API is not available in torch {torch.__version__}")
 
+        self.use_all_reduce_for_fetch_params = get_config_default(DeepSpeedZeroConfig,
+                                                                  "use_all_reduce_for_fetch_params")
+        if _ds_config is not None:
+            self.use_all_reduce_for_fetch_params = _ds_config.zero_config.use_all_reduce_for_fetch_params
+
     def _update_persist_config(self, ds_config):
         Init.apply_param_persistence = True
         Init.param_persistence_threshold = ds_config.zero_config.param_persistence_threshold
@@ -1118,7 +1158,7 @@ def all_gather_coalesced(params: Iterable[Parameter],
             ds_process_group = self.ds_process_group
             rank_in_group = self.rank
             world_size = self.dp_world_size
-            use_secondary_tensor = False
+            use_secondary_tensor = params[0].ds_secondary_tensor is not None and not forward
             if self.zero_param_process_group and not forward:
                 ds_process_group = self.zero_param_process_group  #intragroup
                 rank_in_group = self.rank_in_group
@@ -1149,10 +1189,10 @@ def all_gather_coalesced(params: Iterable[Parameter],
                 # have an opportunity to avoid some intermediate memory allocations
                 param, = params
                 buffer_size = math.ceil(param.ds_numel / world_size) * world_size
-                if not forward and param.ds_secondary_tensor is not None:
+                if use_secondary_tensor:
                     buffer_size = param.ds_secondary_tensor.shape[0] * world_size  #make sure out is appropriately sized
 
-                param_ds_tensor = param.ds_secondary_tensor if not forward and param.ds_secondary_tensor is not None else param.ds_tensor
+                param_ds_tensor = param.ds_secondary_tensor if use_secondary_tensor else param.ds_tensor
                 param_buffer = torch.empty(
                     buffer_size,
                     dtype=param_ds_tensor.dtype if not quantize else torch.int8,
@@ -1193,78 +1233,101 @@ def all_gather_coalesced(params: Iterable[Parameter],
                     return AllGatherHandle(handle, param, quantization=quant_info)
 
             else:
-                if not quantize:
-                    dtype_params = defaultdict(list)
-                    for p in params:
-                        dtype_params[p.ds_tensor.dtype].append(p)
-                    handles = []
-                    for dtype, params in dtype_params.items():
-                        handles.append(
-                            _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group))
+                # TODO (SW-164792): Handle use case when use_secondary_tensor is True
+                if self.use_all_reduce_for_fetch_params and not quantize and not use_secondary_tensor:
+                    # Use all_reduce instead of all_gather to fetch the module params
+                    flat_buffer_size = sum(p.ds_numel_aligned for p in params)
+                    flat_tensor = torch.zeros(flat_buffer_size,
+                                              dtype=get_only_unique_item(p.ds_tensor.dtype for p in params),
+                                              device=get_accelerator().current_device_name(),
+                                              requires_grad=False)
+                    start_param = 0
+                    for param in params:
+                        param.data = flat_tensor.narrow(0, start_param, param.ds_numel).view(param.ds_shape)
+                        start = start_param + param.ds_tensor.ds_numel * self.get_partition_rank()
+                        flat_tensor.narrow(0, start, param.ds_tensor.ds_numel).copy_(param.ds_tensor)
 
-                    return MultipleAllGatherHandles(handles)
+                        start_param += param.ds_numel
 
-                else:
-                    partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+                    handle = dist.all_reduce(flat_tensor, group=ds_process_group, async_op=True)
 
-                    if params[0].ds_secondary_tensor is not None and not forward:
-                        partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups for p in params)
+                    return AllReduceCoalescedHandle(handle=handle, params=params)
+                else:
+                    if not quantize:
+                        dtype_params = defaultdict(list)
+                        for p in params:
+                            dtype_params[p.ds_tensor.dtype].append(p)
+                        handles = []
+                        for dtype, params in dtype_params.items():
+                            handles.append(
+                                _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group))
 
-                    flat_tensor = torch.empty(partition_sz * world_size,
-                                              dtype=torch.int8,
-                                              device=get_accelerator().current_device_name(),
-                                              requires_grad=False)
+                        return MultipleAllGatherHandles(handles)
 
-                    if params[0].ds_secondary_tensor is not None and not forward:
-                        use_secondary_tensor = True
-                        if hasattr(params[0].ds_secondary_tensor, "ds_quant_scale"):
-                            quantized_param = instrument_w_nvtx(torch.cat)([
-                                p.ds_secondary_tensor.data.to(get_accelerator().current_device_name()) for p in params
-                            ])
-                            scales = instrument_w_nvtx(torch.cat)([
-                                p.ds_secondary_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
-                                for p in params
-                            ])
-                        else:
-                            quantized_param, scales = self.quantizer_module.quantize(
-                                instrument_w_nvtx(torch.cat)([
-                                    p.ds_secondary_tensor.to(get_accelerator().current_device_name()) for p in params
-                                ]))
                     else:
-                        if hasattr(params[0].ds_tensor, "ds_quant_scale"):
-                            quantized_param = instrument_w_nvtx(torch.cat)(
-                                [p.ds_tensor.data.to(get_accelerator().current_device_name()) for p in params])
-                            scales = instrument_w_nvtx(torch.cat)([
-                                p.ds_tensor.ds_quant_scale.to(get_accelerator().current_device_name()) for p in params
-                            ])
+                        partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+
+                        if use_secondary_tensor:
+                            partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups
+                                               for p in params)
+
+                        flat_tensor = torch.empty(partition_sz * world_size,
+                                                  dtype=torch.int8,
+                                                  device=get_accelerator().current_device_name(),
+                                                  requires_grad=False)
+
+                        if use_secondary_tensor:
+                            if hasattr(params[0].ds_secondary_tensor, "ds_quant_scale"):
+                                quantized_param = instrument_w_nvtx(torch.cat)([
+                                    p.ds_secondary_tensor.data.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                                scales = instrument_w_nvtx(torch.cat)([
+                                    p.ds_secondary_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                            else:
+                                quantized_param, scales = self.quantizer_module.quantize(
+                                    instrument_w_nvtx(torch.cat)([
+                                        p.ds_secondary_tensor.to(get_accelerator().current_device_name())
+                                        for p in params
+                                    ]))
                         else:
-                            quantized_param, scales = self.quantizer_module.quantize(
-                                instrument_w_nvtx(torch.cat)(
-                                    [p.ds_tensor.to(get_accelerator().current_device_name()) for p in params]))
-                    quant_scale_buffer = torch.empty(
-                        scales.numel() * world_size,
-                        dtype=torch.float32,
-                        device=get_accelerator().current_device_name(),
-                        requires_grad=False,
-                    )
-                    handle = _dist_allgather_fn(quantized_param, flat_tensor, ds_process_group)
-                    quant_handle = _dist_allgather_fn(scales, quant_scale_buffer, ds_process_group)
-                    quant_info = QuantizationInfo()
-                    quant_info.quantized_param = flat_tensor
-                    quant_info.backend = self.quantizer_module
-                    quant_info.quant_handle = quant_handle
-                    quant_info.scale_buffer = quant_scale_buffer
-                    quant_info.partition_sz = partition_sz
-                    quant_info.world_size = world_size
-                    return AllGatherCoalescedHandle(
-                        allgather_handle=handle,
-                        params=params,
-                        partitions=None,
-                        world_size=world_size,
-                        use_secondary_tensor=use_secondary_tensor,
-                        forward=forward,
-                        quantization=quant_info,
-                    )
+                            if hasattr(params[0].ds_tensor, "ds_quant_scale"):
+                                quantized_param = instrument_w_nvtx(torch.cat)(
+                                    [p.ds_tensor.data.to(get_accelerator().current_device_name()) for p in params])
+                                scales = instrument_w_nvtx(torch.cat)([
+                                    p.ds_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                            else:
+                                quantized_param, scales = self.quantizer_module.quantize(
+                                    instrument_w_nvtx(torch.cat)(
+                                        [p.ds_tensor.to(get_accelerator().current_device_name()) for p in params]))
+                        quant_scale_buffer = torch.empty(
+                            scales.numel() * world_size,
+                            dtype=torch.float32,
+                            device=get_accelerator().current_device_name(),
+                            requires_grad=False,
+                        )
+                        handle = _dist_allgather_fn(quantized_param, flat_tensor, ds_process_group)
+                        quant_handle = _dist_allgather_fn(scales, quant_scale_buffer, ds_process_group)
+                        quant_info = QuantizationInfo()
+                        quant_info.quantized_param = flat_tensor
+                        quant_info.backend = self.quantizer_module
+                        quant_info.quant_handle = quant_handle
+                        quant_info.scale_buffer = quant_scale_buffer
+                        quant_info.partition_sz = partition_sz
+                        quant_info.world_size = world_size
+                        return AllGatherCoalescedHandle(
+                            allgather_handle=handle,
+                            params=params,
+                            partitions=None,
+                            world_size=world_size,
+                            use_secondary_tensor=use_secondary_tensor,
+                            forward=forward,
+                            quantization=quant_info,
+                        )
 
         def partition(param_list=None, backward=False, hierarchy=0, has_been_updated=False):
             cls = param
@@ -1500,6 +1563,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                 param.ds_tensor.ds_numel = partition_size
                 param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
                 param.ds_tensor.final_location = final_location
+                param.ds_numel_aligned = tensor_size
 
             start = partition_size * self.get_partition_rank()
             end = start + partition_size
@@ -1633,7 +1697,8 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
 
-        get_accelerator().synchronize()
+        if not get_accelerator().device_name() == "hpu":
+            get_accelerator().synchronize()
 
         print_rank_0(
             f"{'--'* hierarchy}----allgather param with {debug_param2name_id_shape_status(param)} partition size={partition_size}"
@@ -1766,7 +1831,8 @@ def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False):
             param.data = gathered_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape).data
 
         # guarantee the communication to be completed
-        get_accelerator().synchronize()
+        if not get_accelerator().device_name() == "hpu":
+            get_accelerator().synchronize()
 
         return None
 
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 9bcf5a91bc95..5d119c95c682 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -16,6 +16,8 @@
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id
 from deepspeed.accelerator import get_accelerator
+import deepspeed.runtime.compiler as compiler
+
 import logging
 
 ENABLE_PROFILER = False
@@ -122,7 +124,7 @@ def __init__(
         # mechanism which doesn't require any configuration by the user.
         self.__ongoing_fetch_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
-        self.__max_ongoing_fetch_events: int = 2
+        self.__max_ongoing_fetch_events: int = 2 if get_accelerator().device_name() != "hpu" else -1
         self.__profiler = PartitionedParameterProfiler(timers if ENABLE_PROFILER else None)
 
     """Tracing and Tracking
@@ -174,6 +176,7 @@ def trace_prologue(self, sub_module: Module) -> None:
                     force=True)
                 self._invalidate_trace()
 
+    @compiler.disable
     def record_module(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
@@ -251,6 +254,7 @@ def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
     Fetching, prefetching, and releasing parameters
     """
 
+    @compiler.disable
     @instrument_w_nvtx
     @torch.no_grad()
     def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
@@ -271,6 +275,7 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
         params_to_fetch = frozenset(iter_params(current_submodule))
         fetch_numel = sum(
             [p.partition_numel() for p in params_to_fetch if p.ds_status == ZeroParamStatus.NOT_AVAILABLE])
+
         if fetch_numel > 0:
             event_name = __class__.FORWARD_FETCH_SUBMIT if forward else __class__.BACKWARD_FETCH_SUBMIT
             self._dump_param_ids(event_name, current_submodule.id,
@@ -297,18 +302,18 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
                 with get_accelerator().stream(self.__allgather_stream):
                     while self.__ongoing_fetch_events and self.__ongoing_fetch_events[0].query():
                         self.__ongoing_fetch_events.popleft()
-                    if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events:
+                    if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events > -1:
                         self.__ongoing_fetch_events.popleft().synchronize()
 
                     self.__inflight_param_registry.pop(param).wait()
 
-                    if not get_accelerator().is_synchronized_device():
+                    if not get_accelerator().handles_memory_backpressure():
                         event = get_accelerator().Event()
                         event.record()
                         self.__ongoing_fetch_events.append(event)
 
             assert param.ds_status == ZeroParamStatus.AVAILABLE, param.ds_summary()
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             get_accelerator().current_stream().wait_stream(self.__allgather_stream)
         self.__profiler.stop_event(wait_event_name, wait_numel)
 
@@ -438,7 +443,6 @@ def __all_gather_params_(self, params: Set[Parameter], forward: bool, quantize:
                 all_gather_numel += param.ds_numel
 
         if partitioned_params:
-            partitioned_params
             self.__n_available_params += all_gather_numel
             with get_accelerator().stream(self.__allgather_stream):
                 event_name = __class__.FORWARD_ALL_GATHER if forward else __class__.BACKWARD_ALL_GATHER
@@ -459,6 +463,7 @@ def __all_gather_params_(self, params: Set[Parameter], forward: bool, quantize:
             if swap_persisted_params:
                 swap_persisted_params[0].nvme_swapper.remove_partition_and_release_buffers(swap_persisted_params)
 
+    @compiler.disable
     @instrument_w_nvtx
     def __release_param(self, param: Parameter, backward: bool) -> None:
         if param.ds_status == ZeroParamStatus.AVAILABLE and not param.ds_active_sub_modules:
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 63aa4ca4c48e..961d90ec3b29 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -15,18 +15,17 @@
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
-from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
 from deepspeed.accelerator import get_accelerator
-
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
@@ -195,16 +194,16 @@ def __init__(
 
         # backup fused_adam optimizer init
         if self.offload_optimizer and self.partial_offload != 1.0:
-            backup_gpu_tensor = torch.randn(1, device='cuda').to(self.dtype)
+            backup_gpu_tensor = torch.randn(1, device=get_accelerator().device_name()).to(self.dtype)
             backup_gpu_param = torch.nn.Parameter(backup_gpu_tensor)
             assert type(init_optimizer) == DeepSpeedCPUAdam, 'Hybrid Optimizer Only Supports DeepSpeedCPUAdam'
-            self.backup_optimizer = FusedAdam([backup_gpu_param],
-                                              lr=self.optimizer.param_groups[0]["lr"],
-                                              bias_correction=self.optimizer.param_groups[0]["bias_correction"],
-                                              betas=self.optimizer.param_groups[0]["betas"],
-                                              eps=self.optimizer.param_groups[0]["eps"],
-                                              weight_decay=self.optimizer.param_groups[0]["weight_decay"],
-                                              amsgrad=self.optimizer.param_groups[0]["amsgrad"])
+            self.backup_optimizer = torch.optim.AdamW([backup_gpu_param],
+                                                      lr=self.optimizer.param_groups[0]["lr"],
+                                                      betas=self.optimizer.param_groups[0]["betas"],
+                                                      eps=self.optimizer.param_groups[0]["eps"],
+                                                      weight_decay=self.optimizer.param_groups[0]["weight_decay"],
+                                                      amsgrad=self.optimizer.param_groups[0]["amsgrad"])
+
             # Multiple param_groups configs for back-up optimizer
             if len(self.optimizer.param_groups) > 1:
                 for i in range(1, len(self.optimizer.param_groups)):
@@ -323,7 +322,7 @@ def __init__(
 
         self.param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
-        self.max_param_reduce_events: int = 2
+        self.max_param_reduce_events: int = 2 if get_accelerator().device_name() != "hpu" else -1
 
         self.param_dict = {}
 
@@ -461,16 +460,13 @@ def invalidate_secondary_tensor(self):
                     param.ds_secondary_tensor = None
 
     def _setup_for_real_optimizer(self):
-        see_memory_usage("Before creating fp32 partitions", force=True)
         self._create_fp32_partitions()
-        see_memory_usage("After creating fp32 partitions", force=True)
         dist.barrier()
 
         # To support pipelined optimizer swapping
         self._create_next_swappable_fp32_groups()
 
         see_memory_usage("Before initializing optimizer states", force=True)
-
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states", force=True)
         dist.barrier()
@@ -541,6 +537,7 @@ def defragment(tensors: List[Tensor]) -> Tensor:
             offset += tensor_numel
 
         gc.collect()
+        #TODO SW-107191: support empty_cache() in hpu
         get_accelerator().empty_cache()
 
         # copy tensors (now flattened and contiguous) back to GPU
@@ -1075,7 +1072,7 @@ def independent_gradient_partition_epilogue(self):
         self.__reduce_and_partition_ipg_grads()
         self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
@@ -1158,7 +1155,7 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param):
     @instrument_w_nvtx
     @torch.no_grad()
     def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.wait_stream(get_accelerator().default_stream())
 
         if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() <= self.reduce_bucket_size:
@@ -1189,7 +1186,7 @@ def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
 
         while self.param_reduce_events and self.param_reduce_events[0].query():
             self.param_reduce_events.popleft()
-        if len(self.param_reduce_events) > self.max_param_reduce_events:
+        if len(self.param_reduce_events) > self.max_param_reduce_events > -1:
             self.param_reduce_events.popleft().synchronize()
 
         with get_accelerator().stream(self.reduce_and_partition_stream):
@@ -1207,15 +1204,14 @@ def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
 
             self.params_in_ipg_bucket.clear()
 
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().handles_memory_backpressure():
                 event = get_accelerator().Event()
                 event.record()
                 self.param_reduce_events.append(event)
 
     @instrument_w_nvtx
     def __avg_scatter_contiguous_grads(self, buffer_to_reduce: Tensor) -> List[Tensor]:
-        dtype = buffer_to_reduce.dtype
-        if self.communication_data_type == self.dtype:
+        if self.communication_data_type != self.dtype:
             buffer_to_reduce = buffer_to_reduce.to(self.communication_data_type)
         if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
             buffer_to_reduce = buffer_to_reduce.div_(self.gradient_predivide_factor)
@@ -1324,7 +1320,7 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
                 param_id = self.get_param_id(p)
                 if param_id in self.norm_for_param_grads.keys():
                     param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item()**2
+                    total_norm += param_norm**2
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
@@ -1333,12 +1329,16 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+        total_norm = total_norm_cuda[0]**(1. / norm_type)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
-        return total_norm
+        err = torch.tensor(-1.0, device=inf_or_nan.device, dtype=torch.float)
+        total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
+
+        return total_norm.cpu()
 
     @instrument_w_nvtx
     def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
@@ -1665,7 +1665,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
+            total_norm = total_norm_cuda[0]
         else:
             # if dist.get_rank() == 0:
             #    logger.info(f"Total Norm beginning {total_norm}")
@@ -1686,10 +1686,14 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
-            total_norm = total_norm_cuda.item()**(1. / norm_type)
+            total_norm = total_norm_cuda**(1. / norm_type)
+
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        err = torch.tensor(-1.0, device=inf_or_nan.device, dtype=torch.float)
+        total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
 
         return total_norm
 
@@ -1949,7 +1953,7 @@ def step(self, closure=None):
             return
 
         norm_groups = self._get_norm_groups()
-        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
+        scaled_global_grad_norm = torch.norm(torch.stack(norm_groups))
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.loss_scale
@@ -2072,7 +2076,7 @@ def has_overflow(self, partition_gradients=True):
                 overflow_gpu = self.inf_or_nan_tracker.clone().to(torch.uint8)
                 self.inf_or_nan_tracker.zero_()
 
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().resolves_data_dependency():
                 get_accelerator().default_stream().wait_stream(self.reduce_and_partition_stream)
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
@@ -2143,7 +2147,7 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
         """get fp32 gradient partition dictionary
         accessed as grad_dict[parameter_group_index][parameter_index]
         """
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
         grad_dict = collections.defaultdict(dict)
         if self.offload_optimizer:
@@ -2173,7 +2177,7 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
@@ -2185,7 +2189,7 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
         return self._fp32_state_allgather(param, fp32_grad)
 
     def _get_fp32_opt_state_partition(self, param, optim_state_key=None):
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
@@ -2242,7 +2246,7 @@ def get_local_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 1d2d561dbd39..a55a12e938d5 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -12,9 +12,8 @@
 
 from deepspeed.runtime import ZeROOptimizer
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
-                                     inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups)
-
+from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, empty_cache, see_memory_usage, inf,
+                                     is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups)
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -75,11 +74,6 @@ def get_alignment_padding(tensor_list, alignment):
     return (alignment - remainder) if remainder else remainder
 
 
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
 def print_rank_msg(msg):
     print(f"rank {dist.get_rank()} - {msg}")
 
@@ -237,7 +231,7 @@ def __init__(self,
             f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
             f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
 
-        if self.reduce_scatter:
+        if self.reduce_scatter and self.partition_gradients:
             valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
             assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
             assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
@@ -273,7 +267,7 @@ def __init__(self,
         self.partition_size = []
 
         # align nccl all-gather send buffers to 4-byte boundary
-        self.nccl_start_alignment_factor = 2  # 4-byte alignment/sizeof(fp16) = 2
+        self.nccl_start_alignment_factor = int(4 / self.optimizer.param_groups[0]['params'][0].element_size())
 
         assert (
             allgather_bucket_size % self.nccl_start_alignment_factor == 0
@@ -294,6 +288,7 @@ def __init__(self,
 
         self.round_robin_bit16_groups = []
         self.round_robin_bit16_indices = []
+        self.round_robin_bit16_meta = []
 
         # Use different parallel to do all_to_all_reduce related things
         # padding on each partition for alignment purposes
@@ -316,7 +311,14 @@ def __init__(self,
 
             see_memory_usage(f"Before moving param group {i} to CPU")
             # move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.bit16_groups[i])
+
+            # Create temp CPU param copies, free accelerator tensors
+            orig_group_numel = 0
+            for param in self.bit16_groups[i]:
+                orig_group_numel += param.numel()
+                param.cpu_data = param.data.cpu()
+                param.data = torch.empty(1).to(param.device)
+
             empty_cache()
             see_memory_usage(f"After moving param group {i} to CPU", force=False)
 
@@ -334,18 +336,31 @@ def __init__(self,
             self.round_robin_bit16_groups.append(round_robin_tensors)
             self.round_robin_bit16_indices.append(round_robin_indices)
 
-            # create flat buffer in CPU and move to GPU
-            self.bit16_groups_flat.append(
-                self.flatten_dense_tensors_aligned(
-                    self.round_robin_bit16_groups[i],
-                    self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])).to(
-                        get_accelerator().current_device_name()))
+            # Create meta tensors list, ordered according to round_robin_tensors
+            meta_tensors = []
+            for param in round_robin_tensors:
+                meta_tensors.append(torch.zeros_like(param.cpu_data, device="meta"))
+            self.round_robin_bit16_meta.append(meta_tensors)
+
+            # create flat buffer in CPU
+            flattened_buffer = self.flatten_dense_tensors_aligned(
+                self.round_robin_bit16_groups[i],
+                self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i]),
+                use_cpu_data=True)
+
+            # free temp CPU params
+            for param in self.bit16_groups[i]:
+                del param.cpu_data
+
+            # Move CPU flat tensor to the accelerator memory.
+            self.bit16_groups_flat.append(flattened_buffer.to(get_accelerator().current_device_name()))
+            del flattened_buffer
+
             see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
 
             # Record padding required for alignment
             if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
-                padding = self.bit16_groups_flat[i].numel() - sum(
-                    [t.numel() for t in self.round_robin_bit16_groups[i]])
+                padding = self.bit16_groups_flat[i].numel() - orig_group_numel
             else:
                 padding = 0
             self.groups_padding.append(padding)
@@ -590,8 +605,7 @@ def _configure_moe_settings(self):
         assert self.ep_process_group is not None, "Expert parallel group should be configured with MoE"
 
     def _update_model_bit16_weights(self, group_index):
-        updated_params = self.unflatten(self.bit16_groups_flat[group_index],
-                                        self.round_robin_bit16_groups[group_index])
+        updated_params = self.unflatten(self.bit16_groups_flat[group_index], self.round_robin_bit16_meta[group_index])
         for p, q in zip(self.round_robin_bit16_groups[group_index], updated_params):
             p.data = q.data
 
@@ -723,7 +737,8 @@ def independent_gradient_partition_epilogue(self):
             self.params_already_reduced[i] = False
 
         if self.overlap_comm:
-            get_accelerator().synchronize()
+            if not get_accelerator().resolves_data_dependency():
+                get_accelerator().synchronize()
             # It is safe to clear previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
 
@@ -881,7 +896,8 @@ def report_ipg_memory_usage(self, tag, param_elems):
         )
 
     # create a flat tensor aligned at the alignment boundary
-    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment, use_cpu_data=False):
+        tensor_list = [param.cpu_data for param in tensor_list] if use_cpu_data else tensor_list
         return self.flatten(align_dense_tensors(tensor_list, alignment))
 
     ############### Independent Partition Gradient ########################
@@ -998,7 +1014,7 @@ def allreduce_and_scatter(self, bucket, numel_per_bucket=500000000, log=None, di
     def average_tensor(self, tensor):
         if self.overlap_comm:
             stream = self.reduction_stream
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().resolves_data_dependency():
                 stream.wait_stream(get_accelerator().current_stream())
         else:
             stream = get_accelerator().current_stream()
@@ -1127,7 +1143,7 @@ def get_grad_position(self, group_id, tensor_list, first_offset, partition_size)
 
     def update_overflow_tracker_for_param_grad(self, param):
         grad_accum = self.get_param_gradient_attribute(param)
-        if grad_accum is not None and self._has_inf_or_nan(grad_accum.data):
+        if grad_accum is not None and self._has_inf_or_nan(grad_accum.data, cpu_offload=self.cpu_offload):
             self.local_overflow = True
 
     def _get_offload_gradient_dict(self):
@@ -1325,7 +1341,7 @@ def reduce_ipg_grads(self):
                 self.average_tensor(extra_large_grad_reduc.view(-1))
                 self.extra_large_param_to_reduce = None
             else:
-                self.average_tensor(self.ipg_buffer[self.ipg_index])
+                self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket))
         else:
             self.buffered_reduce_fallback(None,
                                           self.grads_in_ipg_bucket,
@@ -1480,7 +1496,8 @@ def _clear_previous_reduced_grads(self):
     def allreduce_and_copy(self, small_bucket, rank=None, log=None, divide=True, process_group=None):
         process_group = self.dp_process_group if process_group is None else process_group
         if self.overlap_comm:
-            get_accelerator().synchronize()
+            if not get_accelerator().resolves_data_dependency():
+                get_accelerator().synchronize()
             # It is safe to clear the previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
             stream = self.reduction_stream
@@ -1629,16 +1646,16 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
             Total norm of the parameters (viewed as a single vector).
         """
         norm_type = float(norm_type)
+        all_norms = []
         if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
+            for g in gradients:
+                all_norms.append(g.data.abs().max().float())
+            total_norm = torch.stack(all_norms).max()
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
+            self._model_parallel_all_reduce(tensor=total_norm, op=dist.ReduceOp.MAX)
         else:
-            total_norm = 0.0
             # if dist.get_rank() == 0:
             #    logger.info(f"Total Norm beginning {total_norm}")
             for g, p in zip(gradients, params):
@@ -1646,18 +1663,25 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
                 if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
                     continue
                 if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item()**2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
+                    all_norms.append(
+                        torch.norm(g.data.double().detach(), norm_type).to(get_accelerator().current_device_name()))
+            if len(all_norms) > 0:
+                total_norm = torch.stack(all_norms).square().sum().float()
+            else:
+                total_norm = torch.tensor(0.0, dtype=torch.float32).to(self.device)
+            # Sum across all model parallel Device.
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
-            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
+            self._model_parallel_all_reduce(tensor=total_norm, op=dist.ReduceOp.SUM)
 
-            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+            total_norm = total_norm.pow(1. / norm_type)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
+
+        err = torch.tensor(-1.0, device=self.device, dtype=torch.float)
+        total_norm = err.where(inf_or_nan, total_norm)
 
         return total_norm
 
@@ -1734,7 +1758,9 @@ def scaled_global_norm(self, norm_type=2):
         for i, group in enumerate(self.bit16_groups):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
-                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]))
+                norm = torch.tensor(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]),
+                                    device=self.device)
+                norm_groups.append(norm)
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
             else:
                 norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i]))
@@ -1743,7 +1769,7 @@ def scaled_global_norm(self, norm_type=2):
             self._average_expert_grad_norms(norm_groups)
 
         # note that the get_global_norm function only supports l2 norm
-        return get_global_norm(norm_list=norm_groups)
+        return torch.norm(torch.stack(norm_groups))
 
     def get_bit16_param_group(self, group_no):
         bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
@@ -1899,12 +1925,9 @@ def update_lp_params(self):
     def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
-                scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.real_dp_process_group[i]))
-                scaled_norm_tensor = torch.tensor(scaled_norm,
-                                                  device=get_accelerator().device_name(),
-                                                  dtype=torch.float)
+                scaled_norm_tensor = norm * 1.0 / dist.get_world_size(group=self.real_dp_process_group[i])
                 dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i])
-                norm_groups[i] = scaled_norm_tensor.item()
+                norm_groups[i] = scaled_norm_tensor
 
     def unscale_and_clip_grads(self, grad_groups_flat, total_norm):
         # compute combined scale factor for this group
@@ -1927,24 +1950,34 @@ def _check_overflow(self, partition_gradients=True):
         self.overflow = self.has_overflow(partition_gradients)
 
     # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
+    def has_overflow_serial(self, params):
+        invalid_grad_count = torch.zeros([1], dtype=torch.float, device=get_accelerator().current_device_name())
         for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
+            if p.grad is not None:
+                float_grad = p.grad.float()
+                nan = float_grad.isnan()
+                inf = float_grad.isinf()
+                inf_or_nan = nan.logical_or(inf)
+                invalid_grad_count += inf_or_nan.float().max()
+        return invalid_grad_count
 
     def has_overflow_partitioned_grads_serial(self):
+        invalid_grad_count = torch.zeros([1], dtype=torch.float, device=get_accelerator().current_device_name())
         for i in range(len(self.bit16_groups)):
             for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
+                if grad is not None:
+                    float_grad = grad.float()
+                    nan = float_grad.isnan()
+                    inf = float_grad.isinf()
+                    inf_or_nan = nan.logical_or(inf)
+                    invalid_grad_count += inf_or_nan.float().max()
+        return invalid_grad_count
 
     def has_overflow(self, partition_gradients=True):
+        device = torch.device(get_accelerator().current_device_name())
         if partition_gradients:
             overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = get_accelerator().ByteTensor([overflow])
+            overflow_gpu = get_accelerator().ByteTensor([overflow]) if self.cpu_offload else overflow.to(device)
             '''This will capture overflow across all data parallel and expert parallel process
             Since expert parallel process are a subset of data parallel process'''
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
@@ -1954,9 +1987,7 @@ def has_overflow(self, partition_gradients=True):
             for group in self.bit16_groups:
                 for param in group:
                     params.append(param)
-
-            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = get_accelerator().ByteTensor([overflow])
+            overflow_gpu = self.has_overflow_serial(params)
 
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
@@ -1967,7 +1998,8 @@ def has_overflow(self, partition_gradients=True):
 
     # `x` is a torch.Tensor
     @staticmethod
-    def _has_inf_or_nan(x, j=None):
+    def _has_inf_or_nan(x, j=None, cpu_offload=False):
+        assert cpu_offload or x.device.type != 'hpu', "_has_inf_or_nan not optimized for hpu device"
         try:
             # if x is half, the .float() incurs an additional deep copy, but it's necessary if
             # Pytorch's .sum() creates a one-element tensor of the same type as x
@@ -2397,7 +2429,9 @@ def estimate_zero2_model_states_mem_needs(total_params,
         gpu_mem = 2 * total_params
         cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor
     else:
-        gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
+        # GPU's total_params multipliers: 2 = params_16bit,
+        # 14 = 2_grads_16bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
+        gpu_mem = 2 * total_params + int(14 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
 
     return int(cpu_mem), int(gpu_mem)
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index 0bf1ca4a894d..d5bac643a730 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -50,6 +50,13 @@ class ZeRORuntimeException(Exception):
 except ImportError:
     pass
 
+# Add hpex FusedAdamW to supported list if hpex is installed
+try:
+    from habana_frameworks.torch.hpex.optimizers import FusedAdamW
+    ZERO_SUPPORTED_OPTIMIZERS.append(FusedAdamW)
+except ImportError:
+    pass
+
 
 def is_zero_supported_optimizer(optimizer):
     if dist.get_rank() == 0:
@@ -62,10 +69,14 @@ def get_lst_from_rank0(lst: List[int]) -> None:
     NOTE: creates both communication and synchronization overhead so should be used
     sparingly
     """
+    #if torch.distributed.get_backend() == "hccl":
+    #    device=torch.device('hpu:0')
+    #else:
+    #    device=torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
     lst_tensor = torch.tensor(
         lst if dist.get_rank() == 0 else [-1] * len(lst),
         dtype=int,
-        # device=get_accelerator().current_device_name(),
+        # device=device,
         device=torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])),
         requires_grad=False,
     )
@@ -87,3 +98,74 @@ def assert_ints_same_as_other_ranks(ints: List[int]) -> None:
     if ints != rank0_ints:
         raise RuntimeError(f"disagreement between rank0 and rank{dist.get_rank()}: "
                            f"rank0: {rank0_ints}, rank{dist.get_rank()}: {ints}")
+
+
+def is_builtin_type(obj):
+    # https://stackoverflow.com/a/17795199
+    return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
+
+
+def isinstance_namedtuple(obj: object) -> bool:
+    """
+    Is this an instance of namedtuple/NamedTuple?
+    From: https://stackoverflow.com/a/62692640
+
+    Args:
+        obj (object): An object.
+
+    Returns:
+        bool: True if namedtuple/NamedTuple else False.
+    """
+    return isinstance(obj, tuple) and hasattr(obj, '_asdict') and hasattr(obj, '_fields')
+
+
+def is_zero_param(parameter):
+    if not torch.is_tensor(parameter):
+        return False
+    return hasattr(parameter, 'ds_id')
+
+
+def apply_to_tensors_only(function, value, warning_msg_fn=None):
+    """
+    Apply `function` to every Tensor in `value`.
+
+    Args:
+        functional: The function class to apply.
+        value (Any): Target object to apply `function` to.
+
+    Returns:
+        Any: Output of `function`.
+    """
+    if isinstance(value, (tuple, list)):
+        touched_outputs = []
+        for elem in value:
+            touched_output = apply_to_tensors_only(function, elem)
+            touched_outputs.append(touched_output)
+
+        if isinstance_namedtuple(value):
+            # namedtuples require a slightly different syntax.
+            return value.__class__(*touched_outputs)
+
+        return value.__class__(touched_outputs)
+    elif isinstance(value, dict):
+        # apply inplace to avoid recreating dict inherited objects
+        for key in value.keys():
+            value[key] = apply_to_tensors_only(function, value[key])
+        return value
+
+    elif isinstance(value, torch.Tensor):
+        # this also applies to torch.Tensor's subclasses like torch.nn.parameter.Parameter
+        touched_output = function(value)
+
+        # restore zero param attributes if those get stripped by `backward_function`
+        if not is_zero_param(touched_output) and is_zero_param(value):
+            touched_output.ds_param_alias = value
+
+        return touched_output
+    else:
+        if not is_builtin_type(value):
+            global warned
+            if warning_msg_fn and not warned and dist.get_rank() == 0:
+                logger.warning(warning_msg_fn(value))
+                warned = True
+        return value
diff --git a/deepspeed/tools/__init__.py b/deepspeed/tools/__init__.py
new file mode 100644
index 000000000000..36d8fbe03558
--- /dev/null
+++ b/deepspeed/tools/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/tools/pg_sim/__init__.py b/deepspeed/tools/pg_sim/__init__.py
new file mode 100644
index 000000000000..917c5a255fa8
--- /dev/null
+++ b/deepspeed/tools/pg_sim/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .pg import install_sim_dist_backend
diff --git a/deepspeed/tools/pg_sim/pg.py b/deepspeed/tools/pg_sim/pg.py
new file mode 100644
index 000000000000..b7e2483f86d0
--- /dev/null
+++ b/deepspeed/tools/pg_sim/pg.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import warnings
+import torch
+import torch.distributed as dist
+
+from datetime import timedelta
+from functools import partial
+from torch.distributed.constants import default_pg_timeout
+from torch.distributed.distributed_c10d import (GroupMember, BroadcastOptions, AllreduceOptions, ReduceOp)
+from torch.distributed.distributed_c10d import STORE_BASED_BARRIER_PREFIX  # noqa
+from deepspeed.accelerator import get_accelerator
+
+
+class SimProcessGroup:
+    BACKEND = "sim"
+    DEFAULT_PG = None
+    WORLD_SIZE = -1
+    STORE = None
+
+    def __init__(self, rank, world_size, timeout, backend):
+        self.sim_rank = rank
+        self.pg_world_size = world_size
+        self.timeout = timeout
+        self.backend = backend
+        self.pg = None
+        self.torch_ver_major = int(torch.__version__.split('.')[0])
+        self.torch_ver_minor = int(torch.__version__.split('.')[1])
+
+        assert self.torch_ver_major == 1, \
+            f"Torch version major != 1 is not supported (version={torch.__version__})"
+        assert self.torch_ver_minor >= 10, \
+            f"Torch version < 1.10 is not supported (version={torch.__version__})"
+
+        if self.torch_ver_minor < 13:
+            warnings.warn(f"Torch version < 1.13 is not tested (version={torch.__version__})")
+
+        # default is the first process group created
+        if SimProcessGroup.DEFAULT_PG is None:
+            SimProcessGroup.DEFAULT_PG = self
+
+    @staticmethod
+    def get_dist_group_count():
+        return torch.distributed.distributed_c10d._group_count
+
+    @classmethod
+    def store_add_rest_of_world(cls, next_group):
+        group = cls.get_dist_group_count() + (1 if next_group else 0)
+        store_key = f"{STORE_BASED_BARRIER_PREFIX}:{group}"
+        cls.STORE.add(store_key, cls.WORLD_SIZE - 1)
+
+    def _create_pg(self):
+        self.store_add_rest_of_world(next_group=False)
+        pg = dist.new_group(ranks=[0], timeout=self.timeout, backend=self.backend, pg_options=None)
+        return pg
+
+    def post_create_sim_group(self):
+        self.pg = self._create_pg()
+
+    @classmethod
+    def default_pg(cls):
+        assert cls.DEFAULT_PG is not None
+        return cls.DEFAULT_PG
+
+    def size(self):
+        return self.pg_world_size
+
+    def rank(self):
+        return self.sim_rank
+
+    # ----------------------------------------------------
+    # P2P
+    #
+    # P2P operations are simulated as all_reduce
+    # ----------------------------------------------------
+    class P2PRequestObject:
+        """ Dummy p2p request object that is returned for p2p ops"""
+
+        def __init__(self, src):
+            self.src = src
+
+        def wait(self):
+            return
+
+        def is_completed(self):
+            return True
+
+        def _source_rank(self):
+            return self.src
+
+    def _p2p_op(self, tensor_list, src=None):
+        opts = AllreduceOptions()
+        if self.torch_ver_minor > 10:
+            opts.reduceOp = ReduceOp.SUM
+        self.pg.allreduce(tensor_list, opts).wait()
+        src = src if src is not None else self.sim_rank
+        return SimProcessGroup.P2PRequestObject(src=src)
+
+    def send(self, tensor_list, _group_dst_rank, _tag):
+        return self._p2p_op(tensor_list)
+
+    def recv_anysource(self, tensor_list, _tag):
+        return self._p2p_op(tensor_list)
+
+    def recv(self, tensor_list, src, _tag):
+        return self._p2p_op(tensor_list, src=src)
+
+    # ----------------------------------------------------
+    # Collectives
+    #
+    # For some collectives, it is required to shrink the
+    # input/output tensors_list to 1-element (world_size=1).
+    # also, need to make all other members of tensors_list to depend
+    # on the first element - to prevent incorrect graph signaling.
+    # The logic of shrink and then copy is handled by:
+    # - _adjust_tensors_list_to_ws1
+    # - _copy_data_from_tensor_to_tensor_list
+    # ----------------------------------------------------
+    @staticmethod
+    def _to_device(tensors, device):
+        if isinstance(tensors, dict):
+            return {k: SimProcessGroup._to_device(v, device) for k, v in tensors.items()}
+        elif isinstance(tensors, list):
+            return [SimProcessGroup._to_device(v, device) for v in tensors]
+        elif isinstance(tensors, torch.Tensor):
+            return tensors.to(device)
+        else:
+            assert False, 'Unsupported tensors type'
+
+    def broadcast(self, tensors, opts):
+        """ ignore opts.rootRank and override to be the source """
+        opts.rootRank = self.sim_rank
+        tensors = self._to_device(tensors, get_accelerator().current_device_name())
+        return self.pg.broadcast(tensors, opts)
+
+    def allreduce(self, tensors, opts):
+        return self.pg.allreduce(tensors, opts)
+
+    def allreduce_coalesced(self, tensors, opts):
+        return self.pg.allreduce_coalesced(tensors, opts)
+
+    def reduce(self, tensors, opts):
+        if opts.rootRank == self.sim_rank:
+            return self.pg.reduce(tensors, opts)
+
+        broadcast_opts = BroadcastOptions()
+        broadcast_opts.rootRank = self.sim_rank
+        broadcast_opts.rootTensor = opts.rootTensor
+        return self.pg.broadcast(tensors, broadcast_opts)
+
+    def _adjust_tensors_list_to_ws1(self, tensors_list):
+        """ receives list of lists of tensors and returns lists
+            of list-size-1 to match the world_size=1
+        """
+        world1_tensors_list = []
+        for i, tensors in enumerate(tensors_list):
+            world1_tensors_list.append(tensors[self.sim_rank:self.sim_rank + 1])
+        return world1_tensors_list
+
+    @staticmethod
+    def _copy_data_from_tensor_to_tensor_list(source_tensors, tensors_list):
+        """ copy data from source tensors to all tensors in tensor list """
+        for i, tensors in enumerate(tensors_list):
+            for t in tensors:
+                t.data[:] = source_tensors[i][0].data[:]
+
+    def allgather(self, tensors_list, input_tensors, *kwargs):
+        world1_tensors_list = self._adjust_tensors_list_to_ws1(tensors_list)
+        handle = self.pg.allgather(world1_tensors_list, input_tensors, *kwargs)
+        self._copy_data_from_tensor_to_tensor_list(world1_tensors_list, tensors_list)
+        return handle
+
+    def gather(self, output_tensors, input_tensors, opts):
+        if opts.rootRank == self.sim_rank:
+            world1_tensors_list = self._adjust_tensors_list_to_ws1(output_tensors)
+            handle = self.pg.gather(world1_tensors_list, input_tensors, opts)
+            self._copy_data_from_tensor_to_tensor_list(world1_tensors_list, output_tensors)
+            return handle
+
+        broadcast_opts = BroadcastOptions()
+        broadcast_opts.rootRank = self.sim_rank
+        return self.pg.broadcast(input_tensors, broadcast_opts)
+
+    def scatter(self, output_tensors, input_tensors, opts):
+        if opts.rootRank == self.sim_rank:
+            world1_tensors_list = self._adjust_tensors_list_to_ws1(input_tensors)
+            handle = self.pg.scatter(output_tensors, world1_tensors_list, opts)
+            self._copy_data_from_tensor_to_tensor_list(world1_tensors_list, input_tensors)
+            return handle
+
+        broadcast_opts = BroadcastOptions()
+        broadcast_opts.rootRank = self.sim_rank
+        return self.pg.broadcast(output_tensors, broadcast_opts)
+
+    def reduce_scatter(self, output_tensors, input_tensors, opts):
+        world1_tensors_list = self._adjust_tensors_list_to_ws1(input_tensors)
+        handle = self.pg.reduce_scatter(output_tensors, world1_tensors_list, opts)
+        self._copy_data_from_tensor_to_tensor_list(world1_tensors_list, input_tensors)
+        return handle
+
+    def alltoall(self, output_tensors, input_tensors, _opts):
+        world1_in_tensors_list = input_tensors[self.sim_rank:self.sim_rank + 1]
+        world1_out_tensors_list = output_tensors[self.sim_rank:self.sim_rank + 1]
+        world1_out_tensors_list[0].data[:] = world1_in_tensors_list[0].data[:]
+        opts = AllreduceOptions()
+        if self.torch_ver_minor > 10:
+            opts.reduceOp = ReduceOp.SUM
+        handle = self.pg.allreduce(world1_out_tensors_list, opts)
+        return handle
+
+    def barrier(self, opts):
+        opts.device_ids = [self.sim_rank]
+        return self.pg.barrier(opts)
+
+    # ----------------------------------------------------
+    # Create group registered function
+    # ----------------------------------------------------
+    @classmethod
+    def create(cls, _store, rank, world_size, timeout, backend):
+        return cls(rank, world_size, timeout, backend)
+
+
+def install_sim_dist_backend(sim_world_size, sim_rank):
+
+    def wrapped_dist_init_process_group(backend,
+                                        init_method=None,
+                                        timeout=default_pg_timeout,
+                                        world_size=-1,
+                                        rank=-1,
+                                        store=None,
+                                        group_name="",
+                                        pg_options=None):
+        assert world_size == -1 or world_size == sim_world_size, \
+            f'Inconsistent world_size: sim={sim_world_size} dist_init={world_size}'
+
+        assert rank == -1 or rank == sim_rank, \
+            f'Inconsistent rank: sim={sim_rank} dist_init={rank}'
+
+        if backend == 'hccl':
+            import habana_frameworks.torch.distributed.hccl  # noqa: F401
+
+        # override provided init_method/store with a dummy store
+        # For debug, it is better to use FileStore:
+        #   import os
+        #   my_store_filename = '/tmp/my_store'
+        #   os.remove(my_store_filename) if os.path.exists(my_store_filename) else None
+        #   os.remove(my_store_filename)
+        #   store = torch.distributed.FileStore(my_store_filename, world_size)
+        store = torch.distributed.TCPStore(host_name="localhost",
+                                           port=12355,
+                                           world_size=sim_world_size,
+                                           is_master=True,
+                                           timeout=timedelta(seconds=300),
+                                           wait_for_workers=False)
+
+        # set the simulated world size
+        SimProcessGroup.WORLD_SIZE = sim_world_size
+        SimProcessGroup.STORE = store
+
+        # register sim backend
+        # create_fn = partial(SimProcessGroup.create, backend=default_backend)
+        create_fn = partial(SimProcessGroup.create, backend=backend)
+        dist.Backend.register_backend(SimProcessGroup.BACKEND, create_fn)
+
+        # emulate all other world devices has joined the newly created group
+        SimProcessGroup.store_add_rest_of_world(next_group=True)
+
+        orig_dist_init_process_group(backend=SimProcessGroup.BACKEND,
+                                     timeout=timeout,
+                                     world_size=sim_world_size,
+                                     rank=sim_rank,
+                                     store=store,
+                                     group_name=group_name,
+                                     pg_options=pg_options)
+
+        SimProcessGroup.default_pg().post_create_sim_group()
+
+    def wrapped_dist_new_group(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=None):
+        SimProcessGroup.store_add_rest_of_world(next_group=True)
+        pg = orig_dist_new_group(ranks=ranks, timeout=timeout, backend=backend, pg_options=pg_options)
+
+        if pg != GroupMember.NON_GROUP_MEMBER:
+            if backend is None or backend == SimProcessGroup.BACKEND:
+                pg.post_create_sim_group()
+
+        return pg
+
+    def wrapped_dist_broadcast_object_list(object_list, src=0, group=None, device=None):
+        rank = SimProcessGroup.default_pg().sim_rank
+        if src != sim_rank:
+            raise RuntimeError(f'SimProcessGroup does not support dist.broadcast_object_list() '
+                               f'for src={src} different than sim_rank={rank}')
+        return orig_dist_broadcast_object_list(object_list, src, group, device)
+
+    orig_dist_init_process_group = dist.init_process_group
+    dist.init_process_group = wrapped_dist_init_process_group
+
+    orig_dist_new_group = dist.new_group
+    dist.new_group = wrapped_dist_new_group
+
+    orig_dist_broadcast_object_list = dist.broadcast_object_list
+    dist.broadcast_object_list = wrapped_dist_broadcast_object_list
diff --git a/deepspeed/tools/pg_sim/ut/base.py b/deepspeed/tools/pg_sim/ut/base.py
new file mode 100644
index 000000000000..24889f944070
--- /dev/null
+++ b/deepspeed/tools/pg_sim/ut/base.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import unittest
+import functools
+import torch
+import torch.distributed as dist
+import pytest
+
+from pg_sim.pg import (install_sim_dist_backend, GroupMember)
+
+
+class TestBaseWrapper:
+    """
+    BaseTestWrapper class ensures that the test cases encapsulated
+    in ProcessGroupSimTestBase will only be executed by subclasses.
+    """
+
+    class ProcessGroupSimTestBase(unittest.TestCase):
+
+        def setUp(self) -> None:
+            self.world_size = 8
+            self.rank = 0
+            self.backend = self.get_backend()
+            self.device = self.get_device()
+
+            self.assertIsNotNone(self.backend)
+            self.assertIsNotNone(self.device)
+
+            install_sim_dist_backend(sim_world_size=self.world_size, sim_rank=self.rank)
+
+            dist.init_process_group(backend=self.backend,
+                                    init_method=None,
+                                    store=None,
+                                    rank=self.rank,
+                                    world_size=self.world_size)
+
+        def get_backend(self):
+            self.assertTrue(False, msg='get_backend must be implemented by derived test')
+
+        def get_device(self):
+            self.assertTrue(False, msg='get_device must be implemented by derived test')
+
+        def _get_row_first_rank(self):
+            row_ranks = list(set(range(self.world_size)) - {self.rank})
+            return row_ranks[0] if row_ranks else None
+
+        @staticmethod
+        def _get_torch_version():
+            return int(torch.__version__.split('.')[1])
+
+        @pytest.mark.forked
+        def test_world(self):
+            res_rank = dist.get_rank()
+            res_ws = dist.get_world_size()
+            self.assertEqual(res_rank, self.rank)
+            self.assertEqual(res_ws, self.world_size)
+
+        @pytest.mark.forked
+        def test_new_group(self):
+            t = torch.tensor([1, 2]).to(self.device)
+            t_in_out = t.clone()
+
+            pg_1 = dist.new_group(ranks=[self.rank])
+            dist.all_reduce(t_in_out, op=dist.ReduceOp.SUM, group=pg_1)
+            self.assertTrue(t.eq(t_in_out).all())
+
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                pg_2 = dist.new_group(ranks=[row_rank])
+                self.assertEqual(pg_2, GroupMember.NON_GROUP_MEMBER)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        def _test_broadcast_impl(self, src):
+            t = torch.tensor([1, 2]).to(self.device)
+            handle = dist.broadcast(t, src=src, async_op=False)
+            self.assertIsNone(handle)
+
+            t = torch.tensor([1, 2]).to(self.device)
+            handle = dist.broadcast(t, src=src, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+
+        @pytest.mark.forked
+        def test_broadcast_src(self):
+            self._test_broadcast_impl(src=self.rank)
+
+        @pytest.mark.forked
+        def test_broadcast_dst(self):
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                self._test_broadcast_impl(src=row_rank)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        def _test_broadcast_object_type_impl(self, src):
+            if dist.get_rank() == src:
+                objects = ["foo", 12, {1: 2}]
+            else:
+                objects = [None, None, None]
+
+            dev = torch.device(self.device)
+            dist.broadcast_object_list(objects, src=src, device=dev)
+
+        @pytest.mark.forked
+        def test_broadcast_object_type_src(self):
+            self._test_broadcast_object_type_impl(src=self.rank)
+
+        @pytest.mark.forked
+        def test_broadcast_object_type_dst(self):
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                with pytest.raises(RuntimeError):
+                    self._test_broadcast_object_type_impl(src=row_rank)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        @pytest.mark.forked
+        def test_all_reduce(self):
+            t = torch.tensor([1, 2]).to(self.device)
+            t_in_out = t.clone()
+            dist.all_reduce(t_in_out, op=dist.ReduceOp.SUM)
+            self.assertTrue(t.eq(t_in_out).all())
+
+        def _test_reduce_impl(self, dst):
+            t = torch.tensor([1.0, 2.0]).to(self.device)
+            t_in_out = t.clone()
+
+            handle = dist.reduce(t_in_out, dst=dst, op=dist.ReduceOp.SUM, async_op=False)
+            self.assertIsNone(handle)
+            self.assertTrue(t.eq(t_in_out).all())
+
+            handle = dist.reduce(t_in_out, dst=dst, op=dist.ReduceOp.SUM, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            self.assertTrue(t.eq(t_in_out).all())
+
+        @pytest.mark.forked
+        def test_reduce_src(self):
+            self._test_reduce_impl(dst=self.rank)
+
+        @pytest.mark.forked
+        def test_reduce_dst(self):
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                self._test_reduce_impl(dst=row_rank)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        @pytest.mark.forked
+        def test_all_gather(self):
+            tensor_list = [torch.zeros(2).to(self.device) for _ in range(self.world_size)]
+            tensor = torch.ones(2).to(self.device)
+
+            handle = dist.all_gather(tensor_list, tensor, async_op=False)
+            self.assertIsNone(handle)
+            self.assertTrue(tensor_list[0].eq(tensor).all())
+
+            handle = dist.all_gather(tensor_list, tensor, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            self.assertTrue(tensor_list[0].eq(tensor).all())
+
+        def _test_gather_impl(self, dst, local_dst):
+            torch_version = self._get_torch_version()
+            if (self.backend == 'nccl') and (torch_version <= 10):
+                self.skipTest(f'Skipping {self._testMethodName} for nccl '
+                              f'for torch.version={torch_version}')
+
+            tensor = torch.ones(2).to(self.device)
+            gather_list = [torch.zeros(2).to(self.device) for _ in range(self.world_size)] if local_dst else None
+
+            handle = dist.gather(tensor, gather_list, dst=dst, async_op=False)
+            self.assertIsNone(handle)
+            if local_dst:
+                self.assertTrue(gather_list[dst].eq(tensor).all())
+
+            handle = dist.gather(tensor, gather_list, dst=dst, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            if local_dst:
+                self.assertTrue(gather_list[dst].eq(tensor).all())
+
+        @pytest.mark.forked
+        def test_gather_src(self):
+            self._test_gather_impl(dst=self.rank, local_dst=True)
+
+        @pytest.mark.forked
+        def test_gather_not_src(self):
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                self._test_gather_impl(dst=row_rank, local_dst=False)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        def _test_scatter_impl(self, src, local_src):
+            if self.backend not in ('gloo', 'mpi'):
+                self.skipTest(f'Skipping {self._testMethodName} for {self.backend}')
+
+            tensor = torch.ones(2).to(self.device)
+            scatter_list = [torch.zeros(2).to(self.device) for _ in range(self.world_size)] if local_src else None
+
+            handle = dist.scatter(tensor, scatter_list, src=src, async_op=False)
+            self.assertIsNone(handle)
+            if local_src:
+                self.assertTrue(scatter_list[src].eq(tensor).all())
+
+            handle = dist.scatter(tensor, scatter_list, src=src, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            if local_src:
+                self.assertTrue(scatter_list[src].eq(tensor).all())
+
+        @pytest.mark.forked
+        def test_scatter_src(self):
+            self._test_scatter_impl(src=self.rank, local_src=True)
+
+        @pytest.mark.forked
+        def test_scatter_not_src(self):
+            row_rank = self._get_row_first_rank()
+            if row_rank:
+                self._test_scatter_impl(src=row_rank, local_src=False)
+            else:
+                self.skipTest(f'Skipping {self._testMethodName}')
+
+        @pytest.mark.forked
+        def test_reduce_scatter(self):
+            if self.backend not in ('nccl', 'hccl'):
+                self.skipTest(f'Skipping {self._testMethodName} for {self.backend}')
+
+            output = torch.ones(2).to(self.device)
+            input_list = [torch.zeros(2).to(self.device) for _ in range(self.world_size)]
+
+            handle = dist.reduce_scatter(output, input_list, async_op=False)
+            self.assertIsNone(handle)
+            self.assertTrue(input_list[self.rank].eq(output).all())
+
+            handle = dist.reduce_scatter(output, input_list, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            self.assertTrue(input_list[self.rank].eq(output).all())
+
+        @pytest.mark.forked
+        def test_all_to_all(self):
+            if self.backend not in ('nccl', 'hccl', 'mpi'):
+                self.skipTest(f'Skipping {self._testMethodName} for {self.backend}')
+
+            output_list = [torch.zeros(1).to(self.device) for _ in range(self.world_size)]
+            input_list = list(
+                torch.arange(self.world_size, dtype=torch.float32).add(1.).to(self.device).chunk(self.world_size))
+
+            expected_res = [
+                torch.zeros(1).to(self.device) if i != self.rank else torch.ones(1).to(self.device)
+                for i in range(self.world_size)
+            ]
+
+            handle = dist.all_to_all(output_list, input_list, async_op=False)
+            self.assertIsNone(handle)
+            self.assertTrue(
+                functools.reduce(lambda x, y: x and y, map(lambda p, q: p == q, expected_res, output_list), True))
+
+            handle = dist.all_to_all(output_list, input_list, async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+            self.assertTrue(
+                functools.reduce(lambda x, y: x and y, map(lambda p, q: p == q, expected_res, output_list), True))
+
+        @pytest.mark.forked
+        def test_barrier(self):
+            handle = dist.barrier(async_op=False)
+            self.assertIsNone(handle)
+
+            handle = dist.barrier(async_op=True)
+            self.assertIsNotNone(handle)
+            handle.wait()
+
+        @pytest.mark.forked
+        def test_p2p_send(self):
+            tensor = torch.ones(2).to(self.device)
+            dist.send(tensor, dst=self.rank, group=None, tag=0)
+
+            row_rank = self._get_row_first_rank()
+            dist.send(tensor, dst=row_rank, group=None, tag=0) if row_rank else None
+
+            handle = dist.isend(tensor, dst=self.rank, group=None, tag=0)
+            self.assertIsNotNone(handle)
+            handle.wait()
+
+            handle = dist.isend(tensor, dst=row_rank, group=None, tag=0)
+            self.assertIsNotNone(handle)
+            handle.wait()
+
+        @pytest.mark.forked
+        def test_p2p_recv(self):
+            tensor = torch.zeros(2).to(self.device)
+            dist.recv(tensor, src=self.rank, group=None, tag=0)
+
+            row_rank = self._get_row_first_rank()
+            dist.recv(tensor, src=row_rank, group=None, tag=0) if row_rank else None
+
+            handle = dist.irecv(tensor, src=self.rank, group=None, tag=0)
+            self.assertIsNotNone(handle)
+            handle.wait()
+
+            handle = dist.irecv(tensor, src=row_rank, group=None, tag=0)
+            self.assertIsNotNone(handle)
+            handle.wait()
diff --git a/deepspeed/tools/pg_sim/ut/gloo.py b/deepspeed/tools/pg_sim/ut/gloo.py
new file mode 100644
index 000000000000..347f9b96a741
--- /dev/null
+++ b/deepspeed/tools/pg_sim/ut/gloo.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import unittest
+from pg_sim.ut.base import TestBaseWrapper
+
+
+class ProcessGroupSimTestGloo(TestBaseWrapper.ProcessGroupSimTestBase):
+
+    def setUp(self) -> None:
+        super(ProcessGroupSimTestGloo, self).setUp()
+
+    def get_backend(self):
+        return 'gloo'
+
+    def get_device(self):
+        return 'cpu'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/deepspeed/tools/pg_sim/ut/hccl.py b/deepspeed/tools/pg_sim/ut/hccl.py
new file mode 100644
index 000000000000..37fabcfb9363
--- /dev/null
+++ b/deepspeed/tools/pg_sim/ut/hccl.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import unittest
+from pg_sim.ut.base import TestBaseWrapper
+
+
+class ProcessGroupSimTestHccl(TestBaseWrapper.ProcessGroupSimTestBase):
+
+    def setUp(self) -> None:
+        super(ProcessGroupSimTestHccl, self).setUp()
+
+    def get_backend(self):
+        return 'hccl'
+
+    def get_device(self):
+        return 'hpu'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/deepspeed/tools/pg_sim/ut/nccl.py b/deepspeed/tools/pg_sim/ut/nccl.py
new file mode 100644
index 000000000000..da6b8f23ed47
--- /dev/null
+++ b/deepspeed/tools/pg_sim/ut/nccl.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import unittest
+from pg_sim.ut.base import TestBaseWrapper
+
+
+class ProcessGroupSimTestNccl(TestBaseWrapper.ProcessGroupSimTestBase):
+
+    def setUp(self) -> None:
+        super(ProcessGroupSimTestNccl, self).setUp()
+
+    def get_backend(self):
+        return 'nccl'
+
+    def get_device(self):
+        return 'cuda'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/deepspeed/tools/tensor_logger/__init__.py b/deepspeed/tools/tensor_logger/__init__.py
new file mode 100644
index 000000000000..55ea7227e48d
--- /dev/null
+++ b/deepspeed/tools/tensor_logger/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .tensor_logger import TensorLogger, save_logged_tensors
diff --git a/deepspeed/tools/tensor_logger/tensor_logger.py b/deepspeed/tools/tensor_logger/tensor_logger.py
new file mode 100644
index 000000000000..f128b9a6b0a0
--- /dev/null
+++ b/deepspeed/tools/tensor_logger/tensor_logger.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import collections
+from functools import partial
+from contextlib import contextmanager
+from torch.functional import Tensor
+from os import makedirs
+from os.path import join
+
+
+class TensorLogger:
+    """ Records nn.Module's activations and gradients tensors
+
+        Description:
+        Records up to end_iteration(if 0, recording is disabled) from start_iteration.
+        If log_activations_enabled, nn.Module's activations are recorded during forward.
+        If log_grads_enabled, nn.Module's gradients are recorded during back propagation.
+        If log_inputs_enabled, model inputs are recorded.
+
+        Usage:
+            Integrated within the training loop:
+                tensor_logger = TensorLogger(model, start_iteration=2, end_iteration=2)
+                # dumps second iteration only, iteration number starts with 1
+
+                for i, samples in enumerate(data_loader) # training loop
+                    with tensor_logger.log_iteration(i):
+                        # run forward/backward iteration
+
+                tensor_logger.save(filename)
+
+            Another alternative:
+                tensor_logger = TensorLogger(model, end_iteration=2)
+
+                for i, samples in enumerate(data_loader) # training loop
+                    with tensor_logger:
+                        tensor_logger.set_iteration(i)
+                        # run forward/backward iteration
+
+                tensor_logger.save(filename)
+
+        Implementation notes:
+            forward/backward activations/gradients are collected using nn.Module hooks.
+            However, model inputs are collected by overloading model.forward() method.
+            Model inputs can't be collected using the hooks since the hooks only provide
+            inputs and do not provide kwargs, if exist, of the forward method.
+    """
+
+    def __init__(self,
+                 model,
+                 start_iteration=0,
+                 end_iteration=0,
+                 log_activations_enabled=False,
+                 log_grads_enabled=False,
+                 log_inputs_enabled=False,
+                 prefix=None):
+
+        # for now, no support for virtual pipeline (interleaved)
+        if isinstance(model, list):
+            assert len(model) == 1, 'No support for list of multiple models (len={})'.format(len(model))
+            model = model[0]
+
+        self.model = model
+        self.start_iteration = start_iteration
+        self.end_iteration = end_iteration
+        self.log_activations_enabled = log_activations_enabled
+        self.log_grads_enabled = log_grads_enabled
+        self.log_inputs_enabled = log_inputs_enabled
+        self.prefix = 'model' if prefix is None else prefix
+
+        # captured tensors are saved in the following hierarchy:
+        #   {
+        #        iteration:  {               # iteration number
+        #            tensor_type:  {         # fwd_act/bwd_grad_in/bwd_grad_out
+        #                name: [tensors]     # tensor name's tensors. list is required due to e.g. grad accumulation
+        #            }
+        #        }
+        #    }
+        class IterData(dict):
+
+            def __init__(self):
+                super(IterData, self).__init__()
+                self['fwd_act'] = collections.defaultdict(list)
+                self['bwd_grad_in'] = collections.defaultdict(list)
+                self['bwd_grad_out'] = collections.defaultdict(list)
+                self['model_inputs'] = collections.defaultdict(list)
+
+        self.data = collections.defaultdict(IterData)
+        self.active = False
+        self.current_iteration = 0
+        self.fwd_handles = []
+        self.bwd_handles = []
+
+    def _fqn(self, name):
+        return '.'.join([self.prefix, name]) if name else self.prefix
+
+    def set_iteration(self, iteration):
+        self.current_iteration = iteration
+
+    def get_num_recorded_iterations(self):
+        return len(self.data)
+
+    @contextmanager
+    def log_iteration(self, iteration):
+        self.current_iteration = iteration
+        self._enable()
+        yield self
+        self._disable()
+
+    def __enter__(self):
+        self._enable()
+        return self
+
+    def __exit__(self):
+        self._disable()
+
+    def clear(self):
+        self.data.clear()
+
+    def save(self, filename, do_clear=True):
+
+        def convert_for_pickle(obj):
+            if isinstance(obj, dict):
+                return {k: convert_for_pickle(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [convert_for_pickle(e) for e in obj]
+            elif isinstance(obj, tuple):
+                return tuple([convert_for_pickle(e) for e in obj])
+            else:
+                if isinstance(obj, Tensor):
+                    return obj.detach().cpu()
+                else:
+                    return obj
+
+        data = convert_for_pickle(self.data)
+        torch.save(data, filename)
+        self.clear() if do_clear else None
+
+    def _enable(self):
+        if not self.active and self.start_iteration <= self.current_iteration <= self.end_iteration:
+            self.active = True
+            self._enable_log_grads() if self.log_grads_enabled else None
+            self._enable_log_activations() if self.log_activations_enabled else None
+            self._enable_log_inputs() if self.log_inputs_enabled else None
+
+    def _disable(self):
+        if self.active:
+            self.active = False
+            self._disable_log_grads()
+            self._disable_log_activations()
+            self._disable_log_inputs()
+
+    @staticmethod
+    def _extract_tensors(t):
+        if t is None:
+            return None
+        elif isinstance(t, int):
+            return torch.tensor(t)
+        elif isinstance(t, torch.Tensor):
+            return t.detach().contiguous()
+        elif isinstance(t, list):
+            return [TensorLogger._extract_tensors(e) for e in t]
+        elif isinstance(t, tuple):
+            return tuple(TensorLogger._extract_tensors(e) for e in t)
+        elif isinstance(t, dict):
+            return {k: TensorLogger._extract_tensors(v) for k, v in t.items()}
+        assert False, 'Unsupported type: {}'.format(type(t))
+
+    def _save_fwd_activation(self, name, _mod, _inp, out):
+        fwd_act = self._extract_tensors(out)
+        self.data[self.current_iteration]['fwd_act'][name].append(fwd_act)
+
+    def _save_bwd_grads(self, name, _mod, grad_input, grad_output):
+        grad_in = self._extract_tensors(grad_input)
+        grad_out = self._extract_tensors(grad_output)
+        self.data[self.current_iteration]['bwd_grad_in'][name].append(grad_in)
+        self.data[self.current_iteration]['bwd_grad_out'][name].append(grad_out)
+
+    def _save_inputs(self, *inp, **kwargs):
+        model_inputs = self._extract_tensors(inp)
+        model_kwargs = self._extract_tensors(kwargs)
+        self.data[self.current_iteration]['model_inputs']['inputs'].append(model_inputs)
+        self.data[self.current_iteration]['model_inputs']['kwargs'].append(model_kwargs)
+
+    def _enable_log_grads(self):
+        #Revert after [SW-69765] is fixed
+        full_bwd_hook_supported = False
+        for name, m in self.model.named_modules():
+            register_fn = m.register_full_backward_hook if full_bwd_hook_supported else m.register_backward_hook
+            h = register_fn(partial(self._save_bwd_grads, self._fqn(name)))
+            self.bwd_handles.append(h)
+
+    def _enable_log_activations(self):
+        for name, m in self.model.named_modules():
+            h = m.register_forward_hook(partial(self._save_fwd_activation, self._fqn(name)))
+            self.fwd_handles.append(h)
+
+    def _enable_log_inputs(self):
+
+        def wrapped_forward(*inputs, **kwargs):
+            self._save_inputs(*inputs, **kwargs)
+            return self.model.original_forward__(*inputs, **kwargs)
+
+        self.model.original_forward__ = self.model.forward
+        self.model.forward = wrapped_forward
+
+    def _disable_log_grads(self):
+        for h in self.bwd_handles:
+            h.remove()
+        self.bwd_handles = []
+
+    def _disable_log_activations(self):
+        for h in self.fwd_handles:
+            h.remove()
+        self.fwd_handles = []
+
+    def _disable_log_inputs(self):
+        if hasattr(self.model, 'original_forward__'):
+            self.model.forward = self.model.original_forward__
+            del self.model.original_forward__
+
+
+def save_logged_tensors(tensor_logger: TensorLogger, tensor_logger_path, rank_no, iteration=None):
+    if tensor_logger.get_num_recorded_iterations():
+        makedirs(tensor_logger_path, exist_ok=True)
+        filename = 'tensor_logger_rank_{}'.format(rank_no) + '.pt'
+        if iteration is not None:
+            filename = 'tensor_logger_rank_{}_iter_{}'.format(rank_no, iteration) + '.pt'
+        fullname = join(tensor_logger_path, filename)
+        tensor_logger.save(fullname)
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 4aac4ded1243..23139e84385b 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -50,7 +50,7 @@ def __init__(self, name):
             self.name_ = name
             self.started_ = False
             self.event_timers = []
-            self.use_host_timer = get_accelerator().is_synchronized_device()
+            self.use_host_timer = get_accelerator().use_host_timers()
             self.start_event = None
             self.elapsed_records = None
             self.start_time = 0.0
@@ -237,7 +237,8 @@ def start(self):
         self._init_timer()
         self.started = True
         if self.global_step_count >= self.start_step:
-            get_accelerator().synchronize()
+            if not get_accelerator().device_name() == 'hpu':
+                get_accelerator().synchronize()
             self.start_time = time.time()
 
     def stop(self, global_step=False, report_speed=True):
@@ -249,13 +250,15 @@ def stop(self, global_step=False, report_speed=True):
             self.global_step_count += 1
 
         if self.start_time > 0:
-            get_accelerator().synchronize()
+            if not get_accelerator().device_name() == 'hpu':
+                get_accelerator().synchronize()
             self.end_time = time.time()
             duration = self.end_time - self.start_time
             self.total_elapsed_time += duration
             self.step_elapsed_time += duration
 
             if global_step:
+                #curr_samples_sec = (self.batch_size * self.num_workers) / duration
                 if report_speed and self.global_step_count % self.steps_per_output == 0:
                     self.logging(
                         "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index c98caae31534..49b846633d6e 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -248,6 +248,11 @@ def _zero2_merge_frozen_params(state_dict, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
 def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
 
@@ -287,7 +292,7 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
 
-            unpartitioned_numel = shape.numel()
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
             total_numel += unpartitioned_numel
             total_params += 1
 
diff --git a/op_builder/hpu/__init__.py b/op_builder/hpu/__init__.py
new file mode 100644
index 000000000000..5ad1b9a7f891
--- /dev/null
+++ b/op_builder/hpu/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
+from .transformer_inference import InferenceBuilder
+from .no_impl import NotImplementedBuilder
diff --git a/op_builder/hpu/builder.py b/op_builder/hpu/builder.py
new file mode 100644
index 000000000000..5a538c84040c
--- /dev/null
+++ b/op_builder/hpu/builder.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class CPUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/hpu/cpu_adam.py b/op_builder/hpu/cpu_adam.py
new file mode 100644
index 000000000000..2f3b7aefe705
--- /dev/null
+++ b/op_builder/hpu/cpu_adam.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class CPUAdamBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        args += ['-DENABLE_BFLOAT16']
+        return args
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/hpu/fused_adam.py b/op_builder/hpu/fused_adam.py
new file mode 100644
index 000000000000..d77228317ddb
--- /dev/null
+++ b/op_builder/hpu/fused_adam.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class FusedAdamBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/cpu/adam/fused_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        args += ['-DENABLE_BFLOAT16']
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/hpu/no_impl.py b/op_builder/hpu/no_impl.py
new file mode 100644
index 000000000000..140d65b48def
--- /dev/null
+++ b/op_builder/hpu/no_impl.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class NotImplementedBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on HPU backend.")
+
+    def sources(self):
+        return []
diff --git a/op_builder/hpu/transformer_inference.py b/op_builder/hpu/transformer_inference.py
new file mode 100644
index 000000000000..4312cf58956b
--- /dev/null
+++ b/op_builder/hpu/transformer_inference.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..torch_fallback_builder import TorchInferenceOpBuilder
+
+
+class InferenceBuilder(TorchInferenceOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=self.NAME)
diff --git a/op_builder/torch_fallback_builder.py b/op_builder/torch_fallback_builder.py
new file mode 100644
index 000000000000..c6e0df880304
--- /dev/null
+++ b/op_builder/torch_fallback_builder.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+# taken from op_builder/cpu/builder.py
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401
+    from op_builder.builder import OpBuilder
+    import op_builder.torch_fallback_kernels as torch_fallback_kernels
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+    import deepspeed.ops.op_builder.torch_fallback_kernels as torch_fallback_kernels
+
+
+class TorchInferenceOp:
+    """Torch implementations for inference operations"""
+
+    def gated_activation(activation, bias, activation_func_type):
+        return torch_fallback_kernels.gated_activation_fallback(activation, bias, activation_func_type)
+
+    def layer_norm(vals, gamma, beta, epsilon):
+        return torch_fallback_kernels.layer_norm_fallback(vals, gamma, beta, epsilon)
+
+    def _layer_norm_residual(vals, bias, res, gamma, beta, epsilon):
+        return torch_fallback_kernels.layer_norm_residual_fallback(vals, bias, res, gamma, beta, epsilon)
+
+    def layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon):
+        return torch_fallback_kernels.layer_norm_residual_store_pre_ln_res_fallback(
+            vals, bias, res, gamma, beta, epsilon)
+
+    def moe_res_matmul(residual, coef, output):
+        return torch_fallback_kernels.moe_res_matmul_fallback(residual, coef, output)
+
+    def reset_cache():
+        return torch_fallback_kernels.InferenceContext.Instance().reset_tokens()
+
+    def allocate_workspace_fp32(hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size, external_cache,
+                                rank, max_out_tokens, min_out_tokens):
+        return torch_fallback_kernels.InferenceContext.Instance().gen_workspace(num_layers, num_heads, batch_size,
+                                                                                prompt_length, hidden_dim, mp_size,
+                                                                                external_cache, torch.float, rank,
+                                                                                max_out_tokens, min_out_tokens)
+
+    def allocate_workspace_bf16(hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size, external_cache,
+                                rank, max_out_tokens, min_out_tokens):
+        return torch_fallback_kernels.InferenceContext.Instance().gen_workspace(num_layers, num_heads, batch_size,
+                                                                                prompt_length, hidden_dim, mp_size,
+                                                                                external_cache, torch.bfloat16, rank,
+                                                                                max_out_tokens, min_out_tokens)
+
+    def allocate_workspace_fp16(hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size, external_cache,
+                                rank, max_out_tokens, min_out_tokens):
+        return torch_fallback_kernels.InferenceContext.Instance().gen_workspace(num_layers, num_heads, batch_size,
+                                                                                prompt_length, hidden_dim, mp_size,
+                                                                                external_cache, torch.half, rank,
+                                                                                max_out_tokens, min_out_tokens)
+
+    def release_workspace():
+        return torch_fallback_kernels.release_workspace()
+
+    def retake_workspace():
+        return torch_fallback_kernels.retake_workspace()
+
+    def pre_rms_norm(vals, residual, gamma, epsilon):
+        return torch_fallback_kernels.pre_rms_norm_fallback(vals, residual, gamma, epsilon)
+
+    def rms_norm(vals, gamma, epsilon):
+        return torch_fallback_kernels.rms_norm_fallback(vals, gamma, epsilon)
+
+    def _vector_add(a, b, gamma):
+        return torch_fallback_kernels.vector_add_fallback(a, b, gamma)
+
+
+def define_func_dtypes(clas, func, name=None):
+    for dtype in torch_fallback_kernels.dtype_names_dict.values():
+        setattr(clas, f"{name or func.__name__}_{dtype}", func)
+
+
+def define_dtype_funcs_to_class(clas):
+    define_func_dtypes(clas, torch_fallback_kernels.bias_add)
+    define_func_dtypes(clas, torch_fallback_kernels.bias_gelu_fallback, "bias_gelu")
+    define_func_dtypes(clas, torch_fallback_kernels.bias_relu_fallback, "bias_relu")
+    define_func_dtypes(clas, torch_fallback_kernels.gelu_gemm_fallback, "fused_gemm_gelu")
+    define_func_dtypes(clas, torch_fallback_kernels.mlp_gemm_fallback, "mlp_gemm")
+    define_func_dtypes(clas, torch_fallback_kernels.rms_mlp_gemm_fallback, "rms_mlp_gemm")
+    define_func_dtypes(clas, torch_fallback_kernels.residual_add_bias_fallback, "residual_add_bias")
+    define_func_dtypes(clas, torch_fallback_kernels.qkv_gemm_fallback, "qkv_gemm")
+    define_func_dtypes(clas, torch_fallback_kernels.rms_qkv_gemm_fallback, "rms_qkv_gemm")
+    define_func_dtypes(clas, torch_fallback_kernels.transform4d_0213, "transform4d_0213")
+    define_func_dtypes(clas, torch_fallback_kernels.bias_add_transform_0213, "bias_add_transform_0213")
+    define_func_dtypes(clas, torch_fallback_kernels.vector_matmul_fallback, "vector_matmul")
+    define_func_dtypes(clas, torch_fallback_kernels.softmax_fallback, "softmax")
+    define_func_dtypes(clas, torch_fallback_kernels.softmax_context_fallback, "softmax_context")
+
+
+class TorchInferenceOpBuilder(OpBuilder):
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f"deepspeed.ops.transformer.inference.{self.NAME}_op"
+
+    def sources(self):
+        return []
+
+    def load(self, verbose=True):
+        clas = TorchInferenceOp
+        define_dtype_funcs_to_class(clas)
+        return clas
diff --git a/op_builder/torch_fallback_kernels.py b/op_builder/torch_fallback_kernels.py
new file mode 100644
index 000000000000..562d587b0a5b
--- /dev/null
+++ b/op_builder/torch_fallback_kernels.py
@@ -0,0 +1,539 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Copyright Habana Labs, Ltd. an Intel Company"""
+
+import torch
+import torch.nn.functional as F
+
+dtype_names_dict = {
+    torch.float16: "fp16",
+    torch.bfloat16: "bf16",
+    torch.float32: "fp32",
+}
+
+minus_inf = -10000
+
+
+class InferenceContext():
+
+    __instance = None
+
+    def __init__(self):
+        self.kv_cache = None
+        self.kv_cache_elem_size = None
+        self.num_tokens = 1
+        self.kv_cache_num_layers = None
+        self.kv_cache_size = None
+        self.max_out_tokens = None
+        self.rotary = None
+        self.allocate_called = False
+
+    @classmethod
+    def Instance(cls):
+        if InferenceContext.__instance is None:
+            InferenceContext.__instance = InferenceContext()
+        return InferenceContext.__instance
+
+    def gen_workspace(self, num_layers, num_heads, batch_size, prompt_len, hidden_dim, mp_size, external_cache,
+                      elem_size, rank, max_out_tokens, min_out_tokens):
+        assert mp_size == 1, f"{mp_size=} not supported"
+        self.allocate_called = True
+        if not external_cache:
+            self.kv_cache_num_layers = num_layers
+            self.max_out_tokens = max_out_tokens
+            head_size = hidden_dim // num_heads
+            self.kv_cache_size = torch.Size([batch_size, (num_heads // mp_size), max_out_tokens, head_size])
+            self.kv_cache_elem_size = elem_size
+            self.retake_workspace()
+        self.num_tokens = 0
+        self.static_shapes = True
+        return True
+
+    def retake_workspace(self):
+        if not self.allocate_called:
+            return True
+
+        import deepspeed.accelerator as accelerator
+        if self.kv_cache is None:
+            # (key,value) tuple for each layer
+            self.kv_cache = [(torch.full(self.kv_cache_size,
+                                         minus_inf,
+                                         dtype=self.kv_cache_elem_size,
+                                         device=accelerator.get_accelerator().device_name()),
+                              torch.full(self.kv_cache_size,
+                                         minus_inf,
+                                         dtype=self.kv_cache_elem_size,
+                                         device=accelerator.get_accelerator().device_name()))
+                             ] * self.kv_cache_num_layers
+        return True
+
+    def update_cache(self, layer_id, token_idx, is_prompt, bat_0213_key, bat_0213_value):
+        # Update current token
+        if is_prompt:
+            self.static_shapes = True
+            if token_idx is None:
+                self.static_shapes = False
+                InferenceContext.Instance().reset_tokens(bat_0213_key.shape[2])
+            else:
+                InferenceContext.Instance().reset_tokens(token_idx)
+
+        if token_idx is None:
+            token_idx = InferenceContext.Instance().current_tokens()
+
+        # Update cache content
+        if is_prompt:
+            cache_max_seq = self.kv_cache[layer_id][0].shape[2]
+            cache_max_head_dim = self.kv_cache[layer_id][0].shape[3]
+            mask = torch.arange(cache_max_seq, device=bat_0213_key.device)
+            mask = mask.ge(token_idx)
+            mask = mask.unsqueeze(-1)
+            mask = mask.expand([cache_max_seq, cache_max_head_dim])
+
+            self.kv_cache[layer_id][0][:, :, :bat_0213_key.shape[2], :].copy_(bat_0213_key)
+            self.kv_cache[layer_id][0][:, :].masked_fill(mask, minus_inf)
+            self.kv_cache[layer_id][1][:, :, :bat_0213_value.shape[2], :].copy_(bat_0213_value)
+            self.kv_cache[layer_id][1][:, :].masked_fill(mask, 0)
+        else:
+            if self.static_shapes:
+                assert type(token_idx) == torch.Tensor, "token_idx is expected to be torch.Tensor"
+                self.kv_cache[layer_id][0].index_copy_(2, token_idx - 1, bat_0213_key)
+                self.kv_cache[layer_id][1].index_copy_(2, token_idx - 1, bat_0213_value)
+            else:
+                assert type(token_idx) == int, "token_idx is expected to be int"
+                self.kv_cache[layer_id][0][:, :, token_idx - 1, :] = bat_0213_key
+                self.kv_cache[layer_id][1][:, :, token_idx - 1, :] = bat_0213_value
+
+        bat_0213_key = self.kv_cache[layer_id][0]
+        bat_0213_value = self.kv_cache[layer_id][1]
+
+        if not self.static_shapes:
+            bat_0213_key = bat_0213_key[:, :, :token_idx, :]
+            bat_0213_value = bat_0213_value[:, :, :token_idx, :]
+
+        return bat_0213_key, bat_0213_value
+
+    def release_workspace(self):
+        self.kv_cache = None
+        self.rotary = None
+
+    def reset_tokens(self, initial_tokens=1):
+        self.num_tokens = initial_tokens
+
+    def current_tokens(self):
+        return self.num_tokens
+
+    def advance_tokens(self):
+        self.num_tokens = self.num_tokens + 1
+
+    def get_kv_cache(self):
+        return self.kv_cache
+
+    def get_rotary(self, rotary_dim, rope_theta):
+        if self.rotary is None:
+            from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+
+            self.rotary = LlamaRotaryEmbedding(rotary_dim, base=rope_theta)
+
+        return self.rotary
+
+    def get_max_tokens_num(self):
+        return self.max_out_tokens
+
+
+def bias_add(input, bias):
+    return torch.add(input, bias)
+
+
+def bias_gelu_fallback(activations, bias):
+    # Expected behavior is that of casting to float32 internally and using the tanh approximation
+    return F.gelu(activations.to(torch.float32) + bias.to(torch.float32), approximate='tanh').to(activations.dtype)
+
+
+def bias_relu_fallback(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    return F.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+
+
+def gated_geglu_fallback(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    # Explicitly using the default GeLU
+    activations = activations + bias.reshape(1, 1, -1)
+    hidden_states, gate = activations.chunk(2, dim=-1)
+    return hidden_states * F.gelu(gate.to(torch.float32)).to(activations.dtype)
+
+
+def gated_silu_fallback(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    # Explicitly using the default GeLU
+    activations = activations + bias.reshape(1, 1, -1)
+    hidden_states, gate = activations.chunk(2, dim=-1)
+    return hidden_states * F.silu(gate.to(torch.float32)).to(activations.dtype)
+
+
+def gated_activation_fallback(activations, bias, activation_func_type):
+    from deepspeed.utils.types import ActivationFuncType
+    if activation_func_type == ActivationFuncType.GATED_SILU:
+        return gated_silu_fallback(activations, bias)
+    elif activation_func_type == ActivationFuncType.GATED_GELU:
+        return gated_geglu_fallback(activations, bias)
+    # Default, shouldn't happen
+    raise NotImplementedError
+
+
+def gelu_gemm_fallback(input, weight, scale, bias, out, out_scale, dtype, transpose):
+    tmp = torch.matmul(input, weight)
+    tmp = F.gelu(tmp.to(torch.float32) + bias.to(torch.float32), approximate="tanh").to(tmp.dtype)
+    output = torch.matmul(tmp, out)
+    return output
+
+
+def layer_norm_fallback(vals, gamma, beta, epsilon):
+    channels = gamma.shape[0]
+    dtype = gamma.dtype
+    vals_f = vals.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return F.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+
+def layer_norm_residual_fallback(vals, bias, res, gamma, beta, epsilon):
+    channels = gamma.shape[0]
+    dtype = gamma.dtype
+    vals_f = vals.to(torch.float32)
+    bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+    res_f = res.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return F.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+
+def layer_norm_residual_store_pre_ln_res_fallback(vals, bias, res, gamma, beta, epsilon):
+    channels = gamma.shape[0]
+    dtype = gamma.dtype
+    vals_f = vals.to(torch.float32)
+    bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+    res_f = res.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    res_output = vals_f + bias_f + res_f
+    norm_output = F.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+    return norm_output, res_output.to(dtype)
+
+
+def moe_res_matmul_fallback(residual, coef_t, output):
+    coef = coef_t.transpose(1, 2).contiguous()
+    coef1, coef2 = torch.split(coef, split_size_or_sections=coef.shape[len(coef.shape) - 1] // 2, dim=-1)
+    return residual * coef1 + output * coef2
+
+
+def mlp_gemm_fallback(
+    input,
+    residual,
+    input_bias,
+    weight_interm,
+    weight_out,
+    bias,
+    gamma,
+    beta,
+    eps,
+    pre_layer_norm,
+    mlp_after_attn,
+    interm_scale,
+    out_scale,
+    dtype,
+    mlp_act_func_type,
+    transpose,
+):
+    if mlp_after_attn:
+        residual_add = F.layer_norm(
+            input + residual + input_bias,
+            (input.shape[2], ),
+            gamma,
+            beta,
+            eps,
+        )
+        tmp = torch.matmul(residual_add, weight_interm.t() if transpose else weight_interm)
+        tmp = F.gelu(tmp + bias)
+        output = torch.matmul(tmp, weight_out.t() if transpose else weight_out)
+        return (output, residual_add)
+    else:
+        # TODO: SW-151870 implement mlp_gemm_fallback
+        raise NotImplementedError
+
+
+def pre_rms_norm_fallback(vals, residual, gamma, epsilon):
+    residual = vals.to(torch.float32) + residual.to(torch.float32)
+    vals = residual
+
+    variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    vals = vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        vals = vals.to(gamma.dtype)
+
+    return gamma * vals, residual.to(gamma.dtype)
+
+
+def rms_mlp_gemm_fallback(
+    input,
+    residual,
+    weight_interm,
+    weight_out,
+    gamma,
+    eps,
+    interm_scale,
+    out_scale,
+    dtype,
+    mlp_act_func_type,
+    transpose,
+):
+    inp_norm, residual = pre_rms_norm_fallback(input, residual, gamma, eps)
+    tmp = torch.matmul(inp_norm.view([-1, inp_norm.size(2)]), weight_interm.t() if transpose else weight_interm)
+    up_proj, gate_proj = tmp.chunk(2, dim=1)
+
+    from deepspeed.utils.types import ActivationFuncType
+    if mlp_act_func_type == ActivationFuncType.GELU:
+        intermediate = F.gelu(gate_proj)
+    elif mlp_act_func_type == ActivationFuncType.ReLU:
+        intermediate = F.relu(gate_proj)
+    elif mlp_act_func_type == ActivationFuncType.GATED_GELU:
+        intermediate = F.gelu(gate_proj)
+    elif mlp_act_func_type == ActivationFuncType.GATED_SILU:
+        intermediate = F.silu(gate_proj)
+    else:
+        raise f"rms_mlp_gemm_fallback not implemented for activation type {mlp_act_func_type}"
+
+    intermediate = intermediate * up_proj
+
+    output = torch.matmul(intermediate, weight_out.t() if transpose else weight_out)
+    output = output.view([input.size(0), input.size(1), -1])
+
+    return [output, residual]
+
+
+def rms_norm_fallback(vals, gamma, epsilon):
+    variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    vals = vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        vals = vals.to(gamma.dtype)
+
+    return gamma * vals
+
+
+def res_add_bias_ref_gptj_fallback(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
+    hidden_state += attn_output + (residual + final_bias) / mp_size
+    if add_attn_bias:
+        hidden_state += attn_bias / mp_size
+    return hidden_state
+
+
+def residual_add_bias_fallback(
+    hidden_state,
+    residual,
+    attention_output,
+    attention_bias,
+    final_bias,
+    mp_size,
+    mlp_after_attn,
+    add_bias,
+    pre_layer_norm,
+):
+    if mlp_after_attn:
+        if pre_layer_norm:
+            tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                   final_bias.float()) / mp_size + hidden_state.float()
+        else:
+            tmp = residual.float() + hidden_state.float() + final_bias.float()
+    else:
+        tmp = res_add_bias_ref_gptj_fallback(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                             add_bias, mp_size)
+    residual.copy_(tmp.to(hidden_state.dtype))
+    return residual
+
+
+def qkv_gemm_fallback(input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+    inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
+    tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+    if add_bias:
+        tmp += bias
+    output = [tmp, inp_norm]
+    return output
+
+
+def rms_qkv_gemm_fallback(input, weight, q_scale, gamma, eps, q_int8, transpose):
+    inp_norm = rms_norm_fallback(input, gamma, eps)
+    tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+    output = [tmp, inp_norm]
+    return output
+
+
+def softmax_fallback(
+    attn_scores,
+    attn_mask,
+    alibi,
+    triangular,
+    recompute,
+    local_attention,
+    window_size,
+    async_op,
+    layer_scale,
+    head_offset,
+    mp_size,
+):
+    # get heads, algo from kernel code
+    len_ = len(attn_scores.size())
+    heads = 1
+    if len_ > 1:
+        heads = attn_scores.size()[1]
+    num_attention_heads_per_partition = heads // mp_size
+
+    if alibi != None:
+        if len(alibi.shape) == 1:
+            alibi = None
+        else:
+            alibi = alibi[head_offset:head_offset + num_attention_heads_per_partition]
+    if attn_mask != None and len(attn_mask.shape) == 1:
+        attn_mask = None
+    input_dtype = attn_scores.dtype
+    attn_scores *= layer_scale
+
+    if alibi is not None:
+        attn_scores += alibi
+    if attn_mask is not None:
+        attn_mask = attn_mask.to(torch.bool)
+
+        # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+        if len(attn_mask.shape) == 2:
+            # The above if statement was added because the mask was already 4D so this
+            # expansion should be avoided as it expands to 6D and crashes later (in bloom
+            # HE KI FB)
+            attn_mask = attn_mask[:, None, None, :]
+        attn_scores = torch.masked_fill(attn_scores, attn_mask, -10000.0)
+    if triangular:
+        tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
+        attn_scores = torch.masked_fill(attn_scores, tri, minus_inf)
+    output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
+    return output
+
+
+def transform4d_0213(x, seq_length):
+    assert x.dim() == 3, F"{x.dim()=} is not supported"
+    batch_size, num_heads, seq_length_head_dim = x.shape
+    head_dim = seq_length_head_dim // seq_length
+    x = x.view(batch_size, num_heads, seq_length, head_dim)
+    x = x.permute(0, 2, 1, 3)
+    return x
+
+
+def bias_add_transform_0213(input, bias, num_heads, trans_count, perform_bias=False):
+    assert trans_count == 1 or trans_count == 3, F"{trans_count=} is not supported"
+    assert input.dim() == 3, F"{input.dim()=} is not supported"
+    input_biased = bias_add(input, bias) if perform_bias else input
+    batch_size, seq_length, value_size = input_biased.shape
+    hid_dim = value_size // trans_count
+    head_dim = hid_dim // num_heads
+
+    if (trans_count == 1):
+        query_layer = input.view(batch_size, seq_length, num_heads, head_dim)
+        query_layer = query_layer.permute(0, 2, 1, 3)
+        key_layer = torch.zeros_like(query_layer)
+        value_layer = torch.zeros_like(query_layer)
+        return query_layer, key_layer, value_layer
+
+    qkv_layers = input.view(batch_size, seq_length, 3, num_heads, head_dim)
+    query_layer, key_layer, value_layer = qkv_layers[..., 0, :, :], qkv_layers[..., 1, :, :], qkv_layers[..., 2, :, :]
+    query_layer = query_layer.transpose(1, 2)
+    key_layer = key_layer.transpose(1, 2)
+    value_layer = value_layer.transpose(1, 2)
+    return query_layer, key_layer, value_layer
+
+
+def vector_matmul_fallback(input, weight, async_op, q_scale, q_int8, transpose):
+    return torch.matmul(input, weight.t() if transpose else weight)
+
+
+def vector_add_fallback(a, b, gamma):
+    """Based on csrc/transformer/inference/csrc/pt_binding.cpp code of _vector_add"""
+    dtype = a.dtype
+    return (gamma * a.float() + b.float()).to(dtype)
+
+
+def reset_cache():
+    return InferenceContext.Instance().reset_tokens()
+
+
+def release_workspace():
+    return InferenceContext.Instance().release_workspace()
+
+
+def retake_workspace():
+    return InferenceContext.Instance().retake_workspace()
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep <= 1 or num_key_value_heads == 1:
+        return hidden_states
+
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def softmax_context_fallback(query_key_value, attn_mask, rotary_dim, rotate_half, roteate_every_two, heads, num_kv,
+                             norm_factor, triangular_masking, local_attention, window_size, no_masking, layer_id,
+                             num_layers, alibi, rope_theta, is_prompt, token_idx, position_ids):
+
+    bat_0213_query, bat_0213_key, bat_0213_value = bias_add_transform_0213(query_key_value, None, heads, 3, False)
+
+    if (rotary_dim > 0 and rotate_half):
+        from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+        rotary = InferenceContext.Instance().get_rotary(rotary_dim, rope_theta)
+        cos, sin = rotary(bat_0213_value, InferenceContext.Instance().get_max_tokens_num())
+        # TODO: SW-170999 Optimize RoPE implementation.
+        bat_0213_query, bat_0213_key = apply_rotary_pos_emb(bat_0213_query, bat_0213_key, cos, sin, position_ids)
+
+    bat_0213_key, bat_0213_value = InferenceContext.Instance().update_cache(layer_id, token_idx, is_prompt,
+                                                                            bat_0213_key, bat_0213_value)
+
+    bat_0213_key = repeat_kv(bat_0213_key, num_kv)
+    bat_0213_value = repeat_kv(bat_0213_value, num_kv)
+
+    bsz = query_key_value.shape[0]
+    head_dim = query_key_value.shape[2] // (heads * 3)
+
+    bmm_output = torch.bmm(bat_0213_query.reshape(bsz * heads, bat_0213_query.shape[2], head_dim),
+                           bat_0213_key.reshape(bsz * heads, bat_0213_key.shape[2], head_dim).transpose(1, 2))
+
+    layer_scale = 1.0
+    if alibi != None and len(alibi.shape) > 1:
+        layer_scale = max(1, layer_id).to(float)
+
+    alpha = norm_factor * norm_factor / layer_scale
+    bmm_output *= alpha
+    bmm_output_reshape = bmm_output.reshape(bsz, heads, bmm_output.shape[1], bmm_output.shape[2])
+
+    recompute = is_prompt
+    if attn_mask is not None and len(attn_mask.shape) > 1 and attn_mask.shape[-1] < bmm_output_reshape.shape[3]:
+        attn_mask = torch.nn.functional.pad(attn_mask, (0, bmm_output_reshape.shape[3] - attn_mask.shape[-1]),
+                                            value=torch.finfo(attn_mask.dtype).min)
+    softmax_output = softmax_fallback(bmm_output_reshape, attn_mask, alibi, triangular_masking, recompute,
+                                      local_attention, window_size, None, layer_scale, 0, 1)
+
+    output = torch.bmm(softmax_output.reshape(bsz * heads, softmax_output.shape[2], softmax_output.shape[3]),
+                       bat_0213_value.reshape(bsz * heads, bat_0213_value.shape[2], head_dim))
+
+    output = output.reshape(bsz, heads, output.shape[1], head_dim)
+    output = output.reshape(bsz, heads, output.shape[2] * head_dim)
+    input_seq_len = query_key_value.shape[1]
+    t4d_0123_output = transform4d_0213(output, input_seq_len)
+    t4d_0123_output = t4d_0123_output.reshape(bsz, t4d_0123_output.shape[1], heads * head_dim)
+
+    if layer_id == num_layers - 1:
+        InferenceContext.Instance().advance_tokens()
+
+    return t4d_0123_output, bat_0213_key, bat_0213_value
diff --git a/pre-commit-toggle.sh b/pre-commit-toggle.sh
new file mode 100755
index 000000000000..c458c7f2d0da
--- /dev/null
+++ b/pre-commit-toggle.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Path to the pre-commit configuration file
+PRE_COMMIT_CONFIG=".pre-commit-config.yaml"
+
+# Install pre-commit
+pip install -r ${DEEPSPEED_FORK_ROOT}/requirements/requirements-dev.txt
+
+# Enable pre-commit
+function enable_pre_commit() {
+    # Install pre-commit hooks
+    pre-commit install
+}
+
+# Disable pre-commit
+function disable_pre_commit() {
+    # Install pre-commit hooks
+    pre-commit uninstall
+}
+
+# Check if the pre-commit configuration file exists
+if [ -f "$PRE_COMMIT_CONFIG" ]; then
+  echo "Pre-commit configuration file found: $PRE_COMMIT_CONFIG"
+else
+  echo "Pre-commit configuration file not found: $PRE_COMMIT_CONFIG"
+  exit 1
+fi
+
+# Check the command-line argument to enable or disable pre-commit
+if [ "$1" == "enable" ]; then
+  enable_pre_commit
+elif [ "$1" == "disable" ]; then
+  disable_pre_commit
+else
+  echo "Usage: ./pre-commit-toggle.sh [enable|disable]"
+  exit 1
+fi
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 078386c457bd..b5bb1f118921 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -5,6 +5,7 @@ deepspeed-kernels ; sys_platform == 'linux'
 docutils<0.18
 future
 importlib-metadata>=4
+megatron-lm@git+https://github.com/NVIDIA/Megatron-LM.git@v2.2
 mup
 pre-commit>=2.20.0
 pytest
diff --git a/requirements/requirements-inf.txt b/requirements/requirements-inf.txt
index 848a7f7a485d..8c20f151c5d3 100644
--- a/requirements/requirements-inf.txt
+++ b/requirements/requirements-inf.txt
@@ -1,5 +1,6 @@
 google
-lm-eval==0.3.0
+lm-eval>=0.2.0
 protobuf
+safetensors
 transformers
 transformers[sentencepiece]
diff --git a/requirements/requirements-sparse_attn.txt b/requirements/requirements-sparse_attn.txt
index f929bb0168a5..09386fdcb120 100755
--- a/requirements/requirements-sparse_attn.txt
+++ b/requirements/requirements-sparse_attn.txt
@@ -1 +1 @@
-triton==1.0.0
+triton==2.0.0.dev20221202
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3287a..291b45f80923 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,7 +1,7 @@
 hjson
 ninja
 numpy
-packaging>=20.0
+packaging
 psutil
 py-cpuinfo
 pydantic
diff --git a/requirements_hpu/requirements-1bit-mpi.txt b/requirements_hpu/requirements-1bit-mpi.txt
new file mode 100644
index 000000000000..a17a8c62cb59
--- /dev/null
+++ b/requirements_hpu/requirements-1bit-mpi.txt
@@ -0,0 +1 @@
+mpi4py==3.1.4
diff --git a/requirements_hpu/requirements-autotuning-ml.txt b/requirements_hpu/requirements-autotuning-ml.txt
new file mode 100644
index 000000000000..b0157e24f5c4
--- /dev/null
+++ b/requirements_hpu/requirements-autotuning-ml.txt
@@ -0,0 +1,3 @@
+hjson==3.1.0
+tabulate==0.9.0
+xgboost==2.0.1
diff --git a/requirements_hpu/requirements-autotuning.txt b/requirements_hpu/requirements-autotuning.txt
new file mode 100755
index 000000000000..a00a1bbfbe41
--- /dev/null
+++ b/requirements_hpu/requirements-autotuning.txt
@@ -0,0 +1 @@
+tabulate==0.9.0
diff --git a/requirements_hpu/requirements-cpu.txt b/requirements_hpu/requirements-cpu.txt
new file mode 100755
index 000000000000..08a7ec83faf7
--- /dev/null
+++ b/requirements_hpu/requirements-cpu.txt
@@ -0,0 +1 @@
+intel_extension_for_pytorch
diff --git a/requirements_hpu/requirements-dev.txt b/requirements_hpu/requirements-dev.txt
new file mode 100644
index 000000000000..e26e903742c4
--- /dev/null
+++ b/requirements_hpu/requirements-dev.txt
@@ -0,0 +1,20 @@
+accelerate
+clang-format==16.0.2
+coverage
+#deepspeed-kernels ; sys_platform == 'linux'
+docutils==0.17.1
+future==0.18.3
+importlib-metadata==6.8.0
+mup
+pre-commit==3.3.3
+pytest==7.1.2
+pytest-forked==1.6.0
+pytest-randomly==3.15.0
+pytest-xdist==3.3.1
+recommonmark==0.7.1
+sphinx==5.3.0
+sphinx-rtd-theme==1.3.0
+tensorboard==2.11.2
+torchvision
+transformers==4.35.0
+wandb==0.15.12
diff --git a/requirements_hpu/requirements-inf.txt b/requirements_hpu/requirements-inf.txt
new file mode 100644
index 000000000000..8a8f444f540a
--- /dev/null
+++ b/requirements_hpu/requirements-inf.txt
@@ -0,0 +1,5 @@
+google==3.0.0
+lm-eval==0.3.0
+protobuf==3.20.3
+transformers==4.35.0
+transformers[sentencepiece]
diff --git a/requirements_hpu/requirements-readthedocs.txt b/requirements_hpu/requirements-readthedocs.txt
new file mode 100644
index 000000000000..f6375427f8a2
--- /dev/null
+++ b/requirements_hpu/requirements-readthedocs.txt
@@ -0,0 +1,11 @@
+autodoc_pydantic==1.9.0
+docutils==0.17.1
+hjson==3.1.0
+packaging==23.2
+psutil==5.9.6
+py-cpuinfo==9.0.0
+pydantic==1.10.13
+recommonmark==0.7.1
+sphinx_rtd_theme==1.3.0
+torch
+tqdm==4.66.1
diff --git a/requirements_hpu/requirements-sd.txt b/requirements_hpu/requirements-sd.txt
new file mode 100644
index 000000000000..4dd53cb8a1cf
--- /dev/null
+++ b/requirements_hpu/requirements-sd.txt
@@ -0,0 +1,2 @@
+diffusers==0.21.4
+triton==2.0.0.dev20221202
diff --git a/requirements_hpu/requirements-sparse_attn.txt b/requirements_hpu/requirements-sparse_attn.txt
new file mode 100755
index 000000000000..09386fdcb120
--- /dev/null
+++ b/requirements_hpu/requirements-sparse_attn.txt
@@ -0,0 +1 @@
+triton==2.0.0.dev20221202
diff --git a/requirements_hpu/requirements-sparse_pruning.txt b/requirements_hpu/requirements-sparse_pruning.txt
new file mode 100755
index 000000000000..3b96b4134cdb
--- /dev/null
+++ b/requirements_hpu/requirements-sparse_pruning.txt
@@ -0,0 +1 @@
+neural-compressor==2.1.0
diff --git a/requirements_hpu/requirements-triton.txt b/requirements_hpu/requirements-triton.txt
new file mode 100644
index 000000000000..3b382f83f2ae
--- /dev/null
+++ b/requirements_hpu/requirements-triton.txt
@@ -0,0 +1 @@
+triton==2.1.0
diff --git a/requirements_hpu/requirements.txt b/requirements_hpu/requirements.txt
new file mode 100755
index 000000000000..c83ff5a88e0c
--- /dev/null
+++ b/requirements_hpu/requirements.txt
@@ -0,0 +1,10 @@
+hjson==3.1.0
+ninja==1.11.1.1
+numpy==1.23.5
+packaging==23.2
+psutil==5.9.6
+py-cpuinfo==9.0.0
+pydantic==1.10.13
+pynvml==8.0.4
+torch
+tqdm==4.66.1
diff --git a/test b/test
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/conftest.py b/tests/conftest.py
index 45e8434a021b..f1451bfcf7b6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,6 +11,11 @@
 from os.path import abspath, dirname, join
 import torch
 import warnings
+from unit.ci_promote_marker import *
+from unit.xfail_marker import *
+from unit.skip_marker import *
+from unit.util import get_hpu_dev_version
+from deepspeed.accelerator import get_accelerator
 
 # Set this environment variable for the T5 inference unittest(s) (e.g. google/t5-v1_1-small)
 os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
@@ -21,11 +26,21 @@
 sys.path.insert(1, git_repo_path)
 
 
+def update_wa_env_var(key, value):
+    if key not in os.environ.keys():
+        os.environ[key] = value
+
+
 def pytest_configure(config):
     config.option.color = "yes"
     config.option.durations = 0
     config.option.durations_min = 1
     config.option.verbose = True
+    if get_accelerator().device_name() == 'hpu':
+        # TODO: SW-113485 need to remove the below WA once SW-113485 is unblocked
+        update_wa_env_var("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        # todo SW-125782: remove DYNAMIC SHAPE disable WA
+        update_wa_env_var("PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES", "0")
 
 
 def pytest_addoption(parser):
@@ -70,6 +85,49 @@ def pytest_runtest_call(item):
         item.runtest = lambda: True  # Dummy function so test is not run twice
 
 
+def pytest_collection_modifyitems(items, config):
+    device = get_accelerator().device_name()
+    gaudi_dev = get_hpu_dev_version()
+    for item in items:
+        if item._nodeid in hpu_ci_tests:
+            item._pyfuncitem.add_marker(pytest.mark.hpu_ci)
+        if item._nodeid in hpu_ci_tests_4cards:
+            item._pyfuncitem.add_marker(pytest.mark.hpu_ci_4cards)
+        if item._nodeid in gpu_ci_tests:
+            item._pyfuncitem.add_marker(pytest.mark.gpu_ci)
+        if item._nodeid in hpu_promote_tests:
+            item._pyfuncitem.add_marker(pytest.mark.hpu_promote)
+        if item._nodeid in hpu_promote_tests_4cards:
+            item._pyfuncitem.add_marker(pytest.mark.hpu_promote_4cards)
+        if item._nodeid in gpu_promote_tests:
+            item._pyfuncitem.add_marker(pytest.mark.gpu_promote)
+    for item in items:
+        item.user_properties.append(("module_name", item.module.__name__))
+        if item._nodeid in hpu_xfail_tests.keys():
+            item._pyfuncitem.add_marker(pytest.mark.xfail(device == 'hpu', reason=hpu_xfail_tests[item._nodeid]))
+        if item._nodeid in gpu_xfail_tests.keys():
+            item._pyfuncitem.add_marker(pytest.mark.xfail(device != 'hpu', reason=gpu_xfail_tests[item._nodeid]))
+        if item._nodeid in hpu_skip_tests.keys():
+            item._pyfuncitem.add_marker(pytest.mark.skipif(device == 'hpu', reason=hpu_skip_tests[item._nodeid]))
+        if item._nodeid in gpu_skip_tests.keys():
+            item._pyfuncitem.add_marker(pytest.mark.skipif(device != 'hpu', reason=gpu_skip_tests[item._nodeid]))
+        if device == 'hpu':
+            if gaudi_dev == "Gaudi":
+                if item._nodeid in g1_xfail_tests.keys():
+                    item._pyfuncitem.add_marker(pytest.mark.xfail(reason=g1_xfail_tests[item._nodeid]))
+                if item._nodeid in g1_skip_tests.keys():
+                    item._pyfuncitem.add_marker(pytest.mark.skip(reason=g1_skip_tests[item._nodeid]))
+            if gaudi_dev == "Gaudi2":
+                if item._nodeid in g2_xfail_tests.keys():
+                    item._pyfuncitem.add_marker(pytest.mark.xfail(reason=g2_xfail_tests[item._nodeid]))
+                if item._nodeid in g2_skip_tests.keys():
+                    item._pyfuncitem.add_marker(pytest.mark.skip(reason=g2_skip_tests[item._nodeid]))
+        for marker in item.own_markers:
+            if marker.name in ['skip', 'xfail']:
+                if 'reason' in marker.kwargs:
+                    item.user_properties.append(("message", marker.kwargs['reason']))
+
+
 # We allow DistributedTest to reuse distributed environments. When the last
 # test for a class is run, we want to make sure those distributed environments
 # are destroyed.
@@ -85,3 +143,11 @@ def pytest_fixture_setup(fixturedef, request):
     if getattr(fixturedef.func, "is_dist_fixture", False):
         dist_fixture_class = fixturedef.func()
         dist_fixture_class(request)
+
+
+def pytest_runtest_makereport(item, call):
+    if call.when == 'call':
+        if call.excinfo:
+            if not (any('message' in prop for prop in item.user_properties)):
+                if call.excinfo.value:
+                    item.user_properties.append(("message", call.excinfo.value))
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index e3be2be4894d..0a074dbbb80d 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -99,12 +99,24 @@ def cifar_trainset(fp16=False):
     dist.barrier()
     if local_rank != 0:
         dist.barrier()
-
-    data_root = os.getenv("TEST_DATA_DIR", "/tmp/")
-    trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_root, "cifar10-data"),
-                                            train=True,
-                                            download=True,
-                                            transform=transform)
+    if os.getenv("CIFAR10_OFFLINE", default=None):
+        if os.getenv("CIFAR10_DATASET_PATH", default=None):
+            trainset = torchvision.datasets.CIFAR10(root=os.getenv("CIFAR10_DATASET_PATH", default=None),
+                                                    train=True,
+                                                    download=False,
+                                                    transform=transform)
+    elif os.getenv("STORE_CIFAR10", default=None):
+        if os.getenv("CIFAR10_DATASET_PATH", default=None):
+            trainset = torchvision.datasets.CIFAR10(root=os.getenv("CIFAR10_DATASET_PATH", default=None),
+                                                    train=True,
+                                                    download=True,
+                                                    transform=transform)
+    else:
+        data_root = os.getenv("TEST_DATA_DIR", "/tmp/")
+        trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_root, "cifar10-data"),
+                                                train=True,
+                                                download=True,
+                                                transform=transform)
     if local_rank == 0:
         dist.barrier()
     return trainset
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index d6dda2f14cbe..3b8338824e94 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -136,6 +136,8 @@ def create_moe_param_groups(model):
 
 
 def create_deepspeed_model(config_dict, model, base_optimizer):
+    if get_accelerator().device_name() == 'hpu':
+        model.to(get_accelerator().device_name())
     ds_model, _, _, _ = deepspeed.initialize(config=config_dict,
                                              model=model,
                                              model_parameters=create_moe_param_groups(model),
diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py
index c4c6773cd474..f6b36f5a1486 100644
--- a/tests/unit/checkpoint/test_lr_scheduler.py
+++ b/tests/unit/checkpoint/test_lr_scheduler.py
@@ -8,9 +8,7 @@
 
 from unit.common import DistributedTest
 from unit.simple_model import *
-
 from unit.checkpoint.common import checkpoint_correctness_verification
-
 import pytest
 
 
@@ -52,11 +50,18 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
             }
         }
         hidden_dim = 10
+        fp16 = True
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+            zero3_init_dtype = torch.float32
 
         if zero_stage == 3:
             global DeepSpeedZeroOptimizer_Stage3
             from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(dtype=zero3_init_dtype):
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -66,7 +71,8 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                                             hidden_dim,
                                             tmpdir,
                                             load_optimizer_states=False,
-                                            load_lr_scheduler_states=True)
+                                            load_lr_scheduler_states=True,
+                                            fp16=fp16)
 
     def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
@@ -99,8 +105,16 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
         }
         hidden_dim = 10
 
+        fp16 = True
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+            zero3_init_dtype = torch.float32
+
         if zero_stage == 3:
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(dtype=zero3_init_dtype):
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -110,4 +124,5 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                                             hidden_dim,
                                             tmpdir,
                                             load_optimizer_states=False,
-                                            load_lr_scheduler_states=False)
+                                            load_lr_scheduler_states=False,
+                                            fp16=fp16)
diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py
index 3f853cd5c13a..636d750476ac 100644
--- a/tests/unit/checkpoint/test_mics_optimizer.py
+++ b/tests/unit/checkpoint/test_mics_optimizer.py
@@ -12,7 +12,6 @@
 from unit.common import DistributedTest
 from unit.simple_model import *
 from unit.checkpoint.common import *
-
 import pytest
 
 if not required_torch_version(max_version=2.0):
@@ -46,7 +45,9 @@ def _toy_model_config(self, shard_size):
                 "mics_shard_size": shard_size
             }
         }
-
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         with deepspeed.zero.MiCS_Init(config_dict_or_path=config_dict):
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
index 0706b7327ce8..36bdec542eb8 100644
--- a/tests/unit/checkpoint/test_moe_checkpoint.py
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -24,7 +24,11 @@ def test_checkpoint_moe(self, tmpdir, ep_size):
 
         config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
-
+        fp16 = config_dict["fp16"]["enabled"]
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
         models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
         checkpoint_correctness_verification(config_dict,
@@ -33,7 +37,7 @@ def test_checkpoint_moe(self, tmpdir, ep_size):
                                             tmpdir=tmpdir,
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
+                                            fp16=fp16,
                                             empty_tag=True,
                                             base_optimizers=optimizers,
                                             seq_dataloader=True)
@@ -61,9 +65,15 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
             },
             "zero_optimization": {
                 "stage": 2,
+                "reduce_scatter": True
             }
         }
         hidden_dim = 16
+        fp16 = config_dict["fp16"]["enabled"]
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
 
         models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         # param group must have a random unique name (for now)
@@ -77,7 +87,7 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
                                             tmpdir=tmpdir,
                                             load_optimizer_states=load_optim_states,
                                             load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
+                                            fp16=fp16,
                                             empty_tag=True,
                                             base_optimizers=optimizers,
                                             seq_dataloader=True)
diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py
index 9cb8c4286880..e8f29992215c 100644
--- a/tests/unit/checkpoint/test_other_optimizer.py
+++ b/tests/unit/checkpoint/test_other_optimizer.py
@@ -8,9 +8,7 @@
 
 from unit.common import DistributedTest
 from unit.simple_model import *
-
 from unit.checkpoint.common import checkpoint_correctness_verification
-
 import pytest
 
 
@@ -49,7 +47,11 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                 }
             }
         }
-
+        fp16 = True
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
         args = args_from_dict(tmpdir, config_dict)
         hidden_dim = 10
         models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
@@ -59,14 +61,16 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            load_optimizer_states=True)
+                                            load_optimizer_states=True,
+                                            fp16=fp16)
 
         # Ignore optimizer states
         checkpoint_correctness_verification(config_dict,
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            load_optimizer_states=False)
+                                            load_optimizer_states=False,
+                                            fp16=fp16)
 
     def test_checkpoint_fused_optimizer(self, tmpdir):
         config_dict = {
@@ -85,6 +89,11 @@ def test_checkpoint_fused_optimizer(self, tmpdir):
                 "enabled": True
             }
         }
+        fp16 = True
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
 
         args = args_from_dict(tmpdir, config_dict)
         hidden_dim = 10
@@ -95,14 +104,16 @@ def test_checkpoint_fused_optimizer(self, tmpdir):
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            load_optimizer_states=True)
+                                            load_optimizer_states=True,
+                                            fp16=fp16)
 
         # Ignore optimizer states
         checkpoint_correctness_verification(config_dict,
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            load_optimizer_states=False)
+                                            load_optimizer_states=False,
+                                            fp16=fp16)
 
     def test_checkpoint_fp32_optimizer(self, tmpdir):
         config_dict = {
diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py
index 99f1ba2ec433..bfe53d82f591 100644
--- a/tests/unit/checkpoint/test_pipeline.py
+++ b/tests/unit/checkpoint/test_pipeline.py
@@ -8,7 +8,6 @@
 from unit.simple_model import *
 from unit.checkpoint.common import checkpoint_correctness_verification
 from unit.util import skip_on_arch
-
 import pytest
 
 
@@ -52,13 +51,18 @@ def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
                 }
             }
         }
+        fp16 = config_dict['fp16']['enabled']
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
 
         models = [LinearStackPipe(num_stages=2) for _ in range(2)]
         checkpoint_correctness_verification(config_dict=config_dict,
                                             models=models,
                                             hidden_dim=models[0].hidden_dim,
                                             tmpdir=tmpdir,
-                                            fp16=config_dict['fp16']['enabled'],
+                                            fp16=fp16,
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=True,
                                             train_batch=True)
diff --git a/tests/unit/checkpoint/test_shared_weights.py b/tests/unit/checkpoint/test_shared_weights.py
index ed69073fb81c..d86f6c07ab2f 100644
--- a/tests/unit/checkpoint/test_shared_weights.py
+++ b/tests/unit/checkpoint/test_shared_weights.py
@@ -9,6 +9,7 @@
 import deepspeed
 from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
 from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
 
 
 class ModelWithSharedWeights(nn.Module):
@@ -34,8 +35,10 @@ def test_checkpoint_shared_weights(self, tmp_path):
             },
         }
         model = ModelWithSharedWeights()
+        if get_accelerator().device_name() == 'hpu':
+            device = get_accelerator().current_device_name()
+            model.to(device)
         optimizer = torch.optim.Adam(model.parameters())
-
         deepspeed_engine, _, _, _ = deepspeed.initialize(
             config=config,
             model=model,
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index f2237341ef68..f44fa3b1a6c9 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -41,7 +41,6 @@ def test_pipeline_checkpoint_loading(self, tmpdir, zero_stage):
 
         with deepspeed.zero.Init():
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
         checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
 
     @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, 'Adam'), (2, False, 'Adam'),
@@ -75,14 +74,29 @@ def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_op
             }
         }
         hidden_dim = 10
+        fp16 = True
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+            zero3_init_dtype = torch.float32
 
         if zero_stage == 3:
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(dtype=zero3_init_dtype):
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            fp16=fp16,
+                                            load_optimizer_states=True)
 
     @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, "Adam"), (2, False, "Adam"),
                                                                              (2, True, 'deepspeed_adam'),
@@ -113,16 +127,31 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, ada
             }
         }
         hidden_dim = 10
+        fp16 = True
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+            zero3_init_dtype = torch.float32
 
         if zero_stage == 3:
             global DeepSpeedZeroOptimizer_Stage3
             from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(dtype=zero3_init_dtype):
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            fp16=fp16,
+                                            load_optimizer_states=False)
 
     @pytest.mark.parametrize('zero_stage', [1, 2])
     def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
@@ -140,7 +169,16 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
             }
         }
         hidden_dim = 10
+        fp16 = True
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+
         models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
+        if get_accelerator().device_name() == 'hpu':
+            device = get_accelerator().device_name()
+            models = [model.to(device) for model in models]
         optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
 
         checkpoint_correctness_verification(config_dict,
@@ -148,6 +186,7 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
                                             base_optimizers=optimizers,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
+                                            fp16=fp16,
                                             load_optimizer_states=True)
 
     @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
@@ -166,14 +205,24 @@ def test_load_module_only(self, tmpdir, zero_stage):
             }
         }
         hidden_dim = 10
+        fp16 = True
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            fp16 = False
+            zero3_init_dtype = torch.float32
 
         if zero_stage == 3:
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(dtype=zero3_init_dtype):
                 models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, fp16=fp16, load_module_only=True)
 
 
 class ws4_model_checkpoint(DistributedFixture):
@@ -197,8 +246,17 @@ def run(self, class_tmpdir, elastic_save, load_optim):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["fp32"] = {"enabled": True}
+            dtype = torch.float
         model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -237,10 +295,20 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
         # Newer torch versions store tensor ids as 0, 1, 2, ...
         expected_mismatch_keys = [] if required_torch_version(min_version=1.4) else ['params']
         models = [SimpleModel(hidden_dim) for _ in range(2)]
+
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["fp32"] = {"enabled": True}
+            dtype = torch.float
         model, _, _, _ = deepspeed.initialize(config=ds_config,
                                               model=models[0],
                                               model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -261,7 +329,11 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
             for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
                 compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys)
 
-        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=8,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -286,6 +358,9 @@ def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir,
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["fp32"] = {"enabled": True}
         # Load checkpoint with dp world size = 2
         model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
         if load_optim:
@@ -316,6 +391,9 @@ def test_immediate_save_load(self, tmpdir, zero_stage):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
         ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         ds_model.save_checkpoint(tmpdir)
         ds_model.load_checkpoint(tmpdir,
@@ -343,6 +421,10 @@ def test_load_immediate_save(self, tmpdir, zero_stage):
 
         # 1. pretrain a model and save it
         dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float
         ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         data_loader = random_dataloader(model=ds_model,
                                         total_samples=1,
@@ -386,6 +468,11 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float
         # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
         # cycle was completed.
         # So we config grad_accum=2 and step only once and save_16bit_model
@@ -395,7 +482,7 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
                                         total_samples=2,
                                         hidden_dim=hidden_dim,
                                         device=ds_model.device,
-                                        dtype=torch.half)
+                                        dtype=dtype)
 
         batch = next(iter(data_loader))
         loss = ds_model(batch[0], batch[1])
@@ -438,12 +525,28 @@ def test_load_optimizer_state(self, tmpdir, zero_stage):
                 "stage": zero_stage
             }
         }
+        fp16 = True
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            fp16 = False
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float32
+
         hidden_dim = 10
 
-        with deepspeed.zero.Init(enabled=zero_stage == 3):
+        with deepspeed.zero.Init(enabled=zero_stage == 3, dtype=dtype):
             models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=True,
+                                            fp16=fp16)
 
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
     def test_not_load_optimizer_state(self, tmpdir, zero_stage):
@@ -468,11 +571,25 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage):
             }
         }
         hidden_dim = 10
-
-        with deepspeed.zero.Init(enabled=zero_stage == 3):
+        fp16 = True
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            fp16 = False
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float32
+        with deepspeed.zero.Init(enabled=zero_stage == 3, dtype=dtype):
             models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+        checkpoint_correctness_verification(config_dict,
+                                            models,
+                                            hidden_dim,
+                                            tmpdir,
+                                            load_optimizer_states=False,
+                                            fp16=fp16)
 
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
     def test_load_module_only(self, tmpdir, zero_stage):
@@ -490,11 +607,20 @@ def test_load_module_only(self, tmpdir, zero_stage):
             }
         }
         hidden_dim = 10
-
-        with deepspeed.zero.Init(enabled=zero_stage == 3):
+        fp16 = True
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            fp16 = False
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float32
+        with deepspeed.zero.Init(enabled=zero_stage == 3, dtype=dtype):
             models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+            if get_accelerator().device_name() == 'hpu':
+                device = get_accelerator().device_name()
+                models = [model.to(device) for model in models]
 
-        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True, fp16=fp16)
 
     @pytest.mark.parametrize('zero_stage', [1, 2])
     def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
@@ -561,6 +687,11 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
             }
         }
         hidden_dim = 10
+        if os.getenv("REPLACE_FP16", default=None):
+            fp16 = False
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.float32
 
         model = SimpleFrozenModel(hidden_dim, empty_grad=False)
 
@@ -603,8 +734,12 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
             "train_batch_size": 1,
             "train_micro_batch_size_per_gpu": 1
         }
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
+            ds_config["bf16"] = {"enabled": True}
         hidden_dim = 1024
-        model = SimpleModel(hidden_dim, nlayers=4).half()
+        model = SimpleModel(hidden_dim, nlayers=4).to(dtype=dtype)
         ref_model_state_dict = model.state_dict()
 
         ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)
diff --git a/tests/unit/ci_promote_marker.py b/tests/unit/ci_promote_marker.py
new file mode 100644
index 000000000000..25e411ec9f4b
--- /dev/null
+++ b/tests/unit/ci_promote_marker.py
@@ -0,0 +1,639 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+hpu_ci_tests = [
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_train_schedule_singlestage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_schedule_firststage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[1]",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_enable_argument",
+    "unit/runtime/test_ds_config_model.py::test_config_base",
+    "unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_1",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_211",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_122",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[2-2]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[33-33]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[0-0]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[1-1]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[32-32]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[5-5]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroAllowUntestedOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestInitNoOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestDistInit::test", "unit/launcher/test_run.py::test_parser_local",
+    "unit/launcher/test_run.py::test_parser_mutual_exclusive",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupLR-params0]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[OneCycle-params2]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[LRRangeTest-params3]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupDecayLR-params1]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0.1-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-211]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.1-0-10-0]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-210]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-101]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-101]",
+    "unit/runtime/utils/test_partition.py::test_float_balanced",
+    "unit/runtime/utils/test_partition.py::test_int_balanced",
+    "unit/runtime/utils/test_partition.py::test_easy_balance_uniform",
+    "unit/runtime/utils/test_partition.py::test_float_midheavy",
+    "unit/runtime/utils/test_partition.py::test_short_partition_uniform",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings2]",
+    "unit/autotuning/test_autotuning.py::test_command_line",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings4]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings3]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[None]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings1]",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/runtime/test_pld.py::TestNonPLDModel::test_non_pld_model",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_deprecatedfields",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_aliasfields",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output3]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Optimizer]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[optimizer_type2]",
+    "unit/elasticity/test_elastic.py::test_proper_mbsz", "unit/runtime/pipe/test_topology.py::test_topology_rank_repr",
+    "unit/runtime/pipe/test_topology.py::test_topology_2d", "unit/runtime/pipe/test_topology.py::test_primes",
+    "unit/runtime/sparse_tensor/test_csr.py::test_csr_addition_different",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[2]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[12]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[24]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[1]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[gpt2-text-generation-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[gpt2-text-generation-fp16-noCG]"
+]
+
+hpu_ci_tests_4cards = [
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_train_schedule_singlestage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_schedule_firststage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[1]",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_enable_argument",
+    "unit/runtime/test_ds_config_model.py::test_config_base",
+    "unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_1",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_211",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_122",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[2-2]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[33-33]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[0-0]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[1-1]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[32-32]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[5-5]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroAllowUntestedOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestInitNoOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestDistInit::test", "unit/launcher/test_run.py::test_parser_local",
+    "unit/launcher/test_run.py::test_parser_mutual_exclusive",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupLR-params0]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[OneCycle-params2]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[LRRangeTest-params3]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupDecayLR-params1]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0.1-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-211]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.1-0-10-0]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-210]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-101]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-101]",
+    "unit/runtime/utils/test_partition.py::test_float_balanced",
+    "unit/runtime/utils/test_partition.py::test_int_balanced",
+    "unit/runtime/utils/test_partition.py::test_easy_balance_uniform",
+    "unit/runtime/utils/test_partition.py::test_float_midheavy",
+    "unit/runtime/utils/test_partition.py::test_short_partition_uniform",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings2]",
+    "unit/autotuning/test_autotuning.py::test_command_line",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings4]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings3]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[None]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings1]",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/runtime/test_pld.py::TestNonPLDModel::test_non_pld_model",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_deprecatedfields",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_aliasfields",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output3]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Optimizer]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[optimizer_type2]",
+    "unit/elasticity/test_elastic.py::test_proper_mbsz", "unit/runtime/pipe/test_topology.py::test_topology_rank_repr",
+    "unit/runtime/pipe/test_topology.py::test_topology_2d", "unit/runtime/pipe/test_topology.py::test_primes",
+    "unit/runtime/sparse_tensor/test_csr.py::test_csr_addition_different",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[2]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[12]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[24]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[1]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[gpt2-text-generation-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[gpt2-text-generation-fp16-noCG]",
+    "unit/comm/test_dist.py::TestDistInitNoEnv::test",
+    "unit/runtime/zero/test_zero_context.py::TestSerialContext::test_throughput_calculation",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-zero2]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-zero1]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero1]",
+    "unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[2]",
+    "unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[3]",
+    "unit/runtime/zero/test_zero_context.py::TestGatherUpdate::test",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[1]",
+    "unit/runtime/zero/test_zero_context.py::TestScatterGather::test",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[2]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-False-Adam]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-True-True]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[2]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-False]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[1-False]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_save_before_accum_grad_is_done[3]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[3]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[1-False]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_not_load_optimizer_state[2]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_module_only[3]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_not_load_optimizer_state[3]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[2-True-deepspeed_adam]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config0]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-True]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[none-EleutherAI/gpt-neo-125m-zero_stage=2-bsz=1]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config1]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-True-deepspeed_adam]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config2]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[cpu-EleutherAI/gpt-neo-125m-zero_stage=3-bsz=1]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config1]",
+    "unit/runtime/half_precision/test_fp16.py::TestFP16OptimizerForMoE::test_unfused_gradnorm",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[none-facebook/opt-350m-zero_stage=3-bsz=1]",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShard::test[facebook/opt-350m-fp16]"
+]
+
+hpu_promote_tests = [
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-False-resulting_optimizer9]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-False-resulting_optimizer3]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-True-resulting_optimizer13]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-False-resulting_optimizer1]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-True-resulting_optimizer7]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-True-resulting_optimizer5]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-True-resulting_optimizer15]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-False-resulting_optimizer11]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-False-resulting_optimizer2]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-False-resulting_optimizer8]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-True-resulting_optimizer14]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-True-resulting_optimizer12]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-True-resulting_optimizer4]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-True-resulting_optimizer6]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-False-resulting_optimizer10]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[Adam]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-3]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-3]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[2]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[1]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[3]",
+    "unit/compression/test_compression.py::TestCompression::test_mpu_compress",
+    "unit/launcher/test_run.py::test_parser_errors", "unit/launcher/test_run.py::test_num_plus_parser",
+    "unit/launcher/test_run.py::test_parser_multinode",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-15]",
+    "unit/runtime/pipe/test_topology.py::test_topology_3d",
+    "unit/runtime/pipe/test_topology.py::test_topology_comm_list",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bfloat16]",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bf16]",
+    "unit/runtime/test_ds_config_dict.py::TestNoModel::test",
+    "unit/runtime/test_ds_config_dict.py::TestDeprecatedDeepScaleConfig::test",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask1]",
+    "unit/launcher/test_ds_arguments.py::test_core_deepscale_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments_no_ds_parser",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_missing_latest",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-scheduler_type2]",
+    "unit/runtime/utils/test_partition.py::test_balance_bert",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_offload_configs",
+    "unit/runtime/zero/test_zero_config.py::test_zero_offload_optimizer_config_pipeline",
+    "unit/runtime/test_pld.py::test_pld_schedule[0]", "unit/runtime/test_pld.py::test_pld_schedule[0.9]",
+    "unit/runtime/test_pld.py::test_pld_schedule[1.0]", "unit/runtime/test_pld.py::test_pld_schedule[0.1]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[1.0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.9]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources3]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources2]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources0]",
+    "unit/elasticity/test_elastic.py::test_basic_10k",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_111",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_121",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict0]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict1]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict2]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[3]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[facebook/opt-125m-text-generation-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[dslim/bert-base-NER-token-classification-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-CG]",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[True-False-roberta-base-fill-mask]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroEmptyGrad::test",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-base-fill-mask-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]"
+]
+
+hpu_promote_tests_4cards = [
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-False-resulting_optimizer9]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-False-resulting_optimizer3]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-True-resulting_optimizer13]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-False-resulting_optimizer1]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-True-resulting_optimizer7]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-True-resulting_optimizer5]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-True-resulting_optimizer15]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-False-resulting_optimizer11]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-False-resulting_optimizer2]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-False-resulting_optimizer8]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-True-resulting_optimizer14]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-True-resulting_optimizer12]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-True-resulting_optimizer4]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-True-resulting_optimizer6]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-False-resulting_optimizer10]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[Adam]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-3]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-3]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[2]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[1]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[3]",
+    "unit/compression/test_compression.py::TestCompression::test_mpu_compress",
+    "unit/launcher/test_run.py::test_parser_errors", "unit/launcher/test_run.py::test_num_plus_parser",
+    "unit/launcher/test_run.py::test_parser_multinode",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-15]",
+    "unit/runtime/pipe/test_topology.py::test_topology_3d",
+    "unit/runtime/pipe/test_topology.py::test_topology_comm_list",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bfloat16]",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bf16]",
+    "unit/runtime/test_ds_config_dict.py::TestNoModel::test",
+    "unit/runtime/test_ds_config_dict.py::TestDeprecatedDeepScaleConfig::test",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask1]",
+    "unit/launcher/test_ds_arguments.py::test_core_deepscale_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments_no_ds_parser",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_missing_latest",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-scheduler_type2]",
+    "unit/runtime/utils/test_partition.py::test_balance_bert",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_offload_configs",
+    "unit/runtime/zero/test_zero_config.py::test_zero_offload_optimizer_config_pipeline",
+    "unit/runtime/test_pld.py::test_pld_schedule[0]", "unit/runtime/test_pld.py::test_pld_schedule[0.9]",
+    "unit/runtime/test_pld.py::test_pld_schedule[1.0]", "unit/runtime/test_pld.py::test_pld_schedule[0.1]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[1.0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.9]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources3]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources2]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources0]",
+    "unit/elasticity/test_elastic.py::test_basic_10k",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_111",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_121",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict0]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict1]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict2]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[3]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[facebook/opt-125m-text-generation-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[dslim/bert-base-NER-token-classification-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-CG]",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[True-False-roberta-base-fill-mask]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroEmptyGrad::test",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[roberta-base-fill-mask-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero2]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-zero1]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-zero2]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-zero1]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero2]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-None]",
+    "unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[1]",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-None]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[1]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[2]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_optimizer_state[1]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[1-False-Adam]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[1]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[3]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-False]",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[1-False-Adam]",
+    "unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-True]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config0]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[none-EleutherAI/gpt-neo-125m-zero_stage=3-bsz=1]",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config2]",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShard::test[bigscience/bloom-560m-fp16]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[none-bigscience/bloom-560m-zero_stage=3-bsz=1]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[cpu-bigscience/bloom-560m-zero_stage=3-bsz=1]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[none-bigscience/bloom-560m-zero_stage=2-bsz=1]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[cpu-bigscience/bloom-560m-zero_stage=2-bsz=1]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[cpu-facebook/opt-350m-zero_stage=2-bsz=1]",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShard::test[facebook/opt-125m-fp16]",
+    "unit/hybrid_engine/test_he_lora.py::TestHybridEngineLoRA::test_lora[cpu-EleutherAI/gpt-neo-125m-zero_stage=2-bsz=1]",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShard::test[EleutherAI/gpt-neo-125M-fp16]"
+]
+
+gpu_ci_tests = [
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_train_schedule_singlestage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_schedule_firststage",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[1]",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_enable_argument",
+    "unit/runtime/test_ds_config_model.py::test_config_base",
+    "unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_1",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_211",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_122",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[2-2]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[33-33]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[0-0]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[1-1]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[32-32]",
+    "unit/runtime/zero/test_zero_tiled.py::test_tiled_init[5-5]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroAllowUntestedOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestInitNoOptimizer::test",
+    "unit/runtime/test_ds_config_dict.py::TestDistInit::test", "unit/launcher/test_run.py::test_parser_local",
+    "unit/launcher/test_run.py::test_parser_mutual_exclusive",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupLR-params0]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[OneCycle-params2]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[LRRangeTest-params3]",
+    "unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupDecayLR-params1]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0.1-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-211]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.1-0-10-0]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0-21-21]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-210]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-101]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-100]",
+    "unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-101]",
+    "unit/runtime/utils/test_partition.py::test_float_balanced",
+    "unit/runtime/utils/test_partition.py::test_int_balanced",
+    "unit/runtime/utils/test_partition.py::test_easy_balance_uniform",
+    "unit/runtime/utils/test_partition.py::test_float_midheavy",
+    "unit/runtime/utils/test_partition.py::test_short_partition_uniform",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings2]",
+    "unit/autotuning/test_autotuning.py::test_command_line",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings4]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings3]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[None]",
+    "unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings1]",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/runtime/test_pld.py::TestNonPLDModel::test_non_pld_model",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_deprecatedfields",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_aliasfields",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[None]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output3]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Optimizer]",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[optimizer_type2]",
+    "unit/elasticity/test_elastic.py::test_proper_mbsz", "unit/runtime/pipe/test_topology.py::test_topology_rank_repr",
+    "unit/runtime/pipe/test_topology.py::test_topology_2d", "unit/runtime/pipe/test_topology.py::test_primes",
+    "unit/runtime/sparse_tensor/test_csr.py::test_csr_addition_different",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[2]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[12]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[24]",
+    "unit/utils/test_get_optim_files.py::test_get_optim_files[1]",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-fp16-noCG]"
+]
+
+gpu_promote_tests = [
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-False-resulting_optimizer9]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-False-resulting_optimizer3]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-True-resulting_optimizer13]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-False-resulting_optimizer1]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-True-resulting_optimizer7]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-True-resulting_optimizer5]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-True-resulting_optimizer15]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-False-resulting_optimizer11]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-False-resulting_optimizer2]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-False-resulting_optimizer8]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-True-resulting_optimizer14]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-True-resulting_optimizer12]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-True-resulting_optimizer4]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-True-resulting_optimizer6]",
+    "unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-False-resulting_optimizer10]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[Adam]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-1]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-3]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-2]",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-3]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[2]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[1]",
+    "unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[3]",
+    "unit/compression/test_compression.py::TestCompression::test_mpu_compress",
+    "unit/launcher/test_run.py::test_parser_errors", "unit/launcher/test_run.py::test_num_plus_parser",
+    "unit/launcher/test_run.py::test_parser_multinode",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-19]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-33]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-10]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-15]",
+    "unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-15]",
+    "unit/runtime/pipe/test_topology.py::test_topology_3d",
+    "unit/runtime/pipe/test_topology.py::test_topology_comm_list",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bfloat16]",
+    "unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bf16]",
+    "unit/runtime/test_ds_config_dict.py::TestNoModel::test",
+    "unit/runtime/test_ds_config_dict.py::TestDeprecatedDeepScaleConfig::test",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask0]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask1]",
+    "unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask1]",
+    "unit/launcher/test_ds_arguments.py::test_core_deepscale_arguments",
+    "unit/launcher/test_ds_arguments.py::test_no_ds_arguments_no_ds_parser",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_missing_latest",
+    "unit/compression/test_compression.py::TestCompression::test_conv1d_convertion",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[optimizer_type2-scheduler_type2]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-_LRScheduler]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-None]",
+    "unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-scheduler_type2]",
+    "unit/runtime/utils/test_partition.py::test_balance_bert",
+    "unit/runtime/zero/test_zero_config.py::test_zero_config_offload_configs",
+    "unit/runtime/zero/test_zero_config.py::test_zero_offload_optimizer_config_pipeline",
+    "unit/runtime/test_pld.py::test_pld_schedule[0]", "unit/runtime/test_pld.py::test_pld_schedule[0.9]",
+    "unit/runtime/test_pld.py::test_pld_schedule[1.0]", "unit/runtime/test_pld.py::test_pld_schedule[0.1]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[1.0]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.9]",
+    "unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources3]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources1]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources2]",
+    "unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources0]",
+    "unit/elasticity/test_elastic.py::test_basic_10k",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_111",
+    "unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_121",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict0]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict1]",
+    "unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict2]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[3]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[10]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[8]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[1]",
+    "unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[3]",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-fp32-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp32-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-noCG]",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-CG]",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[True-False-roberta-base-fill-mask]",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroEmptyGrad::test",
+    "unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp16-CG]",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-CG]"
+]
diff --git a/tests/unit/common.py b/tests/unit/common.py
index cdeca54b01ee..6c3642101c5e 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -23,7 +23,7 @@
 from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
 # Worker timeout for tests that hang
-DEEPSPEED_TEST_TIMEOUT = 600
+DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DEEPSPEED_TEST_TIMEOUT', '600'))
 
 
 def is_rocm_pytorch():
@@ -59,6 +59,7 @@ def get_master_port(base_port=29500, port_range_size=1000):
 
 
 def set_accelerator_visible():
+    # below function relevant for GPU
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     xdist_worker_id = get_xdist_worker_id()
     if xdist_worker_id is None:
@@ -84,13 +85,21 @@ def set_accelerator_visible():
         elif get_accelerator().device_name() == 'npu':
             npu_smi = subprocess.check_output(['npu-smi', 'info', '-l'])
             num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip())
+        elif get_accelerator().device_name() == 'hpu':
+            hl_smi = subprocess.check_output(['hl-smi', "-L"])
+            num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode())
+            num_accelerators = sorted(num_accelerators, key=int)
+            os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators)
         else:
             assert get_accelerator().device_name() == 'cpu'
             cpu_sockets = int(
                 subprocess.check_output('cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True))
             num_accelerators = cpu_sockets
 
-        cuda_visible = ",".join(map(str, range(num_accelerators)))
+        if isinstance(num_accelerators, list):
+            cuda_visible = ",".join(num_accelerators)
+        else:
+            cuda_visible = ",".join(map(str, range(num_accelerators)))
 
     # rotate list based on xdist worker id, example below
     # wid=0 -> ['0', '1', '2', '3']
@@ -113,6 +122,7 @@ class DistributedExec(ABC):
     set_dist_env = True
     requires_cuda_env = True
     reuse_dist_env = False
+    non_daemonic_procs = False
     _pool_cache = {}
     exec_timeout = DEEPSPEED_TEST_TIMEOUT
 
@@ -145,16 +155,11 @@ def _get_fixture_kwargs(self, request, func):
                 pass  # test methods can have kwargs that are not fixtures
         return fixture_kwargs
 
-    def _launch_procs(self, num_procs):
-        # Verify we have enough accelerator devices to run this test
-        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
-            pytest.skip(
-                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
-            )
-
-        # Set start method to `forkserver` (or `fork`)
-        mp.set_start_method('forkserver', force=True)
-
+    def _launch_daemonic_procs(self, num_procs):
+        if get_accelerator().device_name() == 'hpu':
+            if self.reuse_dist_env:
+                print("Ignoring reuse_dist_env for hpu")
+                self.reuse_dist_env = False
         # Create process pool or use cached one
         master_port = None
         if self.reuse_dist_env:
@@ -177,17 +182,79 @@ def _launch_procs(self, num_procs):
             # usually means an environment error and the rest of tests will
             # hang (causing super long unit test runtimes)
             pytest.exit("Test hanged, exiting", returncode=0)
-
-        # Tear down distributed environment and close process pools
-        self._close_pool(pool, num_procs)
+        finally:
+            # Tear down distributed environment and close process pools
+            self._close_pool(pool, num_procs)
 
         # If we skipped a test, propagate that to this process
         if any(skip_msgs):
             assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
             pytest.skip(skip_msgs[0])
 
-    def _dist_run(self, local_rank, num_procs, master_port):
-        skip_msg = ''
+    def _launch_non_daemonic_procs(self, num_procs):
+        assert not self.reuse_dist_env, "Cannot reuse distributed environment with non-daemonic processes"
+
+        master_port = get_master_port()
+        skip_msg = mp.Queue()  # Allows forked processes to share pytest.skip reason
+        processes = []
+        for local_rank in range(num_procs):
+            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, skip_msg))
+            p.start()
+            processes.append(p)
+
+        # Now loop and wait for a test to complete. The spin-wait here isn't a big
+        # deal because the number of processes will be O(#GPUs) << O(#CPUs).
+        any_done = False
+        start = time.time()
+        while (not any_done) and ((time.time() - start) < self.exec_timeout):
+            for p in processes:
+                if not p.is_alive():
+                    any_done = True
+                    break
+            time.sleep(.1)  # So we don't hog CPU
+
+        # If we hit the timeout, then presume a test is hanged
+        if not any_done:
+            for p in processes:
+                p.terminate()
+            pytest.exit("Test hanged, exiting", returncode=0)
+
+        # Wait for all other processes to complete
+        for p in processes:
+            p.join(self.exec_timeout)
+
+        failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
+        for rank, p in failed:
+            # If it still hasn't terminated, kill it because it hung.
+            if p.exitcode is None:
+                p.terminate()
+                pytest.fail(f'Worker {rank} hung.', pytrace=False)
+            if p.exitcode < 0:
+                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False)
+            if p.exitcode > 0:
+                pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False)
+
+        if not skip_msg.empty():
+            # This assumed all skip messages are the same, it may be useful to
+            # add a check here to assert all exit messages are equal
+            pytest.skip(skip_msg.get())
+
+    def _launch_procs(self, num_procs):
+        # Verify we have enough accelerator devices to run this test
+        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
+            pytest.skip(
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
+            )
+
+        # Set start method to `forkserver` (or `fork`)
+        mp.set_start_method('forkserver', force=True)
+
+        if self.non_daemonic_procs:
+            self._launch_non_daemonic_procs(num_procs)
+        else:
+            self._launch_daemonic_procs(num_procs)
+
+    def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
         if not dist.is_initialized():
             """ Initialize deepspeed.comm and execute the user function. """
             if self.set_dist_env:
@@ -218,7 +285,10 @@ def _dist_run(self, local_rank, num_procs, master_port):
             self.run(**self._fixture_kwargs)
         except BaseException as e:
             if isinstance(e, Skipped):
-                skip_msg = e.msg
+                if self.non_daemonic_procs:
+                    skip_msg.put(e.msg)
+                else:
+                    skip_msg = e.msg
             else:
                 raise e
 
@@ -227,6 +297,7 @@ def _dist_run(self, local_rank, num_procs, master_port):
     def _dist_destroy(self):
         if (dist is not None) and dist.is_initialized():
             dist.barrier()
+            # tear down after test completes
             dist.destroy_process_group()
 
     def _close_pool(self, pool, num_procs, force=False):
diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py
index a49ec595a420..dbd4f40c9b95 100644
--- a/tests/unit/elasticity/test_elastic.py
+++ b/tests/unit/elasticity/test_elastic.py
@@ -9,7 +9,7 @@
 from deepspeed.git_version_info import version as ds_version
 import os
 from unit.simple_model import SimpleModel
-from deepspeed.ops.op_builder import FusedAdamBuilder
+from deepspeed.ops.op_builder import FusedAdamBuilder, FusedLambBuilder
 
 if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -150,6 +150,7 @@ def test_proper_mbsz(ds_config):
 class TestNonElasticBatchParams(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
@@ -182,6 +183,7 @@ def test(self):
 class TestNonElasticBatchParamsWithOverride(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
@@ -213,6 +215,7 @@ def test(self):
 class TestElasticConfigChanged(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
diff --git a/tests/unit/hybrid_engine/test_he_all.py b/tests/unit/hybrid_engine/test_he_all.py
index aa1f120645b1..47af6990d4bb 100644
--- a/tests/unit/hybrid_engine/test_he_all.py
+++ b/tests/unit/hybrid_engine/test_he_all.py
@@ -43,8 +43,12 @@ def get_model(self, model_name):
         model_config = AutoConfig.from_pretrained(model_name)
         model_config.dropout = 0.0
         model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
-        model = model.half()
-        model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
+        dev = get_accelerator().device_name()
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
+        model = model.to(dtype=dtype)
+        model = model.to(f'{dev}:{local_rank}')
         return model
 
     def get_tokenizer(self, model_name):
@@ -70,8 +74,11 @@ def test_correctness(self, batch_size, model_name):
         base_out = self._generate(model, tokenizer, prompt)
 
         ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
-        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
 
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
         model.eval()
         ds1_out = self._generate(model, tokenizer, prompt)
         assert base_out == ds1_out, f"base_out: {base_out}, ds1_out: {ds1_out}"
@@ -87,6 +94,9 @@ def test_functionality(self, batch_size, model_name):
         prompt = self.get_prompt(batch_size)
 
         ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
         model, *_ = deepspeed.initialize(model=model, config=ds_config)
 
         model.eval()
diff --git a/tests/unit/hybrid_engine/test_he_llama.py b/tests/unit/hybrid_engine/test_he_llama.py
index fcf5b8ffb89b..af83140ab329 100644
--- a/tests/unit/hybrid_engine/test_he_llama.py
+++ b/tests/unit/hybrid_engine/test_he_llama.py
@@ -46,8 +46,12 @@ def get_model(self, model_name):
         model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
         # Make the model smaller so we can run it on a single GPU in CI
         _ = [model.model.layers.pop(-1) for _ in range(8)]
-        model = model.half()
-        model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
+        dev = get_accelerator().device_name()
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
+        model = model.to(dtype=dtype)
+        model = model.to(f'{dev}:{local_rank}')
         return model
 
     def get_tokenizer(self, model_name):
@@ -73,6 +77,9 @@ def test_correctness(self, batch_size, model_name):
         base_out = self._generate(model, tokenizer, prompt)
 
         ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
         model, *_ = deepspeed.initialize(model=model, config=ds_config)
 
         model.eval()
@@ -90,6 +97,9 @@ def test_functionality(self, batch_size, model_name):
         prompt = self.get_prompt(batch_size)
 
         ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
         model, *_ = deepspeed.initialize(model=model, config=ds_config)
 
         model.eval()
diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
index ea27239ed55e..64185aa13226 100644
--- a/tests/unit/hybrid_engine/test_he_lora.py
+++ b/tests/unit/hybrid_engine/test_he_lora.py
@@ -15,6 +15,7 @@
 import numpy.testing as npt
 from unit.common import DistributedTest
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -119,13 +120,17 @@ def only_optimize_lora_parameters(model):
 class TestHybridEngineLoRA(DistributedTest):
     world_size = 1
 
-    def get_model(self, model_name):
+    def get_model(self, model_name, replace_fp16_to_bf16=False):
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         model_config = AutoConfig.from_pretrained(model_name)
         model_config.dropout = 0.0
         model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
-        model = model.half()
-        model = model.to(f'cuda:{local_rank}')
+        if replace_fp16_to_bf16:
+            model = model.bfloat16()
+        else:
+            model = model.half()
+        device = get_accelerator().device_name()
+        model = model.to(f'{device}:{local_rank}')
         return model
 
     def get_tokenizer(self, model_name):
@@ -146,8 +151,11 @@ def get_train_sentences(self, batch_size):
             raise NotImplementedError(f"batch_size {batch_size} not implemented")
 
     def test_lora(self, batch_size, model_name, zero_stage, offload_device):
+        replace_fp16_to_bf16 = False
+        if os.getenv("REPLACE_FP16", default=None):
+            replace_fp16_to_bf16 = True
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        model = self.get_model(model_name)
+        model = self.get_model(model_name, replace_fp16_to_bf16)
         tokenizer = self.get_tokenizer(model_name)
         train_sentences = self.get_train_sentences(batch_size)
 
@@ -180,6 +188,9 @@ def test_lora(self, batch_size, model_name, zero_stage, offload_device):
             }
         }
 
+        if replace_fp16_to_bf16:
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
         model, *_ = deepspeed.initialize(model=model, config=ds_config)
 
         # Verify gradient norm is larger than 0
@@ -190,7 +201,8 @@ def test_lora(self, batch_size, model_name, zero_stage, offload_device):
 
         model.train()
         batch = tokenizer(train_sentences, max_length=16, padding="max_length", truncation=True, return_tensors="pt")
-        batch = to_device(batch, f'cuda:{local_rank}')
+        device = get_accelerator().device_name()
+        batch = to_device(batch, f'{device}:{local_rank}')
         batch["labels"] = batch["input_ids"]
         outputs = model(**batch, use_cache=False)
         loss = outputs.loss
diff --git a/tests/unit/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py
index 564b3fab6bf4..dc4614cfae3e 100644
--- a/tests/unit/inference/test_checkpoint_sharding.py
+++ b/tests/unit/inference/test_checkpoint_sharding.py
@@ -14,6 +14,7 @@
 from huggingface_hub import snapshot_download
 from transformers.utils import is_offline_mode
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -44,6 +45,8 @@ def model_name(request):
 
 @pytest.fixture(params=[torch.float16, torch.int8], ids=["fp16", "int8"])
 def dtype(request):
+    if request.param not in get_accelerator().supported_dtypes():
+        pytest.skip(f"{request.param} not supported by {get_accelerator().device_name()}.")
     return request.param
 
 
@@ -74,6 +77,7 @@ class TestCheckpointShard(DistributedTest):
     world_size = 2
 
     def test(self, model_name, dtype, class_tmpdir, save_shard):
+
         world_size = int(os.getenv("WORLD_SIZE", "1"))
         inf_config = {
             "replace_with_kernel_inject": True,
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 6b5588d8a1f7..ae2e3e1207c2 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -7,6 +7,7 @@
 import time
 import torch
 import pytest
+import json
 import itertools
 import deepspeed
 from deepspeed.git_version_info import torch_info
@@ -16,11 +17,12 @@
 from transformers import pipeline, AutoTokenizer
 from transformers.models.t5.modeling_t5 import T5Block
 from transformers.models.roberta.modeling_roberta import RobertaLayer
-from huggingface_hub import HfApi
+from huggingface_hub import HfApi, hf_api
 from deepspeed.model_implementations import DeepSpeedTransformerInference
 from torch import nn
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from transformers import BertLayer
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):
@@ -58,26 +60,41 @@
     "facebook/opt-125m",  # 125m, 1.7B, ..., 175B variants have the same model architecture.
     "facebook/opt-350m",  # 350m applies layer norm after attention layer which is different than other variants.
 ]
+if os.getenv("TRANSFORMERS_OFFLINE", default=None):
+    if os.getenv("HF_HOME", default=None):
+        model_info_f = os.path.join(os.getenv("HF_HOME", default=None), 'model_info.json')
+        with open(model_info_f, 'r') as f:
+            data = json.load(f)
+        _all_models = [hf_api.ModelInfo(**x) for x in data]
+    else:
+        assert 1
+elif os.getenv("STORE_HF", default=None):
+    if os.getenv("HF_HOME", default=None):
+        _all_models = list(HfApi().list_models())
+        all_models_info = [model_info.__dict__ for model_info in _all_models]
+        json_object = json.dumps(all_models_info, indent=4)
+        model_info_f = os.path.join(os.getenv("HF_HOME", default=None), 'model_info.json')
+        with open(model_info_f, 'w') as f:
+            f.write(json_object)
+else:
+    _all_models = list(HfApi().list_models())
+
 _test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
+_hf_model_names = [m.modelId for m in _all_models]
 _test_tasks = [
     "fill-mask", "question-answering", "text-classification", "token-classification", "text-generation",
     "text2text-generation", "summarization", "translation"
 ]
-
-# Get a list of all models and mapping from task to supported models
-_hf_models = list(HfApi().list_models())
-_hf_model_names = [m.modelId for m in _hf_models]
-_hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
-
+_hf_task_to_models = {task: [m.modelId for m in _all_models if m.pipeline_tag == task] for task in _test_tasks}
 # Get all combinations of task:model to test
 _model_w_tasks = [(m, t) for m, t in itertools.product(*[_test_models, _test_tasks]) if m in _hf_task_to_models[t]]
-
 # Assign to pytest variables for testing
 pytest.model_w_tasks = _model_w_tasks
 pytest.mt_names = [f"{m}-{t}" for m, t in pytest.model_w_tasks]
 
 
-@pytest.fixture(scope="module", autouse=True)
+#Hugging Face model: WA. Hugging Face models were updated, causing the _test_models list to not be found in _hf_model_names. Changed the fixture from True to False.
+@pytest.fixture(scope="module", autouse=False)
 def verify_models():
     # Verify all test models are registered in HF
     _test_models_not_found = [m for m in _test_models if m not in _hf_model_names]
@@ -99,8 +116,17 @@ def model_w_task(request):
     return request.param
 
 
-@pytest.fixture(params=[torch.float, torch.half], ids=["fp32", "fp16"])
+dtype_params = [torch.float, torch.half]
+dtype_ids = ["fp32", "fp16"]
+if get_accelerator().is_bf16_supported():
+    dtype_params.append(torch.bfloat16)
+    dtype_ids.append("bf16")
+
+
+@pytest.fixture(params=dtype_params, ids=dtype_ids)
 def dtype(request):
+    if request.param not in get_accelerator().supported_dtypes():
+        pytest.skip(f"{request.param} not supported by {get_accelerator().device_name()}.")
     return request.param
 
 
@@ -231,7 +257,7 @@ def verify_injection(module):
 def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
     model, task = model_w_task
     msg = ""
-    if enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
+    if enable_cuda_graph and (torch_info["cuda_version"] == "0.0") and get_accelerator().device_name() != 'hpu':
         msg = "CUDA not detected, cannot use CUDA Graph"
     elif enable_cuda_graph and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
         msg = "CUDA Graph is only available in torch versions >= 1.10"
@@ -248,6 +274,8 @@ def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
         msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
     elif ("bert" not in model.lower()) and enable_cuda_graph:
         msg = "Non bert/roberta models do no support CUDA Graph"
+    elif not get_accelerator().is_triton_supported() and enable_triton:
+        msg = f"Triton is not supported for {get_accelerator().device_name()}."
     elif enable_triton and not (dtype in [torch.half]):
         msg = "Triton is for fp16"
     elif enable_triton and not deepspeed.HAS_TRITON:
@@ -261,7 +289,7 @@ def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
     return msg
 
 
-@pytest.mark.inference
+@pytest.mark.nightly
 class TestModelTask(DistributedTest):
     world_size = 1
 
@@ -284,7 +312,7 @@ def test(
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 
         # Load the model on CPU first to avoid OOM for large models @fp32
-        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt", torch_dtype=dtype)
         if dtype == torch.half:
             pipe.model.half()
 
@@ -341,6 +369,96 @@ def test(
         assert assert_fn(bs_output, ds_output)
 
 
+@pytest.mark.skipif(get_accelerator().device_name() != 'hpu', reason="Kernel Inject False validation for HPU tests.")
+@pytest.mark.nightly
+class TestModelTaskKIFalse(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        dtype,
+        enable_cuda_graph,
+        enable_triton,
+        query,
+        inf_kwargs,
+        assert_fn,
+        perf_meas=True,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # Load the model on CPU first to avoid OOM for large models @fp32
+        if dtype == torch.bfloat16:
+            pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt", torch_dtype=torch.bfloat16)
+        else:
+            pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        if dtype == torch.half:
+            pipe.model.half()
+
+        # Switch device to GPU/HPU after converting to half
+        device = torch.device(get_accelerator().device_name(local_rank))
+        pipe.device = device
+        pipe.model.to(device)
+
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        bs_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        bs_time = time.time() - start
+        injection_policy = {BertLayer: ("output.dense", )}
+        if "facebook/opt" in model or "Norod78" in model:
+            injection_policy = {BertLayer: ("out_proj", )}
+        if "gpt2" in model or "EleutherAI" in model or "bigscience/bloom" in model:
+            injection_policy = {BertLayer: ("mlp", )}
+        if "distilbert" in model:
+            injection_policy = {BertLayer: ("output_layer_norm", )}
+        args = {
+            'mp_size': 1,
+            'dtype': dtype,
+            'replace_with_kernel_inject': False,
+            'enable_cuda_graph': enable_cuda_graph,
+            'use_triton': enable_triton,
+            'triton_autotune': False,
+            'injection_policy': injection_policy,
+        }
+        if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig(
+        ).max_out_tokens:
+            args.update({'max_out_tokens': pipe.tokenizer.model_max_length})
+        pipe.model = deepspeed.init_inference(pipe.model, **args)
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        ds_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        ds_time = time.time() - start
+
+        if perf_meas:
+            print(
+                f"model={model}, task={task}, dtype={dtype}, cuda_graph={enable_cuda_graph}, triton={enable_triton}, bs_time={bs_time}, ds_time={ds_time}"
+            )
+
+        # facebook/opt* and some bigscient/bloom* models are not matching
+        # baseline exactly, adding an exception to them for now
+        if ("opt" in model) or ("bloom" in model):
+            bs_output = pipe(query, **inf_kwargs)
+
+        # These performance tests are only measuring the time for a single
+        # inference request, we just want to check that performance isn't terrible
+        #assert ds_time <= (bs_time * 1.1)
+
+        assert assert_fn(bs_output, ds_output)
+
+
 @pytest.mark.seq_inference
 @pytest.mark.parametrize("model_w_task", [("EleutherAI/gpt-neo-1.3B", "text-generation"),
                                           ("EleutherAI/gpt-neox-20b", "text-generation"),
@@ -372,7 +490,6 @@ def test(
         # enough GPU memory
         pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
         bs_output = pipe(query, **inf_kwargs)
-
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=self.world_size,
                                               dtype=dtype,
@@ -401,12 +518,16 @@ def test(
     ):
         model, task = model_w_task
         dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
         if dtype not in get_accelerator().supported_dtypes():
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        device = local_rank
+        if get_accelerator().device_name() != 'cuda':
+            device = torch.device(f"{get_accelerator().device_name()}")
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt")
         bs_output = pipe(query, **inf_kwargs)
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=self.world_size,
@@ -497,7 +618,6 @@ def test(
         # enough GPU memory
         pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
         bs_output = pipe(query, **inf_kwargs)
-
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=world_size,
                                               dtype=dtype,
@@ -634,13 +754,22 @@ def no_pool_bootstrap_stderr(f, xs, iters):
 
         if 'gpt-j-6b' in model_name:
             dtype = torch.half
+            if os.getenv("REPLACE_FP16", default=None):
+                dtype = torch.bfloat16
             lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
                                                                                {"device": "cpu"})
-            setattr(lm, model_family, getattr(lm, model_family).half().to(device))
+            setattr(lm, model_family, getattr(lm, model_family).to(dtype=dtype).to(device))
             lm._device = device
         else:
-            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}", {"device": get_accelerator().device_name()})
+            if get_accelerator().device_name() == 'hpu':
+                #lm_eval not supporting HPU device, so get model with CPU and move it to HPU.
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                                   {"device": "cpu"})
+                setattr(lm, model_family, getattr(lm, model_family).to(device))
+                lm._device = device
+            else:
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(
+                    f"pretrained={model_name}", {"device": get_accelerator().device_name()})
 
         get_accelerator().synchronize()
         start = time.time()
diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index 375563abf65b..4bed47b4560b 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+import os
 import deepspeed
 from unit.common import DistributedTest
 from unit.simple_model import create_config_from_dict
@@ -17,7 +18,8 @@ class TestInferenceConfig(DistributedTest):
     def test_overlap_kwargs(self):
         config = {"replace_with_kernel_inject": True}
         kwargs = {"replace_with_kernel_inject": True}
-
+        if os.getenv("REPLACE_FP16", default=None):
+            kwargs['dtype'] = "torch.bfloat16"
         engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
         assert engine._config.replace_with_kernel_inject
 
@@ -31,14 +33,14 @@ def test_overlap_kwargs_conflict(self):
     def test_kwargs_and_config(self):
         config = {"replace_with_kernel_inject": True}
         kwargs = {"dtype": torch.float32}
-
         engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
         assert engine._config.replace_with_kernel_inject
         assert engine._config.dtype == kwargs["dtype"]
 
     def test_json_config(self, tmpdir):
         config = {"replace_with_kernel_inject": True}
+        if os.getenv("REPLACE_FP16", default=None):
+            config['dtype'] = "torch.bfloat16"
         config_json = create_config_from_dict(tmpdir, config)
-
         engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)
         assert engine._config.replace_with_kernel_inject
diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py
index 23e49f89025b..9ae96ed12fcf 100644
--- a/tests/unit/inference/test_model_profiling.py
+++ b/tests/unit/inference/test_model_profiling.py
@@ -16,6 +16,9 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+
 
 @pytest.mark.inference
 @pytest.mark.parametrize("use_cuda_events", [True, False])
@@ -31,7 +34,6 @@ def test(self, enable_cuda_graph, use_cuda_events):
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "1"))
-
         pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
         pipe.model = deepspeed.init_inference(pipe.model,
                                               dtype=dtype,
diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py
index ac39b7ab12fa..8ac29e1df7eb 100644
--- a/tests/unit/inference/test_stable_diffusion.py
+++ b/tests/unit/inference/test_stable_diffusion.py
@@ -20,22 +20,25 @@ class TestStableDiffusion(DistributedTest):
     def test(self):
         from diffusers import DiffusionPipeline
         from image_similarity_measures.quality_metrics import rmse
-        generator = torch.Generator(device=get_accelerator().current_device())
+        dev = get_accelerator().device_name()
+        generator = torch.Generator(device=dev)
         seed = 0xABEDABE7
         generator.manual_seed(seed)
         prompt = "a dog on a rocket"
         model = "prompthero/midjourney-v4-diffusion"
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        device = torch.device(f"cuda:{local_rank}")
-
-        pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half)
+        device = torch.device(f"{dev}:{local_rank}")
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
+        pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=dtype)
         pipe = pipe.to(device)
         baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
 
         pipe = deepspeed.init_inference(
             pipe,
             mp_size=1,
-            dtype=torch.half,
+            dtype=dtype,
             replace_with_kernel_inject=True,
             enable_cuda_graph=True,
         )
diff --git a/tests/unit/inference/v2/inference_test_utils.py b/tests/unit/inference/v2/inference_test_utils.py
index d63c51267e51..9405b6fde724 100644
--- a/tests/unit/inference/v2/inference_test_utils.py
+++ b/tests/unit/inference/v2/inference_test_utils.py
@@ -44,3 +44,10 @@ def allclose(x, y, tolerances: Tuple[int, int] = None):
     else:
         rtol, atol = tolerances
     return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def skip_on_inference_v2():
+    if get_accelerator().device_name() == 'hpu':
+        return True
+    else:
+        return False
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py b/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py
index 376188b92565..49dbdc715556 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py
@@ -11,7 +11,10 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum
 from deepspeed.inference.v2.kernels.core_ops import CUDABiasActivation
-from ....v2.inference_test_utils import get_dtypes, allclose
+from ....v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_bias_act_implementation(input: torch.Tensor, bias: Optional[torch.Tensor],
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py b/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py
index 864db6204a16..9d8d2c177607 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py
@@ -10,7 +10,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.core_ops import BlasLibLinear
-from ....v2.inference_test_utils import allclose
+from ....v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 # Note: only testing with FP16 and BF16 because we use TF32 on Ampere and we don't have a good
 # set of tolerances. Since this is just on top of BLAS though, the test is more about
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py b/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py
index 8cb95a6cdcba..4232cb7a6324 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py
@@ -11,7 +11,10 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.core_ops import CUDAGatedActivation
 from deepspeed.inference.v2.inference_utils import ActivationType
-from ....v2.inference_test_utils import get_dtypes, allclose
+from ....v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_geglu_implementation(input: torch.Tensor,
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py b/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py
index 0b489894bb9b..0549316081ee 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py
@@ -8,7 +8,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.core_ops import CUDAFPPostLN
-from ....v2.inference_test_utils import get_dtypes, allclose
+from ....v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py b/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py
index ffb748e57af2..4da5173d5f53 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py
@@ -8,7 +8,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.core_ops import CUDAFPPreLN
-from ....v2.inference_test_utils import get_dtypes, allclose
+from ....v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py b/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py
index 63b16da171c9..16357d0f2967 100644
--- a/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py
+++ b/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py
@@ -9,7 +9,10 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_utils import DtypeEnum
 from deepspeed.inference.v2.kernels.core_ops import CUDARMSNorm, CUDARMSPreNorm
-from ....v2.inference_test_utils import get_dtypes, allclose
+from ....v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_rms_norm(vals: torch.Tensor, gamma: torch.Tensor, epsilon: float = 1e-5) -> torch.Tensor:
diff --git a/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py b/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py
index ed76dabe1f4c..c396cc1268d0 100644
--- a/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py
+++ b/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py
@@ -9,7 +9,10 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum
 from deepspeed.inference.v2.kernels.cutlass_ops import MoEGEMM
-from ....v2.inference_test_utils import allclose
+from ....v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 SINGLE_EXPERT_CASES = [(13, 2048, 2048), (256, 1024, 4096), (278, 5120, 2048), (893, 5120, 2560)]
 
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py b/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py
index a33c938a0608..ad88ff2c5d69 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py
@@ -6,8 +6,13 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.ragged_ops import AtomBuilder
 from .ragged_testing_utils import build_complex_batch
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 Q_BLOCK_SIZE = 128
 KV_BLOCK_SIZE = 128
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py
index ce5a178c9548..bb0192bbbde6 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py
@@ -23,7 +23,10 @@
 from deepspeed.ops.op_builder import RaggedUtilsBuilder
 
 from .ragged_testing_utils import build_batch_and_manager
-from ....v2.inference_test_utils import allclose
+from ....v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
index 90fe26eb4490..6df538ac18bb 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
@@ -9,6 +9,10 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.ragged_ops import LinearBlockedKVCopy
 from .ragged_testing_utils import build_batch_and_manager, validate_kv_cache
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 @pytest.mark.inference_v2_ops
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
index 618c2d3b87ec..e650d37dfdff 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
@@ -12,7 +12,10 @@
 from deepspeed.inference.v2.kernels.ragged_ops import BlockedRotaryEmbeddings, BlockedTrainedRotaryEmbeddings
 from deepspeed.inference.v2.ragged import RaggedBatchWrapper, DSSequenceDescriptor
 from .ragged_testing_utils import build_batch_and_manager, validate_kv_cache
-from ....v2.inference_test_utils import allclose
+from ....v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 """
 NOTE(cmikeh2): It is very possible to see unit test failures (even on FP16) depending on when
 certain values are casted up to or down from float32. If we are seeing accuracy issues, we should
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py b/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py
index 1feefa9ee588..e00aa85d194c 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py
@@ -10,9 +10,12 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.ragged_ops import RaggedLogitsGather
-from ....v2.inference_test_utils import allclose, get_dtypes
+from ....v2.inference_test_utils import allclose, get_dtypes, skip_on_inference_v2
 from .ragged_testing_utils import build_simple_batch
 
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
+
 
 def baseline_implementation(hidden_states: torch.Tensor, seq_lens: List[int]) -> torch.Tensor:
     output = torch.empty((len(seq_lens), hidden_states.shape[1]),
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
index 5fa375b49c19..292381bf5e15 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
@@ -14,6 +14,10 @@
     RaggedTop1Gating,
 )
 from .ragged_testing_utils import build_simple_batch
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 """
 For simplicity's sake, these tests do rely on ``RaggedTop1Gating``  and
 ``MoEScatter`` to produce correct inputs. If either of these kernels is broken
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
index 4ca051410c1c..0b90dd82b1b6 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
@@ -10,6 +10,10 @@
 from deepspeed.inference.v2.inference_utils import DtypeEnum
 from deepspeed.inference.v2.kernels.ragged_ops import MoEScatter, RaggedTop1Gating
 from .ragged_testing_utils import build_simple_batch
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 """
 For simplicity's sake, these tests do rely on ``RaggedTop1Gating`` to produce correct
 inputs. If ``RaggedTop1Gating`` is broken, these tests will fail, so double check
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py b/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py
index f179f62a9b12..32d7d312a4cf 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py
@@ -10,9 +10,12 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.kernels.ragged_ops import RaggedEmbeddingKernel
-from ....v2.inference_test_utils import allclose, get_dtypes
+from ....v2.inference_test_utils import allclose, get_dtypes, skip_on_inference_v2
 from .ragged_testing_utils import build_batch_and_manager
 
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
+
 
 def baseline_implementation(token_ids: torch.Tensor,
                             embedding_table: torch.Tensor,
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py b/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
index 6ff2508bf320..1d450c9552ff 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
@@ -11,7 +11,10 @@
 from deepspeed.inference.v2.inference_utils import DtypeEnum
 from deepspeed.inference.v2.kernels.ragged_ops import RaggedTop1Gating
 from .ragged_testing_utils import build_simple_batch
-from ....v2.inference_test_utils import allclose
+from ....v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def _test_single_mapping_helper(n_tokens: int,
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py b/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py
index 52ff0e134dfc..901d9d9b43e7 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py
@@ -15,6 +15,10 @@
 )
 from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
 from .utils import SimpleParam, DummyInferenceModel
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 class TransformerLayerContainer(LayerContainer):
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py b/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py
index 07ad87e6168d..c457227d5499 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py
@@ -6,10 +6,15 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_parameter import InferenceParameter
 from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
 
 from .utils import SimpleParam, DummyInferenceModel
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 class ParentLayer(LayerContainer):
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py b/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py
index 52313cb6f202..0701b8dcc4d8 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py
@@ -6,10 +6,15 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.allocator import on_device
 from deepspeed.inference.v2.inference_parameter import InferenceParameter
 from deepspeed.inference.v2.model_implementations.parameter_base import ParameterBase, ParamList
 from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 class MultiDependencyContainer(ParameterBase):
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py b/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py
index b319bf6de4ad..e7ba08b3c2a8 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py
@@ -6,10 +6,15 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_parameter import InferenceParameter
 from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
 
 from .utils import validate_device, SimpleParam, ListParam, DummyInferenceModel
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 class MultiParameterLayer(LayerContainer):
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
index 260236562ee9..7d68a7e17ab9 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.allocator import on_device
 from deepspeed.inference.v2.inference_parameter import InferenceParameter
 from deepspeed.inference.v2.model_implementations.parameter_base import ParameterBase, ParamList
@@ -13,6 +14,10 @@
 from deepspeed.inference.v2.model_implementations.common_parameters import *
 
 from .utils import validate_device
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 class SimpleMoELayer(LayerContainer):
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py
index 850c4c24fde6..fb7901dbf938 100644
--- a/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py
@@ -8,6 +8,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.model_implementations.sharding import *
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 # None of the logic should be dependent on head size.
 HEAD_SIZE = 64
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py
index aac7e5391d8f..553d604d30ee 100644
--- a/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py
@@ -8,6 +8,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.model_implementations.sharding import *
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def round_up_to_256(x: int) -> int:
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py
index 9a1cb9c09c64..86575d2176ad 100644
--- a/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py
@@ -10,6 +10,10 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.model_implementations.sharding import *
+from ....v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def fill_with_head_ids(head_size: int, n_heads_q: int, n_heads_kv: Optional[int] = None) -> torch.Tensor:
diff --git a/tests/unit/inference/v2/modules/test_blas_linear_module.py b/tests/unit/inference/v2/modules/test_blas_linear_module.py
index f4d0b1991238..b50819875699 100644
--- a/tests/unit/inference/v2/modules/test_blas_linear_module.py
+++ b/tests/unit/inference/v2/modules/test_blas_linear_module.py
@@ -13,7 +13,10 @@
 from deepspeed.inference.v2.modules import ConfigBundle
 from deepspeed.inference.v2.modules.configs import DSLinearConfig
 from deepspeed.inference.v2.modules.interfaces import DSLinearRegistry
-from ...v2.inference_test_utils import allclose
+from ...v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
diff --git a/tests/unit/inference/v2/modules/test_blocked_attn.py b/tests/unit/inference/v2/modules/test_blocked_attn.py
index 215ad64636b1..547a32fd3003 100644
--- a/tests/unit/inference/v2/modules/test_blocked_attn.py
+++ b/tests/unit/inference/v2/modules/test_blocked_attn.py
@@ -16,7 +16,10 @@
 from deepspeed.inference.v2.modules.interfaces import DSSelfAttentionRegistry, DSSelfAttentionBase
 
 from ..kernels.ragged_ops.ragged_testing_utils import build_batch_and_manager
-from ...v2.inference_test_utils import allclose
+from ...v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
diff --git a/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py b/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py
index 386f3b3ef0b3..dc971f83f0c2 100644
--- a/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py
+++ b/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py
@@ -12,7 +12,10 @@
 from deepspeed.inference.v2.modules import ConfigBundle
 from deepspeed.inference.v2.modules.configs import DSNormConfig
 from deepspeed.inference.v2.modules.interfaces import DSPreNormRegistry
-from ...v2.inference_test_utils import get_dtypes, allclose
+from ...v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: Optional[torch.Tensor], gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/modules/test_custom_module.py b/tests/unit/inference/v2/modules/test_custom_module.py
index eb54b7a913f2..b813b715ec1e 100644
--- a/tests/unit/inference/v2/modules/test_custom_module.py
+++ b/tests/unit/inference/v2/modules/test_custom_module.py
@@ -11,7 +11,10 @@
 from deepspeed.inference.v2.modules.interfaces import DSPostNormRegistry
 from deepspeed.inference.v2.modules.configs import DSNormConfig
 from deepspeed.inference.v2.modules.implementations import cuda_post_ln
-from ...v2.inference_test_utils import allclose
+from ...v2.inference_test_utils import allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/modules/test_cutlass_moe.py b/tests/unit/inference/v2/modules/test_cutlass_moe.py
index e21170c9ed8f..785e7f98e67d 100644
--- a/tests/unit/inference/v2/modules/test_cutlass_moe.py
+++ b/tests/unit/inference/v2/modules/test_cutlass_moe.py
@@ -15,7 +15,10 @@
 from deepspeed.inference.v2.modules.interfaces import DSMoERegistry
 
 from ..kernels.ragged_ops.ragged_testing_utils import build_simple_batch
-from ...v2.inference_test_utils import allclose, get_dtypes
+from ...v2.inference_test_utils import allclose, get_dtypes, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def _gating_reference(logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/tests/unit/inference/v2/modules/test_post_ln_module.py b/tests/unit/inference/v2/modules/test_post_ln_module.py
index f9dcfd272170..538eb32c3c85 100644
--- a/tests/unit/inference/v2/modules/test_post_ln_module.py
+++ b/tests/unit/inference/v2/modules/test_post_ln_module.py
@@ -10,7 +10,10 @@
 from deepspeed.inference.v2.modules import ConfigBundle
 from deepspeed.inference.v2.modules.configs import DSNormConfig
 from deepspeed.inference.v2.modules.interfaces import DSPostNormRegistry
-from ...v2.inference_test_utils import get_dtypes, allclose
+from ...v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/modules/test_pre_rms_module.py b/tests/unit/inference/v2/modules/test_pre_rms_module.py
index bbd108a35a5a..58bf7761bafa 100644
--- a/tests/unit/inference/v2/modules/test_pre_rms_module.py
+++ b/tests/unit/inference/v2/modules/test_pre_rms_module.py
@@ -12,7 +12,10 @@
 from deepspeed.inference.v2.modules import ConfigBundle
 from deepspeed.inference.v2.modules.configs import DSNormConfig
 from deepspeed.inference.v2.modules.interfaces import DSPreNormRegistry
-from ...v2.inference_test_utils import get_dtypes, allclose
+from ...v2.inference_test_utils import get_dtypes, allclose, skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 def reference_implementation(residual: torch.Tensor, hidden_states: Optional[torch.Tensor], gamma: torch.Tensor,
diff --git a/tests/unit/inference/v2/ragged/test_blocked_allocator.py b/tests/unit/inference/v2/ragged/test_blocked_allocator.py
index 4596e81c5652..6dddeff6ee9f 100644
--- a/tests/unit/inference/v2/ragged/test_blocked_allocator.py
+++ b/tests/unit/inference/v2/ragged/test_blocked_allocator.py
@@ -9,7 +9,12 @@
 import pytest
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.ragged.blocked_allocator import BlockedAllocator
+from ...v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 @pytest.mark.inference_v2
diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py
index bdd513445ddb..2310d2fcbb15 100644
--- a/tests/unit/inference/v2/ragged/test_manager_configs.py
+++ b/tests/unit/inference/v2/ragged/test_manager_configs.py
@@ -7,7 +7,12 @@
 
 from pydantic import ValidationError
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
+from ...v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 @pytest.mark.inference_v2
diff --git a/tests/unit/inference/v2/ragged/test_ragged_wrapper.py b/tests/unit/inference/v2/ragged/test_ragged_wrapper.py
index 3cb74f4c49d2..669368f9ef9d 100644
--- a/tests/unit/inference/v2/ragged/test_ragged_wrapper.py
+++ b/tests/unit/inference/v2/ragged/test_ragged_wrapper.py
@@ -14,6 +14,10 @@
     RaggedBatchWrapper,
     DSStateManagerConfig,
 )
+from ...v2.inference_test_utils import skip_on_inference_v2
+
+pytestmark = pytest.mark.skipif(skip_on_inference_v2(),
+                                reason=f'Inference V2 not supported by {get_accelerator().device_name()}.')
 
 
 @pytest.mark.inference_v2
diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py
index 011ebaf4d3b9..58d5a1308d95 100644
--- a/tests/unit/megatron_model.py
+++ b/tests/unit/megatron_model.py
@@ -22,7 +22,7 @@ def get_megatron_version():
 
 
 def get_gpt2_model(args_others, mp_size=1):
-    from megatron.model import GPT2Model
+    from megatron.model import GPT2Model  # noqa: F401
     from megatron.initialize import initialize_megatron
 
     args_defaults = {
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index 310a0df16381..34bc247b3045 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -7,6 +7,7 @@
 import deepspeed
 import pytest
 import gc
+import os
 from unit.common import DistributedTest
 from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
@@ -34,6 +35,11 @@ def test(self, ep_size, zero_stage, use_residual):
             }
         }
         hidden_dim = 16
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         # E+D -- ep_size = 2
         # E only -- ep_size = 4
@@ -47,7 +53,11 @@ def test(self, ep_size, zero_stage, use_residual):
                                                       dist_init_required=False)
         #dist_init_required=False -- parameterize to True/False?
 
-        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = sequence_dataloader(model=model,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=model.device,
+                                          dtype=dtype)
 
         def strict_average_tensor(tensor):
             process_group = optimizer.dp_process_group
@@ -116,6 +126,11 @@ def test(self, ep_size, use_residual):
 
         config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         # E+D -- ep_size = 2
         # E only -- ep_size = 4
@@ -126,7 +141,11 @@ def test(self, ep_size, use_residual):
                                               optimizer=optimizer,
                                               dist_init_required=False)
 
-        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = sequence_dataloader(model=model,
+                                          total_samples=50,
+                                          hidden_dim=hidden_dim,
+                                          device=model.device,
+                                          dtype=dtype)
 
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
index 0069c674690c..cc4e88e2f366 100644
--- a/tests/unit/moe/test_moe_tp.py
+++ b/tests/unit/moe/test_moe_tp.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import os
 import torch
 import deepspeed
 import pytest
@@ -63,6 +64,10 @@ def test(self, ep_size, tp_size, enable_expert_tp, use_residual):
 
         config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         tensor_parallel_expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, 4 * hidden_dim // tp_size),
                                                      torch.nn.ReLU(),
diff --git a/tests/unit/ops/adam/test_adamw.py b/tests/unit/ops/adam/test_adamw.py
index 3b1b088766a5..7308d772d1de 100644
--- a/tests/unit/ops/adam/test_adamw.py
+++ b/tests/unit/ops/adam/test_adamw.py
@@ -6,7 +6,7 @@
 import deepspeed
 import torch
 import pytest
-
+import os
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from unit.common import DistributedTest
@@ -67,6 +67,9 @@ def test(self,
                 "cpu_offload": zero_offload
             }
         }
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled" : True}
         model = SimpleModel(10)
         model, _, _, _ = deepspeed.initialize(config=config_dict,
                                               model=model,
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
index 9a6ff6689446..457a1ef9d20a 100644
--- a/tests/unit/ops/adam/test_cpu_adam.py
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -7,11 +7,12 @@
 import numpy as np
 import pytest
 from cpuinfo import get_cpu_info
+import os
 
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.adam import FusedAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
 from unit.common import DistributedTest
 
 if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
@@ -62,9 +63,13 @@ class TestCPUAdam(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test_fused_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
+        if dtype == torch.float16 and os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.bfloat16
 
         from deepspeed.ops.adam import DeepSpeedCPUAdam
 
@@ -91,6 +96,8 @@ def test_torch_adamw_equal(self, dtype, model_size):
         if get_accelerator().is_available():
             if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
                 pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
+            if dtype == torch.float16 and os.getenv("REPLACE_FP16", default=None):
+                dtype = torch.bfloat16
             ref_param_device = get_accelerator().device_name()
         else:
             if dtype == torch.half:
diff --git a/tests/unit/ops/adam/test_hybrid_adam.py b/tests/unit/ops/adam/test_hybrid_adam.py
index c7ef4890b322..9003e02588c1 100644
--- a/tests/unit/ops/adam/test_hybrid_adam.py
+++ b/tests/unit/ops/adam/test_hybrid_adam.py
@@ -12,7 +12,7 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
 from unit.common import DistributedTest
 
 if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
@@ -43,6 +43,8 @@ class TestHybridAdam(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test_hybrid_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
old mode 100644
new mode 100755
diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py
index 217267a97951..13712abb263f 100644
--- a/tests/unit/ops/sparse_attention/test_sparse_attention.py
+++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py
@@ -10,6 +10,7 @@
 
 import pytest
 import torch
+import os
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import SparseAttnBuilder
@@ -130,6 +131,10 @@ def test_softmax(block, width, dtype):
     skip_on_arch(min_arch=7)
     skip_on_cuda(valid_cuda=valid_cuda_versions)
 
+    if os.getenv("REPLACE_FP16", default=None):
+        if dtype == torch.float16:
+            dtype = torch.bfloat16
+
     Z = 2
     H = 4
     scale = 0.4
@@ -236,6 +241,10 @@ def test_matmul(block, dtype, mode, trans_a, trans_b):
     skip_on_arch(min_arch=7)
     skip_on_cuda(valid_cuda=valid_cuda_versions)
 
+    if os.getenv("REPLACE_FP16", default=None):
+        if dtype == torch.float16:
+            dtype = torch.bfloat16
+
     Z = 3
     H = 2
     M = 128
diff --git a/tests/unit/ops/transformer/inference/inference_test_utils.py b/tests/unit/ops/transformer/inference/inference_test_utils.py
index 9c7b428c0e68..9af48e12f4fb 100644
--- a/tests/unit/ops/transformer/inference/inference_test_utils.py
+++ b/tests/unit/ops/transformer/inference/inference_test_utils.py
@@ -26,8 +26,10 @@ def get_tolerances():
 def get_dtypes():
     global DTYPES
     if DTYPES is None:
-        DTYPES = [torch.float16, torch.float32]
+        DTYPES = [torch.float32]
         try:
+            if get_accelerator().is_fp16_supported():
+                DTYPES.append(torch.float16)
             if get_accelerator().is_bf16_supported():
                 DTYPES.append(torch.bfloat16)
         except (AssertionError, AttributeError):
diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py
index 13abe8b915c7..2c8f2de7b975 100644
--- a/tests/unit/ops/transformer/inference/test_attention.py
+++ b/tests/unit/ops/transformer/inference/test_attention.py
@@ -30,6 +30,9 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
     if not deepspeed.HAS_TRITON:
         pytest.skip("triton has to be installed for the test")
 
+    if not get_accelerator().is_triton_supported():
+        pytest.skip(f"triton is not supported by {get_accelerator().device_name()}")
+
     minus_inf = -65504.0
 
     # skip autotune in testing
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index 843c9b889c2b..059f9ff4624a 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -14,6 +14,7 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
+inference_torch_module = None
 torch_minor_version = None
 
 
diff --git a/tests/unit/ops/transformer/inference/test_bias_add_transform_0213.py b/tests/unit/ops/transformer/inference/test_bias_add_transform_0213.py
new file mode 100644
index 000000000000..00ab218ce4dc
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_bias_add_transform_0213.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright Habana Labs, Ltd. an Intel Company'''
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes
+import deepspeed.ops.op_builder.torch_fallback_kernels as torch_fallback_kernels
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytestmark = pytest.mark.skip(reason="Inference ops are not available on this system")
+
+inference_module = None
+
+
+def run_bias_add_transform_0213_reference(input, num_heads, bias, trans_count):
+    return torch_fallback_kernels.bias_add_transform_0213(input, bias, num_heads, trans_count)
+
+
+def run_bias_add_transform_0213_ds(input, num_heads, bias, trans_count):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    batch_size = input.shape[0]
+    seq_length = input.shape[1]
+    value_size = input.shape[2]
+    hidden_dim = value_size // trans_count
+    head_dim = hidden_dim // num_heads
+
+    # Resetting workspace, as when trans_count < 3, not all elements are currently filled in the kernel.
+    inference_module.release_workspace()
+    allocate_workspace_func = getattr(inference_module,
+                                      f"allocate_workspace_{torch_fallback_kernels.dtype_names_dict[input.dtype]}")
+    kernel_func = getattr(inference_module,
+                          f"bias_add_transform_0213_{torch_fallback_kernels.dtype_names_dict[input.dtype]}")
+    allocate_workspace_func(
+        3 * hidden_dim,
+        3 * num_heads,
+        3 * seq_length,
+        3 * batch_size,
+        1,  # num_layers
+        1,  # mp_size
+        False,  # external_cache
+        0,  # rank
+        1024 * 100,  # max_out_tokens
+        1)  # min_out_tokens
+    return kernel_func(input, bias, num_heads, trans_count)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("trans_count", [1, 3])
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 9, 18])
+@pytest.mark.parametrize("value", [576, 1152, 2304])
+@pytest.mark.parametrize("heads", [1, 12])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_bias_add_transform_0213(trans_count, batch, sequence, value, heads, dtype):
+    activations_ds = torch.randn((batch, sequence, value), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((batch, sequence, value), dtype=dtype, device=get_accelerator().device_name())
+    activations_ref = activations_ds.clone().detach()
+    bias_ref = bias_ds.clone().detach()
+
+    ds_out = run_bias_add_transform_0213_ds(activations_ds, heads, bias_ds, trans_count)
+    ref_out = run_bias_add_transform_0213_reference(activations_ref, heads, bias_ref, trans_count)
+
+    for i, (ds, ref) in enumerate(zip(ds_out, ref_out)):
+        delta = ref - ds
+        assert allclose(ds, ref), f"Tensor {i} {delta.max()=}, {delta.mean()=} {ds=}, {ref=}"
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index d5ab13964974..d8ebbae87d1d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -15,6 +15,7 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
+inference_torch_module = None
 torch_minor_version = None
 
 
@@ -66,7 +67,7 @@ def run_gated_silu_ds(activation, bias):
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_gated_silu(batch, sequence, channels, dtype):
     activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
     bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index fd82da51380c..71f5a431e3c4 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -15,6 +15,7 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
+inference_torch_module = None
 torch_minor_version = None
 
 
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 881af78e92cf..8f160d386fbb 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -34,7 +34,6 @@ def run_bias_relu_ds(activations, bias):
         return inference_module.bias_relu_fp32(activations, bias)
 
 
-@pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index de924848bfb4..c31ac6fbe63d 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -42,8 +42,9 @@ def run_gelu_ds(activations, use_triton_ops=False):
         from deepspeed.ops.transformer.inference.triton import gelu
         return gelu(activations)
 
+    device = deepspeed.accelerator.get_accelerator().device_name()
     channels = activations.shape[-1]
-    bias = torch.zeros((channels), dtype=activations.dtype, device='cuda')
+    bias = torch.zeros((channels), dtype=activations.dtype, device=device)
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
@@ -60,11 +61,14 @@ def run_gelu_ds(activations, use_triton_ops=False):
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
 def test_gelu(batch, sequence, channels, dtype, use_triton_ops):
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     activations_ref = activations_ds.clone().detach()
 
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
+    if not deepspeed.accelerator.get_accelerator().is_triton_supported() and use_triton_ops:
+        pytest.skip(f"triton is not supported by {deepspeed.accelerator.get_accelerator().device_name()}")
     ds_out = run_gelu_ds(activations_ds, use_triton_ops)
     ref_out = run_gelu_reference(activations_ref)
     assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 711a35213015..48b48c058105 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -8,6 +8,7 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.op_builder.torch_fallback_builder import TorchInferenceOpBuilder
 from .inference_test_utils import allclose, get_dtypes, assert_almost_equal
 try:
     import triton  # noqa: F401 # type: ignore
@@ -22,6 +23,7 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
+inference_torch_module = None
 
 
 def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
@@ -31,6 +33,10 @@ def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
     return torch.nn.functional.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
 
 
+def layer_norm_implementation(inference_module, vals, gamma, beta, epsilon):
+    return inference_module.layer_norm(vals, gamma, beta, epsilon)
+
+
 def ds_implementation(vals, gamma, beta, epsilon):
     global inference_module
     if inference_module is None:
@@ -38,6 +44,13 @@ def ds_implementation(vals, gamma, beta, epsilon):
     return inference_module.layer_norm(vals, gamma, beta, epsilon)
 
 
+def torch_implementation(vals, gamma, beta, epsilon):
+    global inference_torch_module
+    if inference_torch_module is None:
+        inference_torch_module = TorchInferenceOpBuilder().load()
+    return layer_norm_implementation(inference_torch_module, vals, gamma, beta, epsilon)
+
+
 def ds_triton_implementation(vals, gamma, beta, epsilon):
     return layer_norm(vals, gamma, beta, epsilon)
 
@@ -51,7 +64,8 @@ def ds_triton_implementation(vals, gamma, beta, epsilon):
 def test_layer_norm(batch, seq_len, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
-
+    if use_triton_ops and not get_accelerator().is_triton_supported():
+        pytest.skip(f"triton is not supported by {get_accelerator().device_name()}")
     vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
     gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
     beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
@@ -64,11 +78,16 @@ def test_layer_norm(batch, seq_len, channels, dtype, use_triton_ops):
             return
     else:
         new_output = ds_implementation(vals, gamma, beta, epsilon)
+    torch_output = torch_implementation(vals, gamma, beta, epsilon)
 
     if not allclose(new_output, ref_output):
         #print(new_output - ref_output)
         assert allclose(new_output, ref_output)
 
+    if not allclose(torch_output, ref_output):
+        #print(torch_output - ref_output)
+        assert allclose(torch_output, ref_output)
+
 
 def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
@@ -82,11 +101,22 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels,
                                           eps=epsilon).to(dtype)
 
 
+def residual_implementation(inference_module, vals, bias, res, gamma, beta, epsilon):
+    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+
+
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
-    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+    return residual_implementation(inference_module, vals, bias, res, gamma, beta, epsilon)
+
+
+def residual_torch_implementation(vals, bias, res, gamma, beta, epsilon):
+    global inference_torch_module
+    if inference_torch_module is None:
+        inference_torch_module = TorchInferenceOpBuilder().load()
+    return residual_implementation(inference_torch_module, vals, bias, res, gamma, beta, epsilon)
 
 
 def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -102,6 +132,8 @@ def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
 def test_layer_norm_residual(batch, seq_len, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
+    if use_triton_ops and not get_accelerator().is_triton_supported():
+        pytest.skip(f"triton is not supported by {get_accelerator().device_name()}")
 
     vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
     residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
@@ -116,12 +148,14 @@ def test_layer_norm_residual(batch, seq_len, channels, dtype, use_triton_ops):
             return
     else:
         new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
-
+    torch_output = residual_torch_implementation(vals, bias, residual, gamma, beta, epsilon)
     ref_output = residual_ref_implementation(vals, bias, residual, gamma, beta, epsilon, channels, dtype)
 
     print((new_output - ref_output).abs().max())
+    print((torch_output - ref_output).abs().max())
 
     assert allclose(new_output, ref_output)
+    assert allclose(torch_output, ref_output)
 
 
 def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels, dtype):
@@ -136,11 +170,22 @@ def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, cha
     return norm_output, res_output.to(dtype)
 
 
+def residual_store_implementation(inference_module, vals, bias, res, gamma, beta, epsilon):
+    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
+
+
 def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
+    return residual_store_implementation(inference_module, vals, bias, res, gamma, beta, epsilon)
+
+
+def residual_store_torch_implementation(vals, bias, res, gamma, beta, epsilon):
+    global inference_torch_module
+    if inference_torch_module is None:
+        inference_torch_module = TorchInferenceOpBuilder().load()
+    return residual_store_implementation(inference_torch_module, vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -160,11 +205,23 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
     ref_norm_output, norm_res_output = residual_store_ref_implementation(vals, bias, residual, gamma, beta, epsilon,
                                                                          channels, dtype)
 
+    vals_torch = vals.clone().detach()
+    bias_torch = bias.clone().detach()
+    residual_torch = residual.clone().detach()
+    gamma_torch = gamma.clone().detach()
+    beta_torch = beta.clone().detach()
+
     ds_norm_output, ds_res_output = residual_store_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
 
+    torch_norm_output, torch_res_output = residual_store_torch_implementation(vals_torch, bias_torch, residual_torch,
+                                                                              gamma_torch, beta_torch, epsilon)
+
     assert allclose(ds_res_output, norm_res_output)
     assert allclose(ds_norm_output, ref_norm_output)
 
+    assert allclose(torch_res_output, norm_res_output)
+    assert allclose(torch_norm_output, ref_norm_output)
+
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("M", [4])
@@ -175,6 +232,9 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
 def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='cuda'):
     if not deepspeed.HAS_TRITON:
         pytest.skip("triton has to be installed for the test")
+    if not get_accelerator().is_triton_supported():
+        pytest.skip(f"triton is not supported by {get_accelerator().device_name()}")
+
     torch.manual_seed(0)
     # create data
     x_shape = (M, N)
diff --git a/tests/unit/ops/transformer/inference/test_matmul.py b/tests/unit/ops/transformer/inference/test_matmul.py
index 804a85750a3a..c8b6cc8a4b0a 100644
--- a/tests/unit/ops/transformer/inference/test_matmul.py
+++ b/tests/unit/ops/transformer/inference/test_matmul.py
@@ -44,6 +44,8 @@ def run_matmul_ds(a, b, use_triton_ops=False):
 def test_matmul_4d(B, H, M, K, N, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
+    if not deepspeed.accelerator.get_accelerator().is_triton_supported() and use_triton_ops:
+        pytest.skip(f"triton is not supported by {deepspeed.accelerator.get_accelerator().device_name()}")
 
     # skip autotune in testing
     from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index e1c8127a83ac..6a4d9c9bf7f2 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -38,6 +38,7 @@ def test_moe_residual_matmul(hidden_dim, c, dtype):
     coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     coeff_ds = torch.cat((coeff1, coeff2), dim=-1)
+
     residual_ref = residual_ds.clone().detach()
     coeff_ref = coeff_ds.clone().detach()
     out_ref = out_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index c2952f74ff2d..f028b2e7c13c 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -79,6 +79,8 @@ def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_
                       pre_attn_norm, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops and dtype == torch.float16:
         pytest.skip("triton has to be installed for the test")
+    if use_triton_ops and not get_accelerator().is_triton_supported():
+        pytest.skip(f"triton is not supported by {get_accelerator().device_name()}")
     ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_rms_norm.py b/tests/unit/ops/transformer/inference/test_rms_norm.py
index 508a40e12e8d..fbb7c5f6808b 100644
--- a/tests/unit/ops/transformer/inference/test_rms_norm.py
+++ b/tests/unit/ops/transformer/inference/test_rms_norm.py
@@ -74,7 +74,7 @@ def pre_ref_implementation(vals, residual, gamma, epsilon):
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("seq_len", [1, 128])
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_pre_norm(batch, seq_len, channels, dtype):
     device = get_accelerator().current_device_name()
     vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=device)
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index 76046f31e01a..633aca58e131 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -43,7 +43,11 @@ def run_softmax_ds(input, use_triton_ops=False):
 def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
-    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    if not deepspeed.accelerator.get_accelerator().is_triton_supported() and use_triton_ops:
+        pytest.skip(f"triton is not supported by {device}")
+    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     input_ref = input_ds.clone().detach()
 
     ds_out = run_softmax_ds(input_ds, use_triton_ops)
diff --git a/tests/unit/ops/transformer/inference/test_softmax_context.py b/tests/unit/ops/transformer/inference/test_softmax_context.py
new file mode 100644
index 000000000000..8939df13ff01
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_softmax_context.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright Habana Labs, Ltd. an Intel Company'''
+
+import pytest
+import torch
+import math
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.transformer.inference.op_binding import SoftmaxContextOp
+from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig
+from deepspeed.ops.op_builder import InferenceBuilder
+import deepspeed.ops.op_builder.torch_fallback_kernels as torch_fallback_kernels
+from .inference_test_utils import allclose, get_dtypes
+from packaging import version as pkg_version
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytestmark = pytest.mark.skip(reason="Inference ops are not available on this system")
+
+inference_module = None
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 9, 18])
+@pytest.mark.parametrize("value", [576, 1152, 2304])
+@pytest.mark.parametrize("heads", [6, 12, 24])
+@pytest.mark.parametrize("no_masking", [False, True])
+@pytest.mark.parametrize("num_layers", [1, 2, 6])
+@pytest.mark.parametrize("dtype", get_dtypes())
+@pytest.mark.parametrize("rand", [False, True])
+def test_softmax_context(batch, sequence, value, heads, no_masking, num_layers, dtype, rand):
+    global inference_module
+    if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
+        pytest.skip("softmax_context implementation matches only after torch 1.12")
+
+    ds_inference_config = DeepSpeedInferenceConfig()
+    ds_inference_config.dtype = dtype
+    softmax_context_op = SoftmaxContextOp(ds_inference_config)
+    device_name = get_accelerator().device_name()
+    norm_factor = math.sqrt((value // 3) // heads)
+    position_ids = torch.arange(sequence, dtype=dtype, device=device_name)
+
+    alibi = None
+    #TODO: support num_kv and rope_theta
+    num_kv = -1
+    rope_theta = 1000
+    if (rand):
+        torch.manual_seed(234)
+        query_key_value = torch.randn((batch, sequence, value), dtype=dtype, device=device_name)
+        query_key_value *= torch.tensor(0.1, dtype=dtype)
+        attn_mask = torch.randn((batch, sequence), dtype=dtype, device=device_name)
+        attn_mask *= torch.tensor(0.1, dtype=dtype)
+        from random import randrange
+        layer_id = randrange(num_layers)
+    else:
+        query_key_value = torch.ones((batch, sequence, value), dtype=dtype, device=device_name)
+        query_key_value *= torch.tensor(0.1, dtype=dtype)
+        attn_mask = torch.ones((batch, sequence), dtype=dtype, device=device_name)
+        attn_mask *= torch.tensor(0.1, dtype=dtype)
+        layer_id = 0
+
+    #cuda path
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    inference_module.reset_cache()
+    inference_module.release_workspace()
+    allocate_workspace_func = getattr(inference_module,
+                                      f"allocate_workspace_{torch_fallback_kernels.dtype_names_dict[dtype]}")
+    max_out_tokens = 100
+    assert sequence < max_out_tokens
+    allocate_workspace_func(
+        value // 3,
+        heads,
+        sequence,
+        batch,
+        num_layers,  # num_layers
+        1,  # mp_size
+        False,  # external_cache
+        0,  # rank
+        max_out_tokens,  # max_out_tokens
+        1)  # min_out_tokens)
+    query_key_value_ref = query_key_value.clone().detach()
+    attn_mask_ref = attn_mask.clone().detach()
+
+    output_q, output_k, output_v = softmax_context_op.forward(query_key_value, attn_mask, heads, num_kv, norm_factor,
+                                                              no_masking, layer_id, num_layers, alibi, True, None,
+                                                              position_ids)
+
+    #fallback path
+    torch_fallback_kernels.reset_cache()
+    torch_fallback_kernels.release_workspace()
+    torch_fallback_kernels.InferenceContext.Instance().gen_workspace(num_layers, heads, batch, sequence, value // 3, 1,
+                                                                     False, dtype, 0, max_out_tokens, 1)
+    fallback_output_q, fallback_output_k, fallback_output_v = torch_fallback_kernels.softmax_context_fallback(
+        query_key_value_ref, attn_mask_ref, ds_inference_config.rotary_dim, ds_inference_config.rotate_half,
+        ds_inference_config.rotate_every_two, heads, num_kv, norm_factor, ds_inference_config.triangular_masking,
+        ds_inference_config.local_attention, ds_inference_config.window_size, no_masking, layer_id, num_layers, alibi,
+        rope_theta, True, None, position_ids)
+
+    assert (allclose(output_q, fallback_output_q))
+    assert (allclose(output_k, fallback_output_k))
+    assert (allclose(output_v, fallback_output_v))
+    inference_module.release_workspace()
+    torch_fallback_kernels.release_workspace()
diff --git a/tests/unit/ops/transformer/inference/test_transform_0213.py b/tests/unit/ops/transformer/inference/test_transform_0213.py
new file mode 100644
index 000000000000..7c3dc512f73b
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_transform_0213.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright Habana Labs, Ltd. an Intel Company'''
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.op_builder.torch_fallback_builder import TorchInferenceOpBuilder
+from .inference_test_utils import allclose, get_dtypes
+import deepspeed.ops.op_builder.torch_fallback_kernels as torch_fallback_kernels
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytestmark = pytest.mark.skip(reason="Inference ops are not available on this system")
+
+
+def get_inference_modules():
+    return [InferenceBuilder().load(), TorchInferenceOpBuilder().load()]
+
+
+def run_transform4d_0213_reference(input, seq):
+    return torch_fallback_kernels.transform4d_0213(input, seq)
+
+
+def run_transform4d_0213_ds(inference_module, input, seq):
+    batch_size = input.shape[0]
+    heads = input.shape[1]
+    head_dim = input.shape[2] // seq
+    hidden_dim = heads * head_dim
+
+    allocate_workspace_func = getattr(inference_module,
+                                      f"allocate_workspace_{torch_fallback_kernels.dtype_names_dict[input.dtype]}")
+    kernel_func = getattr(inference_module, f"transform4d_0213_{torch_fallback_kernels.dtype_names_dict[input.dtype]}")
+    allocate_workspace_func(
+        hidden_dim,
+        heads,
+        seq,
+        batch_size,
+        1,  # num_layers
+        1,  # mp_size
+        False,  # external_cache
+        0,  # rank
+        150,  # max_out_tokens
+        1)  # min_out_tokens
+    return kernel_func(input, seq)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("inference_module", get_inference_modules())
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("num_heads", [1, 12])
+@pytest.mark.parametrize("sequence", [1, 18, 128])
+@pytest.mark.parametrize("head_dim", [8, 64, 256, 512])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_transform4d_0213(inference_module, batch_size, num_heads, sequence, head_dim, dtype):
+    activations_ds = torch.randn((batch_size, num_heads, sequence * head_dim),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    activations_ref = activations_ds.clone().detach()
+    ds_out = run_transform4d_0213_ds(inference_module, activations_ds, sequence)
+    ref_out = run_transform4d_0213_reference(activations_ref, sequence)
+    assert allclose(ds_out, ref_out)
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index 05c6a82ef55a..9deb5ff9efd7 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -71,7 +71,6 @@ def test(self, sequential_model, simple_config, batch_input, activation_checkpoi
 
         pipe_model = copy.deepcopy(sequential_model)
         pipe_model = PipelineModule(layers=pipe_model, num_stages=2)
-
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
         total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name())
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
index bbcb01b489f4..68fd3a0b7536 100644
--- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -6,6 +6,7 @@
 import torch
 import pytest
 import deepspeed
+import os
 from deepspeed.profiling.flops_profiler import get_model_profile
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
@@ -82,6 +83,12 @@ def test(self):
                 "top_modules": 3,
             },
         }
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         hidden_dim = 10
         model = SimpleModel(hidden_dim, empty_grad=False)
 
@@ -91,7 +98,7 @@ def test(self):
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
                                         device=model.device,
-                                        dtype=torch.half)
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index 8e736c1eaaa6..e6730aaa7ec5 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -7,6 +7,7 @@
 """
 
 import torch
+import os
 import deepspeed.comm as dist
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
 from deepspeed.accelerator import get_accelerator
@@ -18,7 +19,10 @@ class TestReduceScatterCoalesced(DistributedTest):
     world_size = 2
 
     def test_single_input(self):
-        input = torch.full((6, ), dist.get_rank(), dtype=torch.half, device=get_accelerator().current_device_name())
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float
+        input = torch.full((6, ), dist.get_rank(), dtype=dtype, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
@@ -26,7 +30,10 @@ def test_single_input(self):
         assert torch.allclose(output, torch.full_like(output, 0.5))
 
     def test_two_inputs(self):
-        tensor_kwargs = {"device": get_accelerator().current_device_name(), "dtype": torch.half}
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float
+        tensor_kwargs = {"device": get_accelerator().current_device_name(), "dtype": dtype}
         inputs = [
             dist.get_rank() * torch.arange(0, 6, **tensor_kwargs),
             dist.get_rank() * torch.arange(6, 9, **tensor_kwargs),
@@ -50,7 +57,10 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest):
     world_size = 2
 
     def test(self):
-        input = torch.zeros((1, ), dtype=torch.half, device=get_accelerator().current_device_name())
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float
+        input = torch.zeros((1, ), dtype=dtype, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py
new file mode 100644
index 000000000000..fbf235fb7d62
--- /dev/null
+++ b/tests/unit/runtime/compile/test_compile_wrapper.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+
+@pytest.fixture
+def base_config():
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "compile": {
+            "enabled": True,
+            "backend": "inductor"
+        }
+    }
+    return config_dict
+
+
+class SmallModelWithCustomMethod(torch.nn.Module):
+
+    def __init__(self, hidden_dim, test_value):
+        super(SmallModelWithCustomMethod, self).__init__()
+        self.fc = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.v = test_value
+
+    def forward(self, x):
+        return self.fc(x)
+
+    # Custom function that is not part of DeepSpeed engine.
+    def get_v(self):
+        return self.v
+
+
+class TestCustomMethod(DistributedTest):
+    world_size = 1
+    non_daemonic_procs = True
+
+    def _init_engine(self, config, test_value):
+        hidden_dim = 10
+        model = SmallModelWithCustomMethod(hidden_dim, test_value)
+        engine, _, _, _ = deepspeed.initialize(config=config, model=model, model_parameters=model.parameters())
+        return engine
+
+    def _run_model(self, engine):
+        train_batch_size = 1
+        device = torch.device(get_accelerator().current_device_name())
+        dtype = engine.module.fc.weight.dtype
+        hidden_dim = engine.module.fc.weight.shape[1]
+        x = torch.rand(train_batch_size, hidden_dim, device=device, dtype=dtype)
+        engine(x)
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_custom_function(self, base_config):
+        test_value = 10
+
+        engine = self._init_engine(base_config, test_value)
+        assert engine.module.get_v() == test_value
+        self._run_model(engine)
+
+        # The model is compiled after the first run.
+        # Thus we make sure the custom method is still available after compilation.
+        assert engine.module.get_v() == test_value
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
new file mode 100644
index 000000000000..87e3c52b9e3c
--- /dev/null
+++ b/tests/unit/runtime/compile/test_compile_zero.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+
+from unit.runtime.compile.util import compare_loss
+from unit.common import DistributedTest
+from unit.util import bf16_required_version_check
+
+
+class TestZeRO(DistributedTest):
+    world_size = 2
+    non_daemonic_procs = True
+
+    @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16, torch.float32])
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
+    def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device):
+        if dtype == torch.bfloat16 and not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        if offload_device == OffloadDeviceEnum.nvme:
+            if zero_stage != 3:
+                pytest.skip(f"Nvme offload not supported for zero stage {zero_stage}")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+            "compile": {
+                "enabled": True,
+                "backend": "inductor"
+            }
+        }
+
+        if offload_device == OffloadDeviceEnum.cpu:
+            config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
+        elif offload_device == OffloadDeviceEnum.nvme:
+            config_dict["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_device,
+                "nvme_path": str(tmpdir)
+            }
+        if dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        compare_loss(self, config_dict, dtype)
diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py
new file mode 100644
index 000000000000..351e91d2f69b
--- /dev/null
+++ b/tests/unit/runtime/compile/test_load_config.py
@@ -0,0 +1,114 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from unit.simple_model import SimpleModel
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+
+custom_backend_called = False
+custom_compler_fn_called = False
+
+if deepspeed.is_compile_supported():
+    # PyTorch v1 does not have torch.fx
+    def custom_backend(gm: torch.fx.GraphModule, example_inputs):
+        global custom_backend_called
+        custom_backend_called = True
+        return gm.forward
+
+    def custom_compiler_fn(module: torch.nn.Module):
+        global custom_compler_fn_called
+        custom_compler_fn_called = True
+        return torch.compile(module)
+
+
+@pytest.fixture
+def base_config():
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "compile": {
+            "enabled": True,
+            "backend": "inductor"
+        }
+    }
+    return config_dict
+
+
+class TestConfigLoad(DistributedTest):
+    world_size = 1
+    non_daemonic_procs = True
+
+    def _init_engine(self, config):
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        engine, _, _, _ = deepspeed.initialize(config=config, model=model, model_parameters=model.parameters())
+        return engine
+
+    def _run_model(self, engine):
+        train_batch_size = 1
+        device = torch.device(get_accelerator().current_device_name())
+        dtype = engine.module.linears[0].weight.dtype
+        hidden_dim = engine.module.linears[0].weight.shape[1]
+        x = torch.rand(train_batch_size, hidden_dim, device=device, dtype=dtype)
+        y = torch.randn_like(x)
+        engine(x, y)
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_compile(self, base_config):
+        engine = self._init_engine(base_config)
+        self._run_model(engine)
+        assert engine.is_compiled
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_custom_backend(self, base_config):
+        global custom_backend_called
+        custom_backend_called = False
+
+        engine = self._init_engine(base_config)
+        engine.set_backend(f"{__name__}.custom_backend")
+        self._run_model(engine)
+        assert custom_backend_called
+
+    def test_compile_disabled(self, base_config):
+        base_config["compile"]["enabled"] = False
+        engine = self._init_engine(base_config)
+        self._run_model(engine)
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_compile_kwargs(self, base_config):
+        base_config["compile"]["kwargs"] = {"mode": "default"}
+        engine = self._init_engine(base_config)
+        self._run_model(engine)
+        assert "mode" in engine.torch_compile_kwargs
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_set_compile_kwargs(self, base_config):
+        engine = self._init_engine(base_config)
+        engine.set_torch_compile_kwargs({"mode": "default"})
+        self._run_model(engine)
+        assert "mode" in engine.torch_compile_kwargs
+
+    @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported")
+    def test_set_compiler_fn(self, base_config):
+        global custom_compler_fn_called
+        custom_compler_fn_called = False
+
+        engine = self._init_engine(base_config)
+        engine.set_compiler_fn(custom_compiler_fn)
+        self._run_model(engine)
+        assert custom_compler_fn_called
diff --git a/tests/unit/runtime/compile/util.py b/tests/unit/runtime/compile/util.py
new file mode 100644
index 000000000000..86eadf3f6976
--- /dev/null
+++ b/tests/unit/runtime/compile/util.py
@@ -0,0 +1,124 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import random
+import os
+import numpy as np
+from copy import deepcopy
+
+import torch
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.zero import GatheredParameters
+
+from unit.simple_model import SimpleModel
+from typing import Callable, Any
+
+
+class EnableDeterminism:
+
+    def __init__(self, seed: int):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        self.seed = seed + local_rank
+        self.saved_random_state = None
+        self.saved_np_random_state = None
+        self.saved_cuda_launch_blocking = None
+        self.saved_cublas_workspace_config = None
+        self.saved_deterministic_algorithms = None
+
+    def __enter__(self):
+        self.saved_random_state = random.getstate()
+        self.saved_np_random_state = np.random.get_state()
+        self.saved_acc_rng_state = get_accelerator().get_rng_state()
+        self.saved_cuda_launch_blocking = os.environ.get("CUDA_LAUNCH_BLOCKING", "")
+        self.saved_cublas_workspace_config = os.environ.get("CUBLAS_WORKSPACE_CONFIG", "")
+        self.saved_deterministic_algorithms = torch.are_deterministic_algorithms_enabled()
+
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        get_accelerator().manual_seed(self.seed)
+        get_accelerator().manual_seed_all(self.seed)
+
+        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+        torch.use_deterministic_algorithms(True)
+
+    def __exit__(self, type, value, traceback):
+        random.setstate(self.saved_random_state)
+        np.random.set_state(self.saved_np_random_state)
+        get_accelerator().set_rng_state(self.saved_acc_rng_state)
+        os.environ["CUDA_LAUNCH_BLOCKING"] = self.saved_cuda_launch_blocking
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = self.saved_cublas_workspace_config
+        torch.use_deterministic_algorithms(self.saved_deterministic_algorithms)
+
+
+def enable_determinism(seed: int):
+
+    def decorator(func: Callable) -> Callable:
+
+        def wrapper(*args: Any, **kwargs: Any):
+            with EnableDeterminism(seed):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@enable_determinism(123)
+def compare_loss(self, config, dtype):
+    iteration = 5
+    hidden_dim = 10
+    RTOL = 5e-1
+    ATOL = 1e-2
+
+    device = torch.device(get_accelerator().current_device_name())
+    model = SimpleModel(hidden_dim)
+
+    i = get_accelerator().current_device()
+    baseline_model = deepcopy(model)
+    baseline_config = deepcopy(config)
+    baseline_config["zero_optimization"]["stage"] = 0
+    baseline_config["zero_optimization"]["offload_optimizer"] = {}
+    baseline_config["compile"]["enabled"] = False
+    baseline_engine, baseline_optimizer, _, _ = deepspeed.initialize(config=baseline_config,
+                                                                     model=baseline_model,
+                                                                     model_parameters=baseline_model.parameters())
+
+    if config["zero_optimization"]["stage"] == 3:
+        with deepspeed.zero.Init(config_dict_or_path=config):
+            target_model = SimpleModel(hidden_dim)
+        with GatheredParameters(target_model.parameters(), modifier_rank=0):
+            for p1, p2 in zip(target_model.parameters(), model.parameters()):
+                p1.data.copy_(p2.data)
+    else:
+        target_model = deepcopy(model)
+
+    target_engine, target_optimizer, _, _ = deepspeed.initialize(config=config,
+                                                                 model=target_model,
+                                                                 model_parameters=target_model.parameters())
+
+    train_batch_size = config["train_micro_batch_size_per_gpu"]
+
+    xs = [torch.randn(train_batch_size, hidden_dim, device=device, dtype=dtype) for _ in range(iteration)]
+    ys = [torch.randn_like(x) for x in xs]
+
+    for x, y in zip(xs, ys):
+        baseline_loss = baseline_engine(x, y)
+        target_loss = target_engine(x, y)
+
+        assert torch.allclose(baseline_loss, target_loss, rtol=RTOL, atol=ATOL)
+
+        baseline_engine.backward(baseline_loss)
+        target_engine.backward(target_loss)
+
+        baseline_optimizer.step()
+        target_optimizer.step()
+
+        with GatheredParameters(target_engine.parameters()):
+            for p1, p2 in zip(baseline_engine.parameters(), target_engine.parameters()):
+                assert torch.allclose(p1.to(dtype), p2, rtol=RTOL, atol=ATOL)
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index ba795a853be0..14feb54550f0 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -33,6 +33,9 @@
     pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
                 allow_module_level=True)
 
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True)
+
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
 class TestOneBitAdamBasic(DistributedTest):
diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py
index 3f551fb0fd4a..8a421753ce98 100644
--- a/tests/unit/runtime/half_precision/test_bf16.py
+++ b/tests/unit/runtime/half_precision/test_bf16.py
@@ -12,6 +12,7 @@
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader
 from unit.util import bf16_required_version_check
 from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
 
 
 class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest):
@@ -196,6 +197,9 @@ def test(self, optimizer_constructor, zero_stage=2):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
+        if get_accelerator().device_name() == 'hpu':
+            device = get_accelerator().device_name()
+            model.to(device)
         client_optimizer = optimizer_constructor(params=model.parameters())
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
 
@@ -274,6 +278,9 @@ def test(self, stage=2):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
+        if get_accelerator().device_name() == 'hpu':
+            device = get_accelerator().current_device_name()
+            model.to(device)
         optimizer = torch.optim.Adam(model.parameters())
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
@@ -298,7 +305,10 @@ def test(self, comp_type, comm_type):
                 pytest.skip(
                     " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
                 )
-
+        if comm_type and (comp_type not in get_accelerator().supported_dtypes()
+                          or comm_type not in get_accelerator().supported_dtypes()):
+            pytest.skip(
+                f"comp_type:{comp_type}, comm_type:{comm_type} not supported by {get_accelerator().device_name()}.")
         type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"}
 
         config_dict = {
@@ -321,6 +331,12 @@ def test(self, comp_type, comm_type):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
+        if get_accelerator().device_name() == 'hpu':
+            # TODO: remove this when the following is resolved:
+            # https://jira.habana-labs.com/browse/SW-137450
+            config_dict["fp16"]["initial_scale_power"] = 30
+            device = get_accelerator().current_device_name()
+            model.to(device)
         optimizer = torch.optim.Adam(model.parameters())
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
index 2a58fd6b4a57..ddc274e9bc18 100644
--- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
+++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
@@ -6,8 +6,14 @@
 import torch
 import deepspeed
 import numpy as np
+import pytest
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel
+from deepspeed.ops.op_builder import FusedLambBuilder
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 
 
 def run_model_step(model, gradient_list):
@@ -38,6 +44,7 @@ def test_no_overflow(self):
                 "loss_scale_window": 2
             }
         }
+
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
         model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
@@ -143,6 +150,7 @@ def test_some_overflow(self):
         assert optim.cur_iter == expected_iteration
 
 
+@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
 class TestUnfused(DistributedTest):
     world_size = 1
 
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index 3d5e18b46502..d523c042fe9f 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -21,6 +21,9 @@
     _amp_available = False
 amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed")
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+
 
 class TestLambFP32GradClip(DistributedTest):
     world_size = 2
@@ -561,7 +564,6 @@ def test(self, zero_stage, optimizer_constructor):
             }
         }
         hidden_dim = 10
-
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
index 88e26290b650..eee8cf90fb8c 100644
--- a/tests/unit/runtime/pipe/test_pipe.py
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -6,7 +6,6 @@
 import copy
 import torch.nn as nn
 import pytest
-
 import deepspeed.comm as dist
 from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule
@@ -67,7 +66,6 @@ def test_pipe_base(self, topo_config):
         skip_on_arch(min_arch=7)
         topo = PipeTopo(**topo_config)
         steps = 100  # must be >=100
-
         # Allocate model for consistent initial weights.
         init_net = AlexNetPipe()
 
diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
index 92da2257bdb0..89977bbf3668 100644
--- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
@@ -5,9 +5,14 @@
 
 import torch
 import deepspeed
+import pytest
 from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
 from unit.util import skip_on_arch
 
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True)
+
 
 class Model(torch.nn.Module):
 
diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
index 0689adc08670..01558cb8c6a7 100644
--- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
+++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
@@ -5,10 +5,14 @@
 
 import torch
 import deepspeed
+import pytest
 from unit.common import DistributedTest
-
+from deepspeed.accelerator import get_accelerator
 import deepspeed.utils.groups as groups
 
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True)
+
 
 class Model(torch.nn.Module):
 
@@ -43,7 +47,6 @@ class TestSparseAdam(DistributedTest):
 
     def test(self):
         config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
-
         model = Model()
         optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
         engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py
index 9176770afda7..9692f105d69d 100644
--- a/tests/unit/runtime/test_autocast.py
+++ b/tests/unit/runtime/test_autocast.py
@@ -25,6 +25,7 @@ def test_missing_amp_autocast(self, half_op):
         output = ds_linear(input)
         assert output.dtype == ds_linear.weight.dtype
 
+    @pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed')
     def test_disable_autocast_linear(self, half_op):
         amp = get_accelerator().amp()
 
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
index b9bd9c3aa56e..2515c89a1d0b 100644
--- a/tests/unit/runtime/test_data_efficiency.py
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -103,9 +103,14 @@ def data_post_process(data, data_sampler_state_dict):
             assert 'dummy_metric' in data_sampler_state_dict['current_difficulties']
             return data
 
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half)
+        dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=dtype)
         model, _, data_loader, _ = deepspeed.initialize(config=config_dict,
                                                         model=model,
                                                         training_data=dataset,
@@ -159,9 +164,19 @@ def test_fixed_discrete(self):
         hidden_dim = 10
         ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
 
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         model = Curriculum_SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=20,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
@@ -202,10 +217,19 @@ def test_fixed_linear(self):
         }
         hidden_dim = 10
         ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = Curriculum_SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=20,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 6cd01644fad5..6fa008e1aaa2 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -9,6 +9,7 @@
 import json
 import hjson
 import argparse
+import torch
 
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
 from deepspeed.accelerator import get_accelerator
@@ -163,11 +164,23 @@ class TestConfigLoad(DistributedTest):
     world_size = 1
 
     def test_dict(self, base_config):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters())
 
     def test_json(self, base_config, tmpdir):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         config_path = os.path.join(tmpdir, "config.json")
         with open(config_path, 'w') as fp:
             json.dump(base_config, fp)
@@ -176,6 +189,12 @@ def test_json(self, base_config, tmpdir):
         model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
     def test_hjson(self, base_config, tmpdir):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         config_path = os.path.join(tmpdir, "config.json")
         with open(config_path, 'w') as fp:
             hjson.dump(base_config, fp)
@@ -188,6 +207,12 @@ class TestDeprecatedDeepScaleConfig(DistributedTest):
     world_size = 1
 
     def test(self, base_config, tmpdir):
+
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         config_path = create_config_from_dict(tmpdir, base_config)
         parser = argparse.ArgumentParser()
         args = parser.parse_args(args='')
@@ -198,7 +223,11 @@ def test(self, base_config, tmpdir):
 
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -210,13 +239,22 @@ class TestDistInit(DistributedTest):
 
     def test(self, base_config):
         hidden_dim = 10
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=base_config,
                                               model=model,
                                               model_parameters=model.parameters(),
                                               dist_init_required=True)
-        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -229,11 +267,19 @@ class TestInitNoOptimizer(DistributedTest):
     def test(self, base_config):
         del base_config["optimizer"]
         hidden_dim = 10
-
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["fp32"] = {"enabled": True}
+            dtype = torch.float32
         model = SimpleModel(hidden_dim=hidden_dim)
 
         model, _, _, _ = deepspeed.initialize(config=base_config, model=model)
-        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             with pytest.raises(AssertionError):
@@ -246,16 +292,28 @@ class TestArgs(DistributedTest):
     world_size = 1
 
     def test_none_args(self, base_config):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config)
-        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device, dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
     def test_no_args(self, base_config):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(model=model, config=base_config)
-        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device, dtype=dtype)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
@@ -264,7 +322,11 @@ class TestNoModel(DistributedTest):
     world_size = 1
 
     def test(self, base_config):
+        if os.getenv("REPLACE_FP16", default=None):
+            base_config["fp16"]["enabled"] = False
+            base_config["bf16"] = {"enabled": True}
         model = SimpleModel(hidden_dim=10)
+
         with pytest.raises(AssertionError):
             model, _, _, _ = deepspeed.initialize(model=None, config=base_config)
 
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index 8ec9f05a0a17..4a633a96f32c 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -5,6 +5,7 @@
 
 import pytest
 from typing import Callable
+import os
 import torch
 from torch.optim import Optimizer, Adam, AdamW
 from torch.optim.lr_scheduler import _LRScheduler, LambdaLR
@@ -18,6 +19,8 @@
 from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
 from deepspeed.runtime.config import ADAM_OPTIMIZER
 from deepspeed.runtime.utils import see_memory_usage, required_torch_version
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedAdamBuilder
 
 
 @pytest.mark.parametrize('zero_stage', [0, 3])
@@ -43,6 +46,11 @@ def test(self, zero_stage):
         # 20B test
         #hidden_dim = 16 * 1024
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config):
             model = SimpleModel(hidden_dim, nlayers=78)
@@ -53,7 +61,7 @@ def test(self, zero_stage):
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
                                         device=model.device,
-                                        dtype=torch.half)
+                                        dtype=dtype)
         for batch in data_loader:
             model(batch[0], batch[1])
         see_memory_usage('post-fwds', force=True)
@@ -68,6 +76,9 @@ def test(self, optimizer_type):
         def _optimizer_callable(params) -> Optimizer:
             return AdamW(params=params)
 
+        if (optimizer_type is None) and (not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]):
+            pytest.skip("FusedAdam is not compatible")
+
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -79,13 +90,13 @@ def _optimizer_callable(params) -> Optimizer:
             client_optimizer = Adam(model.parameters())
         else:
             client_optimizer = _optimizer_callable
-
         _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict,
                                                      model=model,
                                                      model_parameters=list(model.parameters()),
                                                      optimizer=client_optimizer)
         if client_optimizer is None:
-            assert isinstance(ds_optimizer, FusedAdam)
+            optim = FusedAdam
+            assert isinstance(ds_optimizer, optim)
         elif isinstance(client_optimizer, Optimizer):
             assert ds_optimizer == client_optimizer
         else:
@@ -96,8 +107,11 @@ def _optimizer_callable(params) -> Optimizer:
 class TestConfigOptimizer(DistributedTest):
     world_size = 1
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test(self, client_parameters):
         ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}}
+        optimizer = FusedAdam
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
@@ -108,8 +122,7 @@ def test(self, client_parameters):
             model_parameters = None
 
         _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model_parameters)
-
-        assert isinstance(ds_optimizer, FusedAdam)
+        assert isinstance(ds_optimizer, optimizer)
 
 
 @pytest.mark.parametrize('optimizer_extension', ['zero1', 'zero2', 'zero3', 'amp', None])
@@ -136,6 +149,8 @@ def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
             pytest.skip(
                 "DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
             )
+        if fp16 and torch.float16 not in get_accelerator().supported_dtypes():
+            pytest.skip(f"FP16 not supported by {get_accelerator().device_name()}")
         if amp and not required_amp_check():
             pytest.skip("Amp is not installed can't run amp check")
         # Config declaration
@@ -216,12 +231,16 @@ def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
         # BF16 Wrapper
         is_supported[(None, 'bf16', 'fp32')] = True
         is_supported[(None, 'bf16', None)] = True
+        is_supported[(None, 'bf16', 'bf16')] = True
         # No Wrapper
         is_supported[(None, 'fp32', None)] = True
         is_supported[(None, 'fp32', 'fp32')] = True
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
+        # TODO: SW-145674 remove this WA when SW-145671 is resolved.
+        if get_accelerator().device_name() == 'hpu':
+            model.to(get_accelerator().device_name())
         model_parameters = list(model.parameters())
 
         if key in is_supported:
diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py
index d9aba419b158..6c3263f7729c 100644
--- a/tests/unit/runtime/test_multi_output_model.py
+++ b/tests/unit/runtime/test_multi_output_model.py
@@ -5,9 +5,14 @@
 
 import torch
 import deepspeed
+import pytest
 from pytest import approx
 from unit.common import DistributedTest
 from unit.multi_output_model import MultiOutputModel, multi_output_dataloader
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 
 
 class TestTwoOutputModel(DistributedTest):
@@ -35,7 +40,6 @@ def test(self, tmpdir):
 
         hidden_dim = 10
         weight_value = 0.1
-
         model = MultiOutputModel(hidden_dim, weight_value)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         total_samples = 4
diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py
index 1f602db73b2f..5f3a328ba66b 100644
--- a/tests/unit/runtime/test_pld.py
+++ b/tests/unit/runtime/test_pld.py
@@ -5,9 +5,10 @@
 
 import numpy as np
 import deepspeed
+import torch
+import os
 import pytest
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
-
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, PLD_SimpleModel, random_dataloader
 
@@ -49,11 +50,19 @@ def test_pld_model(self, theta):
             }
         }
         hidden_dim = 10
-
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         model = PLD_SimpleModel(hidden_dim, empty_grad=False)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -90,11 +99,20 @@ def test_non_pld_model(self):
             }
         }
         hidden_dim = 10
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim, empty_grad=False)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             with pytest.raises(TypeError):
diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
index aade488fde42..cc31d1386c1c 100644
--- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py
+++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
@@ -4,10 +4,11 @@
 # DeepSpeed Team
 
 import pytest
+import os
+import torch
 from unit.common import DistributedTest
 from unit.simple_model import UnusedParametersModel, random_dataloader
 from deepspeed.ops.op_builder import CPUAdamBuilder
-
 import deepspeed
 
 
@@ -42,11 +43,20 @@ def test(self, ignore_unused_parameters):
             }
         }
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
+            dtype = torch.float
 
         model = UnusedParametersModel(hidden_dim=hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         def _loop():
             for n, batch in enumerate(data_loader):
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index db81a0578160..959952e6633f 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -6,6 +6,7 @@
 import math
 from collections import namedtuple
 from typing import Dict, List, NamedTuple, Set, Tuple
+import os
 import pytest
 import deepspeed.comm as dist
 import torch
@@ -76,10 +77,18 @@ def test(self, zero_stage):
             },
         }
         hidden_dim = 4
-
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         model = SimpleModel(hidden_dim=hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         run_unbalanced_gradients(model, data_loader)
 
@@ -110,6 +119,11 @@ def test(self, zero_stage=3):
             },
         }
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         class AlbertLikeModel(torch.nn.Module):
 
@@ -127,7 +141,11 @@ def forward(self, x, y):
 
         model = AlbertLikeModel(hidden_dim=hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -188,6 +206,11 @@ def forward(self, x, y):
                 return self.cross_entropy_loss(hidden, y)
 
         hidden_dim = 3  # do not change
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         world_size = dist.get_world_size()
         # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2
@@ -195,10 +218,15 @@ def forward(self, x, y):
         model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
 
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+
         # Flush zero stage 3 cache
         model.empty_partition_cache()
 
-        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -276,10 +304,17 @@ def forward(self, x, y):
                 return self.cross_entropy_loss(hidden, y)
 
         hidden_dim = 3
+        dtype = torch.half
 
         world_size = dist.get_world_size()
         n_layers = world_size * 2
         model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+        if get_accelerator().device_name() == 'hpu':
+            model.to(get_accelerator().device_name())
 
         optim_groups = [
             {
@@ -301,7 +336,11 @@ def forward(self, x, y):
         )
         model.empty_partition_cache()
 
-        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -365,6 +404,11 @@ def test(self, allgather_bucket_size, zero_stage=2):
             },
         }
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim=hidden_dim)
         if allgather_bucket_size % 2 == 0:
@@ -400,6 +444,11 @@ def test(self, zero_stage=2):
             },
         }
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim=hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
@@ -416,6 +465,11 @@ def test(self, zero_stage=2):
 
 
 def _ds_initialize_for_param_partitioning_testing(model: Module, cfg: dict) -> DeepSpeedEngine:
+    if os.getenv("REPLACE_FP16", default=None):
+        if 'fp16' in cfg:
+            cfg["fp16"]["enabled"] = False
+            cfg["bf16"] = {"enabled": True}
+
     ds_engine, _, _, _ = deepspeed.initialize(config=cfg, model=model, model_parameters=model.parameters())
 
     return ds_engine
@@ -695,6 +749,9 @@ def _test(
                 "pin_memory": True,
             }
 
+        if os.getenv("REPLACE_FP16", default=None):
+            fp16_enabled = False
+            cfg["fp16"]["enabled"] = False
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
         for i, weight in enumerate(weights):
             weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
@@ -857,12 +914,20 @@ def forward(self, x: Tensor) -> Tensor:
                 "loss_scale": 1.0,
             },
         }
-        with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
+        dtype = torch.float16
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_config["fp16"]["enabled"] = False
+            ds_config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+            zero3_init_dtype = torch.bfloat16
+
+        with deepspeed.zero.Init(dtype=zero3_init_dtype, mem_efficient_linear=False, enabled=init_context_manager):
             model = LargeParamModel()
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
 
         for train_iter in range(3):  # test multiple iterations to cover prefetching
-            activation: Tensor = ds_engine(torch.ones(param_sz, dtype=torch.float16, device=ds_engine.device))
+            activation: Tensor = ds_engine(torch.ones(param_sz, dtype=dtype, device=ds_engine.device))
 
             partition_sz = math.ceil(param_sz / self.world_size)
             for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
@@ -893,7 +958,6 @@ class ManyParamModel(Module):
 
             def __init__(self) -> None:
                 super().__init__()
-
                 self.modulelist = ModuleList(
                     EltwiseMultiplicationModule(weight=Parameter(torch.empty((param_sz, ), dtype=torch.float32)))
                     for _ in range(n_layers))
@@ -936,6 +1000,11 @@ def forward(self, x: Tensor) -> Tensor:
                 "loss_scale": 1.0,
             },
         }
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_cfg["fp16"]["enabled"] = False
+            ds_cfg["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
             model = ManyParamModel()
@@ -943,12 +1012,11 @@ def forward(self, x: Tensor) -> Tensor:
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
 
         for _ in range(3):  # test multiple iterations to cover prefetching
-            activations: List[Tensor] = ds_engine(
-                torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device))
+            activations: List[Tensor] = ds_engine(torch.ones((param_sz, ), dtype=dtype, device=ds_engine.device))
             assert len(activations) == n_layers
 
             partition_sz = math.ceil(param_sz / self.world_size)
-            expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device)
+            expected_activations = torch.empty(param_sz, dtype=dtype, device=ds_engine.device)
             for start_idx in range(0, param_sz, partition_sz):
                 expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank()
 
@@ -976,8 +1044,8 @@ class ModelWhereParentInitializesChildWeights(Module):
 
             def __init__(self) -> None:
                 super().__init__()
-
-                self.linear = Linear(12, 1)
+                dev = get_accelerator().device_name()
+                self.linear = Linear(12, 1, device=dev)
 
                 self.apply(self.__init_weights)
 
@@ -1005,6 +1073,9 @@ def __init_weights(self, module):
                 "loss_scale": 1.0,
             },
         }
+        if os.getenv("REPLACE_FP16", default=None):
+            ds_cfg["fp16"]["enabled"] = False
+            ds_cfg["bf16"] = {"enabled": True}
 
         with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
             model = ModelWhereParentInitializesChildWeights()
@@ -1212,10 +1283,19 @@ def test(self):
             },
         }
         hidden_dim = 10
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1245,6 +1325,13 @@ def test(self, return_type):
             },
         }
         hidden_dim = 10
+        dtype = torch.half
+        zero3_init_dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+            zero3_init_dtype = torch.bfloat16
 
         class MyModel(torch.nn.Module):
 
@@ -1266,11 +1353,15 @@ def forward(self, x, y):
                     raise NotImplementedError
                 return val
 
-        with deepspeed.zero.Init():
+        with deepspeed.zero.Init(dtype=zero3_init_dtype):
             model = MyModel(hidden_dim)
 
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1309,12 +1400,21 @@ def test(self, zero_stage):
             },
         }
         hidden_dim = 4
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         model = SimpleModel(hidden_dim=hidden_dim, nlayers=12)
         model, optimizer, _, _ = deepspeed.initialize(config=config_dict,
                                                       model=model,
                                                       model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1380,11 +1480,20 @@ def forward(self, x, y):
                 val = (x, loss)
                 return val
 
-        with deepspeed.zero.Init(config_dict_or_path=config_dict, enabled=zero_stage == 3):
-            model = MyModel(hidden_dim)
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
+        with deepspeed.zero.Init(dtype=dtype, config_dict_or_path=config_dict, enabled=zero_stage == 3):
+            model = MyModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1414,8 +1523,13 @@ def test(self, force_ds_optim):
             "zero_force_ds_cpu_optimizer": force_ds_optim,
         }
         hidden_dim = 10
-
         model = SimpleModel(hidden_dim)
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+        if get_accelerator().device_name() == 'hpu':
+            device = get_accelerator().current_device_name()
+            model.to(device)
 
         optimizer = torch.optim.Adam(model.parameters())
 
@@ -1443,6 +1557,11 @@ def test_training_partition_cache(self, training):
                 "stage3_param_persistence_threshold": hidden_dim,
             },
         }
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         if training:
             config_dict["optimizer"] = {"type": "Adam"}
 
@@ -1451,7 +1570,6 @@ def test_training_partition_cache(self, training):
 
         model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
 
-        dtype = torch.half
         data_loader = random_dataloader(
             model=model,
             total_samples=6,
@@ -1507,9 +1625,8 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
                 "enabled": dtype == torch.bfloat16
             }
         }
-
         if use_client_optimizer:
-            optimizer = deepspeed.ops.adam.FusedAdam(param_groups, lr=0.1)
+            optimizer = torch.optim.AdamW(param_groups, lr=0.1)
             model_parameters = model.parameters()
         else:
             config_dict["optimizer"] = {"type": "adamw"}
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
index aabe7f0b7f15..5889b7101702 100644
--- a/tests/unit/runtime/zero/test_zero_context.py
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -7,6 +7,7 @@
 
 import torch
 import deepspeed
+import os
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
 import deepspeed.comm as dist
 
@@ -71,7 +72,12 @@ def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
 
-        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+        with deepspeed.zero.Init(config_dict_or_path=config_dict, dtype=dtype):
             model = MyModel(hidden_dim)
 
         with deepspeed.zero.GatheredParameters(list(model.parameters())):
@@ -88,7 +94,12 @@ class TestSerialContext(DistributedTest):
 
     def test_subclass_param(self):
         setup_serial_env()
-        with deepspeed.zero.Init(config=config):
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+        with deepspeed.zero.Init(config=config, dtype=dtype):
             model = ConvNet()
 
         assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
@@ -215,7 +226,11 @@ def forward(self, input):
                 return C.sum()
 
         net = ExtLinear()
-
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         args = SimpleNamespace(local_rank=0)
         engine, optim, _, _ = deepspeed.initialize(args=args,
                                                    model=net,
@@ -225,7 +240,7 @@ def forward(self, input):
         with deepspeed.zero.GatheredParameters(net.linear1.weight):
             assert net.linear1.weight.numel() == net.dim**2
 
-        input = torch.rand(net.dim).to(engine.device).half()
+        input = torch.rand(net.dim).to(engine.device).to(dtype)
         loss = engine(input)
         engine.backward(loss)
         engine.step()
@@ -235,7 +250,12 @@ class TestScatterGather(DistributedTest):
     world_size = 2
 
     def test(self):
-        with deepspeed.zero.Init():
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+        with deepspeed.zero.Init(dtype=dtype):
             l = torch.nn.Linear(6, 3)
         assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
         assert l.weight.shape == torch.Size(partitioned_param_data_shape)
@@ -254,7 +274,10 @@ class TestGatherUpdate(DistributedTest):
     world_size = 2
 
     def test(self):
-        with deepspeed.zero.Init():
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float32
+        with deepspeed.zero.Init(dtype=dtype):
             l = torch.nn.Linear(4, 2)
         assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
 
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
index 21955f5df152..07c32cb25fc6 100644
--- a/tests/unit/runtime/zero/test_zero_context_ancestry.py
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -5,6 +5,7 @@
 
 import torch
 import deepspeed
+import os
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.accelerator import get_accelerator
 
@@ -67,7 +68,12 @@ class TestSerialParamInit(DistributedTest):
 
     def test_subclass_param_init(self):
         setup_serial_env()
-        with deepspeed.zero.Init(config=config):
+        dtype = None
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
+        with deepspeed.zero.Init(config=config, dtype=dtype):
             model = Son().cpu()
 
         # test that all params have been partitioned
@@ -107,7 +113,10 @@ def __init__(self):
             def magic(self):
                 return 42
 
-        with deepspeed.zero.Init():
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float32
+        with deepspeed.zero.Init(dtype=dtype):
             model = Model()
             engine, *_ = deepspeed.initialize(model=model, config=ds_config, model_parameters=model.parameters())
         assert engine.magic() == 42
diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py
index 874a8ea3b676..ac790da9f45e 100644
--- a/tests/unit/runtime/zero/test_zero_context_return.py
+++ b/tests/unit/runtime/zero/test_zero_context_return.py
@@ -7,6 +7,7 @@
 import torch
 import pytest
 import deepspeed
+import os
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
 from utils import setup_serial_env
@@ -137,12 +138,16 @@ def test_ext_param_return(self):
         setup_serial_env()
 
         net = DanglingExt()
-
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         args = SimpleNamespace(local_rank=0)
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(dtype)
             loss = engine(input)
             engine.backward(loss)
             engine.step()
@@ -151,14 +156,18 @@ def test_ext_param_return(self):
     def test_ext_param_returnobj(self):
         setup_serial_env()
         print()
-
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         net = ModelContainer(return_obj=True)
 
         args = SimpleNamespace(local_rank=0)
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(dtype)
             loss = engine(input)
             assert len(net._external_params) == 1
             assert len(net.dangler._external_params) == 0
@@ -169,14 +178,18 @@ def test_ext_param_returnobj(self):
     def test_stage_3_output_type(self, output_type):
         setup_serial_env()
         print()
-
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config["fp16"]["enabled"] = False
+            config["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
         net = ModelContainerVariableOutputType(output_type=output_type)
 
         args = SimpleNamespace(local_rank=0)
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(1):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(dtype)
             loss = engine(input)
             if loss is not None:
                 if isinstance(loss, dict):
diff --git a/tests/unit/runtime/zero/test_zero_dynamic_class.py b/tests/unit/runtime/zero/test_zero_dynamic_class.py
index e235206d4dc4..b2cb66dfc79c 100644
--- a/tests/unit/runtime/zero/test_zero_dynamic_class.py
+++ b/tests/unit/runtime/zero/test_zero_dynamic_class.py
@@ -6,7 +6,6 @@
 import torch
 
 from unit.common import DistributedTest
-
 import deepspeed
 
 
@@ -15,7 +14,6 @@ class TestNewClassDeclaredNestingInit(DistributedTest):
 
     def test_new_class_declared_nesting_init(self):
         ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
-
         with deepspeed.zero.Init(config_dict_or_path=ds_config):
 
             class MyModel(torch.nn.Module):
diff --git a/tests/unit/runtime/zero/test_zero_nesting_init.py b/tests/unit/runtime/zero/test_zero_nesting_init.py
index 143e7e997b13..668044da33c3 100644
--- a/tests/unit/runtime/zero/test_zero_nesting_init.py
+++ b/tests/unit/runtime/zero/test_zero_nesting_init.py
@@ -9,7 +9,6 @@
 
 from transformers import VisionEncoderDecoderModel
 from transformers.deepspeed import HfDeepSpeedConfig
-
 import deepspeed
 
 
@@ -18,7 +17,6 @@ class TestNestingInit(DistributedTest):
 
     def test_nesting_init(self):
         ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
-
         with deepspeed.zero.Init(config_dict_or_path=ds_config):
             with deepspeed.zero.Init(config_dict_or_path=ds_config):
                 model = torch.nn.Linear(4, 4)
diff --git a/tests/unit/runtime/zero/test_zero_offloadpp.py b/tests/unit/runtime/zero/test_zero_offloadpp.py
index c376686f8052..18fd40a620e3 100644
--- a/tests/unit/runtime/zero/test_zero_offloadpp.py
+++ b/tests/unit/runtime/zero/test_zero_offloadpp.py
@@ -6,11 +6,9 @@
 import deepspeed.comm as dist
 from unit.common import DistributedTest
 from unit.simple_model import random_dataloader
-
 import deepspeed
-
+import os
 from deepspeed.runtime.zero.offload_config import DeepSpeedZeroOffloadOptimizerConfig
-
 import torch.nn as nn
 
 
@@ -63,6 +61,9 @@ def test(self, h_dim: int, n_layers: int) -> None:
                 }
             }
         }
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["fp32"] = {"enabled": True}
 
         model = NNModel(h_dim, n_layers)
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index e50b03035bad..f63dfdc273f4 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -6,6 +6,7 @@
 import pytest
 import deepspeed.comm as dist
 import torch
+import os
 
 from unit.common import DistributedTest
 from unit.simple_model import random_dataloader, SimpleModel
@@ -131,6 +132,11 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz
                 "stage": zero_stage,
             }
         }
+        dtype = torch.float16
+        if os.getenv("REPLACE_FP16", default=None):
+            config_dict["fp16"]["enabled"] = False
+            config_dict["bf16"] = {"enabled": True}
+            dtype = torch.bfloat16
 
         if offload_device == OffloadDeviceEnum.cpu:
             config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
@@ -142,20 +148,20 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz
 
         hidden_dim = 128
         if zero_stage == 3:
-            with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            with deepspeed.zero.Init(config_dict_or_path=config_dict, dtype=dtype):
                 model = MyModel(hidden_dim, frozen_weights)
         else:
             model = MyModel(hidden_dim, frozen_weights)
 
         validate_func = validate_funcs_mapping[api_type]
 
-        run_fragmented_model(model, config_dict, hidden_dim, torch.float16, validate_func)
+        run_fragmented_model(model, config_dict, hidden_dim, dtype, validate_func)
 
     def test_bf16_fragments(self, frozen_weights):
         if frozen_weights:
             pytest.skip("TODO: Frozen weights not currently supported by BF16 Optimizer")
 
-        if not bf16_required_version_check(accelerator_check=False):
+        if (not bf16_required_version_check(accelerator_check=False)):
             pytest.skip(
                 " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
             )
@@ -310,7 +316,6 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp
             config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
         elif dtype == torch.bfloat16:
             config_dict["bf16"] = {"enabled": True}
-
         hidden_dim = 128
         if zero_stage == 3:
             config_dict["zero_optimization"]["param_persistence_threshold"] = hidden_dim
diff --git a/tests/unit/skip_marker.py b/tests/unit/skip_marker.py
new file mode 100644
index 000000000000..b6f066d406ef
--- /dev/null
+++ b/tests/unit/skip_marker.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+hpu_skip_tests = {}
+
+g1_skip_tests = {
+    "unit/runtime/zero/test_zero_context.py::TestSerialContext::test_scatter_halftype":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/runtime/test_autocast.py::TestAutoCastDisable::test_missing_amp_autocast[True]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_inference.py::TestAutoTensorParallelism::test[noCG-fp16-marian]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShardinAutoTP::test[EleutherAI/gpt-j-6B]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShardinAutoTP::test[facebook/opt-125m]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShardinAutoTP::test[EleutherAI/gpt-neo-125M]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShardinAutoTP::test[bigscience/bloom-560m]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/inference/test_inference.py::TestAutoTensorParallelism::test_odd_world_size[fp16-marian]":
+    "FP16 datatype is not supported by Gaudi.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-False-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[True]":
+    "Stuck",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+}
+
+g2_skip_tests = {
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-True-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-False-True]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[True]":
+    "Stuck",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]":
+    "Skipping test due to segfault. SW-170285",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp16]":
+    "Skipping test due to segfault then stuck. SW-174912",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp16]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp32]":
+    "FusedAdam test not supported. Test got stuck.",
+}
+
+gpu_skip_tests = {
+    "unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[True]":
+    "Disabled as it is causing test to stuck. SW-163517.",
+}
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 13eab3ef3a72..3f42b2838fec 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -5,6 +5,8 @@
 
 import pytest
 import torch
+import os
+import multiprocessing
 from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
 from deepspeed.git_version_info import torch_info
 from packaging import version as pkg_version
@@ -47,11 +49,14 @@ def bf16_required_version_check(accelerator_check=True):
     cuda_version_available = CUDA_MAJOR >= 11
     nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
     npu_available = get_accelerator().device_name() == 'npu'
+    hpu_available = get_accelerator().device_name() == 'hpu'
 
     if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
         return True
     elif npu_available:
         return True
+    elif hpu_available:
+        return True
     else:
         return False
 
@@ -76,3 +81,50 @@ def required_amp_check():
         return False
     else:
         return True
+
+
+def worker(proc_id, return_dict):
+    #TODO SW-114787: move to new api outside experimental
+    import habana_frameworks.torch.utils.experimental as htexp
+    deviceType = htexp._get_device_type()
+    if deviceType == htexp.synDeviceType.synDeviceGaudi:
+        return_dict['devicetype'] = "Gaudi"
+    elif deviceType == htexp.synDeviceType.synDeviceGaudi2:
+        return_dict['devicetype'] = "Gaudi2"
+    elif deviceType == htexp.synDeviceType.synDeviceGaudi3:
+        return_dict['devicetype'] = "Gaudi3"
+    else:
+        return_dict['devicetype'] = None
+        assert False, f'Unexpected hpu device Type: {deviceType}'
+
+
+def get_hpu_dev_version():
+    hpu_dev = None
+    if get_accelerator().device_name() != 'hpu':
+        return hpu_dev
+    if os.getenv("DEEPSPEED_UT_HL_DEVICE", default=None):
+        hpu_dev = os.getenv("DEEPSPEED_UT_HL_DEVICE")
+    if hpu_dev not in ["Gaudi", "Gaudi2", "Gaudi3"]:
+        manager = multiprocessing.Manager()
+        return_dict = manager.dict()
+        proc_id = 0
+        multiprocessing.set_start_method("spawn", force=True)
+        p = multiprocessing.Process(target=worker, args=(proc_id, return_dict))
+        p.start()
+        p.join()
+        try:
+            dev_type = return_dict['devicetype']
+        except:
+            assert False, 'Unexpected hpu device Type: {}'.format(return_dict['devicetype'])
+        p.terminate()
+        exit_code = p.exitcode
+        if exit_code:
+            assert False, 'HPU dev type process exit with: {}'.format(exit_code)
+        if dev_type in ["Gaudi", "Gaudi2", "Gaudi3"]:
+            hpu_dev = dev_type
+            os.environ['DEEPSPEED_UT_HL_DEVICE'] = dev_type
+            return dev_type
+        else:
+            assert False, 'Unexpected hpu device Type: {}'.format(return_dict['devicetype'])
+    else:
+        return hpu_dev
diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py
index 5d84e9be855a..75ab8050f28b 100644
--- a/tests/unit/utils/test_init_on_device.py
+++ b/tests/unit/utils/test_init_on_device.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+import os
 import pytest
 from unit.simple_model import SimpleModel
 from deepspeed import OnDevice
@@ -20,9 +21,12 @@ def test_on_device(self, device):
         if device == "meta" and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
             pytest.skip("meta tensors only became stable after torch 1.10")
 
-        with OnDevice(dtype=torch.half, device=device):
+        dtype = torch.half
+        if os.getenv("REPLACE_FP16", default=None):
+            dtype = torch.float
+        with OnDevice(dtype=dtype, device=device):
             model = SimpleModel(4)
 
         for p in model.parameters():
             assert p.device == torch.device(device)
-            assert p.dtype == torch.half
+            assert p.dtype == dtype
diff --git a/tests/unit/xfail_marker.py b/tests/unit/xfail_marker.py
new file mode 100644
index 000000000000..3e8563d147b5
--- /dev/null
+++ b/tests/unit/xfail_marker.py
@@ -0,0 +1,5376 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+hpu_xfail_tests = {}
+
+g1_xfail_tests = {
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-350m-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[cross-encoder/ms-marco-MiniLM-L-12-v2-text-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/minilm-uncased-squad2-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/minilm-uncased-squad2-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[cross-encoder/ms-marco-MiniLM-L-12-v2-text-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-350m-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/pythia-70m-deduped-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163095.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/pythia-70m-deduped-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163095.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilgpt2-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-bf16-noCG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp32-CG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/gpt-j-6b-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp32-noCG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilgpt2-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-bf16-CG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/gpt-j-6b-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero3]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-zero3]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_model_quantization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization_nvme_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant_cpu_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_half_int4_quantization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization_cpu_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_quantized_linear":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_half_int8_quantization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant_nvme_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_pipeline_checkpoint_loading[3]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdamW-AdamW]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuSGD-SGD]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdam-Adam]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdamW-AdamW]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdam-Adam]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuSGD-SGD]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_frozen_weights[2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_frozen_weights[1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[2-20-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[1-8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[1-20-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[2-8-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[4-20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[4-8-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[4-20-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[1-8-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[2-8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[1-20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[2-20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TesthpZeroConfigSweep::test[4-8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[2-8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[2-20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[4-20-4000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[4-8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[2-8-4000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[4-20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[2-20-4000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_hpzero.py::TestSecondaryTensorSize::test[4-8-4000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qgzero.py::TesthpZeroConfigSweep::test[20-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qgzero.py::TesthpZeroConfigSweep::test[8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qgzero.py::TesthpZeroConfigSweep::test[20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qgzero.py::TesthpZeroConfigSweep::test[8-2000]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[8-2048]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[20-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[8-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[20-2048]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[4]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[4]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[4]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShardinAutoTP::test[facebook/opt-350m]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/test_inference.py::TestAutoTensorParallelism::test[fp16-marian]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/test_inference.py::TestLowCpuMemUsage::test[gpt2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-9-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-4-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-9-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-9-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-4-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-4-1024]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-1232-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-255-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-4096-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-128-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-512-1-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[True-dtype0-512-1-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-4096-128-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/ops/transformer/inference/test_gelu.py::test_gelu[False-dtype0-1232-255-2]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/test_inference.py::TestAutoTP::test[falcon]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/test_inference.py::TestAutoTensorParallelism::test[fp16-codegen]":
+    "float16/half is not supported on Gaudi.",
+    "unit/moe/test_moe.py::TestMoE::test[False-2-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-2-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-0-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[False-0-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[True-0-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[True-0-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestPRMoE::test[2-True]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestPRMoE::test[2-False]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/runtime/zero/test_zero_context.py::TestSerialContext::test_subclass_param":
+    "Xfail, due to SW-156783.",
+    "unit/runtime/zero/test_zero_context_ancestry.py::TestSerialParamInit::test_subclass_param_init":
+    "Xfail, due to SW-143227.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp32-gpt-neo]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp32-gpt-j]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[bf16-gpt-neo]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[bf16-gpt-j]":
+    "Xfail, due to SW-162660.",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-256-52-4-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-2048-128-32-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-25-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-25-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-128-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-True-True0]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-160-128-2-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-True-True1]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-2560-128-40-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-4096-128-64-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-160-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-120-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-512-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-1024-53-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1536-128-24-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-160-128-2-24-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-8192-128-64-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-511-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[1-256-2048-32-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[3-1024-54-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-1024-21-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-2-3-True-True-0.05]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-160-128-2-3-True-True-0.1]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-25-3-True-True-0.05]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-160-128-2-24-False-True-0.2]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-1600-128-2-4-False-True-0.2]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=1]":
+    "Xfail, due to SW-166162.",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=2]":
+    "Xfail, due to SW-166162.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-167459.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[facebook/opt-1.3b-bsz=1]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[facebook/opt-1.3b-bsz=2]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[EleutherAI/gpt-neo-1.3B-bsz=1]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[EleutherAI/gpt-neo-1.3B-bsz=2]":
+    "Xfail, due to SW-167459.",
+    "unit/inference/test_stable_diffusion.py::TestStableDiffusion::test":
+    "Xfail, due to SW-170181.",
+    "unit/runtime/zero/test_zero_offloadpp.py::TestZeroPartialOffloadConfigSweep::test[8-1024]":
+    "Xfail, due to SW-170288.",
+    "unit/runtime/zero/test_zero_offloadpp.py::TestZeroPartialOffloadConfigSweep::test[4-1024]":
+    "Xfail, due to SW-170288.",
+    "unit/compression/test_dequantization.py::TestDequantization::test_dequantize":
+    "Xfail, due to SW-168442.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp16]":
+    "Xfail, due to SW-145262. Gaudi1 does not support FP16.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp16]":
+    "Xfail, due to SW-145262. Gaudi1 does not support FP16.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-True-True]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-True-False]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-True]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-False]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_model_quantization[8bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_half_int8_quantization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_nvme_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_nvme_offload":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[4bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[4bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_half_int4_quantization":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-0]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[8bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[8bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[4bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-0]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[8bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[8bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_model_quantization[4bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[4bits]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-full-dtype1]":
+    "float16/half is not supported on Gaudi.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe[4]":
+    "Xfail, due to SW-170326",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-True]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-False]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-False]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-True]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-False-True]":
+    "Xfail, due to SW-138014.",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[False]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[True]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[default-fp16]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_wrapper.py::TestCustomMethod::test_custom_function":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype1]":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_set_compiler_fn":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_compile_kwargs":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_compile":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_compile_disabled":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_custom_backend":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_set_compile_kwargs":
+    "Fp16 not supported by Gaudi1",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype2]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype0]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype0]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype2]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype0]":
+    "Xfail, due to SW-175720.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype2]":
+    "Xfail, due to SW-175720.",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config1]":
+    " Comm Init Rank Error.",
+    "unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config2]":
+    " Comm Init Rank Error.",
+}
+
+g2_xfail_tests = {
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-350m-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[cross-encoder/ms-marco-MiniLM-L-12-v2-text-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-350m-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/minilm-uncased-squad2-question-answering-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/minilm-uncased-squad2-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-cased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-350m-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/roberta-base-squad2-question-answering-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[cross-encoder/ms-marco-MiniLM-L-12-v2-text-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-uncased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-base-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[j-hartmann/emotion-english-distilroberta-base-text-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[dslim/bert-base-NER-token-classification-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[cross-encoder/ms-marco-MiniLM-L-12-v2-text-classification-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[roberta-large-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-cased-fill-mask-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-uncased-fill-mask-fp32-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-large-uncased-whole-word-masking-finetuned-squad-question-answering-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[bert-base-multilingual-cased-fill-mask-bf16-CG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[deepset/minilm-uncased-squad2-question-answering-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[Jean-Baptiste/roberta-large-ner-english-token-classification-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/pythia-70m-deduped-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163095.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/pythia-70m-deduped-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163095.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/pythia-70m-deduped-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163095.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp16-noCG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/gpt-j-6b-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilgpt2-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp32-CG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp32-noCG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/gpt-j-6b-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-bf16-noCG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-bf16-CG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[EleutherAI/gpt-j-6b-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[gpt2-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilgpt2-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilbert-base-cased-distilled-squad-question-answering-fp16-CG-noTriton]":
+    "Xfail, failed on vanilla as well.",
+    "unit/inference/test_inference.py::TestModelTask::test[Norod78/hebrew-bad_wiki-gpt_neo-tiny-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_inference.py::TestModelTask::test[distilgpt2-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[False-False]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[False-True]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[True-False]":
+    "Xfail, due to SW-163097.",
+    "unit/inference/test_model_profiling.py::TestModelProfiling::test[True-True]":
+    "Xfail, due to SW-163097.",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-512-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-120-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-160-128-2-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-256-52-4-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-8192-128-64-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-1024-53-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-160-128-2-24-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-128-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-25-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-4096-128-64-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1536-128-24-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-True-True0]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-160-128-2-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-2048-128-32-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-2560-128-40-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1600-128-25-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[1-256-2048-32-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[64-1024-21-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-384-16-3-True-True1]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[3-1024-54-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-1024-511-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-2-3-True-True-0.05]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-1600-128-2-4-False-True-0.2]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-160-128-2-3-True-True-0.1]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-25-3-True-True-0.05]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-160-128-2-24-False-True-0.2]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-False-True]":
+    "CUDA tests not supported by HPU",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-True-True]":
+    "CUDA tests not supported by HPU",
+    "unit/inference/test_inference.py::TestMPSize::test[fp32-gpt-j]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[bf16-gpt-neo]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp16-gpt-neo]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[bf16-gpt-j]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp16-bloom]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp16-gpt-j]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestMPSize::test[fp32-gpt-neo]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/test_inference.py::TestLowCpuMemUsage::test[gpt2]":
+    "Xfail, due to SW-164236.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-4-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-4-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-9-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-9-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-4-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-9-1024]":
+    "Xfail, due to SW-164239.",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-2]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-3]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-1]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-1]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-2]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-3]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-2]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-1]":
+    "Xfail, due to SW-145262.",
+    "unit/moe/test_moe.py::TestPRMoE::test[2-True]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestPRMoE::test[2-False]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[False-0-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[True-0-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[False-0-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe.py::TestMoE::test[True-0-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-2-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-4]":
+    "Xfail, due to SW-162657.",
+    "unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-2]":
+    "Xfail, due to SW-162657.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16]":
+    "Xfail, due to SW-162575.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant_cpu_offload":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization_cpu_offload":
+    "Xfail, due to SW-162660.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-2-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-4]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-2]":
+    "Xfail, due to SW-116160.",
+    "unit/moe/test_moe.py::TestMoE::test[False-2-2]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-True]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-False]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-False]":
+    "Xfail, due to SW-116160.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-True]":
+    "Xfail, due to SW-116160.",
+    "unit/runtime/zero/test_zero_context.py::TestSerialContext::test_subclass_param":
+    "Xfail, due to SW-156783.",
+    "unit/runtime/zero/test_zero_context_ancestry.py::TestSerialParamInit::test_subclass_param_init":
+    "Xfail, due to SW-143227.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-dtype1]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-dtype1]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-dtype1]":
+    "Xfail, due to SW-145262.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization_nvme_offload":
+    "Xfail, due to SW-164545.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant_nvme_offload":
+    "Xfail, due to SW-164545.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuSGD-SGD]":
+    "Xfail, due to SW-164551.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuSGD-SGD]":
+    "Xfail, due to SW-164551.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdam-Adam]":
+    "Xfail, due to SW-164551.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdam-Adam]":
+    "Xfail, due to SW-164551.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdamW-AdamW]":
+    "Xfail, due to SW-164551.",
+    "unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdamW-AdamW]":
+    "Xfail, due to SW-164551.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[4]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[4]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[1]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[2]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[4]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[1]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_not_load_optimizer_state[2]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_optimizer_state[1]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_mics_optimizer.py::TestMiCSCheckpoint::test_load_module_only[2]":
+    "Xfail, due to SW-164577.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_pipeline_checkpoint_loading[3]":
+    "Xfail, due to SW-164593.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-dtype1]":
+    "Xfail, due to SW-164593.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_quantized_linear":
+    "Xfail, due to SW-164606.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[20-1024]":
+    "Xfail, due to SW-156782.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[20-2048]":
+    "Xfail, due to SW-156782.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[8-2048]":
+    "Xfail, due to SW-156782.",
+    "unit/runtime/zero/test_qwzero.py::TesthpZeroConfigSweep::test[8-1024]":
+    "Xfail, due to SW-156782.",
+    "unit/inference/test_inference.py::TestModelTaskKIFalse::test[EleutherAI/gpt-j-6b-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-163098.",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=1]":
+    "Xfail, due to SW-166162.",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=2]":
+    "Xfail, due to SW-166162.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-fp32-noCG-noTriton]":
+    "Xfail, due to SW-167459.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-bf16-noCG-noTriton]":
+    "Xfail, due to SW-167459.",
+    "unit/inference/test_inference.py::TestModelTask::test[facebook/opt-125m-text-generation-fp16-noCG-noTriton]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[EleutherAI/gpt-neo-1.3B-bsz=1]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[EleutherAI/gpt-neo-1.3B-bsz=2]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[facebook/opt-1.3b-bsz=2]":
+    "Xfail, due to SW-167459.",
+    "unit/hybrid_engine/test_he_all.py::TestHybridEngineTextGen::test_functionality[facebook/opt-1.3b-bsz=1]":
+    "Xfail, due to SW-167459.",
+    "unit/inference/test_stable_diffusion.py::TestStableDiffusion::test":
+    "Xfail, due to SW-170181.",
+    "unit/compression/test_dequantization.py::TestDequantization::test_dequantize":
+    "Xfail, due to SW-168442.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype1]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype1]":
+    "Xfail, due to SW-145262.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp16]":
+    "Xfail, due to SW-145262.",
+    "unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp16]":
+    "Xfail, due to SW-145262.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype1]":
+    "Xfail, due to SW-164593.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype1]":
+    "Xfail, due to SW-164593.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-0]":
+    "Xfail, due to SW-168583.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-1]":
+    "Xfail, due to SW-168583.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-1]":
+    "Xfail, due to SW-168583.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-0]":
+    "Xfail, due to SW-168583.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[4bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[4bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[8bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[4bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[8bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[4bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[8bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[8bits]":
+    "Xfail, due to SW-162660.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_nvme_offload":
+    "Xfail, due to SW-164545.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_nvme_offload":
+    "Xfail, due to SW-164545.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe[4]":
+    "Xfail, due to SW-170326",
+    "unit/inference/test_checkpoint_sharding.py::TestCheckpointShard::test[facebook/opt-350m-fp16]":
+    "Xfail, due to SW-171096.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-False-True]":
+    "Xfail, due to SW-138014.",
+    "unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[False]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[True]":
+    "Xfail, due to SW-175725.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype1]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype1]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype1]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype1]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype2]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype0]":
+    "Xfail, due to SW-175712.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype1]":
+    "Xfail, due to SW-175716.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype1]":
+    "Xfail, due to SW-175716.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype1]":
+    "Xfail, due to SW-175716.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype2]":
+    "Xfail, due to SW-175720.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype0]":
+    "Xfail, due to SW-175720.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype2]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype0]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype2]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype0]":
+    "Xfail, due to SW-175722.",
+    "unit/runtime/compile/test_load_config.py::TestConfigLoad::test_custom_backend":
+    "Xfail, due to SW-175723.",
+}
+
+gpu_xfail_tests = {
+    "unit/moe/test_moe.py::TestMoE::test[False-2-2]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-2]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-2]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-4]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[True-2-4]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[True-1-2]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[False-1-4]":
+    "Xfail, due to SW-163554.",
+    "unit/moe/test_moe.py::TestMoE::test[False-2-4]":
+    "Xfail, due to SW-163554.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-True]":
+    "Xfail, due to SW-163554.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-True]":
+    "Xfail, due to SW-163554.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-False]":
+    "Xfail, due to SW-163554.",
+    "unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-False]":
+    "Xfail, due to SW-163554.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-4-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[1-9-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-4-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-4-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[4-9-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/runtime/zero/test_zeropp.py::TestZeroPPConfigSweep::test[2-9-1024]":
+    "Xfail, due to SW-163551.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-4096-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-128-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-1-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-512-128-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-512-255-2]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype0-1232-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-4096-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-255-1]":
+    "Xfail, due to SW-161262.",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=2]":
+    "Test requires higher memory.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_post_init_quant_nvme_offload":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/quantization/test_int4_quantization.py::TestQuantizedInt4::test_zero3_int4_quantized_initialization_nvme_offload":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/ops/quantizer/test_fake_quantization.py::test_fake_quant_dequant[16-tensor_shape0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/ops/quantizer/test_fake_quantization.py::test_fake_quant_dequant[1-tensor_shape0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/ops/quantizer/test_fake_quantization.py::test_fake_quant_dequant[16-tensor_shape1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/ops/quantizer/test_fake_quantization.py::test_fake_quant_dequant[1-tensor_shape1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/hybrid_engine/test_he_llama.py::TestHybridEngineLlama::test_functionality[huggyllama/llama-7b-bsz=1]":
+    "Test requires higher memory.",
+    "unit/ops/transformer/inference/test_vector_add.py::test_vector_add[dtype1-1232-1-1]":
+    "Xfail, due to SW-161262.",
+    "unit/inference/v2/kernels/ragged_ops/test_atom_builder.py::test_single_sequence[seq_params2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_atom_builder.py::test_single_sequence[seq_params0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_atom_builder.py::test_single_sequence[seq_params3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_atom_builder.py::test_single_sequence[seq_params1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_multiple_prompts[prompt_lengths3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_multiple_prompts[prompt_lengths1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_continuation[seq_params1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[2037]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_rotary_emb[False]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_gqa[head_config0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_rotary_emb[True]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[65]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[256]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_head_size[128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[33]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_single_prompt[2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_continuation[seq_params4]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_gqa[head_config2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_head_size[64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_multiple_prompts[prompt_lengths2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_fully_composed":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_gqa[head_config1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_multiple_prompts[prompt_lengths0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_continuation[seq_params0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_continuation[seq_params3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_blocked_attn.py::test_continuation[seq_params2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_multiple_blocks[177-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_multiple_blocks[117-88]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_single_block[33-8]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_multiple_blocks[169-8]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_single_block[17-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_multiple_blocks[128-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_multi_sequence":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_single_block[1-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py::test_single_sequence_single_block[63-1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[False-169-8]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_multi_sequences[True]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[False-1-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[True-169-8]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[True-1-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[False-177-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_multi_sequences[False]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[True-33-15]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[True-17-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[False-33-15]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[False-128-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[True-117-88]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[False-17-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[False-1-63]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[True-128-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[False-117-88]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_single_block[True-1-63]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py::test_single_sequence_multiple_blocks[True-177-0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_supported_dtypes[dtype0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_problem_size_permutations[1024]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_multiple_sequences[seq_lens0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_problem_size_permutations[6144]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_multiple_sequences[seq_lens3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_supported_dtypes[dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_problem_size_permutations[6784]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_multiple_sequences[seq_lens2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_logits_gather.py::test_multiple_sequences[seq_lens1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[False-278-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[False-13-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[False-1977-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[True-278-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[True-13-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_gather.py::test_moe_gather[True-1977-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[True-13-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[False-13-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[True-1977-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[True-278-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[False-278-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py::test_moe_scatter[False-1977-64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_positional_embedding[seq_lens0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_problem_size_permutations[50304-6144]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_dtype_permutations[embed_dtype1-token_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_positional_embedding[seq_lens1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_complex_sequences[True-seq_lens1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_positional_embedding_offset":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_problem_size_permutations[32000-5120]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_complex_sequences[True-seq_lens0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_problem_size_permutations[1024-1024]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_positional_embedding[seq_lens3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_dtype_permutations[embed_dtype0-token_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_complex_sequences[False-seq_lens0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_dtype_permutations[embed_dtype0-token_dtype0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_positional_embedding[seq_lens2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_dtype_permutations[embed_dtype1-token_dtype0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py::test_complex_sequences[False-seq_lens1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_single_mapping_gating[433-128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_score_accuracy[32-128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_negative_logits":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_score_accuracy[89-128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_single_mapping_gating[32-128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_single_mapping_gating[89-128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_single_mapping_gating[17-16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_single_mapping_gating[1-16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_score_accuracy[433-2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_score_accuracy[17-16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_determinism":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py::test_score_accuracy[1-16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape0-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape4-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape7-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape5-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape1-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape3-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape2-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape4-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape3-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape6-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape5-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape7-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape6-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape1-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear_t[problem_shape2-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/core_ops/test_blas_linear.py::test_blas_linear[problem_shape0-fp_dtype1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_multiple_prompts[prompt_lengths3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[256]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_gqa[head_config0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_continuation[seq_params2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_multiple_prompts[prompt_lengths1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[65]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_continuation[seq_params0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_head_size[128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_continuation[seq_params4]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_fully_composed":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_head_size[64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_continuation[seq_params1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_multiple_prompts[prompt_lengths0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_continuation[seq_params3]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_gqa[head_config2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[33]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_single_prompt[2037]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_multiple_prompts[prompt_lengths2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py::test_gqa[head_config1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_expert_variance[64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_in_out_channels[2048-8192]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_expert_variance[32]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_activation_types[ActivationType.RELU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_dtypes[dtype0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_activation_types[ActivationType.GELU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_activation_types[ActivationType.SILU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_successive_inputs":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_in_out_channels[4096-2048]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_in_out_channels[6144-3072]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/modules/test_cutlass_moe.py::test_expert_variance[2]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_dtypes[DtypeEnum.bf16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_act_fns[ActivationType.GELU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_dtypes[DtypeEnum.fp16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_single_expert[13-2048-2048]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_act_fns[ActivationType.SILU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[64]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_single_expert[256-1024-4096]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[4]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_single_expert[893-5120-2560]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[16]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[128]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_act_fns[ActivationType.RELU]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_single_expert[278-5120-2048]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py::test_multi_expert[1]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[default-fp32]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero1]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero1]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero1]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero3]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero3]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero1]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero3]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero3]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_List]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_reduce_scatter[True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_prefetching[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_offload_optimizer[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_contiguous_gradients[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_namedtuple]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_NamedTuple]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_param_persistence_threshold[10]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_zero_grad[True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_param_persistence_threshold[0]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_reduce_scatter[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_Dict]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_fp16_enabled[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_Tuple]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_zero_grad[False]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_contiguous_gradients[True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_prefetching[True]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredInsideNestingInit::test_new_class_declared_inside_nesting_init":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredNestingInit::test_new_class_declared_nesting_init":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_nesting_init.py::TestNestingInit::test_nesting_init":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_nesting_init.py::TestShutdownInNestingInit::test_shutdown_in_nesting_init":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[2]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[1]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[3]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-2]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-1]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-2]":
+    "Xfail, due to SW-169830.",
+    "unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-1]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-full-dtype2]":
+    "Xfail, due to SW-169830.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_nvme_offload":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_nvme_offload":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamFP16Pipeline::test[topo_config0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambFP16Pipeline::test[topo_config0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamFP16Pipeline::test[topo_config0]":
+    "Xfailed. failure observed on vanilla as well.",
+    "unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForward::test_forward[8-8192-128-64-3-False-True]":
+    "Test requires higher memory.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-9-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-1-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-9-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-18-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-1-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-1-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-18-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-18-2-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-18-1-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-1152-1-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-9-1-3]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-9-2-1]":
+    "Xfail, due to SW-170526.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-9-1-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-18-2-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-18-1-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-9-2-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-576-18-2-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-18-2-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-9-1-1]":
+    "Xfail, due to SW-170527.",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-576-9-1-1]":
+    "Xfail, due to SW-170527.",
+    "unit/checkpoint/test_shared_weights.py::TestCheckpointSharedWeights::test_checkpoint_shared_weights":
+    "Xfail, due to SW-169830.",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-False-12-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-6-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-24-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-6-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-6-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-24-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-False-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-12-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-24-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-False-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-24-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-12-576-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-True-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-12-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-1152-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-24-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-24-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-6-1152-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-24-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-576-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-False-24-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-2-True-6-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-False-6-2304-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-1-False-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-False-12-1152-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-False-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-True-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-6-True-12-576-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-6-True-12-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-1-True-6-2304-1-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-2-True-12-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-6-True-6-1152-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-12-576-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-6-2304-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-24-2304-9-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype1-1-True-6-1152-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-True-6-2304-18-1]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype1-2-False-12-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[False-dtype0-2-False-24-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-12-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-True-24-2304-9-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-6-576-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-6-False-24-1152-18-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_context.py::test_softmax_context[True-dtype0-1-True-12-576-1-2]":
+    "Xfailed due to SW-174070",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-576-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-1-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-576-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-2304-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-12-2304-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-2304-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-1-1152-9-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-24-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-1152-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-1-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-12-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-True-12-1152-18-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype0-True-24-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-12-1152-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[False-dtype1-True-12-2304-18-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-False-1-576-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-2304-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype1-True-24-576-1-2]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-24-576-9-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_softmax_1.py::test_softmax[True-dtype0-False-12-1152-1-1]":
+    "Xfailed due to SW-173499",
+    "unit/ops/transformer/inference/test_qkv_gemm.py::test_qkv_gemm[dtype0-True]":
+    "Xfailed due to SW-174399",
+    "unit/ops/transformer/inference/test_qkv_gemm.py::test_qkv_gemm[dtype0-False]":
+    "Xfailed due to SW-174399",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-9-1-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-1-2-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-9-2-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-2304-1-1-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-2304-1-1-1]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-1-576-9-1-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-1152-1-1-1]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-2304-1-1-1]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype0-12-1152-18-1-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-1152-18-2-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-1-2304-1-1-1]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-1-1-3]":
+    "Flaky tests",
+    "unit/ops/transformer/inference/test_bias_add_transform_0213.py::test_bias_add_transform_0213[dtype1-12-576-9-2-3]":
+    "Flaky tests",
+}