From d7a541ea2d67543660bb1c14255afb6c1b999b08 Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Wed, 17 Apr 2024 18:47:30 +0530
Subject: [PATCH 001/287] Fix checkfail in GatherV2

The Op GatherV2 leads to check fail particularly when axis=kint64max. Anything less than or greater than that leads to valid exception.
---
 tensorflow/core/kernels/gather_op.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 1ff8145688d35e..098cc6866a1d13 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -77,6 +77,9 @@ class GatherOp : public OpKernel {
                     errors::InvalidArgument("axis must be int32 or int64."));
       }
     }
+    // special case to avoid checkfail when axis = kint64max. 
+    OP_REQUIRES(c, axis < kint64max,
+                absl::InvalidArgumentError("axis must be less than kint64max"));
 
     int64_t min_params_dim = axis < 0 ? -axis : axis + 1;
     OP_REQUIRES(

From 24a9d7b038fa4e87c8d5960ebadd204356243ece Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 14:13:20 +0000
Subject: [PATCH 002/287] Merged commit includes the following changes:
 637889039  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Remove experimental_adaptive_avx_optimization flag from XNNPACK delegate options

    It's always on now.

--
637886275  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA:GPU][IndexAnalysis] Use a flag for IsKnownEmpty instead of recomputing every time.

    Right now, we would try to simplify or compose with indexing maps that have a known empty domain. That's incorrect, but checking if the domain is empty every time is expensive and can be cached.

--
637876088  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Internal config change

--
637864812  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #13088: [ROCm] Fix reduce_atomic_min.hlo.test

    Imported from GitHub PR https://github.com/openxla/xla/pull/13088

    Copybara import of the project:

    --
    b241e076198c03fffd8c7e3a6568070ef0223653 by mmakevic <Milica.Makevic@amd.com>:

    Fix reduce_atomic_min.hlo.test

    --
    f894f1954513019f0ca6890a27e09e0fee9d462e by mmakevic <Milica.Makevic@amd.com>:

    Remove extra space

    Merging this change closes #13088

--
637860531  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Remove xla_gpu_normalize_layouts flag.

    By now, this is really not experimental anymore.

--
637857834  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add heuristic for when to treat Gather ops as coalesced.

--
637820064  by A. Unique TensorFlower<gardener@tensorflow.org>:

    compat: Update forward compatibility horizon to 2024-05-28

--
637820063  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Update GraphDef version to 1876.

--
637756070  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 636206934.

637674999  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [xla:cpu] Add initial support for Thunk-based execution to CpuCompiler and CpuExecutable

    Add support for compiling XLA:CPU HloModule to a ThunkSequence instead of a LLVM module and a jit-compiled function.

--
637666734  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Don't fuse inside computations that are already fused.

--
637657345  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 636208997.

637651034  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Integrate LLVM at llvm/llvm-project@fddf350f9640

    Updates LLVM usage to match
    [fddf350f9640](https://github.com/llvm/llvm-project/commit/fddf350f9640)

--
637639233  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #12940: [ROCm] Fix dot_bf16.hlo.test on ROCm

    Imported from GitHub PR https://github.com/openxla/xla/pull/12940

    Added additional params for `hlo_lit_tests` as a workaround, so `mi200.txtpb` would be used in `dot_bf16.hlo.test` for rocm.
    Copybara import of the project:

    --
    c3bb3a7349266a51ff22a2e18dab0afb6e81bad4 by mmakevic <Milica.Makevic@amd.com>:

    Have dot_bf16.hlo.test use mi200.txtpb for rocm

    Merging this change closes #12940

--
637632492  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #13089: Fix reduce_large_row_to_scalar.hlo.test

    Imported from GitHub PR https://github.com/openxla/xla/pull/13089

    Copybara import of the project:

    --
    ae97058c01ca57107a2566a6f190d51f5ad4ca0e by mmakevic <Milica.Makevic@amd.com>:

    Fix reduce_large_row_to_scalar.hlo.test

    Merging this change closes #13089

--
637623329  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 637594837.

637607386  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 636926669.

637594837  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA:GPU] Pass CUDA_VERSION explicitly into CudnnFusedConvRewriter.

    Passing the CuDNN version will be the next step.

--
637580666  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Remove usage of --xla_gpu_enable_triton_hopper in autotuner

--
637578573  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA:GPU] Add documentation about RTVars.

--
637570959  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Update GraphDef version to 1875.

--
637570942  by A. Unique TensorFlower<gardener@tensorflow.org>:

    compat: Update forward compatibility horizon to 2024-05-27

--
637561798  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #12979: [NVIDIA] Fix PGLE for latency estimation of p2p instructions

    Imported from GitHub PR https://github.com/openxla/xla/pull/12979

    PGLE doesn't recognize p2p instruction such as send or recv as async operations.
    This adds the utility to check if instruction is a p2p communication instruction.
    Copybara import of the project:

    --
    469b2d31ff6b0270dda28f8754462681514d0e04 by TJ Xu <tjx@nvidia.com>:

    fix pgle not recognizing p2p instructions

    Merging this change closes #12979

--
637560035  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [xla:gpu] Track loop iteration counter of a WhileThunk in thread local variable

--
637552495  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #13056: Use `operator->` with XLA FFI Result Buffers in custom call docs

    Imported from GitHub PR https://github.com/openxla/xla/pull/13056

    Copybara import of the project:

    --
    7940a1a02a0f93736a88406958edf62488bdbe19 by Andrey Portnoy <aportnoy@nvidia.com>:

    Use `operator->` with XLA FFI Result Buffers in custom call docs

    Merging this change closes #13056

--
637547404  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #13068: Introduce the Blackwell compute capability.

    Imported from GitHub PR https://github.com/openxla/xla/pull/13068

    Introduce the Blackwell compute capability. Future Blackwell-specific changes can be guarded by this capability.
    Copybara import of the project:

    --
    cc1adebc95166b2d3979cc01de954a1895515ad4 by Dimitris Vardoulakis <dvardoulakis@nvidia.com>:

    Introduce the Blackwell compute capability. Future Blackwell-specific changes can be guarded by this capability.

    Merging this change closes #13068

--
637541058  by A. Unique TensorFlower<gardener@tensorflow.org>:

    PR #13061: Add Tirton support for XLA clamp

    Imported from GitHub PR https://github.com/openxla/xla/pull/13061

    Add Triton support for XLA clamp instruction. Clamp is a common instruction found in FP8 fusions, and will be used in cuDNN fusions:

    This is a fix for perviously rolled-back PR due to internal ir_emitter_triton test failure: https://github.com/openxla/xla/commit/d114eceb0afa4289e1ba4468a0474d2c1ffe4123

    cc @sergeykozub @sergachev
    Copybara import of the project:

    --
    3496ba2fa86571ab290e0881dd06400c415d80b6 by Elfie Guo <elfieg@nvidia.com>:

    Add Tirton support for XLA clamp.

    Merging this change closes #13061

--
637366630  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Update GraphDef version to 1874.

--
637366295  by A. Unique TensorFlower<gardener@tensorflow.org>:

    compat: Update forward compatibility horizon to 2024-05-26

--
637185396  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Automated Code Change

--
637168744  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Update GraphDef version to 1873.

--
637168421  by A. Unique TensorFlower<gardener@tensorflow.org>:

    compat: Update forward compatibility horizon to 2024-05-25

--
637166714  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Attempt loading libOpenCL.so before libOpenCL-pixel.so

--
637137789  by A. Unique TensorFlower<gardener@tensorflow.org>:

    feat: Implement hermetic Python version matching system Python version

--
637102058  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [IFRT] Add xla::ifrt::Sharding::IsFullyReplicated()

    IFRT Sharding type gains `IsFullyReplicated()`, which quickly tells if the
    sharding represents a fully-replicated sharding.

    The main motivation is to make full replication information queriable at IFRT
    shardings and prepare for enabling IFRT implementations to handle full
    replication directly.

    There are a preset of rules:

    * `SingleDeviceSharding` is trivially fully replicated by its definition.
    * `ConcreteSharding` and `OpaqueSharding` is not fully replicated. They have special cases where it may be fully replicated, but the user is advised to use a more specific sharding type to represent such cases.
    * `ConcreteEvenSharding` may/may not fully replicated. This is controlled at creation time.
    * `ShardingParamSharding` and (IFRT) `HloSharding` depend on whether their lower-level sharding represents full replication.

    `ConcreteEvenSharding` is a noteworthy case where the full replication information
    does not come from the existing source of the information. This is because the
    creators of this sharding (e.g., JAX) typically has the information, but the
    replication information is lost when coercing it into `ConcreteEvenSharding`.
    This problem will be gradually less problematic once JAX uses a higher-level
    IFRT sharding type (mainly (IFRT) `HloSharding`) at more places.

    This change extends the `Sharding` type, but the new method is not used by any
    existing code.

--
637097325  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Ensure delegates properly delegate models

--
637080761  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add barrier logs.

--
637070664  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Clean up include and build file

--
637069670  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Use the `LoadedClientGraph`'s copy of `FunctionLibraryDefinition` instead of getting it from the `FallbackState` in the parent `GraphExecutor`

--
637069442  by A. Unique TensorFlower<gardener@tensorflow.org>:

    update doc ref

--
637061122  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Refactor exhaustive testing of unary float32 functions into a library.

--
637046941  by A. Unique TensorFlower<gardener@tensorflow.org>:

    fix profile_util's compatible_with tag typo

--
637028365  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA] Refactor HostOffloader.

    Change HostOffloader's algorithm for identifying host memory offloading. This approach supports every conceivable host memory offloading pattern (as of today).

--
637023690  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Simplify volumes for docker container in XLA build script

--
637018892  by A. Unique TensorFlower<gardener@tensorflow.org>:

    move flatbuffer_compatibility_test target to tflite compiler

--
637008187  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add copyright notice to profiler_utils.cc

--
636990162  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Adds a proto profile summary formatter to the TFLite benchmark.
    Adds a Python script to convert benchmark profile protos to a JSON consumable by the model-explorer.

--
636976463  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add profiler_util to enable flexibly tpu profiler registration for different purposes

--

PiperOrigin-RevId: 637889039
---
 tensorflow/compiler/mlir/lite/schema/BUILD    |   23 +
 .../schema/flatbuffer_compatibility_test.cc   |    5 +-
 .../compiler/mlir/lite/schema/schema_v3b.fbs  | 1242 +++++++++++
 tensorflow/core/kernels/BUILD                 |    1 -
 tensorflow/core/kernels/gather_nd_op.cc       |    3 +-
 tensorflow/core/kernels/gather_nd_op.h        |    7 +-
 tensorflow/core/kernels/scatter_nd_op.cc      |   63 +-
 tensorflow/core/kernels/scatter_nd_op.h       |    4 +-
 .../core/kernels/scatter_nd_op_cpu_impl.h     |   52 +-
 .../core/kernels/scatter_nd_op_gpu.cu.cc      |   10 +-
 tensorflow/core/ops/uniform_quant_ops.cc      |    3 +-
 tensorflow/core/public/version.h              |    2 +-
 tensorflow/core/tfrt/common/BUILD             |   48 +-
 .../core/tfrt/common/async_value_tensor.cc    |    5 +
 .../core/tfrt/common/async_value_tensor.h     |    3 +
 .../tfrt/common/create_pjrt_client_util.cc    |    6 +
 .../tfrt/common/create_pjrt_client_util.h     |    2 +-
 .../common/create_pjrt_client_util_test.cc    |    3 +-
 tensorflow/core/tfrt/common/global_state.cc   |    3 +-
 .../tfrt/common/pjrt_client_factory_options.h |    2 -
 .../common/pjrt_client_factory_registry.cc    |   11 +-
 .../common/pjrt_client_factory_registry.h     |    3 +-
 .../common/pjrt_cpu_client_registration.cc    |    6 +-
 .../pjrt_cpu_client_registration_test.cc      |    4 +-
 .../common/pjrt_gpu_client_registration.cc    |    3 +-
 .../pjrt_gpu_client_registration_test.cc      |    4 +-
 tensorflow/core/tfrt/common/pjrt_state.cc     |    6 +
 tensorflow/core/tfrt/common/pjrt_state.h      |    8 +
 .../core/tfrt/common/pjrt_state_test.cc       |   11 +-
 tensorflow/core/tfrt/common/pjrt_util.cc      |    6 +-
 tensorflow/core/tfrt/common/pjrt_util.h       |    3 +-
 tensorflow/core/tfrt/common/pjrt_util_test.cc |    4 +-
 .../tfrt/graph_executor/graph_executor.cc     |    3 +-
 tensorflow/lite/CMakeLists.txt                |    7 +
 .../lite/delegates/gpu/cl/opencl_wrapper.cc   |   34 +-
 .../utils/experimental/stable_delegate/BUILD  |    1 +
 .../stable_delegate/kernel_test_main.cc       |   15 +-
 .../lite/delegates/xnnpack/conv_2d_test.cc    |   35 -
 .../delegates/xnnpack/xnnpack_delegate.cc     |    7 -
 .../lite/delegates/xnnpack/xnnpack_delegate.h |    2 -
 tensorflow/lite/kernels/test_util.cc          |    7 +-
 tensorflow/lite/kernels/test_util.h           |    9 +
 tensorflow/lite/profiling/BUILD               |    3 +
 .../lite/profiling/profile_summarizer.cc      |    2 +
 .../lite/profiling/profile_summarizer.h       |   19 +-
 .../profiling/profile_summary_formatter.cc    |  225 +-
 .../profiling/profile_summary_formatter.h     |   81 +-
 .../profile_summary_formatter_test.cc         |  260 ++-
 tensorflow/lite/profiling/proto/BUILD         |   41 +
 .../lite/profiling/proto/CMakeLists.txt       |   41 +
 .../lite/profiling/proto/profiling_info.proto |   63 +
 tensorflow/lite/python/BUILD                  |    1 +
 tensorflow/lite/schema/BUILD                  |   22 -
 tensorflow/lite/tools/BUILD                   |    1 +
 .../lite/tools/benchmark/CMakeLists.txt       |    9 +
 tensorflow/lite/tools/benchmark/README.md     |   17 +
 .../tools/benchmark/benchmark_tflite_model.cc |   74 +-
 .../tools/benchmark/profiling_listener.cc     |   32 +-
 .../lite/tools/benchmark/profiling_listener.h |    8 +-
 .../tools/cmake/modules/FindProtobuf.cmake    |   16 +
 .../lite/tools/cmake/modules/protobuf.cmake   |   45 +
 tensorflow/python/compat/compat.py            |    2 +-
 third_party/llvm/generated.patch              | 1901 +++++++++++++++--
 third_party/llvm/workspace.bzl                |    4 +-
 third_party/py/python_init_repositories.bzl   |    4 +-
 third_party/py/python_repo.bzl                |  148 +-
 third_party/xla/.kokoro/linux/build.sh        |   14 +-
 third_party/xla/docs/custom_call.md           |    2 +-
 third_party/xla/docs/indexing.md              |  238 ++-
 .../py/python_init_repositories.bzl           |    4 +-
 .../xla/third_party/py/python_repo.bzl        |  148 +-
 .../py/python_init_repositories.bzl           |    4 +-
 .../tsl/third_party/py/python_repo.bzl        |  148 +-
 third_party/xla/xla/debug_options_flags.cc    |   13 +-
 third_party/xla/xla/pjrt/cpu/BUILD            |    4 +
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |   87 +-
 third_party/xla/xla/python/BUILD              |   20 +-
 third_party/xla/xla/python/ifrt/sharding.cc   |   43 +-
 third_party/xla/xla/python/ifrt/sharding.h    |   32 +-
 .../xla/xla/python/ifrt/sharding_serdes.cc    |   11 +-
 .../xla/xla/python/ifrt/sharding_serdes.proto |    1 +
 .../xla/python/ifrt/sharding_serdes_test.cc   |    9 +-
 .../xla/xla/python/ifrt/sharding_test.cc      |   89 +-
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |    7 +
 .../xla/xla/python/pjrt_ifrt/xla_sharding.cc  |   18 +
 .../xla/xla/python/pjrt_ifrt/xla_sharding.h   |   11 +-
 .../xla/python/pjrt_ifrt/xla_sharding_test.cc |   35 +
 third_party/xla/xla/python/profiler.cc        |   24 +-
 third_party/xla/xla/python/profiler_utils.cc  |   56 +
 third_party/xla/xla/python/profiler_utils.h   |   27 +
 third_party/xla/xla/service/BUILD             |    3 +
 third_party/xla/xla/service/cpu/BUILD         |   45 +
 .../xla/xla/service/cpu/cpu_compiler.cc       |   42 +-
 .../xla/xla/service/cpu/cpu_executable.cc     |   51 +-
 .../xla/xla/service/cpu/cpu_executable.h      |   55 +-
 third_party/xla/xla/service/cpu/runtime/BUILD |    4 +
 .../xla/xla/service/cpu/runtime/copy_thunk.cc |    9 +-
 .../xla/xla/service/cpu/runtime/thunk.cc      |   25 +
 .../xla/xla/service/cpu/runtime/thunk.h       |   28 +-
 .../xla/xla/service/cpu/thunk_emitter.cc      |   95 +
 .../xla/xla/service/cpu/thunk_emitter.h       |   59 +
 .../gpu/conv_layout_normalization_test.cc     |   32 +-
 .../xla/service/gpu/cudnn_fusion_compiler.cc  |  122 +-
 .../xla/xla/service/gpu/fusions/cudnn_test.cc |   26 +
 .../xla/service/gpu/gemm_fusion_autotuner.cc  |    5 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |   56 +-
 .../xla/xla/service/gpu/instruction_fusion.cc |   27 +-
 .../xla/xla/service/gpu/instruction_fusion.h  |   13 +-
 .../service/gpu/instruction_fusion_test.cc    |   22 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |    4 +
 .../xla/service/gpu/ir_emitter_triton_test.cc |   14 +-
 third_party/xla/xla/service/gpu/model/BUILD   |    2 +
 .../service/gpu/model/coalescing_analysis.cc  |   65 +-
 .../service/gpu/model/coalescing_analysis.h   |    1 +
 .../gpu/model/coalescing_analysis_test.cc     |   58 +
 .../model/gpu_indexing_performance_model.cc   |    8 +-
 .../gpu/model/indexing_analysis_test.cc       |   44 +-
 .../xla/xla/service/gpu/model/indexing_map.cc |  118 +-
 .../xla/xla/service/gpu/model/indexing_map.h  |   50 +-
 .../service/gpu/model/indexing_map_test.cc    |   34 +-
 .../service/gpu/model/indexing_test_utils.cc  |    2 +-
 third_party/xla/xla/service/gpu/runtime/BUILD |    2 +
 .../xla/service/gpu/runtime/while_thunk.cc    |   29 +-
 .../xla/xla/service/gpu/runtime/while_thunk.h |    7 +
 third_party/xla/xla/service/gpu/tests/BUILD   |    6 +
 .../xla/xla/service/gpu/tests/dot_bf16.hlo    |    4 +-
 .../service/gpu/tests/reduce_atomic_min.hlo   |  415 ++--
 .../gpu/tests/reduce_large_row_to_scalar.hlo  |  510 +++--
 .../xla/xla/service/gpu/triton_support.cc     |    2 +-
 third_party/xla/xla/service/host_offloader.cc | 1587 +++++++-------
 third_party/xla/xla/service/host_offloader.h  |  170 +-
 .../xla/xla/service/host_offloader_test.cc    | 1149 ++++++++--
 .../xla/service/latency_hiding_scheduler.cc   |    8 +
 .../xla/service/latency_hiding_scheduler.h    |    1 +
 .../profile_guided_latency_estimator.cc       |    3 +-
 .../profile_guided_latency_estimator_test.cc  |   56 +
 .../xla/stream_executor/device_description.h  |    3 +-
 third_party/xla/xla/tests/exhaustive/BUILD    |   32 +-
 .../tests/exhaustive/exhaustive_test_main.cc  |   33 +
 .../exhaustive_unary_test_f32_or_smaller.cc   |   46 +-
 .../coordination/coordination_service.cc      |    8 +-
 third_party/xla/xla/xla.proto                 |    9 +-
 third_party/xla/xla/xla_data.proto            |    4 +-
 143 files changed, 8669 insertions(+), 2274 deletions(-)
 rename tensorflow/{ => compiler/mlir}/lite/schema/flatbuffer_compatibility_test.cc (95%)
 create mode 100644 tensorflow/compiler/mlir/lite/schema/schema_v3b.fbs
 create mode 100644 tensorflow/lite/profiling/proto/BUILD
 create mode 100644 tensorflow/lite/profiling/proto/CMakeLists.txt
 create mode 100644 tensorflow/lite/profiling/proto/profiling_info.proto
 create mode 100644 tensorflow/lite/tools/cmake/modules/FindProtobuf.cmake
 create mode 100644 tensorflow/lite/tools/cmake/modules/protobuf.cmake
 create mode 100644 third_party/xla/xla/python/profiler_utils.cc
 create mode 100644 third_party/xla/xla/python/profiler_utils.h
 create mode 100644 third_party/xla/xla/service/cpu/thunk_emitter.cc
 create mode 100644 third_party/xla/xla/service/cpu/thunk_emitter.h
 create mode 100644 third_party/xla/xla/tests/exhaustive/exhaustive_test_main.cc

diff --git a/tensorflow/compiler/mlir/lite/schema/BUILD b/tensorflow/compiler/mlir/lite/schema/BUILD
index 17a6bdb636959d..7cbc2253a83821 100644
--- a/tensorflow/compiler/mlir/lite/schema/BUILD
+++ b/tensorflow/compiler/mlir/lite/schema/BUILD
@@ -1,4 +1,5 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
@@ -69,3 +70,25 @@ cc_library(
         "@flatbuffers",
     ],
 )
+
+# Schema test to make sure we don't introduce backward incompatible changes
+# to schemas.
+tf_cc_test(
+    name = "flatbuffer_compatibility_test",
+    size = "small",
+    srcs = ["flatbuffer_compatibility_test.cc"],
+    data = [
+        "schema.fbs",
+        "schema_v3b.fbs",
+    ],
+    tags = [
+        "no_oss",
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        "//tensorflow/core/platform",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:flatc_library",
+    ],
+)
diff --git a/tensorflow/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/compiler/mlir/lite/schema/flatbuffer_compatibility_test.cc
similarity index 95%
rename from tensorflow/lite/schema/flatbuffer_compatibility_test.cc
rename to tensorflow/compiler/mlir/lite/schema/flatbuffer_compatibility_test.cc
index 976c2b302c1a6e..c2eea199bc6401 100644
--- a/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
+++ b/tensorflow/compiler/mlir/lite/schema/flatbuffer_compatibility_test.cc
@@ -63,9 +63,10 @@ TEST(SchemaTest, TestCompatibility) {
   // Read file contents of schemas into strings
   // TODO(aselle): Need a reliable way to load files.
   std::string base_contents, current_contents;
-  const char *base_filename = TFLITE_TF_PREFIX "lite/schema/schema_v3b.fbs";
+  const char *base_filename = TFLITE_TF_PREFIX
+      "compiler/mlir/lite/schema/schema_v3b.fbs";
   const char *current_filename =
-      TFLITE_TF_PREFIX "lite/schema/schema.fbs";
+      TFLITE_TF_PREFIX "compiler/mlir/lite/schema/schema.fbs";
 
   ASSERT_TRUE(LoadFileRaw(base_filename, &base_contents));
   ASSERT_TRUE(LoadFileRaw(current_filename, &current_contents));
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_v3b.fbs b/tensorflow/compiler/mlir/lite/schema/schema_v3b.fbs
new file mode 100644
index 00000000000000..917786050f7e8b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/schema/schema_v3b.fbs
@@ -0,0 +1,1242 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+// Version 3a: Add new builtin op code field. Has backward compatibility with
+//             version 3.
+// Version 3b: Rename fields in SignatureDef. Has backward compatibility with
+//             version 3 and 3a.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+  UINT64 = 12,
+  // Experimental: Resource and variant types are experimental, that are subject
+  // to change. Do not implement custom kernels using resource & variant types
+  // now.
+  RESOURCE = 13,
+  VARIANT = 14,
+  UINT32 = 15,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+// LINT.IfChange
+enum BuiltinOperator : int32 {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  CUMSUM = 128,
+  CALL_ONCE = 129,
+  BROADCAST_TO = 130,
+  RFFT2D = 131,
+  CONV_3D = 132,
+  IMAG=133,
+  REAL=134,
+  COMPLEX_ABS=135,
+  HASHTABLE = 136,
+  HASHTABLE_FIND = 137,
+  HASHTABLE_IMPORT = 138,
+  HASHTABLE_SIZE = 139,
+  REDUCE_ALL = 140,
+  CONV_3D_TRANSPOSE = 141,
+  VAR_HANDLE = 142,
+  READ_VARIABLE = 143,
+  ASSIGN_VARIABLE = 144,
+}
+// LINT.ThenChange(nnapi_linter/linter.proto)
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions,
+  CumsumOptions,
+  CallOnceOptions,
+  BroadcastToOptions,
+  Rfft2dOptions,
+  Conv3DOptions,
+  HashtableOptions,
+  HashtableFindOptions,
+  HashtableImportOptions,
+  HashtableSizeOptions,
+  VarHandleOptions,
+  ReadVariableOptions,
+  AssignVariableOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+// Options for both Conv3D and Conv3DTranspose.
+table Conv3DOptions {
+  padding:Padding;
+  stride_d:int;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_d_factor:int = 1;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 3.
+  pot_scale_int16:bool = true;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  // This field is currently ignored in the L2 Norm Op.
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+  // Parameters for Gather version 5 or above.
+  batch_dims: int = 0;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table CallOnceOptions {
+  init_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+  // Parameters for BatchMatMul version 4 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table CumsumOptions {
+  exclusive:bool;
+  reverse:bool;
+}
+
+table BroadcastToOptions {
+}
+
+table Rfft2dOptions {
+}
+
+table HashtableOptions {
+  // The identity of hash tables. This identity will be used across different
+  // subgraphs in the same interpreter instance.
+  table_id:int;
+  key_dtype:TensorType;
+  value_dtype:TensorType;
+}
+
+table HashtableFindOptions {
+}
+
+table HashtableImportOptions {
+}
+
+table HashtableSizeOptions {
+}
+
+table VarHandleOptions {
+  container:string;
+  shared_name:string;
+}
+
+table ReadVariableOptions {
+}
+
+table AssignVariableOptions {
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  // This field is for backward compatibility. This field will be used when
+  // the value of the extended builtin_code field has less than
+  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  deprecated_builtin_code:byte;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+
+  // This field is introduced for resolving op builtin code shortage problem
+  // (the original BuiltinOperator enum field was represented as a byte).
+  // This field will be used when the value of the extended builtin_code field
+  // has greater than BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  builtin_code:BuiltinOperator;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+// Map from an alias name of tensor to tensor index in the graph.
+// This is used in Signature def.
+table TensorMap {
+  // Represents the alias to use for this tensor.
+  name:string;
+
+  // The actual tensor index in the primary graph, that 'name' corresponds to.
+  tensor_index:uint;
+}
+
+// This corresponds to SignatureDef in Tensorflow SavedModel.
+// The SignatureDef will be part of the SavedModel provided for conversion.
+table SignatureDef {
+  // Named inputs for this signature.
+  inputs:[TensorMap];
+
+  // Named outputs for this signature.
+  outputs:[TensorMap];
+
+  // Key value which was in the Tensorflow SavedModel SignatureDef map.
+  signature_key:string;
+
+  // Model tag, deprecated.
+  deprecated_tag:string (deprecated);
+
+  // Index of subgraphs that corresponds to the exported method.
+  subgraph_index:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+
+  // Optional SignatureDefs for the model.
+  signature_defs:[SignatureDef];
+}
+
+root_type Model;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 69f763abfda5a7..3790f64e0cec68 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -65,7 +65,6 @@ package_group(
     packages = [
         "//tensorflow/...",
         "//tensorflow_text/...",
-        "//waymo/ml/compiler/frontend/kernels/...",
         "//waymo/onboard/ml/...",
     ],
 )
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 9551bdd79d4ae5..c133556b4aaa43 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -45,8 +45,7 @@ class GatherNdOp : public OpKernel {
 
     Tensor out;
     OP_REQUIRES_OK(
-        c, functor::DoGatherNd<Device, T, Index, /*kDropBadIndices=*/false>(
-               c, params, indices, &out));
+        c, functor::DoGatherNd<Device, T, Index>(c, params, indices, &out));
     c->set_output(0, out);
   }
 };
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 6059a2bbdafb31..09bad00c59b070 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -43,8 +43,7 @@ struct GatherNdSlice {
                    typename TTypes<T>::Matrix Tout);
 };
 
-template <typename Device, typename T, typename Index,
-          bool kDropBadIndices = false>
+template <typename Device, typename T, typename Index>
 Status DoGatherNd(OpKernelContext* c, const Tensor& params,
                   const Tensor& indices, Tensor* out) {
   if (!TensorShapeUtils::IsVectorOrHigher(params.shape())) {
@@ -152,10 +151,6 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
             indices_nd);
     }
 
-    if constexpr (kDropBadIndices) {
-      return absl::OkStatus();
-    }
-
     // bad_i will only return >= 0 on CPUs right now.
     if (bad_i >= 0) {
       auto shape = indices.shape();
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index ea369fd49a5ea2..0f604b0e605879 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -878,7 +878,7 @@ class IndexFlattener {
 namespace {
 
 template <typename Device, typename T, typename Index,
-          scatter_nd_op::UpdateOp Op, bool kDropBadIndices>
+          scatter_nd_op::UpdateOp Op>
 Status DoScatterNdImpl(OpKernelContext* c, const Tensor& indices,
                        const Tensor& updates, const TensorShape& shape,
                        Tensor* out, bool allocate) {
@@ -925,11 +925,7 @@ Status DoScatterNdImpl(OpKernelContext* c, const Tensor& indices,
     for (int i = 0; i < IXDIM; ++i) {                                       \
       output_shape_prefix[i] = shape.dim_size(i);                           \
     }                                                                       \
-    constexpr bool kShallDropBadIndices =                                   \
-        kDropBadIndices || std::is_same<Device, GPUDevice>::value;          \
-    functor::ScatterNdFunctor<Device, T, Index, Op, IXDIM,                  \
-                              kShallDropBadIndices>                         \
-        functor;                                                            \
+    functor::ScatterNdFunctor<Device, T, Index, Op, IXDIM> functor;         \
     bad_i =                                                                 \
         functor(c->eigen_device<Device>(), slice_size, output_shape_prefix, \
                 output_matrix, indices_flat, updates_flat, output_matrix);  \
@@ -951,9 +947,6 @@ Status DoScatterNdImpl(OpKernelContext* c, const Tensor& indices,
             slice_dim);
     }
   }
-  if constexpr (kDropBadIndices) {
-    return absl::OkStatus();
-  }
   if (bad_i >= 0) {
     auto slice_shape = indices.shape();
     slice_shape.RemoveLastDims(1);
@@ -977,8 +970,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
 // back to GPU. This is useful because the CPU implementation is deterministic
 // and the GPU implementation is not. Tensor inputs to this function must be on
 // the GPU.
-template <typename T, typename Index, scatter_nd_op::UpdateOp Op,
-          bool kDropBadIndices>
+template <typename T, typename Index, scatter_nd_op::UpdateOp Op>
 Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
                         const Tensor& updates, const TensorShape& shape,
                         Tensor* out, bool allocate) {
@@ -1023,7 +1015,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
   }
 
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(DoScatterNd<CPUDevice, T, Index, Op, kDropBadIndices>(
+  TF_RETURN_IF_ERROR(DoScatterNd<CPUDevice, T, Index, Op>(
       c, host_indices, host_updates, shape, &host_out, /*allocate=*/false));
 
   // Copy 'host_out' to device.
@@ -1041,15 +1033,15 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
 }  // namespace
 
 template <typename Device, typename T, typename Index,
-          scatter_nd_op::UpdateOp Op, bool kDropBadIndices>
+          scatter_nd_op::UpdateOp Op>
 Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
                    const Tensor& updates, const TensorShape& shape, Tensor* out,
                    bool allocate) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (std::is_same<Device, GPUDevice>::value &&
       tensorflow::OpDeterminismRequired() && !DisableScatterOpDeterminism()) {
-    return DoScatterNdOnCpu<T, Index, Op, kDropBadIndices>(
-        c, indices, updates, shape, out, allocate);
+    return DoScatterNdOnCpu<T, Index, Op>(c, indices, updates, shape, out,
+                                          allocate);
   }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -1057,11 +1049,11 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   // atomics, which are not supported for all integer types.
   if constexpr (std::is_same<Device, GPUDevice>::value &&
                 std::is_integral<T>::value) {
-    return DoScatterNdOnCpu<T, Index, Op, kDropBadIndices>(
-        c, indices, updates, shape, out, allocate);
+    return DoScatterNdOnCpu<T, Index, Op>(c, indices, updates, shape, out,
+                                          allocate);
   } else {
-    return DoScatterNdImpl<Device, T, Index, Op, kDropBadIndices>(
-        c, indices, updates, shape, out, allocate);
+    return DoScatterNdImpl<Device, T, Index, Op>(c, indices, updates, shape,
+                                                 out, allocate);
   }
 }
 }  // namespace functor
@@ -1069,29 +1061,16 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM)                  \
-  template <>                                                                  \
-  Index                                                                        \
-  ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM, /*kDropBadIndices=*/true>:: \
-  operator()(const GPUDevice& d, const Index slice_size,                       \
-             const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix, \
-             typename TTypes<T, 2>::Tensor Tparams,                            \
-             typename TTypes<Index, 2>::ConstTensor Tindices,                  \
-             typename TTypes<T, 2>::ConstTensor Tupdates,                      \
-             typename TTypes<T, 2>::Tensor Toutput);                           \
-  extern template struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM,      \
-                                          /*kDropBadIndices=*/true>;           \
-  template <>                                                                  \
-  Index ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM,                       \
-                         /*kDropBadIndices=*/false>::                          \
-  operator()(const GPUDevice& d, const Index slice_size,                       \
-             const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix, \
-             typename TTypes<T, 2>::Tensor Tparams,                            \
-             typename TTypes<Index, 2>::ConstTensor Tindices,                  \
-             typename TTypes<T, 2>::ConstTensor Tupdates,                      \
-             typename TTypes<T, 2>::Tensor Toutput);                           \
-  extern template struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM,      \
-                                          /*kDropBadIndices=*/false>;
+#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM)           \
+  template <>                                                           \
+  Index ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>::operator()(   \
+      const GPUDevice& d, const Index slice_size,                       \
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix, \
+      typename TTypes<T, 2>::Tensor Tparams,                            \
+      typename TTypes<Index, 2>::ConstTensor Tindices,                  \
+      typename TTypes<T, 2>::ConstTensor Tupdates,                      \
+      typename TTypes<T, 2>::Tensor Toutput);                           \
+  extern template struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
 
 #define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op)     \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
diff --git a/tensorflow/core/kernels/scatter_nd_op.h b/tensorflow/core/kernels/scatter_nd_op.h
index 8d2e74b18ca864..f9a2ce0ed6e12b 100644
--- a/tensorflow/core/kernels/scatter_nd_op.h
+++ b/tensorflow/core/kernels/scatter_nd_op.h
@@ -44,7 +44,7 @@ namespace functor {
 
 // Functor used by ScatterOp to do the computations.
 template <typename Device, typename T, typename Index,
-          scatter_nd_op::UpdateOp op, int IXDIM, bool kDropBadIndices>
+          scatter_nd_op::UpdateOp op, int IXDIM>
 struct ScatterNdFunctor {
   // Returns -1 on success or a nonnegative i s.t. indices[i] is a bad index.
   Index operator()(
@@ -63,7 +63,7 @@ struct ScatterNdFunctor {
 // right type (T) and shape.  This tensor will not be zeroed out
 // before the scatter is executed.
 template <typename Device, typename T, typename Index,
-          scatter_nd_op::UpdateOp Op, bool kDropBadIndices = false>
+          scatter_nd_op::UpdateOp Op>
 Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
                    const Tensor& updates, const TensorShape& shape, Tensor* out,
                    bool allocate);
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index abdbc1ece968bf..b0123780cc6406 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -103,9 +103,8 @@ class UpdateExecutor<T, Input, Update, Output, scatter_nd_op::UpdateOp::MAX> {
 namespace functor {
 
 // Implementation of update functor for CPU.
-template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM,
-          bool kDropBadIndices>
-struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM, kDropBadIndices> {
+template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
+struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
   Index operator()(
       const CPUDevice& d, const Index slice_size,
       const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
@@ -137,44 +136,33 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM, kDropBadIndices> {
         i += ix_d * batch_strides[dim];
       }
       if (TF_PREDICT_FALSE(out_of_bounds)) {
-        if constexpr (kDropBadIndices) {
-          continue;
-        }
         error_loc = loc;
         break;
+      } else {
+        auto input_chip = Toutput.template chip<0>(i);
+        auto output_chip = input_chip;
+        auto update_chip = Tupdates.template chip<0>(loc);
+        update_executor::UpdateExecutor<
+            CPUDevice, decltype(input_chip), decltype(update_chip),
+            decltype(output_chip), OP>::Execute(d, input_chip, update_chip,
+                                                output_chip);
       }
-      auto input_chip = Toutput.template chip<0>(i);
-      auto output_chip = input_chip;
-      auto update_chip = Tupdates.template chip<0>(loc);
-      update_executor::UpdateExecutor<
-          CPUDevice, decltype(input_chip), decltype(update_chip),
-          decltype(output_chip), OP>::Execute(d, input_chip, update_chip,
-                                              output_chip);
     }
 
     return error_loc;
   }
 };
 
-#define REGISTER_SCATTER_ND_FULL(T, Index, op)                                 \
-  template Index ScatterNdFunctor<CPUDevice, T, Index, op, CPU_PROVIDED_IXDIM, \
-                                  /*kDropBadIndices=*/false>::                 \
-  operator()(const CPUDevice& d, const Index slice_size,                       \
-             const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>         \
-                 output_shape_prefix,                                          \
-             typename TTypes<T, 2>::Tensor Tparams,                            \
-             typename TTypes<Index, 2>::ConstTensor Tindices,                  \
-             typename TTypes<T, 2>::ConstTensor Tupdates,                      \
-             typename TTypes<T, 2>::Tensor Toutput);                           \
-  template Index ScatterNdFunctor<CPUDevice, T, Index, op, CPU_PROVIDED_IXDIM, \
-                                  /*kDropBadIndices=*/true>::                  \
-  operator()(const CPUDevice& d, const Index slice_size,                       \
-             const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>         \
-                 output_shape_prefix,                                          \
-             typename TTypes<T, 2>::Tensor Tparams,                            \
-             typename TTypes<Index, 2>::ConstTensor Tindices,                  \
-             typename TTypes<T, 2>::ConstTensor Tupdates,                      \
-             typename TTypes<T, 2>::Tensor Toutput)
+#define REGISTER_SCATTER_ND_FULL(T, Index, op)                               \
+  template Index                                                             \
+  ScatterNdFunctor<CPUDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
+      const CPUDevice& d, const Index slice_size,                            \
+      const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>              \
+          output_shape_prefix,                                               \
+      typename TTypes<T, 2>::Tensor Tparams,                                 \
+      typename TTypes<Index, 2>::ConstTensor Tindices,                       \
+      typename TTypes<T, 2>::ConstTensor Tupdates,                           \
+      typename TTypes<T, 2>::Tensor Toutput)
 
 #define REGISTER_SCATTER_ND_INDEX(type, op)  \
   REGISTER_SCATTER_ND_FULL(type, int32, op); \
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 4e528c58e6ba0f..fd1d4747c40982 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -124,9 +124,8 @@ __global__ void ScatterNdOpKernel(
 namespace functor {
 
 // Functor used by ScatterOp to do the computations.
-template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM,
-          bool kDropBadIndices>
-struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM, kDropBadIndices> {
+template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
+struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   Index operator()(
       const GPUDevice& d, const Index slice_size,
       const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
@@ -165,9 +164,8 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM, kDropBadIndices> {
 
 }  // namespace functor
 
-#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM)               \
-  template struct functor::ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM, \
-                                            /*kDropBadIndices=*/true>;
+#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM) \
+  template struct functor::ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
 
 #define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op)     \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
diff --git a/tensorflow/core/ops/uniform_quant_ops.cc b/tensorflow/core/ops/uniform_quant_ops.cc
index 514c9f9278d8c5..c5fcb762dabd13 100644
--- a/tensorflow/core/ops/uniform_quant_ops.cc
+++ b/tensorflow/core/ops/uniform_quant_ops.cc
@@ -29,7 +29,8 @@ using tensorflow::errors::Unknown;
 
 // If the rank and all dim sizes are known, return corresponding TensorShape.
 // Otherwise return Unknown error.
-StatusOr<TensorShape> ToTensorShape(ShapeHandle shape_handle, int64_t rank) {
+absl::StatusOr<TensorShape> ToTensorShape(ShapeHandle shape_handle,
+                                          int64_t rank) {
   TensorShape shape;
   for (int i = 0; i < rank; ++i) {
     int64_t dim_size = shape_inference::InferenceContext::Value(
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0ab15886f47593..b188b0142e52a7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1872  // Updated: 2024/5/24
+#define TF_GRAPH_DEF_VERSION 1876  // Updated: 2024/5/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 8129cdb0ea0f65..ac5a88c0d326f6 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -46,8 +46,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
         "@local_xla//xla/pjrt:utils",
         "@tf_runtime//:hostcontext",
     ],
@@ -64,6 +62,8 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "@com_google_absl//absl/log:check",
         "@local_xla//xla/pjrt:pjrt_client",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -96,9 +96,15 @@ cc_library(
         ":pjrt_client_factory_options",
         ":pjrt_client_factory_registry",
         "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/framework:resource_base",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/client:local_client",
         "@local_xla//xla/pjrt:local_device_state",
         "@local_xla//xla/pjrt:pjrt_client",
@@ -121,11 +127,14 @@ cc_library(
     deps = [
         ":global_state",
         ":pjrt_state",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:refcount",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
@@ -144,7 +153,12 @@ cc_library(
     deps = [
         ":global_state",
         ":pjrt_state",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core/platform:refcount",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
@@ -153,16 +167,17 @@ tf_cc_test(
     name = "pjrt_state_test",
     srcs = ["pjrt_state_test.cc"],
     deps = [
-        ":global_state",
+        ":pjrt_cpu_client_registration",
         ":pjrt_state",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:test",
-        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/core/platform:refcount",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
-        "//tensorflow/core/tfrt/common:pjrt_cpu_client_registration",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
     ],
 )
 
@@ -170,7 +185,6 @@ tf_cc_test(
     name = "pjrt_util_test",
     srcs = ["pjrt_util_test.cc"],
     deps = [
-        ":global_state",
         ":pjrt_state",
         ":pjrt_util",
         "//tensorflow/core:framework",
@@ -180,7 +194,7 @@ tf_cc_test(
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
     ],
 )
 
@@ -199,6 +213,7 @@ tf_cuda_cc_test(
         ":pjrt_state",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test_main",
@@ -223,9 +238,12 @@ cc_library(
         ":pjrt_client_factory_options",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/framework:device_type",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:statusor",
+        "@local_tsl//tsl/platform:errors",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
@@ -237,9 +255,10 @@ cc_library(
         ":pjrt_client_factory_options",
         ":pjrt_client_factory_registry",
         "//tensorflow/core:framework_types_hdr",
-        "@local_xla//xla:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
     ],
     alwayslink = True,
 )
@@ -253,6 +272,7 @@ tf_cc_test(
         ":pjrt_cpu_client_registration",
         "//tensorflow/core:framework_types_hdr",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -264,7 +284,8 @@ cc_library(
         ":pjrt_client_factory_registry",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework_types_hdr",
-        "@local_xla//xla:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/gpu:se_gpu_pjrt_client",
     ],
@@ -296,6 +317,7 @@ tf_cuda_cc_test(
         ":pjrt_gpu_client_registration",
         "//tensorflow/core:framework_types_hdr",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/service:gpu_plugin",
     ],
 )
diff --git a/tensorflow/core/tfrt/common/async_value_tensor.cc b/tensorflow/core/tfrt/common/async_value_tensor.cc
index d78c41051d29b8..09b86690157ff0 100644
--- a/tensorflow/core/tfrt/common/async_value_tensor.cc
+++ b/tensorflow/core/tfrt/common/async_value_tensor.cc
@@ -14,11 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tfrt/common/async_value_tensor.h b/tensorflow/core/tfrt/common/async_value_tensor.h
index 25ce153b516298..06e99f8f7bcc48 100644
--- a/tensorflow/core/tfrt/common/async_value_tensor.h
+++ b/tensorflow/core/tfrt/common/async_value_tensor.h
@@ -15,10 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_COMMON_ASYNC_VALUE_TENSOR_H_
 #define TENSORFLOW_CORE_TFRT_COMMON_ASYNC_VALUE_TENSOR_H_
 
+#include <cstddef>
 #include <memory>
 
 #include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
 #include "tfrt/support/ref_count.h"  // from @tf_runtime
 
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
index 73f7dfc6de0e3e..b611b183de9032 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
@@ -17,9 +17,15 @@ limitations under the License.
 #include <optional>
 #include <set>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/tfrt/common/global_state.h"
 #include "tensorflow/core/tfrt/common/pjrt_state.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util.h b/tensorflow/core/tfrt/common/create_pjrt_client_util.h
index 945cea4efd4098..fe8dfbb8db5f23 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util.h
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util.h
@@ -15,10 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_COMMON_CREATE_PJRT_CLIENT_UTIL_H_
 #define TENSORFLOW_CORE_TFRT_COMMON_CREATE_PJRT_CLIENT_UTIL_H_
 
-#include <memory>
 #include <optional>
 #include <set>
 
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
 
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
index 4eab11a48c411b..027bf7bed783aa 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
@@ -14,7 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"  // IWYU pragma: keep
 #include "tensorflow/core/framework/types.h"
 #include "tsl/platform/status_matchers.h"
 
diff --git a/tensorflow/core/tfrt/common/global_state.cc b/tensorflow/core/tfrt/common/global_state.cc
index 75d15d010234bf..61279217c06325 100644
--- a/tensorflow/core/tfrt/common/global_state.cc
+++ b/tensorflow/core/tfrt/common/global_state.cc
@@ -17,9 +17,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "absl/memory/memory.h"
 #include "xla/pjrt/utils.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 #include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_options.h b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
index 47caf2116af6b7..70e3092c2df654 100644
--- a/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
 #define TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
 
-#include <functional>
-#include <memory>
 #include <optional>
 #include <set>
 #include <string>
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc
index d792a9b2f6b5e6..bea5b42e7b4c20 100644
--- a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc
@@ -16,9 +16,16 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
 
-#include "tsl/platform/statusor.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tsl/framework/device_type.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 PjrtClientFactoryRegistry& PjrtClientFactoryRegistry::Get() {
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
index 2950772b1ea6f2..01568d11ec1b51 100644
--- a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
@@ -19,8 +19,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/statusor.h"
 #include "tensorflow/core/framework/registration/registration.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
diff --git a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc
index b114821d2f20ec..75bfa24a6b6ad3 100644
--- a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc
+++ b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
-#include "xla/statusor.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
diff --git a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc
index 26d6884e91006c..773d1223507038 100644
--- a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <optional>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+#include "tsl/framework/device_type.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
index ead40c6f39c254..99b1fab73f6052 100644
--- a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/statusor.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/statusor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+#include "tsl/platform/statusor.h"
 namespace xla {
 
 absl::StatusOr<std::unique_ptr<xla::PjRtClient>> GetGpuClient(
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
index f4feb34b541cb7..2eeca7a71eca12 100644
--- a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <optional>
-#include <utility>
 
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+#include "tsl/framework/device_type.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index a1a8e2366c6a38..12a8937d389c9a 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -18,11 +18,17 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/tf_pjrt_client.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
 #include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 180163376b4cd2..4863fc9e7d7e0c 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -17,14 +17,22 @@ limitations under the License.
 
 #include <map>
 #include <memory>
+#include <set>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "tensorflow/core/framework/resource_base.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/framework/allocator.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tfrt/common/pjrt_state_test.cc b/tensorflow/core/tfrt/common/pjrt_state_test.cc
index 0b8cf6e1b9bbf8..fddd72ea050509 100644
--- a/tensorflow/core/tfrt/common/pjrt_state_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state_test.cc
@@ -19,19 +19,20 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
 
 namespace {
 
 using tensorflow::PjRtState;
 using ::testing::HasSubstr;
-
-using ::tensorflow::testing::StatusIs;
+using ::tsl::testing::StatusIs;
 
 class PjRtStateTestFixture : public testing::Test {
  protected:
diff --git a/tensorflow/core/tfrt/common/pjrt_util.cc b/tensorflow/core/tfrt/common/pjrt_util.cc
index 643632d5706a47..54ed3060adbc08 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util.cc
@@ -15,17 +15,19 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 
 #include <memory>
-#include <optional>
-#include <set>
 #include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/global_state.h"
 #include "tensorflow/core/tfrt/common/pjrt_state.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tfrt/common/pjrt_util.h b/tensorflow/core/tfrt/common/pjrt_util.h
index ce9cbc1d11c287..2895f22bf4ea92 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.h
+++ b/tensorflow/core/tfrt/common/pjrt_util.h
@@ -16,9 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
 
 #include <memory>
-#include <optional>
-#include <set>
 
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/core/tfrt/common/pjrt_util_test.cc b/tensorflow/core/tfrt/common/pjrt_util_test.cc
index f8de14dd034812..1361b72c2da686 100644
--- a/tensorflow/core/tfrt/common/pjrt_util_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util_test.cc
@@ -17,10 +17,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
-#include "tensorflow/core/framework/resource_mgr.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/tfrt/common/global_state.h"
 #include "tensorflow/core/tfrt/common/pjrt_state.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 979590bf83aac7..5580e69e4681cb 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -1130,8 +1130,7 @@ GraphExecutor::LoadedClientGraph::LoadedClientGraph(
       pflr_(&graph_executor->fallback_state().device_manager(),
             graph_executor->fallback_state().session_options().env,
             &graph_executor->fallback_state().session_options().config,
-            TF_GRAPH_DEF_VERSION,
-            &graph_executor->fallback_state().func_lib_def(),
+            TF_GRAPH_DEF_VERSION, &flib_def_,
             graph_executor->fallback_state()
                 .session_options()
                 .config.graph_options()
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 9f60eb3ac4d235..e290b0967f75dd 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -711,6 +711,13 @@ if(TFLITE_KERNEL_TEST)
   add_subdirectory(${TFLITE_SOURCE_DIR}/kernels)
 endif()
 
+# Add the generated headers directory. Required for maintaining the
+# tensorflow/lite directory structure for generated headers.
+set(TFLITE_GENERATED_HEADERS_DIR ${CMAKE_BINARY_DIR}/tensorflow/lite)
+
+# Add the profiling proto directory.
+add_subdirectory(${TFLITE_SOURCE_DIR}/profiling/proto)
+
 # The benchmark tool.
 add_subdirectory(${TFLITE_SOURCE_DIR}/tools/benchmark)
 
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 2419b2c9325ad3..8b4de50df0bd84 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -115,23 +115,6 @@ absl::Status LoadOpenCL() {
   }
 #else
   void* libopencl = nullptr;
-#ifdef __ANDROID__
-  // Pixel phone or auto?
-  libopencl =
-      AndroidDlopenSphalLibrary("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
-  if (!libopencl) {
-    libopencl =
-        AndroidDlopenSphalLibrary("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
-  }
-  if (libopencl) {
-    typedef void (*enableOpenCL_t)();
-    enableOpenCL_t enableOpenCL =
-        reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
-    enableOpenCL();
-    LoadOpenCLFunctions(libopencl, true);
-    return absl::OkStatus();
-  }
-#endif
 #ifdef __APPLE__
   static const char* kClLibName =
       "/System/Library/Frameworks/OpenCL.framework/OpenCL";
@@ -140,6 +123,23 @@ absl::Status LoadOpenCL() {
 #endif
 #ifdef __ANDROID__
   libopencl = AndroidDlopenSphalLibrary(kClLibName, RTLD_NOW | RTLD_LOCAL);
+  if (!libopencl) {
+    // Legacy Pixel phone or auto path?
+    libopencl =
+        AndroidDlopenSphalLibrary("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+    if (!libopencl) {
+      libopencl =
+          AndroidDlopenSphalLibrary("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
+    }
+    if (libopencl) {
+      typedef void (*enableOpenCL_t)();
+      enableOpenCL_t enableOpenCL =
+          reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
+      enableOpenCL();
+      LoadOpenCLFunctions(libopencl, true);
+      return absl::OkStatus();
+    }
+  }
 #else
   libopencl = dlopen(kClLibName, RTLD_NOW | RTLD_LOCAL);
 #endif
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
index 9b0cda01bc559a..f37fd78e0f613f 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
@@ -133,6 +133,7 @@ cc_library(
     srcs = ["kernel_test_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/kernels:acceleration_test_util",
         "//tensorflow/lite/kernels:acceleration_test_util_internal",
         "//tensorflow/lite/kernels:test_delegate_providers_lib",
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc
index 3c0d4c5a93f2ee..f3fe76d395a79e 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include <fstream>
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "benchmark/benchmark.h"  // from @com_google_benchmark
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/kernels/acceleration_test_util.h"
 #include "tensorflow/lite/kernels/acceleration_test_util_internal.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
@@ -84,7 +86,16 @@ void ValidateAcceleration(const SingleOpModel& model) {
       GetAccelerationTestParam<DelegateTestSuiteAccelerationTestParams>(test_id)
           .has_value();
   if (!supported) {
+    // Note that the error `kTfLiteApplicationError` is accepted here.
+    // We only want to check the delegate is working properly, so an error due
+    // to incompatibility between the model and the delegate is not considered a
+    // failure here.
+    EXPECT_THAT(model.GetDelegateApplicationStatus().value_or(kTfLiteOk),
+                testing::AnyOf(kTfLiteOk, kTfLiteApplicationError));
     return;
+  } else {
+    EXPECT_EQ(model.GetDelegateApplicationStatus().value_or(kTfLiteOk),
+              kTfLiteOk);
   }
 
   // If we have multiple delegates applied, we would skip this check at the
@@ -135,9 +146,7 @@ bool InitKernelTest(int* argc, char** argv) {
   return true;
 }
 
-void DestroyKernelTest() {
-  DelegateTestSuiteAccelerationTestParams::Destroy();
-}
+void DestroyKernelTest() { DelegateTestSuiteAccelerationTestParams::Destroy(); }
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 5654c285c8d150..cab06da2807b8d 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -816,40 +816,5 @@ TEST(Conv2D, TransientIndirectionBuffer) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, AdaptiveAvxOptimization) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.experimental_adaptive_avx_optimization = true;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto input_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
-  auto kernel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
-  auto stride_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
-
-  Conv2DTester()
-      .BatchSize(batch_rng())
-      .InputHeight(input_rng())
-      .InputWidth(input_rng())
-      .InputChannels(channel_rng())
-      .OutputChannels(channel_rng())
-      .KernelHeight(kernel_rng())
-      .KernelWidth(kernel_rng())
-      .StrideHeight(stride_rng())
-      .StrideWidth(stride_rng())
-      .Test(xnnpack_delegate.get());
-}
-
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index ff54dee09a0fb5..33e1d317bce6c8 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -628,10 +628,6 @@ class Delegate {
 #endif
   }
 
-  bool experimental_adaptive_avx_optimization() const {
-    return options_.experimental_adaptive_avx_optimization;
-  }
-
   pthreadpool_t threadpool() const {
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
     return nullptr;
@@ -1120,9 +1116,6 @@ class Subgraph {
     if (delegate.transient_indirection_buffer()) {
       flags |= XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER;
     }
-    if (delegate.experimental_adaptive_avx_optimization()) {
-      xnn_experiment_enable_adaptive_avx_optimization();
-    }
     if (delegate.force_fp16()) {
       flags |= XNN_FLAG_FORCE_FP16_INFERENCE;
     } else {
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index 55eddcf1a54d67..dd5bf1adc4f587 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -68,8 +68,6 @@ typedef struct {
   // Deprecated. Use the flags bitfield with the
   // TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS mask.
   bool handle_variable_ops;
-  // Enable adaptive optimization for AVX CPUs.
-  bool experimental_adaptive_avx_optimization;
   // Path to the weight cache to load if `weight_cache` is undefined.
   //
   // WARNING this is an experimental flag.
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 1dd47692c50819..99ab45a13c71a7 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -278,7 +278,9 @@ TfLiteStatus SingleOpModel::ApplyDelegate() {
   if (delegate_) {
     TFLITE_LOG(WARN) << "Having a manually-set TfLite delegate, and bypassing "
                         "KernelTestDelegateProviders";
-    TF_LITE_ENSURE_STATUS(interpreter_->ModifyGraphWithDelegate(delegate_));
+    SetDelegateApplicationStatus(
+        interpreter_->ModifyGraphWithDelegate(delegate_));
+    TF_LITE_ENSURE_STATUS(*GetDelegateApplicationStatus());
     ++num_applied_delegates_;
   } else {
     auto* delegate_providers = tflite::KernelTestDelegateProviders::Get();
@@ -292,8 +294,9 @@ TfLiteStatus SingleOpModel::ApplyDelegate() {
     for (auto& one : delegate_providers->CreateAllDelegates()) {
       // The raw ptr always points to the actual TfLiteDegate object.
       auto* delegate_raw_ptr = one.delegate.get();
-      TF_LITE_ENSURE_STATUS(
+      SetDelegateApplicationStatus(
           interpreter_->ModifyGraphWithDelegate(std::move(one.delegate)));
+      TF_LITE_ENSURE_STATUS(*GetDelegateApplicationStatus());
       // Note: 'delegate_' is always set to the last successfully applied one.
       delegate_ = delegate_raw_ptr;
       ++num_applied_delegates_;
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index c4c18fb3eef57f..710ab60d0e28e0 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include <limits>
 #include <map>
 #include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <tuple>
@@ -747,6 +748,13 @@ class SingleOpModel {
   int CountOpsExecutedByCpuKernel();
   int CountNumberOfDelegatedPartitions() const;
   int GetNumberOfAppliedDelegates() const { return num_applied_delegates_; }
+  // Return the most recent return status of ApplyDelegate.
+  std::optional<TfLiteStatus> GetDelegateApplicationStatus() const {
+    return delegate_application_status_;
+  }
+  void SetDelegateApplicationStatus(std::optional<TfLiteStatus> status) {
+    delegate_application_status_ = status;
+  }
 
   // Tell TF Lite runtime to apply default delegates (i.e. XNNPACK delegate)
   // when handling this op-level model.
@@ -1082,6 +1090,7 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Tensor>> tensors_;
   std::vector<flatbuffers::Offset<Buffer>> buffers_;
   TfLiteDelegate* delegate_ = nullptr;  // not own the memory.
+  std::optional<TfLiteStatus> delegate_application_status_ = std::nullopt;
   std::vector<std::vector<int>> input_shapes_;
   int num_applied_delegates_ = 0;
   bool allow_fp32_relax_to_fp16_ = false;
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 54920ef71dc625..03b5438a973ee9 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -194,6 +194,8 @@ cc_library(
     copts = common_copts,
     deps = [
         "//tensorflow/core/util:stats_calculator_portable",
+        "//tensorflow/lite/profiling/proto:profiling_info_cc_proto",
+        "//tensorflow/lite/tools:logging",
     ],
 )
 
@@ -202,6 +204,7 @@ cc_test(
     srcs = ["profile_summary_formatter_test.cc"],
     deps = [
         ":profile_summary_formatter",
+        "//tensorflow/lite/profiling/proto:profiling_info_cc_proto",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index 4bbf4e403a2f2d..f8e461f2baebea 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -201,6 +201,8 @@ void ProfileSummarizer::ProcessProfiles(
   if (delegate_internal_total_us > 0) {
     delegate_stats_calculator_->UpdateRunTotalUs(delegate_internal_total_us);
   }
+
+  SetSubgraphNameMap(interpreter);
 }
 
 tensorflow::StatsCalculator* ProfileSummarizer::GetStatsCalculator(
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index 3007440d680159..986bb691c18aee 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -45,13 +45,13 @@ class ProfileSummarizer {
   // Returns a string detailing the accumulated runtime stats in the format of
   // summary_formatter_.
   std::string GetOutputString() {
-    return summary_formatter_->GetOutputString(stats_calculator_map_,
-                                               *delegate_stats_calculator_);
+    return summary_formatter_->GetOutputString(
+        stats_calculator_map_, *delegate_stats_calculator_, subgraph_name_map_);
   }
 
   std::string GetShortSummary() {
-    return summary_formatter_->GetShortSummary(stats_calculator_map_,
-                                               *delegate_stats_calculator_);
+    return summary_formatter_->GetShortSummary(
+        stats_calculator_map_, *delegate_stats_calculator_, subgraph_name_map_);
   }
 
   tensorflow::StatsCalculator* GetStatsCalculator(uint32_t subgraph_index);
@@ -73,6 +73,17 @@ class ProfileSummarizer {
 
   // Summary formatter for customized output formats.
   std::shared_ptr<ProfileSummaryFormatter> summary_formatter_;
+
+  std::map<uint32_t, std::string> subgraph_name_map_;
+
+  void SetSubgraphNameMap(const tflite::Interpreter& interpreter) {
+    subgraph_name_map_.clear();
+    for (int subgraph_index = 0; subgraph_index < interpreter.subgraphs_size();
+         ++subgraph_index) {
+      subgraph_name_map_[subgraph_index] =
+          interpreter.subgraph(subgraph_index)->GetName();
+    }
+  }
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/profile_summary_formatter.cc b/tensorflow/lite/profiling/profile_summary_formatter.cc
index 5c7bea2c279e11..31f235c999351a 100644
--- a/tensorflow/lite/profiling/profile_summary_formatter.cc
+++ b/tensorflow/lite/profiling/profile_summary_formatter.cc
@@ -15,10 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 
+#include <fstream>
+#include <iomanip>
+#include <ios>
 #include <map>
 #include <memory>
+#include <ostream>
+#include <queue>
 #include <sstream>
 #include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/profiling/proto/profiling_info.pb.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace profiling {
@@ -26,35 +36,47 @@ namespace profiling {
 std::string ProfileSummaryDefaultFormatter::GetOutputString(
     const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
         stats_calculator_map,
-    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+    const tensorflow::StatsCalculator& delegate_stats_calculator,
+    const std::map<uint32_t, std::string>& subgraph_name_map) const {
   return GenerateReport("profile", /*include_output_string*/ true,
-                        stats_calculator_map, delegate_stats_calculator);
+                        stats_calculator_map, delegate_stats_calculator,
+                        subgraph_name_map);
 }
 
 std::string ProfileSummaryDefaultFormatter::GetShortSummary(
     const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
         stats_calculator_map,
-    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+    const tensorflow::StatsCalculator& delegate_stats_calculator,
+    const std::map<uint32_t, std::string>& subgraph_name_map) const {
   return GenerateReport("summary", /*include_output_string*/ false,
-                        stats_calculator_map, delegate_stats_calculator);
+                        stats_calculator_map, delegate_stats_calculator,
+                        subgraph_name_map);
 }
 
 std::string ProfileSummaryDefaultFormatter::GenerateReport(
     const std::string& tag, bool include_output_string,
     const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
         stats_calculator_map,
-    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+    const tensorflow::StatsCalculator& delegate_stats_calculator,
+    const std::map<uint32_t, std::string>& subgraph_name_map) const {
   std::stringstream stream;
   bool has_non_primary_graph =
       (stats_calculator_map.size() - stats_calculator_map.count(0)) > 0;
   for (const auto& stats_calc : stats_calculator_map) {
     auto subgraph_index = stats_calc.first;
     auto subgraph_stats = stats_calc.second.get();
+    std::string subgraph_name = "";
+    if (subgraph_name_map.find(subgraph_index) != subgraph_name_map.end()) {
+      subgraph_name = subgraph_name_map.at(subgraph_index);
+    }
+
     if (has_non_primary_graph) {
       if (subgraph_index == 0) {
-        stream << "Primary graph " << tag << ":" << std::endl;
+        stream << "Primary graph (name: " << subgraph_name << ") " << tag << ":"
+               << std::endl;
       } else {
-        stream << "Subgraph (index: " << subgraph_index << ") " << tag << ":"
+        stream << "Subgraph (index: " << subgraph_index
+               << ", name: " << subgraph_name << ") " << tag << ":"
                << std::endl;
       }
     }
@@ -62,7 +84,8 @@ std::string ProfileSummaryDefaultFormatter::GenerateReport(
       stream << subgraph_stats->GetOutputString();
     }
     if (subgraph_index != 0) {
-      stream << "Subgraph (index: " << subgraph_index << ") ";
+      stream << "Subgraph (index: " << subgraph_index
+             << ", name: " << subgraph_name << ") ";
     }
     stream << subgraph_stats->GetShortSummary() << std::endl;
   }
@@ -78,6 +101,25 @@ std::string ProfileSummaryDefaultFormatter::GenerateReport(
   return stream.str();
 }
 
+void ProfileSummaryDefaultFormatter::HandleOutput(
+    const std::string& init_output, const std::string& run_output,
+    std::string output_file_path) const {
+  std::ofstream output_file(output_file_path);
+  std::ostream* output_stream = nullptr;
+  if (output_file.good()) {
+    output_stream = &output_file;
+  }
+  if (!init_output.empty()) {
+    WriteOutput("Profiling Info for Benchmark Initialization:", init_output,
+                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
+  }
+  if (!run_output.empty()) {
+    WriteOutput(
+        "Operator-wise Profiling Info for Regular Benchmark Runs:", run_output,
+        output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
+  }
+}
+
 tensorflow::StatSummarizerOptions
 ProfileSummaryDefaultFormatter::GetStatSummarizerOptions() const {
   auto options = tensorflow::StatSummarizerOptions();
@@ -95,5 +137,172 @@ ProfileSummaryCSVFormatter::GetStatSummarizerOptions() const {
   return options;
 }
 
+std::vector<tensorflow::StatsCalculator::Detail>
+ProfileSummaryProtoFormatter::GetDetailsSortedByRunOrder(
+    const tensorflow::StatsCalculator* stats_calculator) const {
+  std::vector<tensorflow::StatsCalculator::Detail> details;
+  std::map<std::string, tensorflow::StatsCalculator::Detail> unsorted_details =
+      stats_calculator->GetDetails();
+
+  std::priority_queue<
+      std::pair<std::string, const tensorflow::StatsCalculator::Detail*>>
+      sorted_list;
+  const int num_nodes = unsorted_details.size();
+  for (const auto& det : unsorted_details) {
+    const tensorflow::StatsCalculator::Detail* detail = &(det.second);
+    std::stringstream stream_for_sort;
+    stream_for_sort << std::setw(20) << std::right << std::setprecision(10)
+                    << std::fixed;
+    stream_for_sort << num_nodes - detail->run_order;
+    sorted_list.emplace(stream_for_sort.str(), detail);
+  }
+
+  while (!sorted_list.empty()) {
+    auto entry = sorted_list.top();
+    sorted_list.pop();
+    details.push_back(*entry.second);
+  }
+  return details;
+}
+
+void ProfileSummaryProtoFormatter::GenerateOpProfileDataFromDetail(
+    const tensorflow::StatsCalculator::Detail* detail,
+    const tensorflow::StatsCalculator* stats_calculator,
+    OpProfileData* const op_profile_data) const {
+  if (detail == nullptr) {
+    return;
+  }
+
+  op_profile_data->set_node_type(detail->type);
+  OpProfilingStat* inference_stat =
+      op_profile_data->mutable_inference_microseconds();
+  inference_stat->set_first(detail->elapsed_time.first());
+  inference_stat->set_last(detail->elapsed_time.newest());
+  inference_stat->set_avg(detail->elapsed_time.avg());
+  inference_stat->set_stddev(detail->elapsed_time.std_deviation());
+  inference_stat->set_variance(detail->elapsed_time.variance());
+  inference_stat->set_min(detail->elapsed_time.min());
+  inference_stat->set_max(detail->elapsed_time.max());
+  inference_stat->set_sum(detail->elapsed_time.sum());
+  inference_stat->set_count(detail->elapsed_time.count());
+
+  OpProfilingStat* memory_stat = op_profile_data->mutable_mem_kb();
+  memory_stat->set_first(detail->mem_used.first() / 1000.0);
+  memory_stat->set_last(detail->mem_used.newest() / 1000.0);
+  memory_stat->set_avg(detail->mem_used.avg() / 1000.0);
+  memory_stat->set_stddev(detail->mem_used.std_deviation() / 1000.0);
+  memory_stat->set_variance(detail->mem_used.variance() / 1000000.0);
+  memory_stat->set_min(detail->mem_used.min() / 1000.0);
+  memory_stat->set_max(detail->mem_used.max() / 1000.0);
+  memory_stat->set_sum(detail->mem_used.sum() / 1000.0);
+  memory_stat->set_count(detail->mem_used.count());
+
+  op_profile_data->set_times_called(detail->times_called /
+                                    stats_calculator->num_runs());
+  op_profile_data->set_name(detail->name);
+  op_profile_data->set_run_order(detail->run_order);
+}
+
+void ProfileSummaryProtoFormatter::GenerateSubGraphProfilingData(
+    const tensorflow::StatsCalculator* stats_calculator, int subgraph_index,
+    const std::map<uint32_t, std::string>& subgraph_name_map,
+    SubGraphProfilingData* const sub_graph_profiling_data) const {
+  sub_graph_profiling_data->set_subgraph_index(subgraph_index);
+
+  std::string subgraph_name = "";
+  if (subgraph_name_map.find(subgraph_index) != subgraph_name_map.end()) {
+    subgraph_name = subgraph_name_map.at(subgraph_index);
+  }
+  sub_graph_profiling_data->set_subgraph_name(subgraph_name);
+
+  for (tensorflow::StatsCalculator::Detail& detail :
+       GetDetailsSortedByRunOrder(stats_calculator)) {
+    OpProfileData* const op_profile_data =
+        sub_graph_profiling_data->add_per_op_profiles();
+    GenerateOpProfileDataFromDetail(&detail, stats_calculator, op_profile_data);
+  }
+}
+
+void ProfileSummaryProtoFormatter::GenerateDelegateProfilingData(
+    const tensorflow::StatsCalculator* stats_calculator,
+    DelegateProfilingData* const delegate_profiling_data) const {
+  for (const tensorflow::StatsCalculator::Detail& detail :
+       GetDetailsSortedByRunOrder(stats_calculator)) {
+    OpProfileData* const op_profile_data =
+        delegate_profiling_data->add_per_op_profiles();
+    GenerateOpProfileDataFromDetail(&detail, stats_calculator, op_profile_data);
+  }
+}
+
+std::string ProfileSummaryProtoFormatter::GetShortSummary(
+    const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+        stats_calculator_map,
+    const tensorflow::StatsCalculator& delegate_stats_calculator,
+    const std::map<uint32_t, std::string>& subgraph_name_map) const {
+  TFLITE_LOG(ERROR) << "GetShortSummary is not supported for proto formatter.";
+  return "";
+}
+
+std::string ProfileSummaryProtoFormatter::GetOutputString(
+    const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+        stats_calculator_map,
+    const tensorflow::StatsCalculator& delegate_stats_calculator,
+    const std::map<uint32_t, std::string>& subgraph_name_map) const {
+  ModelProfilingData model_profiling_data;
+  for (const auto& stats_calc : stats_calculator_map) {
+    auto subgraph_index = stats_calc.first;
+    tensorflow::StatsCalculator* subgraph_stats = stats_calc.second.get();
+    SubGraphProfilingData* const sub_graph_profiling_data =
+        model_profiling_data.add_subgraph_profiles();
+    GenerateSubGraphProfilingData(subgraph_stats, subgraph_index,
+                                  subgraph_name_map, sub_graph_profiling_data);
+  }
+
+  if (delegate_stats_calculator.num_runs() > 0) {
+    DelegateProfilingData* const delegate_profiling_data =
+        model_profiling_data.add_delegate_profiles();
+    GenerateDelegateProfilingData(&delegate_stats_calculator,
+                                  delegate_profiling_data);
+  }
+
+  return model_profiling_data.SerializeAsString();
+}
+
+tensorflow::StatSummarizerOptions
+ProfileSummaryProtoFormatter::GetStatSummarizerOptions() const {
+  auto options = tensorflow::StatSummarizerOptions();
+  // Summary will be manually handled per subgraphs in order to keep the
+  // compatibility.
+  options.show_summary = false;
+  options.show_memory = false;
+  return options;
+}
+
+void ProfileSummaryProtoFormatter::HandleOutput(
+    const std::string& init_output, const std::string& run_output,
+    std::string output_file_path) const {
+  std::ofstream output_file(output_file_path, std::ios_base::binary);
+  std::ostream* output_stream = nullptr;
+  if (output_file.good()) {
+    output_stream = &output_file;
+  }
+
+  BenchmarkProfilingData benchmark_profiling_data;
+  if (!init_output.empty()) {
+    benchmark_profiling_data.mutable_init_profile()->ParseFromString(
+        init_output);
+  }
+  if (!run_output.empty()) {
+    benchmark_profiling_data.mutable_runtime_profile()->ParseFromString(
+        run_output);
+  }
+
+  if (output_stream == nullptr) {
+    TFLITE_LOG(INFO) << benchmark_profiling_data.DebugString();
+  } else {
+    benchmark_profiling_data.SerializeToOstream(output_stream);
+  }
+}
+
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_summary_formatter.h b/tensorflow/lite/profiling/profile_summary_formatter.h
index 9c7a13530bbd93..62514eafecda93 100644
--- a/tensorflow/lite/profiling/profile_summary_formatter.h
+++ b/tensorflow/lite/profiling/profile_summary_formatter.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
 #define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
 #include <functional>
 #include <map>
 #include <memory>
@@ -23,7 +26,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/util/stat_summarizer_options.h"
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/profiling/proto/profiling_info.pb.h"
 
 namespace tflite {
 namespace profiling {
@@ -31,54 +36,110 @@ namespace profiling {
 // Formats the profile summary in a certain way.
 class ProfileSummaryFormatter {
  public:
-  ProfileSummaryFormatter() {}
+  ProfileSummaryFormatter() = default;
   virtual ~ProfileSummaryFormatter() {}
   // Returns a string detailing the accumulated runtime stats in StatsCalculator
   // of ProfileSummarizer.
   virtual std::string GetOutputString(
       const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
           stats_calculator_map,
-      const tensorflow::StatsCalculator& delegate_stats_calculator) const = 0;
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const = 0;
   // Returns a string detailing the short summary of the accumulated runtime
   // stats in StatsCalculator of ProfileSummarizer.
   virtual std::string GetShortSummary(
       const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
           stats_calculator_map,
-      const tensorflow::StatsCalculator& delegate_stats_calculator) const = 0;
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const = 0;
   virtual tensorflow::StatSummarizerOptions GetStatSummarizerOptions()
       const = 0;
+  virtual void HandleOutput(const std::string& init_output,
+                            const std::string& run_output,
+                            std::string output_file_path) const = 0;
 };
 
 class ProfileSummaryDefaultFormatter : public ProfileSummaryFormatter {
  public:
-  ProfileSummaryDefaultFormatter() {}
+  ProfileSummaryDefaultFormatter() = default;
   ~ProfileSummaryDefaultFormatter() override {}
   std::string GetOutputString(
       const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
           stats_calculator_map,
-      const tensorflow::StatsCalculator& delegate_stats_calculator)
-      const override;
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
   std::string GetShortSummary(
       const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
           stats_calculator_map,
-      const tensorflow::StatsCalculator& delegate_stats_calculator)
-      const override;
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
   tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+  void HandleOutput(const std::string& init_output,
+                    const std::string& run_output,
+                    std::string output_file_path) const override;
 
  private:
   std::string GenerateReport(
       const std::string& tag, bool include_output_string,
       const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
           stats_calculator_map,
-      const tensorflow::StatsCalculator& delegate_stats_calculator) const;
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const;
+  void WriteOutput(const std::string& header, const std::string& data,
+                   std::ostream* stream) const {
+    (*stream) << header << std::endl;
+    (*stream) << data << std::endl;
+  }
 };
 
 class ProfileSummaryCSVFormatter : public ProfileSummaryDefaultFormatter {
  public:
-  ProfileSummaryCSVFormatter() {}
+  ProfileSummaryCSVFormatter() = default;
   tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
 };
 
+class ProfileSummaryProtoFormatter : public ProfileSummaryFormatter {
+ public:
+  std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+  void HandleOutput(const std::string& init_output,
+                    const std::string& run_output,
+                    std::string output_file_path) const override;
+
+ private:
+  std::string GenerateReport(
+      const std::string& tag, bool include_output_string,
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const;
+  void GenerateSubGraphProfilingData(
+      const tensorflow::StatsCalculator* stats_calculator, int subgraph_index,
+      const std::map<uint32_t, std::string>& subgraph_name_map,
+      SubGraphProfilingData* sub_graph_profiling_data) const;
+
+  void GenerateDelegateProfilingData(
+      const tensorflow::StatsCalculator* stats_calculator,
+      DelegateProfilingData* delegate_profiling_data) const;
+
+  void GenerateOpProfileDataFromDetail(
+      const tensorflow::StatsCalculator::Detail* detail,
+      const tensorflow::StatsCalculator* stats_calculator,
+      OpProfileData* op_profile_data) const;
+
+  std::vector<tensorflow::StatsCalculator::Detail> GetDetailsSortedByRunOrder(
+      const tensorflow::StatsCalculator* stats_calculator) const;
+};
+
 }  // namespace profiling
 }  // namespace tflite
 
diff --git a/tensorflow/lite/profiling/profile_summary_formatter_test.cc b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
index eefd35667e3b2a..d9f26e0b729bc7 100644
--- a/tensorflow/lite/profiling/profile_summary_formatter_test.cc
+++ b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 
+#include <fstream>
+#include <ios>
 #include <map>
 #include <memory>
 #include <string>
@@ -21,6 +23,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
+#include "tensorflow/lite/profiling/proto/profiling_info.pb.h"
 
 namespace tflite {
 namespace profiling {
@@ -46,7 +49,7 @@ TEST(SummaryWriterTest, EmptyOutputString) {
   ProfileSummaryDefaultFormatter writer;
   std::string output = writer.GetOutputString(
       std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()), {});
   EXPECT_EQ(output.size(), 0);
 }
 
@@ -54,7 +57,7 @@ TEST(SummaryWriterTest, EmptyShortSummary) {
   ProfileSummaryDefaultFormatter writer;
   std::string output = writer.GetShortSummary(
       std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()), {});
   EXPECT_EQ(output.size(), 0);
 }
 
@@ -66,7 +69,7 @@ TEST(SummaryWriterTest, SingleSubgraphOutputString) {
       writer.GetStatSummarizerOptions());
   std::string output = writer.GetOutputString(
       stats_calculator_map,
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()), {});
   ASSERT_TRUE(absl::StrContains(output, "Run Order"));
   ASSERT_TRUE(absl::StrContains(output, "Top by Computation Time"));
   ASSERT_TRUE(!absl::StrContains(output, "Top by Memory Use"));
@@ -85,7 +88,8 @@ TEST(SummaryWriterTest, SingleSubgraphShortSummary) {
       writer.GetStatSummarizerOptions());
   std::string output = writer.GetShortSummary(
       stats_calculator_map,
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()),
+      {{0, "Primary graph"}});
   ASSERT_TRUE(!absl::StrContains(output, "Run Order"));
   ASSERT_TRUE(!absl::StrContains(output, "Top by Computation Time"));
   ASSERT_TRUE(!absl::StrContains(output, "Top by Memory Use"));
@@ -106,12 +110,251 @@ TEST(SummaryWriterTest, MultiSubgraphOutputString) {
       writer.GetStatSummarizerOptions());
   std::string output = writer.GetOutputString(
       stats_calculator_map,
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()),
+      {{0, "Primary graph"}, {1, "Subgraph 1"}});
   ASSERT_TRUE(absl::StrContains(output, "Primary graph"));
   ASSERT_TRUE(absl::StrContains(output, "Subgraph"));
   ASSERT_TRUE(!absl::StrContains(output, "Delegate internal"));
 }
 
+TEST(SummaryWriterTest, MultiSubgraphOutputStringForProto) {
+  ProfileSummaryProtoFormatter writer;
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map;
+  stats_calculator_map[0] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  std::string kernel_name_1 = "Kernel 1";
+  std::string kernel_name_2 = "Kernel 2";
+  std::string kernel_name_3 = "Kernel 3";
+
+  std::string op_name_1 = "Convolution";
+  std::string op_name_2 = "Reshape";
+  std::string op_name_3 = "Convolution";
+  stats_calculator_map[0]->AddNodeStats(kernel_name_1, op_name_1, 1, 10, 10000);
+  stats_calculator_map[0]->AddNodeStats(kernel_name_1, op_name_1, 1, 20, 20000);
+  stats_calculator_map[0]->AddNodeStats(kernel_name_2, op_name_2, 2, 15, 10000);
+  stats_calculator_map[0]->UpdateRunTotalUs(25);
+  stats_calculator_map[1] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  stats_calculator_map[1]->AddNodeStats(kernel_name_3, op_name_3, 3, 10, 10000);
+  stats_calculator_map[1]->UpdateRunTotalUs(10);
+
+  std::string output = writer.GetOutputString(
+      stats_calculator_map,
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()),
+      {{0, "Primary graph"}, {1, "Subgraph 1"}});
+  ModelProfilingData model_profiling_data;
+  model_profiling_data.ParseFromString(output);
+  ASSERT_TRUE(absl::StrContains(output, "Primary graph"));
+  ASSERT_TRUE(absl::StrContains(output, "Subgraph"));
+  ASSERT_TRUE(!absl::StrContains(output, "Delegate internal"));
+  ASSERT_EQ(model_profiling_data.subgraph_profiles().size(), 2);
+  ASSERT_EQ(model_profiling_data.subgraph_profiles(0).subgraph_name(),
+            "Primary graph");
+  ASSERT_EQ(model_profiling_data.subgraph_profiles(0).per_op_profiles().size(),
+            2);
+
+  OpProfileData op_profile_data_1;
+  op_profile_data_1.set_node_type(op_name_1);
+  OpProfilingStat* inference_microseconds_stat_1 =
+      op_profile_data_1.mutable_inference_microseconds();
+  inference_microseconds_stat_1->set_first(10);
+  inference_microseconds_stat_1->set_last(20);
+  inference_microseconds_stat_1->set_max(20);
+  inference_microseconds_stat_1->set_min(10);
+  inference_microseconds_stat_1->set_avg(15);
+  inference_microseconds_stat_1->set_stddev(5);
+  inference_microseconds_stat_1->set_variance(25);
+  inference_microseconds_stat_1->set_sum(30);
+  inference_microseconds_stat_1->set_count(2);
+  OpProfilingStat* memory_stat_1 = op_profile_data_1.mutable_mem_kb();
+  memory_stat_1->set_first(10);
+  memory_stat_1->set_last(20);
+  memory_stat_1->set_max(20);
+  memory_stat_1->set_min(10);
+  memory_stat_1->set_avg(15);
+  memory_stat_1->set_stddev(5);
+  memory_stat_1->set_variance(25);
+  memory_stat_1->set_sum(30);
+  memory_stat_1->set_count(2);
+  op_profile_data_1.set_name(kernel_name_1);
+  op_profile_data_1.set_run_order(1);
+  op_profile_data_1.set_times_called(2);
+  EXPECT_THAT(model_profiling_data.subgraph_profiles(0).per_op_profiles(0),
+              testing::EqualsProto(op_profile_data_1));
+
+  OpProfileData op_profile_data_2;
+  op_profile_data_2.set_node_type(op_name_2);
+  OpProfilingStat* inference_microseconds_stat_2 =
+      op_profile_data_2.mutable_inference_microseconds();
+  inference_microseconds_stat_2->set_first(15);
+  inference_microseconds_stat_2->set_last(15);
+  inference_microseconds_stat_2->set_max(15);
+  inference_microseconds_stat_2->set_min(15);
+  inference_microseconds_stat_2->set_avg(15);
+  inference_microseconds_stat_2->set_stddev(0);
+  inference_microseconds_stat_2->set_variance(0);
+  inference_microseconds_stat_2->set_sum(15);
+  inference_microseconds_stat_2->set_count(1);
+  OpProfilingStat* memory_stat_2 = op_profile_data_2.mutable_mem_kb();
+  memory_stat_2->set_first(10);
+  memory_stat_2->set_last(10);
+  memory_stat_2->set_max(10);
+  memory_stat_2->set_min(10);
+  memory_stat_2->set_avg(10);
+  memory_stat_2->set_stddev(0);
+  memory_stat_2->set_variance(0);
+  memory_stat_2->set_sum(10);
+  memory_stat_2->set_count(1);
+  op_profile_data_2.set_times_called(1);
+  op_profile_data_2.set_name(kernel_name_2);
+  op_profile_data_2.set_run_order(2);
+
+  EXPECT_THAT(model_profiling_data.subgraph_profiles(0).per_op_profiles(1),
+              testing::EqualsProto(op_profile_data_2));
+
+  ASSERT_EQ(model_profiling_data.subgraph_profiles(1).subgraph_name(),
+            "Subgraph 1");
+  ASSERT_EQ(model_profiling_data.subgraph_profiles(1).per_op_profiles().size(),
+            1);
+
+  OpProfileData op_profile_data_3;
+  op_profile_data_3.set_node_type(op_name_3);
+  OpProfilingStat* inference_microseconds_stat_3 =
+      op_profile_data_3.mutable_inference_microseconds();
+  inference_microseconds_stat_3->set_first(10);
+  inference_microseconds_stat_3->set_last(10);
+  inference_microseconds_stat_3->set_max(10);
+  inference_microseconds_stat_3->set_min(10);
+  inference_microseconds_stat_3->set_avg(10);
+  inference_microseconds_stat_3->set_stddev(0);
+  inference_microseconds_stat_3->set_variance(0);
+  inference_microseconds_stat_3->set_sum(10);
+  inference_microseconds_stat_3->set_count(1);
+  OpProfilingStat* memory_stat_3 = op_profile_data_3.mutable_mem_kb();
+  memory_stat_3->set_first(10);
+  memory_stat_3->set_last(10);
+  memory_stat_3->set_max(10);
+  memory_stat_3->set_min(10);
+  memory_stat_3->set_avg(10);
+  memory_stat_3->set_stddev(0);
+  memory_stat_3->set_variance(0);
+  memory_stat_3->set_sum(10);
+  memory_stat_3->set_count(1);
+  op_profile_data_3.set_times_called(1);
+  op_profile_data_3.set_name(kernel_name_3);
+  op_profile_data_3.set_run_order(3);
+  EXPECT_THAT(model_profiling_data.subgraph_profiles(1).per_op_profiles(0),
+              testing::EqualsProto(op_profile_data_3));
+}
+
+TEST(SummaryWriterTest, MultiSubgraphHandleOutputForProto) {
+  ProfileSummaryProtoFormatter writer;
+
+  ModelProfilingData model_profiling_data_run;
+  SubGraphProfilingData* subgraph_profiling_data =
+      model_profiling_data_run.add_subgraph_profiles();
+  subgraph_profiling_data->set_subgraph_name("Primary graph");
+  OpProfileData* op_profile_data_1 =
+      subgraph_profiling_data->add_per_op_profiles();
+  op_profile_data_1->set_node_type("Convolution");
+  OpProfilingStat* inference_stat_1 =
+      op_profile_data_1->mutable_inference_microseconds();
+  inference_stat_1->set_first(10);
+  inference_stat_1->set_avg(10);
+  OpProfilingStat* mem_stat_1 = op_profile_data_1->mutable_mem_kb();
+  mem_stat_1->set_first(10);
+  mem_stat_1->set_avg(10);
+  op_profile_data_1->set_times_called(1);
+  op_profile_data_1->set_name("Kernel 1");
+  op_profile_data_1->set_run_order(1);
+  OpProfileData* op_profile_data_2 =
+      subgraph_profiling_data->add_per_op_profiles();
+  op_profile_data_2->set_node_type("Reshape");
+  OpProfilingStat* inference_stat_2 =
+      op_profile_data_2->mutable_inference_microseconds();
+  inference_stat_2->set_first(15);
+  inference_stat_2->set_avg(15);
+  OpProfilingStat* mem_stat_2 = op_profile_data_2->mutable_mem_kb();
+  mem_stat_2->set_first(10);
+  mem_stat_2->set_avg(10);
+  op_profile_data_2->set_times_called(1);
+  op_profile_data_2->set_name("Kernel 2");
+  op_profile_data_2->set_run_order(2);
+  SubGraphProfilingData* subgraph_profiling_data_1 =
+      model_profiling_data_run.add_subgraph_profiles();
+  subgraph_profiling_data_1->set_subgraph_name("Subgraph 1");
+  OpProfileData* op_profile_data_3 =
+      subgraph_profiling_data_1->add_per_op_profiles();
+  op_profile_data_3->set_node_type("Convolution");
+  OpProfilingStat* inference_stat_3 =
+      op_profile_data_3->mutable_inference_microseconds();
+  inference_stat_3->set_first(10);
+  inference_stat_3->set_avg(10);
+  OpProfilingStat* mem_stat_3 = op_profile_data_3->mutable_mem_kb();
+  mem_stat_3->set_first(10);
+  mem_stat_3->set_avg(10);
+  op_profile_data_3->set_times_called(1);
+  op_profile_data_3->set_name("Kernel 3");
+  op_profile_data_3->set_run_order(3);
+  DelegateProfilingData* delegate_profiling_data =
+      model_profiling_data_run.add_delegate_profiles();
+  OpProfileData* op_profile_data_4 =
+      delegate_profiling_data->add_per_op_profiles();
+  op_profile_data_4->set_node_type("Convolution");
+  OpProfilingStat* inference_stat_4 =
+      op_profile_data_4->mutable_inference_microseconds();
+  inference_stat_4->set_first(10);
+  inference_stat_4->set_avg(10);
+  OpProfilingStat* mem_stat_4 = op_profile_data_4->mutable_mem_kb();
+  mem_stat_4->set_first(10);
+  mem_stat_4->set_avg(10);
+  op_profile_data_4->set_times_called(1);
+  op_profile_data_4->set_name("Kernel 4");
+  op_profile_data_4->set_run_order(4);
+
+  ModelProfilingData model_profiling_data_init;
+  SubGraphProfilingData* subgraph_profiling_data_init =
+      model_profiling_data_init.add_subgraph_profiles();
+  subgraph_profiling_data_init->set_subgraph_name("Primary graph");
+  OpProfileData* op_profile_data_init_1 =
+      subgraph_profiling_data_init->add_per_op_profiles();
+  op_profile_data_init_1->set_node_type("Convolution");
+  OpProfilingStat* inference_stat_init_1 =
+      op_profile_data_init_1->mutable_inference_microseconds();
+  inference_stat_init_1->set_first(10);
+  inference_stat_init_1->set_avg(10);
+  op_profile_data_init_1->set_times_called(1);
+  OpProfilingStat* mem_stat_init_1 = op_profile_data_init_1->mutable_mem_kb();
+  mem_stat_init_1->set_first(10);
+  mem_stat_init_1->set_avg(10);
+  op_profile_data_init_1->set_name("ModifyGraphWithDelegate");
+  op_profile_data_init_1->set_run_order(1);
+
+#ifdef __ANDROID__
+  std::string file_name = "/data/local/tmp/test_file.proto";
+#else
+  std::string file_name = "/tmp/test_file.proto";
+#endif
+
+  writer.HandleOutput(model_profiling_data_init.SerializeAsString(),
+                      model_profiling_data_run.SerializeAsString(), file_name);
+
+  std::ifstream file(file_name, std::ios::binary);
+
+  ASSERT_TRUE(file.good());
+
+  BenchmarkProfilingData benchmark_profiling_data;
+  benchmark_profiling_data.ParseFromIstream(&file);
+  file.close();
+
+  ASSERT_TRUE(benchmark_profiling_data.model_name().empty());
+  EXPECT_THAT(benchmark_profiling_data.init_profile(),
+              testing::EqualsProto(model_profiling_data_init));
+  EXPECT_THAT(benchmark_profiling_data.runtime_profile(),
+              testing::EqualsProto(model_profiling_data_run));
+}
+
 TEST(SummaryWriterTest, MultiSubgraphShortSummary) {
   ProfileSummaryDefaultFormatter writer;
   std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
@@ -122,7 +365,8 @@ TEST(SummaryWriterTest, MultiSubgraphShortSummary) {
       writer.GetStatSummarizerOptions());
   std::string output = writer.GetShortSummary(
       stats_calculator_map,
-      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()),
+      {{0, "Primary graph"}, {1, "Subgraph 1"}});
   ASSERT_TRUE(absl::StrContains(output, "Primary graph"));
   ASSERT_TRUE(absl::StrContains(output, "Subgraph"));
   ASSERT_TRUE(!absl::StrContains(output, "Delegate internal"));
@@ -135,7 +379,7 @@ TEST(SummaryWriterTest, DelegationOutputString) {
   delegate_stats_calculator.UpdateRunTotalUs(1);
   std::string output = writer.GetOutputString(
       std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
-      delegate_stats_calculator);
+      delegate_stats_calculator, {});
   ASSERT_TRUE(!absl::StrContains(output, "Primary graph"));
   ASSERT_TRUE(!absl::StrContains(output, "Subgraph"));
   ASSERT_TRUE(absl::StrContains(output, "Delegate internal"));
@@ -148,7 +392,7 @@ TEST(SummaryWriterTest, DelegationShortSummary) {
   delegate_stats_calculator.UpdateRunTotalUs(1);
   std::string output = writer.GetShortSummary(
       std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
-      delegate_stats_calculator);
+      delegate_stats_calculator, {});
   ASSERT_TRUE(!absl::StrContains(output, "Primary graph"));
   ASSERT_TRUE(!absl::StrContains(output, "Subgraph"));
   ASSERT_TRUE(absl::StrContains(output, "Delegate internal"));
diff --git a/tensorflow/lite/profiling/proto/BUILD b/tensorflow/lite/profiling/proto/BUILD
new file mode 100644
index 00000000000000..5e3160b318bf8e
--- /dev/null
+++ b/tensorflow/lite/profiling/proto/BUILD
@@ -0,0 +1,41 @@
+# Placeholder: load py_proto_library
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
+# copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+proto_library(
+    name = "profiling_info_proto",
+    srcs = ["profiling_info.proto"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+)
+
+cc_proto_library(
+    name = "profiling_info_cc_proto",
+    compatible_with = get_compatible_with_portable(),
+    deps = [":profiling_info_proto"],
+)
+
+tf_proto_library(
+    name = "profiling_info",  # bzl adds _py
+    srcs = ["profiling_info.proto"],
+    visibility = ["//visibility:public"],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "profiling_info_py_pb2",
+#     api_version = 2,
+#     compatible_with = get_compatible_with_portable(),
+#     deps = [":profiling_info_proto"],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/lite/profiling/proto/CMakeLists.txt b/tensorflow/lite/profiling/proto/CMakeLists.txt
new file mode 100644
index 00000000000000..a0955470db7d6f
--- /dev/null
+++ b/tensorflow/lite/profiling/proto/CMakeLists.txt
@@ -0,0 +1,41 @@
+#
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+find_package(Protobuf REQUIRED)
+
+add_library(profiling_info_proto profiling_info.proto)
+
+list(APPEND proto_generated_files ${CMAKE_CURRENT_BINARY_DIR}/profiling_info.pb.cc ${CMAKE_CURRENT_BINARY_DIR}/profiling_info.pb.h)
+
+# Generate profiling_info.pb.cc and profiling_info.pb.h from
+# profiling_info.proto using protoc. Once the protobuf package version is
+# upgraded, we can use protobuf_generate_cpp/protobuf_generate here directly.
+add_custom_command(
+    OUTPUT ${proto_generated_files}
+    COMMAND ${Protobuf_PROTOC_EXECUTABLE}
+    ARGS --cpp_out=${CMAKE_CURRENT_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
+    DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
+)
+
+set_source_files_properties(${proto_generated_files} PROPERTIES GENERATED TRUE)
+target_sources(profiling_info_proto PRIVATE ${proto_generated_files})
+target_link_libraries(profiling_info_proto protobuf::libprotobuf)
+target_include_directories(profiling_info_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+# Move all generated proto files to the TFLITE_GENERATED_HEADERS_DIR
+add_custom_command(
+    TARGET profiling_info_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${TFLITE_GENERATED_HEADERS_DIR}/profiling/proto)
\ No newline at end of file
diff --git a/tensorflow/lite/profiling/proto/profiling_info.proto b/tensorflow/lite/profiling/proto/profiling_info.proto
new file mode 100644
index 00000000000000..8116524405dc11
--- /dev/null
+++ b/tensorflow/lite/profiling/proto/profiling_info.proto
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless optional by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package tflite.profiling;
+
+option java_multiple_files = true;
+
+message BenchmarkProfilingData {
+  optional string model_name = 1;
+  optional ModelProfilingData init_profile = 2;
+  optional ModelProfilingData runtime_profile = 3;
+}
+
+message ModelProfilingData {
+  repeated SubGraphProfilingData subgraph_profiles = 1;
+  repeated DelegateProfilingData delegate_profiles = 2;
+}
+
+message SubGraphProfilingData {
+  optional string subgraph_name = 1;
+  optional int32 subgraph_index = 2;
+  repeated OpProfileData per_op_profiles = 3;
+}
+
+message DelegateProfilingData {
+  optional string delegate_name = 1;
+  repeated OpProfileData per_op_profiles = 2;
+}
+
+message OpProfilingStat {
+  optional int64 first = 1;
+  optional int64 last = 2;
+  optional int64 avg = 3;
+  optional float stddev = 4;
+  optional float variance = 5;
+  optional int64 min = 6;
+  optional int64 max = 7;
+  optional int64 sum = 8;
+  optional int64 count = 9;
+}
+
+message OpProfileData {
+  optional string node_type = 1;
+  optional OpProfilingStat inference_microseconds = 2;
+  optional OpProfilingStat mem_kb = 3;
+  optional int64 times_called = 4;
+  optional string name = 5;
+  optional int64 run_order = 6;
+}
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 4e85652310481e..8ff6d3939d996b 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -197,6 +197,7 @@ py_strict_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
+        "//tensorflow/lite/profiling/proto:profiling_info_py",
         "//tensorflow/lite/python/metrics",
         "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/lite/tools:flatbuffer_utils",
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 6155575b4048b5..7bf0f18d68fc24 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -144,28 +144,6 @@ flatbuffer_cc_library(
     out_prefix = "reflection/",
 )
 
-# Schema test to make sure we don't introduce backward incompatible changes
-# to schemas.
-cc_test(
-    name = "flatbuffer_compatibility_test",
-    size = "small",
-    srcs = ["flatbuffer_compatibility_test.cc"],
-    data = [
-        "schema.fbs",
-        "schema_v3b.fbs",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        "//tensorflow/core/platform",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:flatc_library",
-    ],
-)
-
 cc_library(
     name = "schema_utils",
     hdrs = ["schema_utils.h"],
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 8c60f8ad012bd8..b08a2d913b6ec7 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -257,6 +257,7 @@ cc_library_with_tflite(
 cc_library(
     name = "logging",
     hdrs = ["logging.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
 )
 
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index fc2a1be282f985..56794382ff45a8 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -45,6 +45,11 @@ list(APPEND TFLITE_BENCHMARK_LIBS
   tensorflow-lite
 )
 
+list(APPEND TFLITE_BENCHMARK_LIBS
+  profiling_info_proto
+  protobuf::libprotobuf
+)
+
 # TODO(b/171007016): Enable performance options on Windows.
 if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
   list(APPEND TFLITE_BENCHMARK_SRCS
@@ -92,6 +97,10 @@ target_compile_options(benchmark_model
   PRIVATE
     ${TFLITE_BENCHMARK_CC_OPTIONS}
 )
+target_include_directories(benchmark_model
+  PUBLIC
+    ${CMAKE_BINARY_DIR}
+)
 target_link_libraries(benchmark_model
     ${TFLITE_BENCHMARK_LIBS}
 )
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index f25da51705d6b8..e92d841b9c6a87 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -67,7 +67,24 @@ and the following optional parameters:
     thus it is preferred to set `max_profiling_buffer_entries` to a large-enough
     value.
 
+*  `op_profiling_output_mode`: `str` (default="stdout") \
+    The output mode for the profiling information generated. Requires
+    `enable_op_profiling` to be `true`. Takes one of the following 3 values:
+     - `stdout` : Print profiling information to STDOUT.
+     - `csv` : Print the profiling information in a CSV format.
+     - `proto` : Print the profiling information in a proto format as specified
+     in `tensorflow/lite/profiling/proto/profiling_info.proto`.
+*  `op_profiling_output_file`: `str` (default="") \
+    File path to export profile data to. The results are printed to
+    `stdout` if option is not set. Requires `enable_op_profiling` to be `true`
+    and the path to include the name of the output file; otherwise results are
+    printed to `stdout`.
+
 *   `profiling_output_csv_file`: `str` (default="") \
+
+    WARNING: Deprecated, prefer using `op_profiling_output_mode` and
+    `op_profiling_output_file` instead.
+
     File path to export profile data to as CSV. The results are printed to
     `stdout` if option is not set. Requires `enable_op_profiling` to be `true`
     and the path to include the name of the output CSV; otherwise results are
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index d775122fe9c1fc..8fb5b23b7860d9 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -78,6 +78,15 @@ constexpr bool kOpProfilingEnabledDefault = true;
 constexpr bool kOpProfilingEnabledDefault = false;
 #endif
 
+// Op profiling output modes.
+constexpr char kOpProfilingOutputModeStdout[] = "stdout";
+constexpr char kOpProfilingOutputModeCsv[] = "csv";
+constexpr char kOpProfilingOutputModeProto[] = "proto";
+
+const char* kOpProfilingOutputModes[] = {kOpProfilingOutputModeStdout,
+                                         kOpProfilingOutputModeCsv,
+                                         kOpProfilingOutputModeProto};
+
 // Dumps ruy profiling events if the ruy profiler is enabled.
 class RuyProfileListener : public BenchmarkListener {
  public:
@@ -310,10 +319,14 @@ TfLiteStatus PopulateInputLayerInfo(
 }
 
 std::shared_ptr<profiling::ProfileSummaryFormatter>
-CreateProfileSummaryFormatter(bool format_as_csv) {
-  return format_as_csv
-             ? std::make_shared<profiling::ProfileSummaryCSVFormatter>()
-             : std::make_shared<profiling::ProfileSummaryDefaultFormatter>();
+CreateProfileSummaryFormatter(const std::string& output_mode) {
+  if (output_mode == kOpProfilingOutputModeCsv) {
+    return std::make_shared<profiling::ProfileSummaryCSVFormatter>();
+  } else if (output_mode == kOpProfilingOutputModeProto) {
+    return std::make_shared<profiling::ProfileSummaryProtoFormatter>();
+  } else {
+    return std::make_shared<profiling::ProfileSummaryDefaultFormatter>();
+  }
 }
 
 }  // namespace
@@ -479,6 +492,11 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   default_params.AddParam(
       "enable_op_profiling",
       BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
+  default_params.AddParam(
+      "op_profiling_output_mode",
+      BenchmarkParam::Create<std::string>(kOpProfilingOutputModeStdout));
+  default_params.AddParam("op_profiling_output_file",
+                          BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("max_profiling_buffer_entries",
                           BenchmarkParam::Create<int32_t>(1024));
   default_params.AddParam("allow_dynamic_profiling_buffer_increase",
@@ -565,14 +583,21 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<bool>("require_full_delegation", &params_,
                        "require delegate to run the entire graph"),
       CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling"),
+      CreateFlag<std::string>(
+          "op_profiling_output_mode", &params_,
+          "Output mode for op profiling results. Supported values are: "
+          "'stdout', 'csv' and 'proto'."),
+      CreateFlag<std::string>("op_profiling_output_file", &params_,
+                              "Output file for op profiling results."),
       CreateFlag<int32_t>("max_profiling_buffer_entries", &params_,
                           "max initial profiling buffer entries"),
       CreateFlag<bool>("allow_dynamic_profiling_buffer_increase", &params_,
                        "allow dynamic increase on profiling buffer entries"),
-      CreateFlag<std::string>(
-          "profiling_output_csv_file", &params_,
-          "File path to export profile data as CSV, if not set "
-          "prints to stdout."),
+      CreateFlag<std::string>("profiling_output_csv_file", &params_,
+                              "[DEPRECATED: Use op_profiling_output_file and "
+                              "op_profiling_output_mode instead] File path to "
+                              "export profile data as CSV, if not set "
+                              "prints to stdout."),
       CreateFlag<bool>(
           "print_preinvoke_state", &params_,
           "print out the interpreter internals just before calling Invoke. The "
@@ -650,6 +675,10 @@ void BenchmarkTfLiteModel::LogParams() {
                       "Require full delegation", verbose);
   LOG_BENCHMARK_PARAM(bool, "enable_op_profiling", "Enable op profiling",
                       verbose);
+  LOG_BENCHMARK_PARAM(std::string, "op_profiling_output_mode",
+                      "Op profiling output mode.", verbose);
+  LOG_BENCHMARK_PARAM(std::string, "op_profiling_output_file",
+                      "Op profiling output file.", verbose);
   LOG_BENCHMARK_PARAM(int32_t, "max_profiling_buffer_entries",
                       "Max initial profiling buffer entries", verbose);
   LOG_BENCHMARK_PARAM(bool, "allow_dynamic_profiling_buffer_increase",
@@ -693,6 +722,31 @@ TfLiteStatus BenchmarkTfLiteModel::ValidateParams() {
     return kTfLiteError;
   }
 
+  if (params_.Get<bool>("enable_op_profiling")) {
+    bool found =
+        std::find(std::begin(kOpProfilingOutputModes),
+                  std::end(kOpProfilingOutputModes),
+                  params_.Get<std::string>("op_profiling_output_mode")) !=
+        std::end(kOpProfilingOutputModes);
+
+    if (!found) {
+      TFLITE_LOG(ERROR) << "Output mode"
+                        << params_.Get<std::string>("op_profiling_output_mode")
+                        << " is not supported. Supported values are: 'stdout', "
+                           "'csv' and 'proto'.";
+      return kTfLiteError;
+    }
+
+    if (!params_.Get<std::string>("profiling_output_csv_file").empty()) {
+      // Backward compatibility for profiling_output_csv_file.
+      params_.Set<std::string>("op_profiling_output_mode",
+                               kOpProfilingOutputModeCsv);
+      params_.Set<std::string>(
+          "op_profiling_output_file",
+          params_.Get<std::string>("profiling_output_csv_file"));
+    }
+  }
+
   return PopulateInputLayerInfo(
       params_.Get<std::string>("input_layer"),
       params_.Get<std::string>("input_layer_shape"),
@@ -1123,9 +1177,9 @@ BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
       params_.Get<bool>("allow_dynamic_profiling_buffer_increase"),
-      params_.Get<std::string>("profiling_output_csv_file"),
+      params_.Get<std::string>("op_profiling_output_file"),
       CreateProfileSummaryFormatter(
-          !params_.Get<std::string>("profiling_output_csv_file").empty())));
+          params_.Get<std::string>("op_profiling_output_mode"))));
 }
 
 TfLiteStatus BenchmarkTfLiteModel::RunImpl() {
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.cc b/tensorflow/lite/tools/benchmark/profiling_listener.cc
index eff38b0da05f5d..0099c4f8e5fe19 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.cc
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <fstream>
 #include <string>
 
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
@@ -25,13 +26,14 @@ namespace benchmark {
 
 ProfilingListener::ProfilingListener(
     Interpreter* interpreter, uint32_t max_num_initial_entries,
-    bool allow_dynamic_buffer_increase, const std::string& csv_file_path,
+    bool allow_dynamic_buffer_increase, const std::string& output_file_path,
     std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter)
     : run_summarizer_(summarizer_formatter),
       init_summarizer_(summarizer_formatter),
-      csv_file_path_(csv_file_path),
+      output_file_path_(output_file_path),
       interpreter_(interpreter),
-      profiler_(max_num_initial_entries, allow_dynamic_buffer_increase) {
+      profiler_(max_num_initial_entries, allow_dynamic_buffer_increase),
+      summarizer_formatter_(summarizer_formatter) {
   TFLITE_TOOLS_CHECK(interpreter);
   interpreter_->SetProfiler(&profiler_);
 
@@ -66,27 +68,9 @@ void ProfilingListener::OnSingleRunEnd() {
 }
 
 void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
-  std::ofstream output_file(csv_file_path_);
-  std::ostream* output_stream = nullptr;
-  if (output_file.good()) {
-    output_stream = &output_file;
-  }
-  if (init_summarizer_.HasProfiles()) {
-    WriteOutput("Profiling Info for Benchmark Initialization:",
-                init_summarizer_.GetOutputString(),
-                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
-  }
-  if (run_summarizer_.HasProfiles()) {
-    WriteOutput("Operator-wise Profiling Info for Regular Benchmark Runs:",
-                run_summarizer_.GetOutputString(),
-                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
-  }
-}
-
-void ProfilingListener::WriteOutput(const std::string& header,
-                                    const string& data, std::ostream* stream) {
-  (*stream) << header << std::endl;
-  (*stream) << data << std::endl;
+  summarizer_formatter_->HandleOutput(init_summarizer_.GetOutputString(),
+                                      run_summarizer_.GetOutputString(),
+                                      output_file_path_);
 }
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.h b/tensorflow/lite/tools/benchmark/profiling_listener.h
index a9957ddb06b7b1..03869e3df5fe31 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.h
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -32,7 +32,8 @@ class ProfilingListener : public BenchmarkListener {
  public:
   ProfilingListener(
       Interpreter* interpreter, uint32_t max_num_initial_entries,
-      bool allow_dynamic_buffer_increase, const std::string& csv_file_path = "",
+      bool allow_dynamic_buffer_increase,
+      const std::string& output_file_path = "",
       std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter =
           std::make_shared<profiling::ProfileSummaryDefaultFormatter>());
 
@@ -47,13 +48,12 @@ class ProfilingListener : public BenchmarkListener {
  protected:
   profiling::ProfileSummarizer run_summarizer_;
   profiling::ProfileSummarizer init_summarizer_;
-  std::string csv_file_path_;
+  std::string output_file_path_;
 
  private:
-  void WriteOutput(const std::string& header, const string& data,
-                   std::ostream* stream);
   Interpreter* interpreter_;
   profiling::BufferedProfiler profiler_;
+  std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/cmake/modules/FindProtobuf.cmake b/tensorflow/lite/tools/cmake/modules/FindProtobuf.cmake
new file mode 100644
index 00000000000000..3641e8a69e86b0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/FindProtobuf.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(protobuf)
\ No newline at end of file
diff --git a/tensorflow/lite/tools/cmake/modules/protobuf.cmake b/tensorflow/lite/tools/cmake/modules/protobuf.cmake
new file mode 100644
index 00000000000000..de09cdeda9c370
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/protobuf.cmake
@@ -0,0 +1,45 @@
+#
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  protobuf
+  GIT_REPOSITORY https://github.com/protocolbuffers/protobuf
+  # Sync with tensorflow/third_party/protobuf/protobuf.patch
+  GIT_TAG 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/protobuf"
+)
+
+set(protobuf_ABSL_PROVIDER "package" CACHE STRING "" FORCE)
+set(protobuf_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(protobuf_BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+set(protobuf_INSTALL OFF CACHE BOOL "" FORCE)
+set(protobuf_WITH_ZLIB OFF CACHE BOOL "" FORCE)
+set(protobuf_BUILD_PROTOC_BINARIES ON CACHE BOOL "" FORCE)
+
+OverridableFetchContent_GetProperties(protobuf)
+if(NOT protobuf_POPULATED)
+  OverridableFetchContent_Populate(protobuf)
+endif()
+
+set(Protobuf_INCLUDE_DIR "${protobuf_SOURCE_DIR}/src" CACHE INTERNAL "")
+set(Protobuf_LIBRARIES protobuf::libprotobuf CACHE INTERNAL "")
+
+add_subdirectory(${protobuf_SOURCE_DIR} ${protobuf_BINARY_DIR})
+
+set(Protobuf_PROTOC_EXECUTABLE protoc CACHE INTERNAL "")
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 75fedf76ac1ed9..cc1ee13ce3de3f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 30880f5a27eaad..88dc5ae24a6833 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,156 +1,1763 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
---- a/clang/lib/Sema/SemaTemplate.cpp
-+++ b/clang/lib/Sema/SemaTemplate.cpp
-@@ -1807,8 +1807,6 @@
- // Returns the template parameter list with all default template argument
- // information.
- static TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD) {
--  if (TD->isImplicit())
--    return TD->getTemplateParameters();
-   // Make sure we get the template parameter list from the most
-   // recent declaration, since that is the only one that is guaranteed to
-   // have all the default template argument information.
-@@ -1829,8 +1827,7 @@
-   //    template <class = void> friend struct C;
-   //  };
-   //  template struct S<int>;
--  while ((D->isImplicit() ||
--          D->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None) &&
-+  while (D->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None &&
-          D->getPreviousDecl())
-     D = D->getPreviousDecl();
-   return cast<TemplateDecl>(D)->getTemplateParameters();
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
---- a/clang/lib/Sema/SemaTemplateDeduction.cpp
-+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
-@@ -527,8 +527,8 @@
-     R->setDefaultArgument(
-         S.Context,
-         S.getTrivialTemplateArgumentLoc(Default, QualType(), SourceLocation()));
--    if (T->hasTypeConstraint()) {
--      auto *C = T->getTypeConstraint();
-+    if (R->hasTypeConstraint()) {
-+      auto *C = R->getTypeConstraint();
-       R->setTypeConstraint(C->getConceptReference(),
-                            C->getImmediatelyDeclaredConstraint());
-     }
-@@ -583,53 +583,37 @@
-       return TemplateDeductionResult::Success;
- 
-     auto NewDeduced = DeducedTemplateArgument(Arg);
--    // Provisional resolution for CWG2398: If Arg names a template
--    // specialization, then we deduce a synthesized template template parameter
--    // based on A, but using the TS's arguments as defaults.
--    if (DefaultArguments.size() != 0) {
-+    // Provisional resolution for CWG2398: If Arg is also a template template
-+    // param, and it names a template specialization, then we deduce a
-+    // synthesized template template parameter based on A, but using the TS's
-+    // arguments as defaults.
-+    if (auto *TempArg = dyn_cast_or_null<TemplateTemplateParmDecl>(
-+            Arg.getAsTemplateDecl())) {
-       assert(Arg.getKind() == TemplateName::Template);
--      TemplateDecl *TempArg = Arg.getAsTemplateDecl();
--      TemplateParameterList *As = TempArg->getTemplateParameters();
--      assert(DefaultArguments.size() <= As->size());
--
--      SmallVector<NamedDecl *, 4> Params(As->size());
--      for (unsigned I = 0; I < DefaultArguments.size(); ++I)
--        Params[I] = getTemplateParameterWithDefault(S, As->getParam(I),
--                                                    DefaultArguments[I]);
--      for (unsigned I = DefaultArguments.size(); I < As->size(); ++I)
--        Params[I] = As->getParam(I);
--      // FIXME: We could unique these, and also the parameters, but we don't
--      // expect programs to contain a large enough amount of these deductions
--      // for that to be worthwhile.
--      auto *TPL = TemplateParameterList::Create(
--          S.Context, SourceLocation(), SourceLocation(), Params,
--          SourceLocation(), As->getRequiresClause());
-+      assert(!TempArg->isExpandedParameterPack());
- 
--      TemplateDecl *TD;
--      switch (TempArg->getKind()) {
--      case Decl::TemplateTemplateParm: {
--        auto *A = cast<TemplateTemplateParmDecl>(TempArg);
--        assert(!A->isExpandedParameterPack());
--        TD = TemplateTemplateParmDecl::Create(
--            S.Context, A->getDeclContext(), SourceLocation(), A->getDepth(),
--            A->getPosition(), A->isParameterPack(), A->getIdentifier(),
--            A->wasDeclaredWithTypename(), TPL);
--        break;
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
++++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+@@ -1,744 +0,0 @@
+-//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
+-//
+-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+-// See https://llvm.org/LICENSE.txt for license information.
+-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-//
+-//===----------------------------------------------------------------------===//
+-//
+-/// \file Implements a module splitting algorithm designed to support the
+-/// FullLTO --lto-partitions option for parallel codegen. This is completely
+-/// different from the common SplitModule pass, as this system is designed with
+-/// AMDGPU in mind.
+-///
+-/// The basic idea of this module splitting implementation is the same as
+-/// SplitModule: load-balance the module's functions across a set of N
+-/// partitions to allow parallel codegen. However, it does it very
+-/// differently than the target-agnostic variant:
+-///   - Kernels are used as the module's "roots".
+-///     They're known entry points on AMDGPU, and everything else is often
+-///     internal only.
+-///   - Each kernel has a set of dependencies, and when a kernel and its
+-///     dependencies is considered "big", we try to put it in a partition where
+-///     most dependencies are already imported, to avoid duplicating large
+-///     amounts of code.
+-///   - There's special care for indirect calls in order to ensure
+-///     AMDGPUResourceUsageAnalysis can work correctly.
+-///
+-/// This file also includes a more elaborate logging system to enable
+-/// users to easily generate logs that (if desired) do not include any value
+-/// names, in order to not leak information about the source file.
+-/// Such logs are very helpful to understand and fix potential issues with
+-/// module splitting.
+-
+-#include "AMDGPUSplitModule.h"
+-#include "AMDGPUTargetMachine.h"
+-#include "Utils/AMDGPUBaseInfo.h"
+-#include "llvm/ADT/DenseMap.h"
+-#include "llvm/ADT/SmallVector.h"
+-#include "llvm/ADT/StringExtras.h"
+-#include "llvm/ADT/StringRef.h"
+-#include "llvm/Analysis/CallGraph.h"
+-#include "llvm/Analysis/TargetTransformInfo.h"
+-#include "llvm/IR/Function.h"
+-#include "llvm/IR/Instruction.h"
+-#include "llvm/IR/Module.h"
+-#include "llvm/IR/User.h"
+-#include "llvm/IR/Value.h"
+-#include "llvm/Support/Casting.h"
+-#include "llvm/Support/Debug.h"
+-#include "llvm/Support/FileSystem.h"
+-#include "llvm/Support/Path.h"
+-#include "llvm/Support/Process.h"
+-#include "llvm/Support/SHA256.h"
+-#include "llvm/Support/Threading.h"
+-#include "llvm/Support/raw_ostream.h"
+-#include "llvm/Transforms/Utils/Cloning.h"
+-#include <algorithm>
+-#include <cassert>
+-#include <iterator>
+-#include <memory>
+-#include <utility>
+-#include <vector>
+-
+-using namespace llvm;
+-
+-#define DEBUG_TYPE "amdgpu-split-module"
+-
+-namespace {
+-
+-static cl::opt<float> LargeKernelFactor(
+-    "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
+-    cl::Hidden,
+-    cl::desc(
+-        "consider a kernel as large and needing special treatment when it "
+-        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+-        "means if the kernel and its dependencies is 2 times bigger than "
+-        "an average partition; 0 disables large kernels handling entirely"));
+-
+-static cl::opt<float> LargeKernelOverlapForMerge(
+-    "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
+-    cl::Hidden,
+-    cl::desc("defines how much overlap between two large kernel's dependencies "
+-             "is needed to put them in the same partition"));
+-
+-static cl::opt<bool> NoExternalizeGlobals(
+-    "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
+-    cl::desc("disables externalization of global variable with local linkage; "
+-             "may cause globals to be duplicated which increases binary size"));
+-
+-static cl::opt<std::string>
+-    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+-              cl::desc("output directory for AMDGPU module splitting logs"));
+-
+-static cl::opt<bool>
+-    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+-               cl::desc("hash value names before printing them in the AMDGPU "
+-                        "module splitting logs"));
+-
+-using CostType = InstructionCost::CostType;
+-using PartitionID = unsigned;
+-
+-static bool isEntryPoint(const Function *F) {
+-  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
+-}
+-
+-static std::string getName(const Value &V) {
+-  static bool HideNames;
+-
+-  static llvm::once_flag HideNameInitFlag;
+-  llvm::call_once(HideNameInitFlag, [&]() {
+-    if (LogPrivate.getNumOccurrences())
+-      HideNames = LogPrivate;
+-    else {
+-      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+-      HideNames = (EV.value_or("0") != "0");
+-    }
+-  });
+-
+-  if (!HideNames)
+-    return V.getName().str();
+-  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+-               /*LowerCase=*/true);
+-}
+-
+-/// Main logging helper.
+-///
+-/// Logging can be configured by the following environment variable.
+-///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+-///     If set, uses <filepath> as the directory to write logfiles to
+-///     each time module splitting is used.
+-///   AMD_SPLIT_MODULE_LOG_PRIVATE
+-///     If set to anything other than zero, all names are hidden.
+-///
+-/// Both environment variables have corresponding CL options which
+-/// takes priority over them.
+-///
+-/// Any output printed to the log files is also printed to dbgs() when -debug is
+-/// used and LLVM_DEBUG is defined.
+-///
+-/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+-/// cannot be removed from the code (by building without debug). This probably
+-/// has a small performance cost because if some computation/formatting is
+-/// needed for logging purpose, it may be done everytime only to be ignored
+-/// by the logger.
+-///
+-/// As this pass only runs once and is not doing anything computationally
+-/// expensive, this is likely a reasonable trade-off.
+-///
+-/// If some computation should really be avoided when unused, users of the class
+-/// can check whether any logging will occur by using the bool operator.
+-///
+-/// \code
+-///   if (SML) {
+-///     // Executes only if logging to a file or if -debug is available and
+-///     used.
+-///   }
+-/// \endcode
+-class SplitModuleLogger {
+-public:
+-  SplitModuleLogger(const Module &M) {
+-    std::string LogDir = LogDirOpt;
+-    if (LogDir.empty())
+-      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+-
+-    // No log dir specified means we don't need to log to a file.
+-    // We may still log to dbgs(), though.
+-    if (LogDir.empty())
+-      return;
+-
+-    // If a log directory is specified, create a new file with a unique name in
+-    // that directory.
+-    int Fd;
+-    SmallString<0> PathTemplate;
+-    SmallString<0> RealPath;
+-    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+-    if (auto Err =
+-            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
+-      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
+-                             "': " + Err.message(),
+-                         /*CrashDiag=*/false);
+-    }
+-
+-    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
+-  }
+-
+-  bool hasLogFile() const { return FileOS != nullptr; }
+-
+-  raw_ostream &logfile() {
+-    assert(FileOS && "no logfile!");
+-    return *FileOS;
+-  }
+-
+-  /// \returns true if this SML will log anything either to a file or dbgs().
+-  /// Can be used to avoid expensive computations that are ignored when logging
+-  /// is disabled.
+-  operator bool() const {
+-    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+-  }
+-
+-private:
+-  std::unique_ptr<raw_fd_ostream> FileOS;
+-};
+-
+-template <typename Ty>
+-static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
+-  static_assert(
+-      !std::is_same_v<Ty, Value>,
+-      "do not print values to logs directly, use handleName instead!");
+-  LLVM_DEBUG(dbgs() << Val);
+-  if (SML.hasLogFile())
+-    SML.logfile() << Val;
+-  return SML;
+-}
+-
+-/// Calculate the cost of each function in \p M
+-/// \param SML Log Helper
+-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+-/// \param M Module to analyze.
+-/// \param CostMap[out] Resulting Function -> Cost map.
+-/// \return The module's total cost.
+-static CostType
+-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
+-                       Module &M,
+-                       DenseMap<const Function *, CostType> &CostMap) {
+-  CostType ModuleCost = 0;
+-  CostType KernelCost = 0;
+-
+-  for (auto &Fn : M) {
+-    if (Fn.isDeclaration())
+-      continue;
+-
+-    CostType FnCost = 0;
+-    TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
+-
+-    for (const auto &BB : Fn) {
+-      for (const auto &I : BB) {
+-        auto Cost =
+-            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+-        assert(Cost != InstructionCost::getMax());
+-        // Assume expensive if we can't tell the cost of an instruction.
+-        CostType CostVal =
+-            Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
+-        assert((FnCost + CostVal) >= FnCost && "Overflow!");
+-        FnCost += CostVal;
 -      }
--      case Decl::ClassTemplate: {
--        auto *A = cast<ClassTemplateDecl>(TempArg);
--        auto *CT = ClassTemplateDecl::Create(S.Context, A->getDeclContext(),
--                                             SourceLocation(), A->getDeclName(),
--                                             TPL, A->getTemplatedDecl());
--        CT->setPreviousDecl(A);
--        TD = CT;
--        break;
+-    }
+-
+-    assert(FnCost != 0);
+-
+-    CostMap[&Fn] = FnCost;
+-    assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
+-    ModuleCost += FnCost;
+-
+-    if (isEntryPoint(&Fn))
+-      KernelCost += FnCost;
+-  }
+-
+-  CostType FnCost = (ModuleCost - KernelCost);
+-  SML << "=> Total Module Cost: " << ModuleCost << '\n'
+-      << "  => KernelCost: " << KernelCost << " ("
+-      << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
+-      << "  => FnsCost: " << FnCost << " ("
+-      << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
+-
+-  return ModuleCost;
+-}
+-
+-static bool canBeIndirectlyCalled(const Function &F) {
+-  if (F.isDeclaration() || isEntryPoint(&F))
+-    return false;
+-  return !F.hasLocalLinkage() ||
+-         F.hasAddressTaken(/*PutOffender=*/nullptr,
+-                           /*IgnoreCallbackUses=*/false,
+-                           /*IgnoreAssumeLikeCalls=*/true,
+-                           /*IgnoreLLVMUsed=*/true,
+-                           /*IgnoreARCAttachedCall=*/false,
+-                           /*IgnoreCastedDirectCall=*/true);
+-}
+-
+-/// When a kernel or any of its callees performs an indirect call, this function
+-/// takes over \ref addAllDependencies and adds all potentially callable
+-/// functions to \p Fns so they can be counted as dependencies of the kernel.
+-///
+-/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
+-/// presence of an indirect call, the function's resource usage is the same as
+-/// the most expensive function in the module.
+-/// \param M    The module.
+-/// \param Fns[out] Resulting list of functions.
+-static void addAllIndirectCallDependencies(const Module &M,
+-                                           DenseSet<const Function *> &Fns) {
+-  for (const auto &Fn : M) {
+-    if (canBeIndirectlyCalled(Fn))
+-      Fns.insert(&Fn);
+-  }
+-}
+-
+-/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
+-/// callee until all reachable functions have been gathered.
+-///
+-/// \param SML Log Helper
+-/// \param CG Call graph for \p Fn's module.
+-/// \param Fn Current function to look at.
+-/// \param Fns[out] Resulting list of functions.
+-/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
+-/// point, either in \p Fn or in one of the function it calls. When that
+-/// happens, we fall back to adding all callable functions inside \p Fn's module
+-/// to \p Fns.
+-static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
+-                               const Function &Fn,
+-                               DenseSet<const Function *> &Fns,
+-                               bool &HadIndirectCall) {
+-  assert(!Fn.isDeclaration());
+-
+-  const Module &M = *Fn.getParent();
+-  SmallVector<const Function *> WorkList({&Fn});
+-  while (!WorkList.empty()) {
+-    const auto &CurFn = *WorkList.pop_back_val();
+-    assert(!CurFn.isDeclaration());
+-
+-    // Scan for an indirect call. If such a call is found, we have to
+-    // conservatively assume this can call all non-entrypoint functions in the
+-    // module.
+-
+-    for (auto &CGEntry : *CG[&CurFn]) {
+-      auto *CGNode = CGEntry.second;
+-      auto *Callee = CGNode->getFunction();
+-      if (!Callee) {
+-        // Functions have an edge towards CallsExternalNode if they're external
+-        // declarations, or if they do an indirect call. As we only process
+-        // definitions here, we know this means the function has an indirect
+-        // call. We then have to conservatively assume this can call all
+-        // non-entrypoint functions in the module.
+-        if (CGNode != CG.getCallsExternalNode())
+-          continue; // this is another function-less node we don't care about.
+-
+-        SML << "Indirect call detected in " << getName(CurFn)
+-            << " - treating all non-entrypoint functions as "
+-               "potential dependencies\n";
+-
+-        // TODO: Print an ORE as well ?
+-        addAllIndirectCallDependencies(M, Fns);
+-        HadIndirectCall = true;
+-        return;
 -      }
--      default:
--        llvm_unreachable("Unexpected Template Kind");
-+      TemplateParameterList *As = TempArg->getTemplateParameters();
-+      if (DefaultArguments.size() != 0) {
-+        assert(DefaultArguments.size() <= As->size());
-+        SmallVector<NamedDecl *, 4> Params(As->size());
-+        for (unsigned I = 0; I < DefaultArguments.size(); ++I)
-+          Params[I] = getTemplateParameterWithDefault(S, As->getParam(I),
-+                                                      DefaultArguments[I]);
-+        for (unsigned I = DefaultArguments.size(); I < As->size(); ++I)
-+          Params[I] = As->getParam(I);
-+        // FIXME: We could unique these, and also the parameters, but we don't
-+        // expect programs to contain a large enough amount of these deductions
-+        // for that to be worthwhile.
-+        auto *TPL = TemplateParameterList::Create(
-+            S.Context, SourceLocation(), SourceLocation(), Params,
-+            SourceLocation(), As->getRequiresClause());
-+        NewDeduced = DeducedTemplateArgument(
-+            TemplateName(TemplateTemplateParmDecl::Create(
-+                S.Context, TempArg->getDeclContext(), SourceLocation(),
-+                TempArg->getDepth(), TempArg->getPosition(),
-+                TempArg->isParameterPack(), TempArg->getIdentifier(),
-+                TempArg->wasDeclaredWithTypename(), TPL)));
-       }
--      TD->setImplicit(true);
--      NewDeduced = DeducedTemplateArgument(TemplateName(TD));
-     }
- 
-     DeducedTemplateArgument Result = checkDeducedTemplateArguments(S.Context,
-diff -ruN --strip-trailing-cr a/clang/test/CXX/temp/temp.decls/temp.alias/p2.cpp b/clang/test/CXX/temp/temp.decls/temp.alias/p2.cpp
---- a/clang/test/CXX/temp/temp.decls/temp.alias/p2.cpp
-+++ b/clang/test/CXX/temp/temp.decls/temp.alias/p2.cpp
-@@ -28,14 +28,13 @@
-     { /* ... */ }
- 
-   template<template<class> class TT>
--    void f(TT<int>);
-+    void f(TT<int>); // expected-note {{candidate template ignored}}
- 
-   template<template<class,class> class TT>
-     void g(TT<int, Alloc<int>>);
+-
+-      if (Callee->isDeclaration())
+-        continue;
+-
+-      auto [It, Inserted] = Fns.insert(Callee);
+-      if (Inserted)
+-        WorkList.push_back(Callee);
+-    }
+-  }
+-}
+-
+-/// Contains information about a kernel and its dependencies.
+-struct KernelWithDependencies {
+-  KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+-                         const DenseMap<const Function *, CostType> &FnCosts,
+-                         const Function *Fn)
+-      : Fn(Fn) {
+-    addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
+-    TotalCost = FnCosts.at(Fn);
+-    for (const auto *Dep : Dependencies) {
+-      TotalCost += FnCosts.at(Dep);
+-
+-      // We cannot duplicate functions with external linkage, or functions that
+-      // may be overriden at runtime.
+-      HasNonDuplicatableDependecy |=
+-          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
+-    }
+-  }
+-
+-  const Function *Fn = nullptr;
+-  DenseSet<const Function *> Dependencies;
+-  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
+-  bool HasIndirectCall = false;
+-  /// Whether any of \p Fn's dependencies cannot be duplicated.
+-  bool HasNonDuplicatableDependecy = false;
+-
+-  CostType TotalCost = 0;
+-
+-  /// \returns true if this kernel and its dependencies can be considered large
+-  /// according to \p Threshold.
+-  bool isLarge(CostType Threshold) const {
+-    return TotalCost > Threshold && !Dependencies.empty();
+-  }
+-};
+-
+-/// Calculates how much overlap there is between \p A and \p B.
+-/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
+-/// and B have no shared elements. Kernels do not count in overlap calculation.
+-static float calculateOverlap(const DenseSet<const Function *> &A,
+-                              const DenseSet<const Function *> &B) {
+-  DenseSet<const Function *> Total;
+-  for (const auto *F : A) {
+-    if (!isEntryPoint(F))
+-      Total.insert(F);
+-  }
+-
+-  if (Total.empty())
+-    return 0.0f;
+-
+-  unsigned NumCommon = 0;
+-  for (const auto *F : B) {
+-    if (isEntryPoint(F))
+-      continue;
+-
+-    auto [It, Inserted] = Total.insert(F);
+-    if (!Inserted)
+-      ++NumCommon;
+-  }
+-
+-  return static_cast<float>(NumCommon) / Total.size();
+-}
+-
+-/// Performs all of the partitioning work on \p M.
+-/// \param SML Log Helper
+-/// \param M Module to partition.
+-/// \param NumParts Number of partitions to create.
+-/// \param ModuleCost Total cost of all functions in \p M.
+-/// \param FnCosts Map of Function -> Cost
+-/// \param WorkList Kernels and their dependencies to process in order.
+-/// \returns The created partitions (a vector of size \p NumParts )
+-static std::vector<DenseSet<const Function *>>
+-doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
+-               CostType ModuleCost,
+-               const DenseMap<const Function *, CostType> &FnCosts,
+-               const SmallVector<KernelWithDependencies> &WorkList) {
+-
+-  SML << "\n--Partitioning Starts--\n";
+-
+-  // Calculate a "large kernel threshold". When more than one kernel's total
+-  // import cost exceeds this value, we will try to merge it with other,
+-  // similarly large kernels.
+-  //
+-  // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
+-  // assign X to a partition as usual, but when we get to Y, we check if it's
+-  // worth also putting it in Y's partition.
+-  const CostType LargeKernelThreshold =
+-      LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor)
+-                        : std::numeric_limits<CostType>::max();
+-
+-  std::vector<DenseSet<const Function *>> Partitions;
+-  Partitions.resize(NumParts);
+-
+-  // Assign a partition to each kernel, and try to keep the partitions more or
+-  // less balanced. We do that through a priority queue sorted in reverse, so we
+-  // can always look at the partition with the least content.
+-  //
+-  // There are some cases where we will be deliberately unbalanced though.
+-  //  - Large kernels: we try to merge with existing partitions to reduce code
+-  //  duplication.
+-  //  - Kernels with indirect or external calls always go in the first partition
+-  //  (P0).
+-  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
+-                              const std::pair<PartitionID, CostType> &b) {
+-    // When two partitions have the same cost, assign to the one with the
+-    // biggest ID first. This allows us to put things in P0 last, because P0 may
+-    // have other stuff added later.
+-    if (a.second == b.second)
+-      return a.first < b.first;
+-    return a.second > b.second;
+-  };
+-
+-  // We can't use priority_queue here because we need to be able to access any
+-  // element. This makes this a bit inefficient as we need to sort it again
+-  // everytime we change it, but it's a very small array anyway (likely under 64
+-  // partitions) so it's a cheap operation.
+-  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
+-  for (unsigned I = 0; I < NumParts; ++I)
+-    BalancingQueue.push_back(std::make_pair(I, 0));
+-
+-  // Helper function to handle assigning a kernel to a partition. This takes
+-  // care of updating the balancing queue.
+-  const auto AssignToPartition = [&](PartitionID PID,
+-                                     const KernelWithDependencies &KWD) {
+-    auto &FnsInPart = Partitions[PID];
+-    FnsInPart.insert(KWD.Fn);
+-    FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
+-
+-    SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n  ->  ";
+-    if (!KWD.Dependencies.empty()) {
+-      SML << KWD.Dependencies.size() << " dependencies added\n";
+-    };
+-
+-    // Update the balancing queue. we scan backwards because in the common case
+-    // the partition is at the end.
+-    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
+-      if (QueuePID == PID) {
+-        CostType NewCost = 0;
+-        for (auto *Fn : Partitions[PID])
+-          NewCost += FnCosts.at(Fn);
+-
+-        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
+-        if (Cost) {
+-          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
+-              << "% increase)";
+-        }
+-        SML << '\n';
+-
+-        Cost = NewCost;
+-      }
+-    }
+-
+-    sort(BalancingQueue, ComparePartitions);
+-  };
+-
+-  for (auto &CurKernel : WorkList) {
+-    // When a kernel has indirect calls, it must stay in the first partition
+-    // alongside every reachable non-entry function. This is a nightmare case
+-    // for splitting as it severely limits what we can do.
+-    if (CurKernel.HasIndirectCall) {
+-      SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
+-          << " defaulting to P0\n";
+-      AssignToPartition(0, CurKernel);
+-      continue;
+-    }
+-
+-    // When a kernel has non duplicatable dependencies, we have to keep it in
+-    // the first partition as well. This is a conservative approach, a
+-    // finer-grained approach could keep track of which dependencies are
+-    // non-duplicatable exactly and just make sure they're grouped together.
+-    if (CurKernel.HasNonDuplicatableDependecy) {
+-      SML << "Kernel with externally visible dependency "
+-          << getName(*CurKernel.Fn) << " defaulting to P0\n";
+-      AssignToPartition(0, CurKernel);
+-      continue;
+-    }
+-
+-    // Be smart with large kernels to avoid duplicating their dependencies.
+-    if (CurKernel.isLarge(LargeKernelThreshold)) {
+-      assert(LargeKernelOverlapForMerge >= 0.0f &&
+-             LargeKernelOverlapForMerge <= 1.0f);
+-      SML << "Large Kernel: " << getName(*CurKernel.Fn)
+-          << " - looking for partition with at least "
+-          << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
+-
+-      bool Assigned = false;
+-      for (const auto &[PID, Fns] : enumerate(Partitions)) {
+-        float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
+-        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
+-            << PID << '\n';
+-        if (Overlap > LargeKernelOverlapForMerge) {
+-          SML << "  selecting P" << PID << '\n';
+-          AssignToPartition(PID, CurKernel);
+-          Assigned = true;
+-        }
+-      }
+-
+-      if (Assigned)
+-        continue;
+-    }
+-
+-    // Normal "load-balancing", assign to partition with least pressure.
+-    auto [PID, CurCost] = BalancingQueue.back();
+-    AssignToPartition(PID, CurKernel);
+-  }
+-
+-  // Work is mostly done now, verify the partioning and add all functions we may
+-  // have missed (= unreachable, or we don't understand how they're reached) to
+-  // P0.
+-  DenseSet<const Function *> AllFunctions;
+-  for (const auto &[Idx, Part] : enumerate(Partitions)) {
+-    CostType Cost = 0;
+-    for (auto *Fn : Part) {
+-      // external linkage functions should exclusively be in the first partition
+-      // at this stage. In theory, we should only ever see external linkage
+-      // functions here if they're kernels, or if they've been added due to a
+-      // kernel using indirect calls somewhere in its CallGraph.
+-      assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
+-      Cost += FnCosts.at(Fn);
+-    }
+-    SML << "P" << Idx << " has a total cost of " << Cost << " ("
+-        << format("%0.2f", (float(Cost) / ModuleCost) * 100)
+-        << "% of source module)\n";
+-    AllFunctions.insert(Part.begin(), Part.end());
+-  }
+-
+-  // Add missed functions to P0. This will take care of adding things like
+-  // external functions with no callers in the module to P0. This should be
+-  // fairly rare as AMDGPU internalizes everything in most cases, so unused
+-  // internal functions would get removed.
+-  for (auto &Fn : M) {
+-    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
+-      SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
+-      Partitions[0].insert(&Fn);
+-    }
+-  }
+-
+-  SML << "--Partitioning Done--\n\n";
+-
+-  return Partitions;
+-}
+-
+-static void externalize(GlobalValue &GV) {
+-  if (GV.hasLocalLinkage()) {
+-    GV.setLinkage(GlobalValue::ExternalLinkage);
+-    GV.setVisibility(GlobalValue::HiddenVisibility);
+-  }
+-
+-  // Unnamed entities must be named consistently between modules. setName will
+-  // give a distinct name to each such entity.
+-  if (!GV.hasName())
+-    GV.setName("__llvmsplit_unnamed");
+-}
+-} // end anonymous namespace
+-
+-void llvm::splitAMDGPUModule(
+-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+-
+-  SplitModuleLogger SML(M);
+-
+-  CallGraph CG(M);
+-
+-  // Externalize functions whose address are taken.
+-  //
+-  // This is needed because partitioning is purely based on calls, but sometimes
+-  // a kernel/function may just look at the address of another local function
+-  // and not do anything (no calls). After partitioning, that local function may
+-  // end up in a different module (so it's just a declaration in the module
+-  // where its address is taken), which emits a "undefined hidden symbol" linker
+-  // error.
+-  //
+-  // Additionally, it guides partitioning to not duplicate this function if it's
+-  // called directly at some point.
+-  for (auto &Fn : M) {
+-    if (Fn.hasAddressTaken()) {
+-      if (Fn.hasLocalLinkage()) {
+-        SML << "[externalize] " << Fn.getName()
+-            << " because its address is taken\n";
+-      }
+-      externalize(Fn);
+-    }
+-  }
+-
+-  // Externalize local GVs, which avoids duplicating their initializers, which
+-  // in turns helps keep code size in check.
+-  if (!NoExternalizeGlobals) {
+-    for (auto &GV : M.globals()) {
+-      if (GV.hasLocalLinkage())
+-        SML << "[externalize] GV " << GV.getName() << '\n';
+-      externalize(GV);
+-    }
+-  }
+-
+-  // Start by calculating the cost of every function in the module, as well as
+-  // the module's overall cost.
+-  DenseMap<const Function *, CostType> FnCosts;
+-  const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
+-
+-  // Gather every kernel into a WorkList, then sort it by descending total cost
+-  // of the kernel so the biggest kernels are seen first.
+-  SmallVector<KernelWithDependencies> WorkList;
+-  for (auto &Fn : M) {
+-    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
+-      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+-  }
+-  sort(WorkList, [&](auto &A, auto &B) {
+-    // Sort by total cost, and if the total cost is identical, sort
+-    // alphabetically.
+-    if (A.TotalCost == B.TotalCost)
+-      return A.Fn->getName() < B.Fn->getName();
+-    return A.TotalCost > B.TotalCost;
+-  });
+-
+-  if (SML) {
+-    SML << "Worklist\n";
+-    for (const auto &KWD : WorkList) {
+-      SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
+-          << " indirect:" << KWD.HasIndirectCall
+-          << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
+-          << ")\n";
+-      for (const auto *Dep : KWD.Dependencies)
+-        SML << "  [Dep] " << getName(*Dep) << '\n';
+-    }
+-  }
+-
+-  // This performs all of the partitioning work.
+-  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
+-  assert(Partitions.size() == N);
+-
+-  // If we didn't externalize GVs, then local GVs need to be conservatively
+-  // imported into every module (including their initializers), and then cleaned
+-  // up afterwards.
+-  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
+-    // We conservatively import private/internal GVs into every module and clean
+-    // them up afterwards.
+-    const auto *Var = dyn_cast<GlobalVariable>(GV);
+-    return Var && Var->hasLocalLinkage();
+-  };
+-
+-  SML << "Creating " << N << " modules...\n";
+-  unsigned TotalFnImpls = 0;
+-  for (unsigned I = 0; I < N; ++I) {
+-    const auto &FnsInPart = Partitions[I];
+-
+-    ValueToValueMapTy VMap;
+-    std::unique_ptr<Module> MPart(
+-        CloneModule(M, VMap, [&](const GlobalValue *GV) {
+-          // Functions go in their assigned partition.
+-          if (const auto *Fn = dyn_cast<Function>(GV)) {
+-// Check we don't import an external linkage function in any
+-// partition other than P0.
+-#ifndef NDEBUG
+-            if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
+-              assert((I == 0) == FnsInPart.contains(Fn));
+-            }
+-#endif
+-            return FnsInPart.contains(Fn);
+-          }
+-
+-          if (NeedsConservativeImport(GV))
+-            return true;
+-
+-          // Everything else goes in the first partition.
+-          return I == 0;
+-        }));
+-
+-    // Clean-up conservatively imported GVs without any users.
+-    for (auto &GV : make_early_inc_range(MPart->globals())) {
+-      if (NeedsConservativeImport(&GV) && GV.use_empty())
+-        GV.eraseFromParent();
+-    }
+-
+-    unsigned NumAllFns = 0, NumKernels = 0;
+-    for (auto &Cur : *MPart) {
+-      if (!Cur.isDeclaration()) {
+-        ++NumAllFns;
+-        if (isEntryPoint(&Cur))
+-          ++NumKernels;
+-      }
+-    }
+-    TotalFnImpls += NumAllFns;
+-    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
+-        << NumKernels << " kernels)\n";
+-    ModuleCallback(std::move(MPart));
+-  }
+-
+-  SML << TotalFnImpls << " function definitions across all modules ("
+-      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
+-      << "% of original module)\n";
+-}
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
+--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
++++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
+@@ -1,30 +0,0 @@
+-//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
+-//
+-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+-// See https://llvm.org/LICENSE.txt for license information.
+-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-//
+-//===----------------------------------------------------------------------===//
+-//
+-//===----------------------------------------------------------------------===//
+-
+-#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
+-#define LLVM_TARGET_AMDGPUSPLITMODULE_H
+-
+-#include "llvm/ADT/STLFunctionalExtras.h"
+-#include <memory>
+-
+-namespace llvm {
+-
+-class Module;
+-class AMDGPUTargetMachine;
+-
+-/// Splits the module M into N linkable partitions. The function ModuleCallback
+-/// is called N times passing each individual partition as the MPart argument.
+-void splitAMDGPUModule(
+-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
+-
+-} // end namespace llvm
+-
+-#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
++++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+@@ -21,7 +21,6 @@
+ #include "AMDGPUIGroupLP.h"
+ #include "AMDGPUMacroFusion.h"
+ #include "AMDGPURegBankSelect.h"
+-#include "AMDGPUSplitModule.h"
+ #include "AMDGPUTargetObjectFile.h"
+ #include "AMDGPUTargetTransformInfo.h"
+ #include "AMDGPUUnifyDivergentExitNodes.h"
+@@ -816,13 +815,6 @@
+   return AMDGPUAS::FLAT_ADDRESS;
+ }
  
-   int h() {
--    f(v); // OK: TT = vector, Alloc<int> is used as the default argument for the
--          // second parameter.
-+    f(v); // expected-error {{no matching function for call to 'f'}}
-     g(v); // OK: TT = vector
-   }
+-bool AMDGPUTargetMachine::splitModule(
+-    Module &M, unsigned NumParts,
+-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
+-  splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
+-  return true;
+-}
+-
+ //===----------------------------------------------------------------------===//
+ // GCN Target Machine (SI+)
+ //===----------------------------------------------------------------------===//
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
++++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+@@ -73,10 +73,6 @@
+   getPredicatedAddrSpace(const Value *V) const override;
  
-diff -ruN --strip-trailing-cr a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
---- a/clang/test/SemaTemplate/cwg2398.cpp
-+++ b/clang/test/SemaTemplate/cwg2398.cpp
-@@ -65,10 +65,13 @@
-   template <class T3> struct B;
+   unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
+-
+-  bool splitModule(Module &M, unsigned NumParts,
+-                   function_ref<void(std::unique_ptr<Module> MPart)>
+-                       ModuleCallback) const override;
+ };
  
-   template <template <class T4> class TT1, class T5> struct B<TT1<T5>>;
-+  // new-note@-1 {{partial specialization matches}}
+ //===----------------------------------------------------------------------===//
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
+--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
++++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
+@@ -98,7 +98,6 @@
+   AMDGPURewriteOutArguments.cpp
+   AMDGPURewriteUndefForPHI.cpp
+   AMDGPUSetWavePriority.cpp
+-  AMDGPUSplitModule.cpp
+   AMDGPUSubtarget.cpp
+   AMDGPUTargetMachine.cpp
+   AMDGPUTargetObjectFile.cpp
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
++++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+@@ -2109,10 +2109,12 @@
+         if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
+           if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
+             // MemSetInst must have a write location.
+-            MemoryLocation UpperLoc = *getLocForWrite(UpperInst);
++            auto UpperLoc = getLocForWrite(UpperInst);
++            if (!UpperLoc)
++              return false;
+             int64_t InstWriteOffset = 0;
+             int64_t DepWriteOffset = 0;
+-            auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc,
++            auto OR = isOverwrite(UpperInst, DefInst, *UpperLoc, *MaybeDefLoc,
+                                   InstWriteOffset, DepWriteOffset);
+             Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL);
+             return StoredByte && StoredByte == MemSetI->getOperand(1) &&
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
+--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
++++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
+@@ -1389,6 +1389,12 @@
+         if (!Arg)
+           continue;
  
-   template <class T6, class T7> struct B<A<T6, T7>> {};
-+  // new-note@-1 {{partial specialization matches}}
++        if (AL.hasParamAttr(I, Attribute::ByVal))
++          // It's unsound to propagate memory attributes to byval arguments.
++          // Even if CalledFunction doesn't e.g. write to the argument,
++          // the call to NewInnerCB may write to its by-value copy.
++          continue;
++
+         unsigned ArgNo = Arg->getArgNo();
+         // If so, propagate its access attributes.
+         AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/DeadStoreElimination/simple.ll b/llvm/test/Transforms/DeadStoreElimination/simple.ll
+--- a/llvm/test/Transforms/DeadStoreElimination/simple.ll
++++ b/llvm/test/Transforms/DeadStoreElimination/simple.ll
+@@ -790,3 +790,16 @@
+   %l = load i32, ptr %R
+   ret i32 %l
+ }
++
++define void @test49() {
++; CHECK-LABEL: @test49(
++; CHECK-NEXT:  bb:
++; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr readonly null, i8 0, i64 0, i1 false)
++; CHECK-NEXT:    store ptr null, ptr null, align 8
++; CHECK-NEXT:    ret void
++;
++bb:
++  call void @llvm.memset.p0.i64(ptr readonly null, i8 0, i64 0, i1 false)
++  store ptr null, ptr null, align 8
++  ret void
++}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
+--- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
++++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
+@@ -6,6 +6,7 @@
+ declare void @bar1(ptr %p)
+ declare void @bar2(ptr %p, ptr %p2)
+ declare void @bar3(ptr writable %p)
++declare void @bar4(ptr byval([4 x i32]) %p)
+ define dso_local void @foo1_rdonly(ptr readonly %p) {
+ ; CHECK-LABEL: define {{[^@]+}}@foo1_rdonly
+ ; CHECK-SAME: (ptr readonly [[P:%.*]]) {
+@@ -186,6 +187,15 @@
+   ret void
+ }
  
-   template struct B<A<int>>;
-+  // new-error@-1 {{ambiguous partial specialization}}
- } // namespace class_template
++define dso_local void @foo_byval_readonly(ptr readonly %p) {
++; CHECK-LABEL: define {{[^@]+}}@foo_byval_readonly
++; CHECK-SAME: (ptr readonly [[P:%.*]])
++; CHECK-NEXT:   call void @bar4(ptr byval([4 x i32]) [[P]])
++; CHECK-NEXT:   ret void
++  call void @bar4(ptr byval([4 x i32]) %p)
++  ret void
++}
++
+ define void @prop_param_func_decl(ptr %p) {
+ ; CHECK-LABEL: define {{[^@]+}}@prop_param_func_decl
+ ; CHECK-SAME: (ptr [[P:%.*]]) {
+@@ -539,3 +549,11 @@
+   ret void
+ }
  
- namespace type_pack1 {
++define void @prop_byval_readonly(ptr %p) {
++; CHECK-LABEL: define {{[^@]+}}@prop_byval_readonly
++; CHECK-SAME: (ptr [[P:%.*]]) {
++; CHECK-NEXT:   call void @bar4(ptr byval([4 x i32]) [[P]])
++; CHECK-NEXT:   ret void
++  call void @foo_byval_readonly(ptr %p)
++  ret void
++}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+@@ -1,46 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels:
+-;   - A does a direct call to HelperA
+-;   - B is storing @HelperA
+-;   - C does a direct call to HelperA
+-;
+-; The helper functions will get externalized, which will force A and C into P0 as
+-; external functions cannot be duplicated.
+-
+-; CHECK0: define hidden void @HelperA()
+-; CHECK0: define amdgpu_kernel void @A()
+-; CHECK0: declare amdgpu_kernel void @B(ptr)
+-; CHECK0: define amdgpu_kernel void @C()
+-
+-; CHECK1: declare hidden void @HelperA()
+-; CHECK1: declare amdgpu_kernel void @A()
+-; CHECK1: declare amdgpu_kernel void @B(ptr)
+-; CHECK1: declare amdgpu_kernel void @C()
+-
+-; CHECK2: declare hidden void @HelperA()
+-; CHECK2: declare amdgpu_kernel void @A()
+-; CHECK2: define amdgpu_kernel void @B(ptr %dst)
+-; CHECK2: declare amdgpu_kernel void @C()
+-
+-define internal void @HelperA() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  call void @HelperA()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B(ptr %dst) {
+-  store ptr @HelperA, ptr %dst
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @HelperA()
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+@@ -1,37 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-
+-; 2 kernels:
+-;   - A is isolated
+-;   - B is storing @HelperA/B's address
+-;
+-; The helper functions should get externalized (become hidden w/ external linkage)
+-
+-; CHECK0: define hidden void @HelperA()
+-; CHECK0: define hidden void @HelperB()
+-; CHECK0: define amdgpu_kernel void @A()
+-; CHECK0: declare amdgpu_kernel void @B(i1, ptr)
+-
+-; CHECK1: declare hidden void @HelperA()
+-; CHECK1: declare hidden void @HelperB()
+-; CHECK1: declare amdgpu_kernel void @A()
+-; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)
+-
+-define internal void @HelperA() {
+-  ret void
+-}
+-
+-define internal void @HelperB() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
+-  %addr = select i1 %cond, ptr @HelperA, ptr @HelperB
+-  store ptr %addr, ptr %dst
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
+@@ -1,20 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
+-; REQUIRES: asserts
+-
+-; SHA256 of the kernel names.
+-
+-; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
+-; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
+-; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
+-
+-define amdgpu_kernel void @MyCustomKernel0() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @MyCustomKernel1() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @MyCustomKernel2() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+@@ -1,45 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-
+-; 3 kernels:
+-;   - A calls nothing
+-;   - B calls @PerryThePlatypus
+-;   - C calls @Perry, an alias of @PerryThePlatypus
+-;
+-; We should see through the alias and put B/C in the same
+-; partition.
+-;
+-; Additionally, @PerryThePlatypus gets externalized as
+-; the alias counts as taking its address.
+-
+-; CHECK0-NOT: define
+-; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
+-; CHECK0: define hidden void @PerryThePlatypus()
+-; CHECK0: define amdgpu_kernel void @B
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @A
+-; CHECK1-NOT: define
+-
+-@Perry = internal alias ptr(), ptr @PerryThePlatypus
+-
+-define internal void @PerryThePlatypus() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @PerryThePlatypus()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @Perry()
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+@@ -1,54 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels with each their own dependencies should go into 3
+-; distinct partitions. The most expensive kernel should be
+-; seen first and go into the last partition.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0: define internal void @HelperC
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @A
+-; CHECK1: define internal void @HelperA
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define amdgpu_kernel void @B
+-; CHECK2: define internal void @HelperB
+-; CHECK2-NOT: define
+-
+-
+-define amdgpu_kernel void @A() {
+-  call void @HelperA()
+-  ret void
+-}
+-
+-define internal void @HelperA() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B(ptr %x) {
+-  store i64 42, ptr %x
+-  store i64 43, ptr %x
+-  store i64 44, ptr %x
+-  call void @HelperB()
+-  ret void
+-}
+-
+-define internal void @HelperB() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @HelperC()
+-  ret void
+-}
+-
+-define internal void @HelperC() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
+@@ -1,50 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels with each their own dependencies should go into 3
+-; distinct partitions.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0: define internal void @HelperC
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @B
+-; CHECK1: define internal void @HelperB
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define amdgpu_kernel void @A
+-; CHECK2: define internal void @HelperA
+-; CHECK2-NOT: define
+-
+-
+-define amdgpu_kernel void @A() {
+-  call void @HelperA()
+-  ret void
+-}
+-
+-define internal void @HelperA() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @HelperB()
+-  ret void
+-}
+-
+-define internal void @HelperB() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @HelperC()
+-  ret void
+-}
+-
+-define internal void @HelperC() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
+@@ -1,41 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels share a common helper, that helper should be
+-; cloned in all partitions.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define internal void @Helper
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define internal void @Helper
+-; CHECK1: define amdgpu_kernel void @B
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define internal void @Helper
+-; CHECK2: define amdgpu_kernel void @A
+-; CHECK2-NOT: define
+-
+-define internal void @Helper() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  call void @Helper()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @Helper()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @Helper()
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+@@ -1,64 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
+-
+-; Both overridable helper should go in P0.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define available_externally void @OverridableHelper0()
+-; CHECK0: define internal void @OverridableHelper1()
+-; CHECK0: define amdgpu_kernel void @A
+-; CHECK0: define amdgpu_kernel void @B
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define internal void @PrivateHelper1()
+-; CHECK2: define amdgpu_kernel void @D
+-; CHECK2-NOT: define
+-
+-; CHECK3-NOT: define
+-; CHECK3: define internal void @PrivateHelper0()
+-; CHECK3: define amdgpu_kernel void @C
+-; CHECK3-NOT: define
+-
+-define available_externally void @OverridableHelper0() {
+-  ret void
+-}
+-
+-define internal void @OverridableHelper1() #0 {
+-  ret void
+-}
+-
+-define internal void @PrivateHelper0() {
+-  ret void
+-}
+-
+-define internal void @PrivateHelper1() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  call void @OverridableHelper0()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @OverridableHelper1()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @PrivateHelper0()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @D() {
+-  call void @PrivateHelper1()
+-  ret void
+-}
+-
+-attributes #0 = { nobuiltin }
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+@@ -1,76 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; We have 4 kernels:
+-;   - Each kernel has an internal helper
+-;   - @A and @B's helpers does an indirect call.
+-;
+-; We default to putting A/B in P0, alongside a copy
+-; of all helpers who have their address taken.
+-; The other kernels can still go into separate partitions.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define hidden void @HelperA
+-; CHECK0: define hidden void @HelperB
+-; CHECK0: define hidden void @CallCandidate
+-; CHECK0-NOT: define {{.*}} @HelperC
+-; CHECK0-NOT: define {{.*}} @HelperD
+-; CHECK0: define amdgpu_kernel void @A
+-; CHECK0: define amdgpu_kernel void @B
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define internal void @HelperD
+-; CHECK1: define amdgpu_kernel void @D
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define internal void @HelperC
+-; CHECK2: define amdgpu_kernel void @C
+-; CHECK2-NOT: define
+-
+-@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
+-
+-define internal void @HelperA(ptr %call) {
+-  call void %call()
+-  ret void
+-}
+-
+-define internal void @HelperB(ptr %call) {
+-  call void %call()
+-  ret void
+-}
+-
+-define internal void @CallCandidate() {
+-  ret void
+-}
+-
+-define internal void @HelperC() {
+-  ret void
+-}
+-
+-define internal void @HelperD() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A(ptr %call) {
+-  call void @HelperA(ptr %call)
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B(ptr %call) {
+-  call void @HelperB(ptr %call)
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @HelperC()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @D() {
+-  call void @HelperD()
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+@@ -1,40 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; CHECK0-NOT: define
+-; CHECK0: define void @ExternalHelper
+-; CHECK0: define amdgpu_kernel void @A
+-; CHECK0: define amdgpu_kernel void @B
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @D
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define amdgpu_kernel void @C
+-; CHECK2-NOT: define
+-
+-define void @ExternalHelper() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  call void @ExternalHelper()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @ExternalHelper()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @D() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+@@ -1,42 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels use private/internal global variables.
+-; The GVs should be copied in each partition as needed.
+-
+-; CHECK0-NOT: define
+-; CHECK0: @bar = internal constant ptr
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: @foo = private constant ptr
+-; CHECK1: define amdgpu_kernel void @A
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: @foo = private constant ptr
+-; CHECK2: @bar = internal constant ptr
+-; CHECK2: define amdgpu_kernel void @B
+-; CHECK2-NOT: define
+-
+-@foo = private constant ptr poison
+-@bar = internal constant ptr poison
+-
+-define amdgpu_kernel void @A() {
+-  store i32 42, ptr @foo
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  store i32 42, ptr @foo
+-  store i32 42, ptr @bar
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  store i32 42, ptr @bar
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+@@ -1,44 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; 3 kernels use private/internal global variables.
+-; The GVs should be copied in each partition as needed.
+-
+-; CHECK0-NOT: define
+-; CHECK0: @foo = hidden constant ptr poison
+-; CHECK0: @bar = hidden constant ptr poison
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: @foo = external hidden constant ptr{{$}}
+-; CHECK1: @bar = external hidden constant ptr{{$}}
+-; CHECK1: define amdgpu_kernel void @A
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: @foo = external hidden constant ptr{{$}}
+-; CHECK2: @bar = external hidden constant ptr{{$}}
+-; CHECK2: define amdgpu_kernel void @B
+-; CHECK2-NOT: define
+-
+-@foo = private constant ptr poison
+-@bar = internal constant ptr poison
+-
+-define amdgpu_kernel void @A() {
+-  store i32 42, ptr @foo
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  store i32 42, ptr @foo
+-  store i32 42, ptr @bar
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  store i32 42, ptr @bar
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
+@@ -1,75 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; Test load balancing logic with 6 kernels.
+-;
+-; Kernels go from most expensive (A == 6) to least expensive (F == 1)
+-;
+-; Load balancing should work like this (current partition cost is in parens)
+-;
+-; Initial    -> [P0(0), P1(0), P2(0)]
+-;
+-; A(6) goes in 2 -> [P2(6), P0(0), P1(0)]
+-; B(5) goes in 1 -> [P2(6), P1(5), P0(4)]
+-; C(4) goes in 0 -> [P2(6), P1(5), P0(4)]
+-
+-; D(3) goes in 0 -> [P0(7), P2(6), P1(5)]
+-; E(2) goes in 1 -> [P0(7), P1(7), P2(6)]
+-; F(1) goes in 2 -> [P0(7), P1(7), P2(7)]
+-
+-; CHECK0-NOT: define
+-; CHECK0: define amdgpu_kernel void @C
+-; CHECK0: define amdgpu_kernel void @D
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @B
+-; CHECK1: define amdgpu_kernel void @E
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define amdgpu_kernel void @A
+-; CHECK2: define amdgpu_kernel void @F
+-; CHECK2-NOT: define
+-
+-
+-define amdgpu_kernel void @A(ptr %x) {
+-  store i64 42, ptr %x
+-  store i64 43, ptr %x
+-  store i64 44, ptr %x
+-  store i64 45, ptr %x
+-  store i64 46, ptr %x
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B(ptr %x) {
+-  store i64 42, ptr %x
+-  store i64 43, ptr %x
+-  store i64 44, ptr %x
+-  store i64 45, ptr %x
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C(ptr %x) {
+-  store i64 42, ptr %x
+-  store i64 43, ptr %x
+-  store i64 44, ptr %x
+-  ret void
+-}
+-
+-define amdgpu_kernel void @D(ptr %x) {
+-  store i64 42, ptr %x
+-  store i64 43, ptr %x
+-  ret void
+-}
+-
+-define amdgpu_kernel void @E(ptr %x) {
+-  store i64 42, ptr %x
+-  ret void
+-}
+-
+-define amdgpu_kernel void @F() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
+@@ -1,39 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
+-
+-; Check that 4 independent kernels get put into 4 different partitions.
+-
+-; CHECK0-NOT: define
+-; CHECK0: define amdgpu_kernel void @D
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define amdgpu_kernel void @C
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define amdgpu_kernel void @B
+-; CHECK2-NOT: define
+-
+-; CHECK3-NOT: define
+-; CHECK3: define amdgpu_kernel void @A
+-; CHECK3-NOT: define
+-
+-define amdgpu_kernel void @A() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @D() {
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
++++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+@@ -1,98 +0,0 @@
+-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5
+-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+-
+-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
+-; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
+-; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
+-; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
+-
+-; 2 kernels (A/B) are large and share all their dependencies.
+-; They should go in the same partition, the remaining kernel should
+-; go somewhere else, and one partition should be empty.
+-;
+-; Also check w/o large kernels processing to verify they are indeed handled
+-; differently.
+-
+-; CHECK0-NOT: define
+-
+-; CHECK1-NOT: define
+-; CHECK1: define internal void @HelperC()
+-; CHECK1: define amdgpu_kernel void @C
+-; CHECK1-NOT: define
+-
+-; CHECK2-NOT: define
+-; CHECK2: define internal void @large2()
+-; CHECK2: define internal void @large1()
+-; CHECK2: define internal void @large0()
+-; CHECK2: define internal void @HelperA()
+-; CHECK2: define internal void @HelperB()
+-; CHECK2: define amdgpu_kernel void @A
+-; CHECK2: define amdgpu_kernel void @B
+-; CHECK2-NOT: define
+-
+-; NOLARGEKERNELS-CHECK0-NOT: define
+-; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
+-; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
+-; NOLARGEKERNELS-CHECK0-NOT: define
+-
+-; NOLARGEKERNELS-CHECK1: define internal void @large2()
+-; NOLARGEKERNELS-CHECK1: define internal void @large1()
+-; NOLARGEKERNELS-CHECK1: define internal void @large0()
+-; NOLARGEKERNELS-CHECK1: define internal void @HelperB()
+-; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B
+-
+-; NOLARGEKERNELS-CHECK2: define internal void @large2()
+-; NOLARGEKERNELS-CHECK2: define internal void @large1()
+-; NOLARGEKERNELS-CHECK2: define internal void @large0()
+-; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
+-; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
+-
+-define internal void @large2() {
+-  store volatile i32 42, ptr null
+-  call void @large2()
+-  ret void
+-}
+-
+-define internal void @large1() {
+-  call void @large1()
+-  call void @large2()
+-  ret void
+-}
+-
+-define internal void @large0() {
+-  call void @large0()
+-  call void @large1()
+-  call void @large2()
+-  ret void
+-}
+-
+-define internal void @HelperA() {
+-  call void @large0()
+-  ret void
+-}
+-
+-define internal void @HelperB() {
+-  call void @large0()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @A() {
+-  call void @HelperA()
+-  ret void
+-}
+-
+-define amdgpu_kernel void @B() {
+-  call void @HelperB()
+-  ret void
+-}
+-
+-define internal void @HelperC() {
+-  ret void
+-}
+-
+-define amdgpu_kernel void @C() {
+-  call void @HelperC()
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
+--- a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
++++ b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
+@@ -1,2 +0,0 @@
+-if not "AMDGPU" in config.root.targets:
+-  config.unsupported = True
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index cf3a320455d06d..13f84dc3837360 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "10dc3a8e916d73291269e5e2b82dd22681489aa1"
-    LLVM_SHA256 = "6ee5e0f9a49d41b5f48ebc4613ce3371f686bf70fcece9f849aba3c37bdeb3e8"
+    LLVM_COMMIT = "fddf350f96405d2b0f4b17025e7c7bb2d159798e"
+    LLVM_SHA256 = "fe7c00020525dd5343ed27c4b5b94b027b8d080b6cda80e52d56ee68591a961b"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/py/python_init_repositories.bzl b/third_party/py/python_init_repositories.bzl
index fb278fcc977a25..32c340cd83a1c1 100644
--- a/third_party/py/python_init_repositories.bzl
+++ b/third_party/py/python_init_repositories.bzl
@@ -6,12 +6,14 @@ load("//third_party/py:python_repo.bzl", "python_repository")
 def python_init_repositories(
         requirements = {},
         local_wheel_workspaces = [],
-        local_wheel_dist_folder = None):
+        local_wheel_dist_folder = None,
+        default_python_version = None):
     python_repository(
         name = "python_version_repo",
         requirements_versions = requirements.keys(),
         requirements_locks = requirements.values(),
         local_wheel_workspaces = local_wheel_workspaces,
         local_wheel_dist_folder = local_wheel_dist_folder,
+        default_python_version = default_python_version,
     )
     py_repositories()
diff --git a/third_party/py/python_repo.bzl b/third_party/py/python_repo.bzl
index c549afd926c4a7..051e45aae37ee3 100644
--- a/third_party/py/python_repo.bzl
+++ b/third_party/py/python_repo.bzl
@@ -8,41 +8,13 @@ To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
 """
 
 DEFAULT_VERSION = "3.11"
-WARNING = """
-HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
-Python {} will be used.
-To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
-your shell:
-  export HERMETIC_PYTHON_VERSION=3.12
-OR pass it as an argument to bazel command directly or inside your .bazelrc
-file:
-  --repo_env=HERMETIC_PYTHON_VERSION=3.12
-""".format(DEFAULT_VERSION)
-
-content = """TF_PYTHON_VERSION = "{version}"
-HERMETIC_PYTHON_VERSION = "{version}"
-WHEEL_NAME = "{wheel_name}"
-WHEEL_COLLAB = "{wheel_collab}"
-REQUIREMENTS = "{requirements}"
-REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
-"""
 
 def _python_repository_impl(ctx):
-    ctx.file("BUILD", "")
-    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
-    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
-    if not version:
-        version = version_legacy
-    else:
-        version_legacy = version
+    version = _get_python_version(ctx)
 
+    ctx.file("BUILD", "")
     wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
     wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
-    if not version:
-        print(WARNING)  # buildifier: disable=print
-        version = DEFAULT_VERSION
-    else:
-        print("Using hermetic Python %s" % version)  # buildifier: disable=print
 
     requirements = None
     for i in range(0, len(ctx.attr.requirements_locks)):
@@ -62,11 +34,14 @@ Please check python_init_repositories() in your WORKSPACE file.
         ))
 
     requirements_with_local_wheels = str(requirements)
-    if ctx.attr.local_wheel_workspaces:
+
+    local_wheels_dir = ctx.os.environ.get("LOCAL_WHEELS_DIR", "")
+    if ctx.attr.local_wheel_workspaces or local_wheels_dir:
         local_wheel_requirements = _get_injected_local_wheels(
             ctx,
             version,
             ctx.attr.local_wheel_workspaces,
+            local_wheels_dir,
         )
         requirements_content = [ctx.read(requirements)] + local_wheel_requirements
         merged_requirements_content = "\n".join(requirements_content)
@@ -82,7 +57,14 @@ Please check python_init_repositories() in your WORKSPACE file.
 
     ctx.file(
         "py_version.bzl",
-        content.format(
+        """
+TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
+""".format(
             version = version,
             wheel_name = wheel_name,
             wheel_collab = wheel_collab,
@@ -91,32 +73,70 @@ Please check python_init_repositories() in your WORKSPACE file.
         ),
     )
 
-def _get_injected_local_wheels(ctx, py_version, local_wheel_workspaces):
-    local_wheel_requirements = []
-    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
-    wheels = {}
+def _get_python_version(ctx):
+    print_warning = False
 
-    for local_wheel_workspace in local_wheel_workspaces:
-        local_wheel_workspace_path = ctx.path(local_wheel_workspace)
-        dist_folder = ctx.attr.local_wheel_dist_folder
-        dist_wheels = local_wheel_workspace_path.dirname.get_child(dist_folder).readdir()
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    if not version:
+        print_warning = True
+        if ctx.attr.default_python_version == "system":
+            python_version_result = ctx.execute(["python3", "--version"])
+            if python_version_result.return_code == 0:
+                version = python_version_result.stdout
+            else:
+                fail("""
+Cannot match hermetic Python version to system Python version.
+System Python was not found.""")
+        else:
+            version = ctx.attr.default_python_version
 
-        for wheel in dist_wheels:
-            bn = wheel.basename
-            if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
-                continue
+    version = _parse_python_version(version)
 
-            name_components = bn.split("-")
-            package_name = name_components[0]
-            for name_component in name_components[1:]:
-                if name_component[0].isdigit():
-                    break
-                package_name += "-" + name_component
+    if print_warning:
+        print("""
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(version))  # buildifier: disable=print
+
+    print("Using hermetic Python %s" % version)  # buildifier: disable=print
+    return version
 
-            latest_wheel = wheels.get(package_name, None)
+def _parse_python_version(version_str):
+    if version_str.startswith("Python "):
+        py_ver_chunks = version_str[7:].split(".")
+        return "%s.%s" % (py_ver_chunks[0], py_ver_chunks[1])
+    return version_str
+
+def _get_injected_local_wheels(
+        ctx,
+        py_version,
+        local_wheel_workspaces,
+        local_wheels_dir):
+    local_wheel_requirements = []
+    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
+    wheels = {}
 
-            if not latest_wheel or latest_wheel.basename < wheel.basename:
-                wheels[package_name] = wheel
+    if local_wheel_workspaces:
+        for local_wheel_workspace in local_wheel_workspaces:
+            local_wheel_workspace_path = ctx.path(local_wheel_workspace)
+            dist_folder = ctx.attr.local_wheel_dist_folder
+            dist_folder_path = local_wheel_workspace_path.dirname.get_child(dist_folder)
+            if dist_folder_path.exists:
+                dist_wheels = dist_folder_path.readdir()
+                _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
+    if local_wheels_dir:
+        dist_folder_path = ctx.path(local_wheels_dir)
+        if dist_folder_path.exists:
+            dist_wheels = dist_folder_path.readdir()
+            _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
 
     for wheel_name, wheel_path in wheels.items():
         local_wheel_requirements.append(
@@ -147,6 +167,10 @@ python_repository = repository_rule(
             mandatory = False,
             default = "dist",
         ),
+        "default_python_version": attr.string(
+            mandatory = False,
+            default = DEFAULT_VERSION,
+        ),
     },
     environ = [
         "TF_PYTHON_VERSION",
@@ -156,6 +180,24 @@ python_repository = repository_rule(
     ],
 )
 
+def _process_dist_wheels(dist_wheels, wheels, py_ver_marker):
+    for wheel in dist_wheels:
+        bn = wheel.basename
+        if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
+            continue
+
+        name_components = bn.split("-")
+        package_name = name_components[0]
+        for name_component in name_components[1:]:
+            if name_component[0].isdigit():
+                break
+            package_name += "-" + name_component
+
+        latest_wheel = wheels.get(package_name, None)
+
+        if not latest_wheel or latest_wheel.basename < wheel.basename:
+            wheels[package_name] = wheel
+
 def _custom_python_interpreter_impl(ctx):
     version = ctx.attr.version
     strip_prefix = ctx.attr.strip_prefix.format(version = version)
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index ad8c4a1534b282..32e24d4d474674 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -46,10 +46,11 @@ if is_linux_gpu_job ; then
 fi
 
 pull_docker_image_with_retries
+
+
 # Start a container in the background
-docker run --name xla -w /tf/xla -itd --rm \
-    -v "$KOKORO_ARTIFACTS_DIR/github/xla:/tf/xla" \
-    -v "$KOKORO_ARTIFACTS_DIR/pkg:/tf/pkg" \
+docker run --name xla -w /github/xla -itd --rm \
+    -v "./github:/github" \
     "$DOCKER_IMAGE" \
     bash
 
@@ -71,8 +72,7 @@ if is_linux_gpu_job ; then
     RBE_FLAGS="--config=rbe_linux_cuda_nvcc --jobs=150"
     (
       #TODO(b/338885148): Remove this block after TF was updated to cuDNN 9
-      cd ${KOKORO_ARTIFACTS_DIR}/github/xla
-      sed -i 's/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g' .bazelrc
+      sed -i 's/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g' ./github/xla/.bazelrc
       echo "The following changes were made:"
       git diff -- .bazelrc || true
     )
@@ -100,7 +100,7 @@ docker exec xla bazel \
         --keep_going \
         --nobuild_tests_only \
         --features=layering_check \
-        --profile=/tf/pkg/profile.json.gz \
+        --profile=profile.json.gz \
         --flaky_test_attempts=3 \
         --config=warnings \
         $RBE_FLAGS \
@@ -109,7 +109,7 @@ docker exec xla bazel \
 
 
 # Print build time statistics, including critical path.
-docker exec xla bazel analyze-profile "/tf/pkg/profile.json.gz"
+docker exec xla bazel analyze-profile profile.json.gz
 
 # Stop container
 docker stop xla
diff --git a/third_party/xla/docs/custom_call.md b/third_party/xla/docs/custom_call.md
index ef56830e08c764..ed8ba511196a77 100644
--- a/third_party/xla/docs/custom_call.md
+++ b/third_party/xla/docs/custom_call.md
@@ -110,7 +110,7 @@ void do_custom_call(CUstream stream, BufferF32 in0, BufferF32 in1,
   const int64_t block_dim = 64;
   const int64_t grid_dim = 2048 / block_dim;
   custom_call_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    in0.data, in1.data, out.data);
+    in0.data, in1.data, out->data);
 }
 
 XLA_FFI_DEFINE_HANDLER(handler, do_custom_call,
diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
index 2cb9c8583f5e3b..5f0b70a8daa6c1 100644
--- a/third_party/xla/docs/indexing.md
+++ b/third_party/xla/docs/indexing.md
@@ -49,37 +49,34 @@ robust if it is expressed via indexing maps.
 
 ## Function and Domain
 
-The indexing map is a function **f**(**d**, **s**)
+The indexing map is a function **f**(**x**) = **f**(**d**,  **r**, **rt**)
 that maps a multi-index **d** of a tensor `A` to elements/ranges of
-tensor `B`. The parameter **s** refers to the ranges of indices of
-the dimensions that are present in tensor `B`, but not in tensor `A`​.
+tensor `B`. The parameter **r** refers to the ranges of indices of
+the dimensions that are present in tensor `B`, but not in tensor `A`​. The
+parameter **rt** refers to the runtime values, e.g. indices for a gather op.
 
 For example, if we have a reduction from `tensor<2x4x8x16xf32>` to
 `tensor<4x8xf32>`, then the indexing map from the 2D output to the 4D input is
-`(d0, d1) -> (s0, d0, d1, s1)`, where `d_i` are the dimension
-parameters that correspond to the indices of the output tensor. Parameters `s_j`
-encode multiple values, i.e. to compute a `(d0, d1)` element of the output, we
-need `(s0, d0, d1, s1)` elements of the input, where `s0 in [0, 2)` and
-`s1 in [0, 16)`.
+`(d0, d1) -> (r0, d0, d1, r1)`, where `d_i` are the dimension variables that
+correspond to the indices of the output tensor. Range variables `r_j` encode
+multiple values, i.e. to compute a `(d0, d1)` element of the output, we need
+`(r0, d0, d1, r1)` elements of the input, where `r0 in [0, 2)` and
+`r1 in [0, 16)`.
 
 This mapping can be constructed from the attributes of HLO instructions or the
 mappings of unfused instructions can be composed to get indexing for a fusion.
 The mapping also has a domain, which specifies for what elements of the tensor
 the mapping exists.
 
-**f**(**d**, **s**) s.t.
+**f**(**x**) s.t.
 
-**lb**_d <= **d** <= **ub**_d
-
-**lb**_s <= **s** <= **ub**_s
-
-**lb**_g <= **g** <= **ub**_g
+**lb** <= **g**(**x**) <= **ub**
 
 
 Since we want to minimize recomputation, we need a library for symbolic
 computations. XLA already depends on MLIR, so we use
 [mlir::AffineMap](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/IR/AffineMap.h)
-instead of writing a symbolic arithmetic library.
+instead of writing a yet-another symbolic arithmetic library.
 
 A typical `AffineMap` looks like
 
@@ -87,33 +84,66 @@ A typical `AffineMap` looks like
 (d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)
 ```
 
-`AffineMap` conveniently has two types of parameters: *dimensions* and *symbols*
-that we can use for **d** and **s** respectively.
-`AffineMap` does not contain any metadata about ranges of the dimensions, so we
-have to provide this data ourselves.
+`AffineMap` has two types of parameters: *dimensions* and *symbols*. The
+*dimensions* correspond to the dimension variables *d*, *symbols* correspond to
+the range variables *r* and RT variables *rt*.  `AffineMap` does not contain any
+metadata about ranges of the dimensions, so we have to provide this data
+ourselves.
 
 ```c++
-struct Range {
- int64_t lower_bound;
- int64_t upper_bound;
+struct Interval {
+ int64_t lower;
+ int64_t upper;
+};
+
+// Dimension variable represents a dimension of a tensor or a GPU grid.
+struct DimVar {
+  Interval bounds;
 };
 
-struct IndexingMap {
- mlir::AffineMap affine_map;
- std::vector<Range> dim_ranges;
- std::vector<Range> symbol_ranges;
- llvm::DenseMap<mlir::AffineExpr, Range> expr_ranges;
+// RangeVar variable represents a range of values, e.g. to compute a single
+// element of the reduction's result we need a range of values from the input
+// tensor.
+struct RangeVar {
+  Interval range;
 };
 
+// RTVar represents a runtime value, e.g. a dynamic offset in
+// HLO dynamic-update-slice op.
+struct RTVar {
+  Interval feasible_values;
+  const HloInstruction* hlo;
+  // This is a map from the iteration space of the corresponding indexing map to
+  // the iteration space of `hlo`. It shows what element of `hlo` we need to
+  // extract to get the runtime value for the RTVar.
+  mlir::AffineMap map;
+};
+
+class IndexingMap {
+  mlir::AffineMap affine_map_;
+  std::vector<DimVar> dim_vars_;
+  std::vector<RangeVar> range_vars_;
+  std::vector<RTVar> rt_vars_;
+  llvm::DenseMap<mlir::AffineExpr, Interval> constraints_;
+};
 ```
 
-`dim_ranges` encodes the **inclusive** box constraints for the dimension
-parameters **d** of the indexing map, which usually coincide with the
+`dim_vars_` encode the **inclusive** box constraints for the dimension
+variables **d** of the indexing map, which usually coincide with the
 shape of the output tensor for ops like transpose, reduce, elementwise, dot, but
 there are some exceptions like
 [HloConcatenateInstruction](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#concatenate).
 
-`symbol_ranges` encode possible values that **s** parameters can take.
+`range_vars_` encode possible values that **r** parameters can take.
+
+`rt_vars_` store the associated hlo instructions together with their access
+patterns and the feasible values in runtime. For example, the offset is dynamic
+for a 1D `HloDynamicSliceInstruction`. The corresponding `RTVar` will have an
+`HloInstruction*` that produces a rank-0 tensor with the `(d0) -> ()` access
+pattern, because for every element of the output we extract the same element
+from the offset tensor to compute the index of the input. We can also assume
+that the offset of the slice is always between `0` and
+ `tensor_size - slice_size - 1`.
 
 Let's study-by-example to understand what's all of the above actually means.
 
@@ -191,6 +221,151 @@ this particular case every element of input with index `d0` is mapped to a
 Conveniently, they do not have any input parameters, so there is nothing to
 compute indexing for.
 
+### [DynamicSlice](https://openxla.org/xla/operation_semantics#dynamicslice)
+DynamicSlice is just like Slice, but the offsets are dynamic.
+```c+
+src = s32[2,2,258] parameter(0)
+of1 = s32[] parameter(1)
+of2 = s32[] parameter(2)
+of3 = s32[] parameter(3)
+ds = dynamic-slice(s32[2,2,258] src, s32[] of1, s32[] of2, s32[] of3), dynamic_slice_sizes={1, 2, 32}
+```
+
+The output to input map for `src`:
+```
+(d0, d1, d2)[s0, s1, s2] -> (d0 + s0, d1 + s1, d2 + s2)
+domain:
+d0 in [0, 0]
+d1 in [0, 1]
+d2 in [0, 31]
+s0 in [0, 1]
+  hlo: of1 = s32[] parameter(1)
+  (d0, d1, d2)  -> ()
+s1 in [0, 0]
+  hlo: of2 = s32[] parameter(2)
+  (d0, d1, d2)  -> ()
+s2 in [0, 226]
+  hlo: of3 = s32[] parameter(3)
+  (d0, d1, d2) -> ()
+```
+
+Note that now we have **s** on the right side for the input-to-output mapping.
+Those are the symbols that represent runtime values. For example, in this
+particular case for every element of the output with indices `d0, d1, d2` we
+access slice offsets `of1`, `of2` and `of3` to compute the index of the input.
+The intervals for the runtime variables are derived by assuming that the entire
+slice stays in bounds.
+
+The output to input map for `of1`, `of2` and `of3`:
+```
+(d0, d1, d2)  -> ()
+domain:
+d0 in [0, 0]
+d1 in [0, 1]
+d2 in [0, 31]
+```
+
+### [DynamicUpdateSlice](https://openxla.org/xla/operation_semantics#dynamicupdateslice)
+```c+
+src = s32[20,30] parameter(0)
+upd = s32[5,10] parameter(1)
+of1 = s32[] parameter(2)
+of2 = s32[] parameter(3)
+dus = s32[20,30] dynamic-update-slice(
+    s32[20,30] src, s32[5,10] upd, s32[] of1, s32[] of2)
+```
+
+The output to input map for `src` is trivial. It can be made more precise by
+restricting the domain to the not-updated indices, but right now indexing maps
+do not support inqequality constraints.
+```
+(d0, d1) -> (d0, d1)
+domain:
+d0 in [0, 19]
+d1 in [0, 29]
+```
+
+The output to input map for `upd`:
+```
+(d0, d1)[s0, s1]  -> (d0 - s0, d1 - s1)
+domain:
+d0 in [0, 19]
+d1 in [0, 29]
+s0 in [0, 15]
+  hlo: of1 = s32[] parameter(2)
+  (d0, d1)  -> ()
+s1 in [0, 20]
+  hlo: of2 = s32[] parameter(3)
+  (d0, d1)  -> ()
+```
+
+Note that now we have **s** on the right side for the input-to-output mapping.
+Those are the symbols that represent runtime values. For example, in this
+particular case for every element of the output with indices `d0, d1` we access
+slice offsets `of1` and `of2` to compute the index of the input.  The intervals
+for the runtime variables are derived by assuming that the entire slice stays in
+bounds.
+
+
+The output to input map for `of1` and `of2`:
+```
+(d0, d1)  -> ()
+domain:
+d0 in [0, 19]
+d1 in [0, 29]
+```
+
+### [Gather](https://openxla.org/xla/operation_semantics#gather)
+Only the simplified gather is supported. See [gather_simplifier].(https://github.com/openxla/xla/blob/main/xla/service/gather_simplifier.h).
+
+```c++
+operand = f32[33,76,70] parameter(0)
+indices = s32[1806,2] parameter(1)
+gather = f32[1806,7,8,4] gather(operand, indices),
+  offset_dims={1,2,3},
+  collapsed_slice_dims={},
+  start_index_map={0,1},
+  index_vector_dim=1,
+  slice_sizes={7,8,4}
+```
+
+The output to input map for `operand`:
+```
+
+(d0, d1, d2, d3)[s0, s1] -> (d1 + s0, d2 + s1, d3)
+domain:
+d0 in [0, 1805]
+d1 in [0, 6]
+d2 in [0, 7]
+d3 in [0, 3]
+s0 in [0, 26]
+  hlo: indices = s32[1806,2]{1,0} parameter(1)
+  (d0, d1, d2, d3) -> (d0, 0)
+s1 in [0, 68]
+  hlo: indices = s32[1806,2]{1,0} parameter(1)
+  (d0, d1, d2, d3) -> (d0, 1)
+```
+
+Note that now we have **s** on the right side for the input-to-output mapping.
+Those are the symbols that represent runtime values. For example, in this
+particular case for every element of the output with indices `d0, d1, d2, d3` we
+extract elements (d0, 0) and (d0, 1) from `indices` tensor.
+
+The output to input map for `indices`:
+
+```
+  (d0, d1, d2, d3)[s0] -> (d0, s0)
+  domain:
+  d0 in [0, 1805]
+  d1 in [0, 6]
+  d2 in [0, 7]
+  d3 in [0, 3]
+  s0 in [0, 1]
+```
+The range variable `s0` shows that we need the entire row (d0, *) of the
+`indices` tensor to compute an element of the output.
+
+
 ### [Transpose](https://openxla.org/xla/operation_semantics#transpose)
 
 Indexing map for transpose is a permutation of input/output dimensions.
@@ -796,3 +971,6 @@ rewritten as `updated_lower_bound <= affine_expr <= updated_upped_bound`.
 for `d0 in [0, 5]` and `s0 in [1, 3]` are eliminated.
 3. Affine expressions in the constraints are optimized as the indexing affine
 map above.
+
+
+For more examples see [indexing_map_test.cc](https://github.com/openxla/xla/blob/main/xla/service/gpu/model/indexing_map_test.cc).
\ No newline at end of file
diff --git a/third_party/xla/third_party/py/python_init_repositories.bzl b/third_party/xla/third_party/py/python_init_repositories.bzl
index fb278fcc977a25..32c340cd83a1c1 100644
--- a/third_party/xla/third_party/py/python_init_repositories.bzl
+++ b/third_party/xla/third_party/py/python_init_repositories.bzl
@@ -6,12 +6,14 @@ load("//third_party/py:python_repo.bzl", "python_repository")
 def python_init_repositories(
         requirements = {},
         local_wheel_workspaces = [],
-        local_wheel_dist_folder = None):
+        local_wheel_dist_folder = None,
+        default_python_version = None):
     python_repository(
         name = "python_version_repo",
         requirements_versions = requirements.keys(),
         requirements_locks = requirements.values(),
         local_wheel_workspaces = local_wheel_workspaces,
         local_wheel_dist_folder = local_wheel_dist_folder,
+        default_python_version = default_python_version,
     )
     py_repositories()
diff --git a/third_party/xla/third_party/py/python_repo.bzl b/third_party/xla/third_party/py/python_repo.bzl
index c549afd926c4a7..051e45aae37ee3 100644
--- a/third_party/xla/third_party/py/python_repo.bzl
+++ b/third_party/xla/third_party/py/python_repo.bzl
@@ -8,41 +8,13 @@ To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
 """
 
 DEFAULT_VERSION = "3.11"
-WARNING = """
-HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
-Python {} will be used.
-To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
-your shell:
-  export HERMETIC_PYTHON_VERSION=3.12
-OR pass it as an argument to bazel command directly or inside your .bazelrc
-file:
-  --repo_env=HERMETIC_PYTHON_VERSION=3.12
-""".format(DEFAULT_VERSION)
-
-content = """TF_PYTHON_VERSION = "{version}"
-HERMETIC_PYTHON_VERSION = "{version}"
-WHEEL_NAME = "{wheel_name}"
-WHEEL_COLLAB = "{wheel_collab}"
-REQUIREMENTS = "{requirements}"
-REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
-"""
 
 def _python_repository_impl(ctx):
-    ctx.file("BUILD", "")
-    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
-    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
-    if not version:
-        version = version_legacy
-    else:
-        version_legacy = version
+    version = _get_python_version(ctx)
 
+    ctx.file("BUILD", "")
     wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
     wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
-    if not version:
-        print(WARNING)  # buildifier: disable=print
-        version = DEFAULT_VERSION
-    else:
-        print("Using hermetic Python %s" % version)  # buildifier: disable=print
 
     requirements = None
     for i in range(0, len(ctx.attr.requirements_locks)):
@@ -62,11 +34,14 @@ Please check python_init_repositories() in your WORKSPACE file.
         ))
 
     requirements_with_local_wheels = str(requirements)
-    if ctx.attr.local_wheel_workspaces:
+
+    local_wheels_dir = ctx.os.environ.get("LOCAL_WHEELS_DIR", "")
+    if ctx.attr.local_wheel_workspaces or local_wheels_dir:
         local_wheel_requirements = _get_injected_local_wheels(
             ctx,
             version,
             ctx.attr.local_wheel_workspaces,
+            local_wheels_dir,
         )
         requirements_content = [ctx.read(requirements)] + local_wheel_requirements
         merged_requirements_content = "\n".join(requirements_content)
@@ -82,7 +57,14 @@ Please check python_init_repositories() in your WORKSPACE file.
 
     ctx.file(
         "py_version.bzl",
-        content.format(
+        """
+TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
+""".format(
             version = version,
             wheel_name = wheel_name,
             wheel_collab = wheel_collab,
@@ -91,32 +73,70 @@ Please check python_init_repositories() in your WORKSPACE file.
         ),
     )
 
-def _get_injected_local_wheels(ctx, py_version, local_wheel_workspaces):
-    local_wheel_requirements = []
-    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
-    wheels = {}
+def _get_python_version(ctx):
+    print_warning = False
 
-    for local_wheel_workspace in local_wheel_workspaces:
-        local_wheel_workspace_path = ctx.path(local_wheel_workspace)
-        dist_folder = ctx.attr.local_wheel_dist_folder
-        dist_wheels = local_wheel_workspace_path.dirname.get_child(dist_folder).readdir()
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    if not version:
+        print_warning = True
+        if ctx.attr.default_python_version == "system":
+            python_version_result = ctx.execute(["python3", "--version"])
+            if python_version_result.return_code == 0:
+                version = python_version_result.stdout
+            else:
+                fail("""
+Cannot match hermetic Python version to system Python version.
+System Python was not found.""")
+        else:
+            version = ctx.attr.default_python_version
 
-        for wheel in dist_wheels:
-            bn = wheel.basename
-            if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
-                continue
+    version = _parse_python_version(version)
 
-            name_components = bn.split("-")
-            package_name = name_components[0]
-            for name_component in name_components[1:]:
-                if name_component[0].isdigit():
-                    break
-                package_name += "-" + name_component
+    if print_warning:
+        print("""
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(version))  # buildifier: disable=print
+
+    print("Using hermetic Python %s" % version)  # buildifier: disable=print
+    return version
 
-            latest_wheel = wheels.get(package_name, None)
+def _parse_python_version(version_str):
+    if version_str.startswith("Python "):
+        py_ver_chunks = version_str[7:].split(".")
+        return "%s.%s" % (py_ver_chunks[0], py_ver_chunks[1])
+    return version_str
+
+def _get_injected_local_wheels(
+        ctx,
+        py_version,
+        local_wheel_workspaces,
+        local_wheels_dir):
+    local_wheel_requirements = []
+    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
+    wheels = {}
 
-            if not latest_wheel or latest_wheel.basename < wheel.basename:
-                wheels[package_name] = wheel
+    if local_wheel_workspaces:
+        for local_wheel_workspace in local_wheel_workspaces:
+            local_wheel_workspace_path = ctx.path(local_wheel_workspace)
+            dist_folder = ctx.attr.local_wheel_dist_folder
+            dist_folder_path = local_wheel_workspace_path.dirname.get_child(dist_folder)
+            if dist_folder_path.exists:
+                dist_wheels = dist_folder_path.readdir()
+                _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
+    if local_wheels_dir:
+        dist_folder_path = ctx.path(local_wheels_dir)
+        if dist_folder_path.exists:
+            dist_wheels = dist_folder_path.readdir()
+            _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
 
     for wheel_name, wheel_path in wheels.items():
         local_wheel_requirements.append(
@@ -147,6 +167,10 @@ python_repository = repository_rule(
             mandatory = False,
             default = "dist",
         ),
+        "default_python_version": attr.string(
+            mandatory = False,
+            default = DEFAULT_VERSION,
+        ),
     },
     environ = [
         "TF_PYTHON_VERSION",
@@ -156,6 +180,24 @@ python_repository = repository_rule(
     ],
 )
 
+def _process_dist_wheels(dist_wheels, wheels, py_ver_marker):
+    for wheel in dist_wheels:
+        bn = wheel.basename
+        if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
+            continue
+
+        name_components = bn.split("-")
+        package_name = name_components[0]
+        for name_component in name_components[1:]:
+            if name_component[0].isdigit():
+                break
+            package_name += "-" + name_component
+
+        latest_wheel = wheels.get(package_name, None)
+
+        if not latest_wheel or latest_wheel.basename < wheel.basename:
+            wheels[package_name] = wheel
+
 def _custom_python_interpreter_impl(ctx):
     version = ctx.attr.version
     strip_prefix = ctx.attr.strip_prefix.format(version = version)
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl b/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl
index fb278fcc977a25..32c340cd83a1c1 100644
--- a/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl
@@ -6,12 +6,14 @@ load("//third_party/py:python_repo.bzl", "python_repository")
 def python_init_repositories(
         requirements = {},
         local_wheel_workspaces = [],
-        local_wheel_dist_folder = None):
+        local_wheel_dist_folder = None,
+        default_python_version = None):
     python_repository(
         name = "python_version_repo",
         requirements_versions = requirements.keys(),
         requirements_locks = requirements.values(),
         local_wheel_workspaces = local_wheel_workspaces,
         local_wheel_dist_folder = local_wheel_dist_folder,
+        default_python_version = default_python_version,
     )
     py_repositories()
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl b/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl
index c549afd926c4a7..051e45aae37ee3 100644
--- a/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl
@@ -8,41 +8,13 @@ To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
 """
 
 DEFAULT_VERSION = "3.11"
-WARNING = """
-HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
-Python {} will be used.
-To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
-your shell:
-  export HERMETIC_PYTHON_VERSION=3.12
-OR pass it as an argument to bazel command directly or inside your .bazelrc
-file:
-  --repo_env=HERMETIC_PYTHON_VERSION=3.12
-""".format(DEFAULT_VERSION)
-
-content = """TF_PYTHON_VERSION = "{version}"
-HERMETIC_PYTHON_VERSION = "{version}"
-WHEEL_NAME = "{wheel_name}"
-WHEEL_COLLAB = "{wheel_collab}"
-REQUIREMENTS = "{requirements}"
-REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
-"""
 
 def _python_repository_impl(ctx):
-    ctx.file("BUILD", "")
-    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
-    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
-    if not version:
-        version = version_legacy
-    else:
-        version_legacy = version
+    version = _get_python_version(ctx)
 
+    ctx.file("BUILD", "")
     wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
     wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
-    if not version:
-        print(WARNING)  # buildifier: disable=print
-        version = DEFAULT_VERSION
-    else:
-        print("Using hermetic Python %s" % version)  # buildifier: disable=print
 
     requirements = None
     for i in range(0, len(ctx.attr.requirements_locks)):
@@ -62,11 +34,14 @@ Please check python_init_repositories() in your WORKSPACE file.
         ))
 
     requirements_with_local_wheels = str(requirements)
-    if ctx.attr.local_wheel_workspaces:
+
+    local_wheels_dir = ctx.os.environ.get("LOCAL_WHEELS_DIR", "")
+    if ctx.attr.local_wheel_workspaces or local_wheels_dir:
         local_wheel_requirements = _get_injected_local_wheels(
             ctx,
             version,
             ctx.attr.local_wheel_workspaces,
+            local_wheels_dir,
         )
         requirements_content = [ctx.read(requirements)] + local_wheel_requirements
         merged_requirements_content = "\n".join(requirements_content)
@@ -82,7 +57,14 @@ Please check python_init_repositories() in your WORKSPACE file.
 
     ctx.file(
         "py_version.bzl",
-        content.format(
+        """
+TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+REQUIREMENTS_WITH_LOCAL_WHEELS = "{requirements_with_local_wheels}"
+""".format(
             version = version,
             wheel_name = wheel_name,
             wheel_collab = wheel_collab,
@@ -91,32 +73,70 @@ Please check python_init_repositories() in your WORKSPACE file.
         ),
     )
 
-def _get_injected_local_wheels(ctx, py_version, local_wheel_workspaces):
-    local_wheel_requirements = []
-    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
-    wheels = {}
+def _get_python_version(ctx):
+    print_warning = False
 
-    for local_wheel_workspace in local_wheel_workspaces:
-        local_wheel_workspace_path = ctx.path(local_wheel_workspace)
-        dist_folder = ctx.attr.local_wheel_dist_folder
-        dist_wheels = local_wheel_workspace_path.dirname.get_child(dist_folder).readdir()
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    if not version:
+        print_warning = True
+        if ctx.attr.default_python_version == "system":
+            python_version_result = ctx.execute(["python3", "--version"])
+            if python_version_result.return_code == 0:
+                version = python_version_result.stdout
+            else:
+                fail("""
+Cannot match hermetic Python version to system Python version.
+System Python was not found.""")
+        else:
+            version = ctx.attr.default_python_version
 
-        for wheel in dist_wheels:
-            bn = wheel.basename
-            if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
-                continue
+    version = _parse_python_version(version)
 
-            name_components = bn.split("-")
-            package_name = name_components[0]
-            for name_component in name_components[1:]:
-                if name_component[0].isdigit():
-                    break
-                package_name += "-" + name_component
+    if print_warning:
+        print("""
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version.
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(version))  # buildifier: disable=print
+
+    print("Using hermetic Python %s" % version)  # buildifier: disable=print
+    return version
 
-            latest_wheel = wheels.get(package_name, None)
+def _parse_python_version(version_str):
+    if version_str.startswith("Python "):
+        py_ver_chunks = version_str[7:].split(".")
+        return "%s.%s" % (py_ver_chunks[0], py_ver_chunks[1])
+    return version_str
+
+def _get_injected_local_wheels(
+        ctx,
+        py_version,
+        local_wheel_workspaces,
+        local_wheels_dir):
+    local_wheel_requirements = []
+    py_ver_marker = "-cp%s-" % py_version.replace(".", "")
+    wheels = {}
 
-            if not latest_wheel or latest_wheel.basename < wheel.basename:
-                wheels[package_name] = wheel
+    if local_wheel_workspaces:
+        for local_wheel_workspace in local_wheel_workspaces:
+            local_wheel_workspace_path = ctx.path(local_wheel_workspace)
+            dist_folder = ctx.attr.local_wheel_dist_folder
+            dist_folder_path = local_wheel_workspace_path.dirname.get_child(dist_folder)
+            if dist_folder_path.exists:
+                dist_wheels = dist_folder_path.readdir()
+                _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
+    if local_wheels_dir:
+        dist_folder_path = ctx.path(local_wheels_dir)
+        if dist_folder_path.exists:
+            dist_wheels = dist_folder_path.readdir()
+            _process_dist_wheels(dist_wheels, wheels, py_ver_marker)
 
     for wheel_name, wheel_path in wheels.items():
         local_wheel_requirements.append(
@@ -147,6 +167,10 @@ python_repository = repository_rule(
             mandatory = False,
             default = "dist",
         ),
+        "default_python_version": attr.string(
+            mandatory = False,
+            default = DEFAULT_VERSION,
+        ),
     },
     environ = [
         "TF_PYTHON_VERSION",
@@ -156,6 +180,24 @@ python_repository = repository_rule(
     ],
 )
 
+def _process_dist_wheels(dist_wheels, wheels, py_ver_marker):
+    for wheel in dist_wheels:
+        bn = wheel.basename
+        if not bn.endswith(".whl") or bn.find(py_ver_marker) < 0:
+            continue
+
+        name_components = bn.split("-")
+        package_name = name_components[0]
+        for name_component in name_components[1:]:
+            if name_component[0].isdigit():
+                break
+            package_name += "-" + name_component
+
+        latest_wheel = wheels.get(package_name, None)
+
+        if not latest_wheel or latest_wheel.basename < wheel.basename:
+            wheels[package_name] = wheel
+
 def _custom_python_interpreter_impl(ctx):
     version = ctx.attr.version
     strip_prefix = ctx.attr.strip_prefix.format(version = version)
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index bcb24cfcddc0b3..e9bfec761ca8f6 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -78,6 +78,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 #ifdef XLA_CPU_USE_ACL
   opts.set_xla_cpu_use_acl(true);
 #endif
+  opts.set_xla_cpu_use_thunk_runtime(false);
 
   opts.set_xla_cpu_enable_fast_math(false);
   // Disable forms of fast math that have caused users problems in the past.
@@ -147,7 +148,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_redzone_scratch_max_megabytes(1LL << 12);
   opts.set_xla_gpu_redzone_padding_bytes(8 * 1024 * 1024);
   opts.set_xla_gpu_shape_checks(DebugOptions::RUNTIME);
-  opts.set_xla_gpu_normalize_layouts(true);
   opts.set_xla_dump_latency_hiding_schedule(false);
   opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
   opts.set_xla_gpu_lhs_enable_gpu_async_tracker(true);
@@ -766,6 +766,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_cpu_use_acl", bool_setter_for(&DebugOptions::set_xla_cpu_use_acl),
       debug_options->xla_cpu_use_acl(),
       "Generate calls to ACL (Arm Compute Library) in the CPU backend."));
+  flag_list->push_back(
+      tsl::Flag("xla_cpu_use_thunk_runtime",
+                bool_setter_for(&DebugOptions::set_xla_cpu_use_thunk_runtime),
+                debug_options->xla_cpu_use_thunk_runtime(),
+                "Use Thunk-based runtime for the CPU backend."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_crash_on_verification_failures",
       bool_setter_for(
@@ -1251,12 +1256,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_gpu_shape_checks", setter_for_xla_gpu_shape_checks,
       DebugOptions::ShapeChecks_Name(debug_options->xla_gpu_shape_checks()),
       "When to perform shape checks in XLA:GPU."));
-  flag_list->push_back(
-      tsl::Flag("xla_gpu_normalize_layouts",
-                bool_setter_for(&DebugOptions::set_xla_gpu_normalize_layouts),
-                debug_options->xla_gpu_normalize_layouts(),
-                "An experimental option to force all layouts present in the "
-                "after-optimizations HLO to be descending"));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_strict_dot_conv_math",
       bool_setter_for(&DebugOptions::set_xla_cpu_strict_dot_conv_math),
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 7cc103260d4bd3..4d67cbd0523abb 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -170,6 +170,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:compiler",
         "//xla/service:computation_placer_hdr",
+        "//xla/service:custom_call_status_internal",
         "//xla/service:custom_call_status_public_headers",
         "//xla/service:dump",
         "//xla/service:executable",
@@ -178,6 +179,7 @@ cc_library(
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
+        "//xla/service:maybe_owning_device_memory",
         "//xla/service/cpu:collectives_interface",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_event",
@@ -185,6 +187,8 @@ cc_library(
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:cpu_xfeed",
         "//xla/service/cpu:simple_orc_jit",
+        "//xla/service/cpu/runtime:buffer_allocations",
+        "//xla/service/cpu/runtime:thunk",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 73ea144c7e35c9..53f04e7b430992 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/pjrt/cpu/cpu_client.h"
 
+#define EIGEN_USE_THREADS
+
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@@ -28,12 +30,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "xla/pjrt/cpu/cpu_topology.h"
-#include "xla/pjrt/host_memory_spaces.h"
-#include "xla/pjrt/pjrt_compiler.h"
-
-#define EIGEN_USE_THREADS
-
 #include "absl/base/dynamic_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -64,11 +60,14 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/cpu_topology.h"
 #include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/distributed/topology_util.h"
+#include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/semaphore.h"
@@ -83,8 +82,11 @@ limitations under the License.
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
 #include "xla/service/cpu/cpu_xfeed.h"
+#include "xla/service/cpu/runtime/buffer_allocations.h"
+#include "xla/service/cpu/runtime/thunk.h"
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_status_internal.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
@@ -92,6 +94,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -1547,7 +1550,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   }
 
   if (input_deps.empty() && execute_inline) {
-    // Synchronously call generated function.
+    // Synchronously call generated function or thunk sequence.
 
     // Set denormal and rounding behavior to match the default TF
     // ThreadPool behavior.
@@ -1570,10 +1573,28 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     }
     void* result_buffer = buffer_pointers[result_buffer_index_];
 
-    // Call generated function.
-    cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
-                                       buffer_pointers.data(), &status,
-                                       nullptr);
+    if (cpu_executable->has_compute_function()) {
+      // Call jit-compiled function implementing XLA executable.
+      cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
+                                         buffer_pointers.data(), &status,
+                                         nullptr);
+
+    } else if (cpu_executable->has_thunks()) {
+      // Call interpreted thunk sequence implementing XLA executable.
+      absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+      buffer_device_mem.reserve(buffer_table.size());
+      for (const auto& buffer_info : buffer_table) {
+        buffer_device_mem.emplace_back(se::DeviceMemoryBase(
+            buffer_info.buffer->data(), buffer_info.buffer->size()));
+      }
+
+      cpu::BufferAllocations allocations(buffer_device_mem);
+      cpu::Thunk::ExecuteParams execute_params = {&allocations};
+      TF_RETURN_IF_ERROR(cpu_executable->thunks().Execute(execute_params));
+
+    } else {
+      return Internal("CpuExecutable has no compute function or thunks.");
+    }
 
     for (auto& donation_transaction : donation_transactions) {
       std::move(donation_transaction).Commit();
@@ -1650,22 +1671,46 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           }
           void* result_buffer = buffer_pointers[result_buffer_index];
 
-          // Call generated function.
-          std::optional<absl::string_view> error_message;
-          XlaCustomCallStatus status;
-          cpu_executable->compute_function()(result_buffer, &run_options,
-                                             nullptr, buffer_pointers.data(),
-                                             &status, nullptr);
-          error_message = xla::CustomCallStatusGetMessage(&status);
+          absl::Status status;
+
+          if (cpu_executable->has_compute_function()) {
+            // Call jit-compiled function implementing XLA executable.
+            XlaCustomCallStatus compute_function_status;
+
+            cpu_executable->compute_function()(
+                result_buffer, &run_options, nullptr, buffer_pointers.data(),
+                &compute_function_status, nullptr);
+            if (auto error_message =
+                    xla::CustomCallStatusGetMessage(&compute_function_status)) {
+              status =
+                  Internal("Generated function failed: %s", *error_message);
+            }
+
+          } else if (cpu_executable->has_thunks()) {
+            // Call interpreted thunk sequence implementing XLA executable.
+            absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+            buffer_device_mem.reserve(buffer_table.size());
+            for (const auto& buffer_info : buffer_table) {
+              buffer_device_mem.emplace_back(se::DeviceMemoryBase(
+                  buffer_info.buffer->data(), buffer_info.buffer->size()));
+            }
+
+            cpu::BufferAllocations allocations(buffer_device_mem);
+            cpu::Thunk::ExecuteParams execute_params = {&allocations};
+            status = cpu_executable->thunks().Execute(execute_params);
+
+          } else {
+            status =
+                Internal("CpuExecutable has no compute function or thunks.");
+          }
 
           for (auto& donation_transaction : donation_transactions) {
             std::move(donation_transaction).Commit();
           }
 
-          if (error_message) {
+          if (!status.ok()) {
             // CPU computation fails with an error.
-            execute_event.SetError(absl::StrFormat(
-                "Generated function failed: %s", *error_message));
+            execute_event.SetError(std::move(status));
             return;
           }
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index ae2d9290f30366..af51f6e6e52a7f 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -20,7 +20,7 @@ load(
     "if_google",
     "internal_visibility",
 )
-load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -964,6 +964,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "profiler_utils",
+    srcs = ["profiler_utils.cc"],
+    hdrs = ["profiler_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla/backends/profiler:profiler_backends",
+        "//xla/backends/profiler/plugin:plugin_tracer",
+        "//xla/backends/profiler/plugin:profiler_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
+        "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
+        "@local_tsl//tsl/profiler/lib:profiler_factory",
+        "@local_tsl//tsl/profiler/lib:profiler_interface",
+    ],
+)
+
 cc_library(
     name = "profiler",
     srcs = ["profiler.cc"],
@@ -976,6 +993,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":profiler_utils",
         ":types",
         ":xplane_to_profile_instructions",
         # placeholder for index annotation deps
diff --git a/third_party/xla/xla/python/ifrt/sharding.cc b/third_party/xla/xla/python/ifrt/sharding.cc
index 96ef77bdd1ca10..af406bf4ce4165 100644
--- a/third_party/xla/xla/python/ifrt/sharding.cc
+++ b/third_party/xla/xla/python/ifrt/sharding.cc
@@ -48,6 +48,12 @@ namespace ifrt {
 
 namespace {
 
+// Returns if `sharding_param` indicates a fully replicated sharding.
+bool ComputeIsFullyReplicated(const ShardingParam& sharding_param) {
+  return llvm::all_of(sharding_param.dim_shards(),
+                      [](auto shards) { return shards == 1; });
+}
+
 // Iterates the major-to-minor Cartesian product of a Span of containers of the
 // same type.
 //
@@ -229,8 +235,8 @@ std::unique_ptr<OpaqueSharding> OpaqueSharding::Create(DeviceList devices,
 }
 
 OpaqueSharding::OpaqueSharding(DeviceList devices, MemoryKind memory_kind)
-    : llvm::RTTIExtends<OpaqueSharding, Sharding>(std::move(devices),
-                                                  memory_kind) {}
+    : llvm::RTTIExtends<OpaqueSharding, Sharding>(
+          std::move(devices), memory_kind, /*is_fully_replicated=*/false) {}
 
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 OpaqueSharding::Disassemble(const Shape& shape) const {
@@ -285,16 +291,16 @@ std::unique_ptr<ConcreteSharding> ConcreteSharding::Create(
 
 ConcreteSharding::ConcreteSharding(DeviceList devices, MemoryKind memory_kind,
                                    Shape shape, std::vector<Shape> shard_shapes)
-    : llvm::RTTIExtends<ConcreteSharding, Sharding>(std::move(devices),
-                                                    memory_kind),
+    : llvm::RTTIExtends<ConcreteSharding, Sharding>(
+          std::move(devices), memory_kind, /*is_fully_replicated=*/false),
       shape_(std::move(shape)),
       shard_shapes_(std::move(shard_shapes)) {}
 
 ConcreteSharding::ConcreteSharding(
     DeviceList devices, MemoryKind memory_kind, DynamicShape dynamic_shape,
     std::vector<DynamicShape> shard_dynamic_shapes)
-    : llvm::RTTIExtends<ConcreteSharding, Sharding>(std::move(devices),
-                                                    memory_kind),
+    : llvm::RTTIExtends<ConcreteSharding, Sharding>(
+          std::move(devices), memory_kind, /*is_fully_replicated=*/false),
       shape_(std::move(dynamic_shape)),
       shard_shapes_(std::move(shard_dynamic_shapes)) {}
 
@@ -381,18 +387,19 @@ std::string ConcreteSharding::DebugString() const {
 }
 
 std::unique_ptr<ConcreteEvenSharding> ConcreteEvenSharding::Create(
-    DeviceList devices, MemoryKind memory_kind, Shape shape,
-    Shape shard_shape) {
-  return std::unique_ptr<ConcreteEvenSharding>(
-      new ConcreteEvenSharding(std::move(devices), memory_kind,
-                               std::move(shape), std::move(shard_shape)));
+    DeviceList devices, MemoryKind memory_kind, Shape shape, Shape shard_shape,
+    bool is_fully_replicated) {
+  return std::unique_ptr<ConcreteEvenSharding>(new ConcreteEvenSharding(
+      std::move(devices), memory_kind, std::move(shape), std::move(shard_shape),
+      is_fully_replicated));
 }
 
 ConcreteEvenSharding::ConcreteEvenSharding(DeviceList devices,
                                            MemoryKind memory_kind, Shape shape,
-                                           Shape shard_shape)
-    : llvm::RTTIExtends<ConcreteEvenSharding, Sharding>(std::move(devices),
-                                                        memory_kind),
+                                           Shape shard_shape,
+                                           bool is_fully_replicated)
+    : llvm::RTTIExtends<ConcreteEvenSharding, Sharding>(
+          std::move(devices), memory_kind, is_fully_replicated),
       shape_(std::move(shape)),
       shard_shape_(std::move(shard_shape)) {}
 
@@ -459,6 +466,14 @@ ShardingParamSharding::Create(ShardingParam sharding_param, DeviceList devices,
       std::move(sharding_param), std::move(devices), memory_kind));
 }
 
+ShardingParamSharding::ShardingParamSharding(ShardingParam sharding_param,
+
+                                             DeviceList devices,
+                                             MemoryKind memory_kind)
+    : llvm::RTTIExtends<ShardingParamSharding, Sharding>(
+          devices, memory_kind, ComputeIsFullyReplicated(sharding_param)),
+      sharding_param_(sharding_param) {}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 ShardingParamSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
diff --git a/third_party/xla/xla/python/ifrt/sharding.h b/third_party/xla/xla/python/ifrt/sharding.h
index 28614d69732645..a7d6fc343484e4 100644
--- a/third_party/xla/xla/python/ifrt/sharding.h
+++ b/third_party/xla/xla/python/ifrt/sharding.h
@@ -57,6 +57,11 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   // Returns the memory kind for all shards in this sharding.
   MemoryKind memory_kind() const { return memory_kind_; }
 
+  // Returns if this sharding is fully replicated. A fully replicated sharding
+  // means that the logical shape and shard shapes are identical, and every
+  // shard of the array contains the entire data of the logical array.
+  bool IsFullyReplicated() const { return is_fully_replicated_; }
+
   // Breaks a shape up into per-device shapes and shardings. See
   // Array::DisassembleIntoSingleDeviceArrays(). It may return an error if
   // disassembly is unsupported.
@@ -94,11 +99,14 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   static char ID;  // NOLINT
 
  protected:
-  Sharding(DeviceList devices, MemoryKind memory_kind)
-      : devices_(devices), memory_kind_(memory_kind) {}
+  Sharding(DeviceList devices, MemoryKind memory_kind, bool is_fully_replicated)
+      : devices_(devices),
+        memory_kind_(memory_kind),
+        is_fully_replicated_(is_fully_replicated) {}
 
   DeviceList devices_;
   MemoryKind memory_kind_;
+  bool is_fully_replicated_;
 };
 
 std::ostream& operator<<(std::ostream& os, const Sharding& sharding);
@@ -138,8 +146,8 @@ class SingleDeviceSharding final
 
  private:
   explicit SingleDeviceSharding(Device* device, MemoryKind memory_kind)
-      : llvm::RTTIExtends<SingleDeviceSharding, Sharding>(DeviceList({device}),
-                                                          memory_kind) {}
+      : llvm::RTTIExtends<SingleDeviceSharding, Sharding>(
+            DeviceList({device}), memory_kind, /*is_fully_replicated=*/true) {}
 };
 
 // Opaque sharding that does not define a fixed semantics for conversion between
@@ -261,10 +269,11 @@ class ConcreteEvenSharding
     : public llvm::RTTIExtends<ConcreteEvenSharding, Sharding> {
  public:
   // Creates a concrete even sharding.
-  static std::unique_ptr<ConcreteEvenSharding> Create(DeviceList devices,
-                                                      MemoryKind memory_kind,
-                                                      Shape shape,
-                                                      Shape shard_shape);
+  // TODO(hyeontaek): Remove the default value of `is_fully_replicated` once all
+  // callers are updated to provide it explicitly.
+  static std::unique_ptr<ConcreteEvenSharding> Create(
+      DeviceList devices, MemoryKind memory_kind, Shape shape,
+      Shape shard_shape, bool is_fully_replicated = false);
 
   Shape shape() const {
     DCHECK(this);
@@ -294,7 +303,7 @@ class ConcreteEvenSharding
 
  private:
   ConcreteEvenSharding(DeviceList devices, MemoryKind memory_kind, Shape shape,
-                       Shape shard_shape);
+                       Shape shard_shape, bool is_fully_replicated);
 
   Shape shape_;
   Shape shard_shape_;
@@ -324,10 +333,7 @@ class ShardingParamSharding
 
  private:
   ShardingParamSharding(ShardingParam sharding_param, DeviceList devices,
-                        MemoryKind memory_kind)
-      : llvm::RTTIExtends<ShardingParamSharding, Sharding>(devices,
-                                                           memory_kind),
-        sharding_param_(sharding_param) {}
+                        MemoryKind memory_kind);
 
   ShardingParam sharding_param_;
 };
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.cc b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
index 2fc1f28f9496ee..11dd4b70ec077b 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
@@ -14,11 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/serdes.h"
@@ -223,6 +227,7 @@ class ConcreteEvenShardingSerDes
     }
     *proto.mutable_shape() = sharding.shape().ToProto();
     *proto.mutable_shard_shape() = sharding.shard_shape().ToProto();
+    proto.set_is_fully_replicated(sharding.IsFullyReplicated());
     return proto.SerializeAsString();
   }
 
@@ -248,9 +253,9 @@ class ConcreteEvenShardingSerDes
     TF_ASSIGN_OR_RETURN(auto shape, Shape::FromProto(proto.shape()));
     TF_ASSIGN_OR_RETURN(auto shard_shape,
                         Shape::FromProto(proto.shard_shape()));
-    return ConcreteEvenSharding::Create(std::move(devices), memory_kind,
-                                        std::move(shape),
-                                        std::move(shard_shape));
+    return ConcreteEvenSharding::Create(
+        std::move(devices), memory_kind, std::move(shape),
+        std::move(shard_shape), proto.is_fully_replicated());
   }
 
   static char ID;  // NOLINT
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.proto b/third_party/xla/xla/python/ifrt/sharding_serdes.proto
index c95a766c592b6a..b470b04fed2a5c 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes.proto
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes.proto
@@ -52,4 +52,5 @@ message ConcreteEvenShardingProto {
   optional string memory_kind = 4;
   ShapeProto shape = 2;
   ShapeProto shard_shape = 3;
+  bool is_fully_replicated = 5;
 }
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc b/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
index 059da0c692d421..d1d075aa4f50b2 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
@@ -118,10 +118,10 @@ TEST_P(ShardingSerDesTest, ConcreteShardingWithDynamicShapeRoundTrip) {
 }
 
 TEST_P(ShardingSerDesTest, ConcreteEvenShardingRoundTrip) {
-  auto sharding =
-      ConcreteEvenSharding::Create(GetDevices({0, 1}), MemoryKind("abc"),
-                                   /*shape=*/Shape({10, 20}),
-                                   /*shard_shape=*/Shape({5, 20}));
+  auto sharding = ConcreteEvenSharding::Create(
+      GetDevices({0, 1}), MemoryKind("abc"),
+      /*shape=*/Shape({10, 20}),
+      /*shard_shape=*/Shape({5, 20}), /*is_fully_replicated=*/true);
 
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
@@ -134,6 +134,7 @@ TEST_P(ShardingSerDesTest, ConcreteEvenShardingRoundTrip) {
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
   EXPECT_THAT(out_sharding->shape(), sharding->shape());
   EXPECT_THAT(out_sharding->shard_shape(), sharding->shard_shape());
+  EXPECT_THAT(out_sharding->IsFullyReplicated(), sharding->IsFullyReplicated());
 }
 
 INSTANTIATE_TEST_SUITE_P(NumDevices, ShardingSerDesTest,
diff --git a/third_party/xla/xla/python/ifrt/sharding_test.cc b/third_party/xla/xla/python/ifrt/sharding_test.cc
index 9be2bf217d16de..7434b8ed37530e 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test.cc
@@ -49,6 +49,13 @@ class ConcreteShardingTest : public test_util::ShardingTest {};
 class ConcreteEvenShardingTest : public test_util::ShardingTest {};
 class ShardingParamShardingTest : public test_util::ShardingTest {};
 
+TEST_P(SingleDeviceShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0});
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device_list.devices().front(), MemoryKind());
+  EXPECT_TRUE(sharding->IsFullyReplicated());
+}
+
 TEST_P(SingleDeviceShardingTest, IndexDomains) {
   auto device_list = GetDevices({0});
   std::shared_ptr<const Sharding> sharding =
@@ -92,6 +99,13 @@ TEST_P(SingleDeviceShardingTest, Disassemble) {
   }
 }
 
+TEST_P(OpaqueShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding =
+      OpaqueSharding::Create(device_list, MemoryKind());
+  EXPECT_FALSE(sharding->IsFullyReplicated());
+}
+
 TEST_P(OpaqueShardingTest, FailedToDisassemble) {
   auto device_list = GetDevices({0, 1});
   std::shared_ptr<const Sharding> sharding =
@@ -125,6 +139,17 @@ TEST_P(OpaqueShardingTest, IndexDomainsFails) {
           HasSubstr("OpaqueSharding does not have index domain information")));
 }
 
+TEST_P(ConcreteShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0, 1});
+  std::vector<Shape> shard_shapes;
+  shard_shapes.reserve(2);
+  shard_shapes.push_back(Shape({10}));
+  shard_shapes.push_back(Shape({20}));
+  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
+      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  EXPECT_FALSE(sharding->IsFullyReplicated());
+}
+
 TEST_P(ConcreteShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1});
   std::vector<Shape> shard_shapes;
@@ -205,10 +230,29 @@ TEST_P(ConcreteShardingTest, IndexDomainsFails) {
                                  "domain information")));
 }
 
+TEST_P(ConcreteEvenShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0, 1});
+  {
+    // Fully replicated.
+    std::shared_ptr<const Sharding> sharding =
+        ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
+                                     Shape({15}), /*is_fully_replicated=*/true);
+    EXPECT_TRUE(sharding->IsFullyReplicated());
+  }
+  {
+    // Not fully replicated.
+    std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
+        device_list, MemoryKind(), Shape({30}), Shape({15}),
+        /*is_fully_replicated=*/false);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+}
+
 TEST_P(ConcreteEvenShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
-      device_list, MemoryKind(), Shape({30}), Shape({15}));
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/false);
 
   TF_ASSERT_OK_AND_ASSIGN(auto disassembled,
                           sharding->Disassemble(Shape({30})));
@@ -224,8 +268,9 @@ TEST_P(ConcreteEvenShardingTest, Disassemble) {
 
 TEST_P(ConcreteEvenShardingTest, DisassembleFailsForUnexpectedShape) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
-      device_list, MemoryKind(), Shape({30}), Shape({15}));
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/false);
 
   EXPECT_THAT(sharding->Disassemble(Shape({40})),
               StatusIs(tsl::error::INVALID_ARGUMENT,
@@ -235,8 +280,9 @@ TEST_P(ConcreteEvenShardingTest, DisassembleFailsForUnexpectedShape) {
 TEST_P(ConcreteEvenShardingTest, IndexDomainsFails) {
   auto device_list = GetDevices({0, 1});
   std::vector<Shape> shard_shapes;
-  std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
-      device_list, MemoryKind(), Shape({30}), Shape({15}));
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/false);
 
   EXPECT_THAT(
       sharding->IndexDomains(Shape({30})),
@@ -257,6 +303,37 @@ TEST_P(ShardingParamShardingTest, CreateFailsWhenDeviceCountNotMatch) {
                                  "ShardingParam 6 vs from DeviceList 2")));
 }
 
+TEST_P(ShardingParamShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
+  {
+    // Fully replicated.
+    ShardingParam param{/*dim_shards=*/{1, 1},
+                        {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> param_sharding,
+        ShardingParamSharding::Create(param, device_list, MemoryKind()));
+    EXPECT_TRUE(param_sharding->IsFullyReplicated());
+  }
+  {
+    // Not fully replicated.
+    ShardingParam param{/*dim_shards=*/{1, 6},
+                        {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> param_sharding,
+        ShardingParamSharding::Create(param, device_list, MemoryKind()));
+    EXPECT_FALSE(param_sharding->IsFullyReplicated());
+  }
+  {
+    // Not fully replicated.
+    ShardingParam param{/*dim_shards=*/{2, 3},
+                        {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> param_sharding,
+        ShardingParamSharding::Create(param, device_list, MemoryKind()));
+    EXPECT_FALSE(param_sharding->IsFullyReplicated());
+  }
+}
+
 TEST_P(ShardingParamShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
   ShardingParam param{/*dim_shards=*/{2, 3},
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 2ce7d49e13e4df..fa5a0a0b465350 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -46,6 +46,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":xla_compiler_proto_cc",
+        "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -57,6 +58,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -167,12 +169,17 @@ xla_cc_test(
         ":tfrt_cpu_client_test_lib",
         ":xla_ifrt",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:tile_assignment",
         "//xla/python/ifrt",
         "//xla/python/ifrt:sharding_test_util",
         "//xla/python/ifrt:tuple_impl_test_lib",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
index d1204b4357aa0f..f3d905ba99c11f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -24,11 +25,22 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/index.h"
+#include "xla/python/ifrt/index_domain.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -92,6 +104,12 @@ std::unique_ptr<HloSharding> HloSharding::Create(
       std::move(devices), memory_kind, std::move(xla_hlo_sharding)));
 }
 
+HloSharding::HloSharding(DeviceList devices, MemoryKind memory_kind,
+                         xla::HloSharding xla_hlo_sharding)
+    : llvm::RTTIExtends<HloSharding, XlaCompatibleSharding>(
+          std::move(devices), memory_kind, xla_hlo_sharding.IsReplicated()),
+      xla_hlo_sharding_(std::move(xla_hlo_sharding)) {}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 HloSharding::Disassemble(const Shape& shape) const {
   TF_ASSIGN_OR_RETURN(auto index_domains, IndexDomains(shape));
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
index 0ec5d5e5507f5a..9feb32f626e5a9 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
@@ -21,7 +21,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/index_domain.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
@@ -72,11 +76,8 @@ class HloSharding final
   static char ID;  // NOLINT
 
  private:
-  explicit HloSharding(DeviceList devices, MemoryKind memory_kind,
-                       xla::HloSharding xla_hlo_sharding)
-      : llvm::RTTIExtends<HloSharding, XlaCompatibleSharding>(
-            std::move(devices), memory_kind),
-        xla_hlo_sharding_(std::move(xla_hlo_sharding)) {}
+  HloSharding(DeviceList devices, MemoryKind memory_kind,
+              xla::HloSharding xla_hlo_sharding);
 
   xla::HloSharding xla_hlo_sharding_;
 };
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
index a5cd85294fe42a..6f260d51748a9e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
@@ -15,16 +15,26 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/python/ifrt/index.h"
+#include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/sharding_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -38,6 +48,31 @@ using ::tsl::testing::StatusIs;
 
 class HloShardingTest : public test_util::ShardingTest {};
 
+TEST_P(HloShardingTest, IsFullyReplicated) {
+  auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
+  {
+    // Fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::Replicate();
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_TRUE(sharding->IsFullyReplicated());
+  }
+  {
+    // Not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::IotaTile({1, 6});
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+  {
+    // Not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::IotaTile({2, 3});
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+}
+
 TEST_P(HloShardingTest, IndexDomainsWithReplication) {
   auto device_list = GetDevices({0, 1});
   // Fully replicated.
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 204e3d942b0a25..de758bdecd3d8b 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/profiler_utils.h"
 #include "xla/python/xplane_to_profile_instructions.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -109,18 +110,6 @@ tensorflow::ProfileOptions DefaultPythonProfileOptions() {
   return options;
 }
 
-const PLUGIN_Profiler_Api* FindProfilerApi(const PJRT_Api* pjrt_api) {
-  const PJRT_Extension_Base* next =
-      reinterpret_cast<const PJRT_Extension_Base*>(pjrt_api->extension_start);
-  while (next != nullptr &&
-         next->type != PJRT_Extension_Type::PJRT_Extension_Type_Profiler) {
-    next = next->next;
-  }
-  if (next == nullptr) {
-    return nullptr;
-  }
-  return reinterpret_cast<const PJRT_Profiler_Extension*>(next)->profiler_api;
-}
 }  // namespace
 
 // nanobind requires in-place construction of types, but tsl::ProfilerSession
@@ -170,16 +159,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
       throw xla::XlaRuntimeError(
           "Argument to register_plugin_profiler was not a pjrt_c_api capsule.");
     }
-    const PLUGIN_Profiler_Api* profiler_api =
-        FindProfilerApi(static_cast<const PJRT_Api*>(c_api.data()));
-    std::function<std::unique_ptr<tsl::profiler::ProfilerInterface>(
-        const tensorflow::ProfileOptions&)>
-        create_func = [profiler_api = profiler_api](
-                          const tensorflow::ProfileOptions& options) mutable {
-          return std::make_unique<xla::profiler::PluginTracer>(profiler_api,
-                                                               options);
-        };
-    tsl::profiler::RegisterProfilerFactory(std::move(create_func));
+    RegisterProfiler(static_cast<const PJRT_Api*>(c_api.data()));
   });
 
   nb::class_<ProfilerSessionWrapper> profiler_session_class(profiler,
diff --git a/third_party/xla/xla/python/profiler_utils.cc b/third_party/xla/xla/python/profiler_utils.cc
new file mode 100644
index 00000000000000..41b14f62a643ab
--- /dev/null
+++ b/third_party/xla/xla/python/profiler_utils.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/profiler_utils.h"
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "xla/backends/profiler/plugin/plugin_tracer.h"
+#include "xla/backends/profiler/plugin/profiler_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
+#include "tsl/profiler/lib/profiler_factory.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+
+namespace xla {
+
+static const PLUGIN_Profiler_Api* FindProfilerApi(const PJRT_Api* pjrt_api) {
+  PJRT_Profiler_Extension* profiler_extension =
+      pjrt::FindExtension<PJRT_Profiler_Extension>(
+          pjrt_api, PJRT_Extension_Type::PJRT_Extension_Type_Profiler);
+
+  if (profiler_extension == nullptr) {
+    // TODO(b/342627527): return proper error when no profiler api is found.
+    return nullptr;
+  }
+  return profiler_extension->profiler_api;
+}
+
+void RegisterProfiler(const PJRT_Api* pjrt_api) {
+  const PLUGIN_Profiler_Api* profiler_api = FindProfilerApi(pjrt_api);
+  std::function<std::unique_ptr<tsl::profiler::ProfilerInterface>(
+      const tensorflow::ProfileOptions&)>
+      create_func = [profiler_api = profiler_api](
+                        const tensorflow::ProfileOptions& options) mutable {
+        return std::make_unique<xla::profiler::PluginTracer>(profiler_api,
+                                                             options);
+      };
+  tsl::profiler::RegisterProfilerFactory(std::move(create_func));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/profiler_utils.h b/third_party/xla/xla/python/profiler_utils.h
new file mode 100644
index 00000000000000..5633a1e212f0fb
--- /dev/null
+++ b/third_party/xla/xla/python/profiler_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PROFILER_UTILS_H_
+#define XLA_PYTHON_PROFILER_UTILS_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace xla {
+
+void RegisterProfiler(const PJRT_Api* pjrt_api);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_UTILS_H_
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 6d069d74f74e4d..ae1f086900d25f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -6062,8 +6062,10 @@ cc_library(
     srcs = ["host_offloader.cc"],
     hdrs = ["host_offloader.h"],
     deps = [
+        ":call_graph",
         ":hlo_alias_analysis",
         ":hlo_buffer",
+        ":hlo_cse",
         ":hlo_pass",
         ":hlo_value",
         ":host_memory_offload_annotations_hdr",
@@ -6868,6 +6870,7 @@ cc_library(
     visibility = internal_visibility([
         ":__subpackages__",
         "//tensorflow/compiler/tf2xla:__pkg__",
+        "//xla/pjrt:__subpackages__",
     ]),
     deps = [
         ":custom_call_status",
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 46f7b00bb3d1bd..c543644ccddd5a 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -53,6 +53,20 @@ filegroup(
     ]),
 )
 
+# Collection of XLA tests that support XLA:CPU thunk-based runtime. We keep
+# running them on TAP while we keep working on porting XLA:CPU to the new
+# runtime.
+#
+# XLA:CPU thunks enabled with:
+#   --test_env=XLA_FLAGS=--xla_cpu_use_thunk_runtime=true
+#
+test_suite(
+    name = "thunk_runtime_tests",
+    tests = [
+        "//xla/tests:copy_test_cpu",
+    ],
+)
+
 bool_flag(
     name = "experimental_mlir_gpu",
     build_setting_default = False,
@@ -237,6 +251,7 @@ cc_library(
         ":parallel_task_assignment",
         ":simple_orc_jit",
         ":target_machine_features",
+        ":thunk_emitter",
         ":xla_framework",
         "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
@@ -338,6 +353,7 @@ cc_library(
         "//xla/service:while_loop_invariant_code_motion",
         "//xla/service:while_loop_simplifier",
         "//xla/service:zero_sized_hlo_elimination",
+        "//xla/service/cpu/runtime:thunk",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
@@ -557,6 +573,7 @@ cc_library(
         ":buffer_desc",
         ":simple_orc_jit",
         ":xla_framework",
+        "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -567,27 +584,37 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:computation_layout",
+        "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
         "//xla/service:executable",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_execution_profile",
+        "//xla/service:hlo_value",
         "//xla/service:logical_buffer",
         "//xla/service:maybe_owning_device_memory",
         "//xla/service:shaped_buffer",
         "//xla/service:xla_debug_info_manager",
+        "//xla/service/cpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/host:host_stream",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -715,6 +742,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "thunk_emitter",
+    srcs = ["thunk_emitter.cc"],
+    hdrs = ["thunk_emitter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/cpu/runtime:copy_thunk",
+        "//xla/service/cpu/runtime:thunk",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "tiled_dot_emitter",
     srcs = ["tiled_dot_emitter.cc"],
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 6bed3bd4263cb4..8cb3587fe8557b 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -63,6 +63,8 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/service/cpu/thunk_emitter.h"
 #include "xla/service/reduce_window_rewriter.h"
 #ifdef TF_LLVM_X86_AVAILABLE
 #include "llvm/TargetParser/X86TargetParser.h"
@@ -1087,6 +1089,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
                       ScheduleModule(module.get(), BufferSizeBytesFunction(),
                                      ComputationSchedulerToModuleScheduler(
                                          DFSMemoryScheduler)));
+  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
@@ -1098,6 +1101,35 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   DumpHloModuleIfEnabled(*module, *assignment,
                          absl::StrCat("cpu_", kAfterOptimizationsDumpName));
 
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
+    auto hlo_proto = std::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+    *hlo_proto->mutable_buffer_assignment() =
+        cpu_executable->buffer_assignment().ToProto();
+    cpu_executable->set_hlo_proto(std::move(hlo_proto));
+    return cpu_executable;
+  };
+
+  // If we use Thunk runtime then instead of emitting LLVM function for the
+  // entry computation we emit a sequence of thunks that implement the
+  // computation as a sequence of interpreted commands.
+  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+    ThunkEmitter thunk_emitter(assignment.get());
+    TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
+                        thunk_emitter.EmitEntryComputation(*module));
+
+    TF_ASSIGN_OR_RETURN(
+        auto cpu_executable,
+        CpuExecutable::Create(std::move(assignment), std::move(module),
+                              std::move(thunks),
+                              std::move(hlo_profile_printer_data),
+                              std::move(hlo_profile_index_map)));
+
+    return with_hlo_proto(std::move(cpu_executable));
+  }
+
   // Each computation is a single function.  Emit all embedded computations
   // before the entry computation. The order of computations returned from
   // GetEmbeddedComputations guarantees that a called computation occurs
@@ -1176,15 +1208,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
     cpu_executable->set_ir_module_string(ir_module_string);
   }
 
-  // Dump computation proto state and buffer assignment for
-  // GetCompiledMemoryStats results.
-  auto hlo_proto = std::make_unique<HloProto>();
-  *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
-  *hlo_proto->mutable_buffer_assignment() =
-      cpu_executable->buffer_assignment().ToProto();
-  cpu_executable->set_hlo_proto(std::move(hlo_proto));
-
-  return cpu_executable;
+  return with_hlo_proto(std::move(cpu_executable));
 }
 
 absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index c6c9337a8b94b6..00ec3eb9a5966f 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -18,27 +18,36 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
-#include <functional>
+#include <cstring>
+#include <memory>
 #include <optional>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/cleanup/cleanup.h"
+#include "absl/base/dynamic_annotations.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
+#include "llvm/Support/Error.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/computation_layout.h"
-#include "xla/service/logical_buffer.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/service/cpu/simple_orc_jit.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_status_internal.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/xla_debug_info_manager.h"
 #include "xla/shape_tree.h"
@@ -46,11 +55,11 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/host/host_stream.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
@@ -62,6 +71,9 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     const std::string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
+  VLOG(2) << "Create CpuExecutable from a jit compiled function: "
+          << entry_function_name << ", module=" << hlo_module->name();
+
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment)));
@@ -89,14 +101,19 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
 }
 
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
-    std::unique_ptr<HloModule> hlo_module,
+    std::unique_ptr<const BufferAssignment> assignment,
+    std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<const BufferAssignment> assignment) {
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
+  VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
+          << hlo_module->name();
+
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment)));
-  executable->module_name_ = "main";
+
+  executable->thunks_ = std::move(thunks);
+
   return executable;
 }
 
@@ -149,9 +166,9 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
           << "]";
 
   // Since the output buffer and all the temporary buffers were written into
-  // by the JITed code, msan has no way of knowing their memory was
-  // initialized. Mark them initialized so that msan doesn't flag loads from
-  // these buffers.
+  // by the JITed code, memory sanitizer has no way of knowing their memory was
+  // initialized. Mark them initialized so that memory sanitizer doesn't flag
+  // loads from these buffers.
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
   return MaybeOwningDeviceMemory{std::move(out)};
 }
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 5cf8aa83357a3d..8510ffbe3858dd 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -16,30 +16,30 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_CPU_EXECUTABLE_H_
 #define XLA_SERVICE_CPU_CPU_EXECUTABLE_H_
 
-#include <cstddef>
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
-#include <string_view>
 #include <utility>
-#include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/cpu/buffer_desc.h"
+#include "xla/service/cpu/runtime/thunk.h"
 #include "xla/service/cpu/simple_orc_jit.h"
-#include "xla/service/cpu/xla_framework.h"
+#include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_execution_profile.h"
-#include "xla/service/shaped_buffer.h"
-#include "xla/statusor.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
 
 namespace xla {
 namespace cpu {
@@ -50,6 +50,8 @@ namespace cpu {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
+  // Creates a CpuExecutable from JIT compiled cpu function by resolving
+  // `entry_function_name` in the `jit`.
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<const BufferAssignment> assignment,
@@ -57,12 +59,13 @@ class CpuExecutable : public Executable {
       const std::string& entry_function_name,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
-  // XLA Runtime factory method.
+
+  // Creates a CpuExecutable from a thunk sequence.
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
-      std::unique_ptr<HloModule> hlo_module,
+      std::unique_ptr<const BufferAssignment> assignment,
+      std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-      std::unique_ptr<const BufferAssignment> assignment);
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   ~CpuExecutable() override;
 
@@ -101,9 +104,11 @@ class CpuExecutable : public Executable {
                const void** /*args*/, void** /*buffer_table*/,
                XlaCustomCallStatus* /*status*/, int64_t* /*profile_counters*/);
 
-  const ComputeFunctionType& compute_function() const {
-    return compute_function_;
-  }
+  bool has_compute_function() const { return compute_function_ != nullptr; }
+  ComputeFunctionType compute_function() const { return compute_function_; }
+
+  bool has_thunks() const { return thunks_.has_value(); }
+  ThunkSequence& thunks() { return *thunks_; }
 
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
 
@@ -168,7 +173,21 @@ class CpuExecutable : public Executable {
   // Unique identifier.
   std::string module_name_;
 
-  ComputeFunctionType compute_function_;
+  // We have two execution modes:
+  //
+  //   (1) HLO module compiled to a single function using LLVM JIT and we get
+  //       a function pointer to it.
+  //   (2) HLO module compiled to a thunk sequence that gets interpreted at run
+  //       time.
+  //
+  // We are currently transitioning from (1) to (2) with a long term plan to
+  // unify thunk-based runtime with all XLA backends.
+
+  // A function pointer to the jit-compiled entry function.
+  ComputeFunctionType compute_function_ = nullptr;
+
+  // A thunk sequence implementing CpuExecutable.
+  std::optional<ThunkSequence> thunks_;
 
   // Entry function name for the computation.
   const std::string entry_function_name_;
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index da1e5620772046..b1dd38764e48b6 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 
 package(
@@ -36,6 +37,8 @@ cc_library(
     deps = [
         ":buffer_allocations",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -48,6 +51,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
index dae8db58db137a..8984a607fa99ea 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstring>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
@@ -47,9 +48,11 @@ absl::Status CopyThunk::Execute(const ExecuteParams& params) {
       se::DeviceMemoryBase destination_data,
       params.buffer_allocations->GetDeviceAddress(destination_buffer_));
 
-  VLOG(3) << "Copy buffer of size " << HumanReadableNumBytes(size_in_bytes_)
-          << " from " << source_data.opaque() << " to "
-          << destination_data.opaque();
+  VLOG(3) << absl::StrFormat(
+      "Copy buffer of size %s from slice %s (%p) to slice %s (%p)",
+      HumanReadableNumBytes(size_in_bytes_), source_buffer_.ToString(),
+      source_data.opaque(), destination_buffer_.ToString(),
+      destination_data.opaque());
 
   // TODO(ezhulenev): Add benchmarks for copy thunk and add support for
   // running it on multiple threads.
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.cc b/third_party/xla/xla/service/cpu/runtime/thunk.cc
index 736a0f53066bcb..b4868d58f3432c 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.cc
@@ -15,8 +15,14 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime/thunk.h"
 
+#include <memory>
 #include <ostream>
 #include <string_view>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 namespace xla::cpu {
 
@@ -32,4 +38,23 @@ std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
   return os;
 }
 
+ThunkSequence::ThunkSequence(std::unique_ptr<Thunk> thunk) {
+  push_back(std::move(thunk));
+}
+
+void ThunkSequence::Append(ThunkSequence other) {
+  reserve(size() + other.size());
+  for (auto& thunk : other) {
+    push_back(std::move(thunk));
+  }
+}
+
+absl::Status ThunkSequence::Execute(const Thunk::ExecuteParams& params) {
+  VLOG(2) << "Execute thunk sequence of size " << size();
+  for (auto& thunk : *this) {
+    TF_RETURN_IF_ERROR(thunk->Execute(params));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.h b/third_party/xla/xla/service/cpu/runtime/thunk.h
index b73250c01b3cb4..83732b2b31f5fe 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 #include <ostream>
 #include <string_view>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -61,10 +63,8 @@ class Thunk {
   // ExecuteParams
   //===--------------------------------------------------------------------===//
 
-  // Parameters passed to ExecuteOnStream. ExecuteOnStream is responsible for
-  // launching "work" on device, i.e., it launches host kernels, calls into
-  // libraries (oneDNN, Eigen, etc.).
-
+  // Parameters passed to Execute. Execute is responsible for launching "work"
+  // on device, i.e., it launches host kernels, calls into libraries, etc.
   struct ExecuteParams {
     const BufferAllocations* buffer_allocations = nullptr;
   };
@@ -78,7 +78,25 @@ class Thunk {
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
 
 // A sequence of thunks to execute.
-class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> {};
+class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> {
+ public:
+  ThunkSequence() = default;
+  explicit ThunkSequence(std::unique_ptr<Thunk> thunk);
+
+  // Return a ThunkSequence that contains a single thunk of type `T`.
+  template <typename T, typename... Args>
+  static ThunkSequence Of(Args&&... args) {
+    static_assert(std::is_base_of_v<Thunk, T>,
+                  "ThunkSequence::Of() requires `T` to be a `Thunk` subclass.");
+    return ThunkSequence(std::make_unique<T>(std::forward<Args>(args)...));
+  }
+
+  static ThunkSequence Empty() { return ThunkSequence(); }
+
+  absl::Status Execute(const Thunk::ExecuteParams& params);
+
+  void Append(ThunkSequence other);
+};
 
 }  // namespace xla::cpu
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
new file mode 100644
index 00000000000000..2e8d2f949be1e4
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -0,0 +1,95 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/thunk_emitter.h"
+
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/copy_thunk.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
+    const HloModule& module) {
+  if (!module.has_schedule()) {
+    return absl::InternalError("HLO module must be scheduled to emit thunks");
+  }
+  return EmitHloComputation(module.entry_computation());
+}
+
+absl::StatusOr<BufferAllocation::Slice> ThunkEmitter::GetAllocationSlice(
+    const HloInstruction* instruction, const ShapeIndex& index) {
+  return buffer_assignment_->GetUniqueSlice(instruction, index);
+}
+
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloComputation(
+    const HloComputation* computation) {
+  ThunkSequence thunks;
+
+  const HloSchedule& schedule = computation->parent()->schedule();
+  if (!schedule.is_computation_scheduled(computation)) {
+    return absl::InternalError(
+        absl::StrCat("Computation ", computation->name(),
+                     " must be scheduled to emit thunks"));
+  }
+
+  const HloInstructionSequence& sequence = schedule.sequence(computation);
+  for (HloInstruction* instr : sequence.instructions()) {
+    TF_ASSIGN_OR_RETURN(ThunkSequence instr_thunks, EmitHloInstruction(instr));
+    thunks.Append(std::move(instr_thunks));
+  }
+
+  return thunks;
+}
+
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
+    const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    // Instructions that do not have a thunk implementation and instead fully
+    // defined by the corresponding buffer assignment.
+    case HloOpcode::kParameter:
+      return ThunkSequence::Empty();
+
+    case HloOpcode::kCopy:
+      return EmitCopyThunk(instruction);
+
+    default:
+      return absl::UnimplementedError(
+          absl::StrCat("HLO opcode: ", HloOpcodeString(instruction->opcode()),
+                       " is not supported by XLA:CPU ThunkEmitter"));
+  }
+}
+
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
+    const HloInstruction* copy) {
+  TF_ASSIGN_OR_RETURN(auto source_buffer, GetAllocationSlice(copy->operand(0)));
+  TF_ASSIGN_OR_RETURN(auto destination_buffer, GetAllocationSlice(copy));
+  return ThunkSequence::Of<CopyThunk>(source_buffer, destination_buffer,
+                                      ShapeUtil::ByteSizeOf(copy->shape()));
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
new file mode 100644
index 00000000000000..b89e48230d59d4
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_THUNK_EMITTER_H_
+#define XLA_SERVICE_CPU_THUNK_EMITTER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/shape_util.h"
+
+namespace xla::cpu {
+
+// ThunkEmitter is responsible for converting optimized HLO module into a
+// sequence of thunks that will launch "work" on the CPU: launch host kernels,
+// call into the libraries (oneDNN, Eigen, etc.).
+class ThunkEmitter {
+ public:
+  explicit ThunkEmitter(const BufferAssignment* buffer_assignment)
+      : buffer_assignment_(buffer_assignment) {}
+
+  // Emits HLO module entry computation as a sequence of thunks.
+  absl::StatusOr<ThunkSequence> EmitEntryComputation(const HloModule& module);
+
+ private:
+  // Returns the buffer allocation slice assigned to the given instruction at
+  // the given shape index. Instruction must have a unique slice assigned to it!
+  absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
+      const HloInstruction* instruction, const ShapeIndex& index = {});
+
+  absl::StatusOr<ThunkSequence> EmitHloComputation(
+      const HloComputation* computation);
+
+  absl::StatusOr<ThunkSequence> EmitHloInstruction(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitCopyThunk(const HloInstruction* copy);
+
+  const BufferAssignment* buffer_assignment_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_THUNK_EMITTER_H_
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
index 206416b332f8c3..99b6ce8d3a3b50 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
@@ -23,32 +23,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class ConvolutionLayoutNormalizationTest : public HloTestBase {
- public:
-  ConvolutionLayoutNormalizationTest()
-      : HloTestBase(GetTestPlatform(), GetTestPlatform(),
-                    /*verifier_layout_sensitive=*/false,
-                    /*allow_mixed_precision_in_hlo_verifier=*/true, {}) {}
-
-  // Run and compare HLO output with and without layouts normalized.
-  void RunAndCompareWithLayoutsNormalized(const char* hlo) {
-    EXPECT_TRUE(
-        RunAndCompare(hlo, ErrorSpec{1e-3, 1e-3}, [&](HloModule* module) {
-          DebugOptions opts = module->config().debug_options();
-
-          // We are setting it to false, as the test runner will have it `true`
-          // due to the method below.
-          opts.set_xla_gpu_normalize_layouts(false);
-          module->mutable_config().set_debug_options(opts);
-        }));
-  }
-
-  DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions opts = HloTestBase::GetDebugOptionsForTest();
-    opts.set_xla_gpu_normalize_layouts(true);
-    return opts;
-  }
-};
+using ConvolutionLayoutNormalizationTest = HloTestBase;
 
 TEST_F(ConvolutionLayoutNormalizationTest, BackwardInput) {
   const char* hlo = R"(
@@ -67,8 +42,6 @@ HloModule TestModule
 }
 )";
 
-  RunAndCompareWithLayoutsNormalized(hlo);
-
   MatchOptimizedHlo(hlo, R"(
 // CHECK: (f32[1,136,23]{2,1,0}, u8[{{[0-9]+}}]{0}) custom-call([[fusion_1_0:%[^ ]+]], [[transpose_1_1:%[^ ]+]]), window={size=31 stride=2 pad=23_23}, dim_labels=bf0_oi0->bf0, custom_call_target="__cudnn$convBackwardInput"
   )");
@@ -85,7 +58,6 @@ ENTRY %TestComputation {
 }
 )";
 
-  RunAndCompareWithLayoutsNormalized(hlo);
   MatchOptimizedHlo(hlo, R"(
 // CHECK: (f32[2,128,1,378]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call([[param_0_0:%[^ ]+]], [[bitcast_5_1:%[^ ]+]]), window={size=1x5 pad=0_0x2_2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward"
   )");
@@ -109,8 +81,6 @@ ENTRY TestComputation {
 }
 )";
 
-  RunAndCompareWithLayoutsNormalized(hlo);
-
   MatchOptimizedHlo(hlo, R"(
 // CHECK: (f32[8,32,4,5,5]{4,3,2,1,0}, u8[0]{0}) custom-call([[bitcast_8_0:%[^ ]+]], [[fusion_1:%[^ ]+]], [[bias_2:%[^ ]+]]), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBiasActivationForward"
   )");
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 3782301595a36a..ee8374d46cdc95 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -349,6 +349,30 @@ HandleConstantHloToCudnnGraph(const HloInstruction& hlo, graph::Graph& graph) {
   }
 }
 
+std::optional<std::shared_ptr<graph::Tensor_attributes>>
+HandleClampToCudnnGraph(
+    const HloInstruction& hlo, graph::Graph& graph,
+    absl::flat_hash_map<const HloInstruction*,
+                        std::shared_ptr<graph::Tensor_attributes>>
+        hlo_to_cudnn,
+    fe::DataType_t data_type, fe::DataType_t compute_dtype) {
+  CHECK(hlo.opcode() == HloOpcode::kClamp)
+      << "HLO is not a clamp: " << hlo.ToShortString();
+  CHECK(hlo.operands().size() == 3)
+      << "Clamp requires to have 3 operands: " << hlo.ToShortString();
+  // clamp = max(lower, min(value, upper));
+  const auto min_attrs = graph::Pointwise_attributes()
+                             .set_mode(fe::PointwiseMode_t::MIN)
+                             .set_compute_data_type(compute_dtype);
+  std::shared_ptr<graph::Tensor_attributes> min_tensor = graph.pointwise(
+      hlo_to_cudnn[hlo.operand(1)], hlo_to_cudnn[hlo.operand(2)], min_attrs);
+  min_tensor->set_data_type(data_type).set_name(std::string(hlo.name()));
+  const auto max_attrs = graph::Pointwise_attributes()
+                             .set_mode(fe::PointwiseMode_t::MAX)
+                             .set_compute_data_type(compute_dtype);
+  return graph.pointwise(min_tensor, hlo_to_cudnn[hlo.operand(0)], max_attrs);
+}
+
 // Traverses fusion computations and creates cuDNN graphs out of them.
 absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     const HloFusionInstruction& fusion) {
@@ -407,6 +431,11 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     auto operand = [&hlo_to_cudnn, &hlo](int i) {
       return hlo_to_cudnn[hlo->operand(i)];
     };
+    const auto data_type = ToCudnnDataType(hlo->shape().element_type());
+    if (!data_type.has_value()) {
+      VLOG(3) << "Unimplemented data type: " << hlo->shape().element_type();
+      return std::nullopt;
+    }
     if (hlo->opcode() == HloOpcode::kParameter) {
       CHECK(hlo_to_cudnn.contains(hlo));
       continue;
@@ -442,53 +471,65 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       // All these are accounted for separately as transformations of strides.
       hlo_to_cudnn[hlo] = operand(0);
     } else if (hlo->IsElementwise()) {
-      const auto mode = GetElementwiseMode(*hlo);
-      if (!mode.has_value()) {
-        VLOG(3) << "Unsupported elementwise operation.";
-        return std::nullopt;
-      }
       const auto compute_dtype =
           GetComputeDataType(hlo->shape().element_type());
       if (!compute_dtype.has_value()) {
         return std::nullopt;
       }
-      const auto attrs = graph::Pointwise_attributes()
-                             .set_mode(mode.value())
-                             .set_compute_data_type(compute_dtype.value());
-      if (hlo->operand_count() == 1) {
-        hlo_to_cudnn[hlo] = graph.pointwise(operand(0), attrs);
-        // Sets the dimensions for unary ops whose operands are broadcast for
-        // cuDNN to infer its inputs' shapes. constant has dimension [1] while
-        // cuDNN requires constant to have dimension [1,1,1]. Not setting output
-        // of the unary shapes results in the rejection of the cuDNN graph.
-        if (hlo->operand(0)->opcode() == HloOpcode::kBroadcast) {
-          const auto scope = adapter->analysis_.QueryInstructionScope(*hlo);
-          std::vector<int64_t> dimensions;
-          std::vector<int64_t> strides;
-          if (!scope.has_value()) {
-            LOG(FATAL) << "No scope for instruction: " << hlo->ToShortString();
+      if (hlo->opcode() == HloOpcode::kClamp) {
+        const auto clamp =
+            HandleClampToCudnnGraph(*hlo, graph, hlo_to_cudnn,
+                                    data_type.value(), compute_dtype.value());
+        if (!clamp.has_value()) {
+          return std::nullopt;
+        }
+        hlo_to_cudnn[hlo] = clamp.value();
+      } else {
+        const auto mode = GetElementwiseMode(*hlo);
+        if (!mode.has_value()) {
+          VLOG(3) << "Unsupported elementwise operation.";
+          return std::nullopt;
+        }
+        const auto attrs = graph::Pointwise_attributes()
+                               .set_mode(mode.value())
+                               .set_compute_data_type(compute_dtype.value());
+        if (hlo->operand_count() == 1) {
+          hlo_to_cudnn[hlo] = graph.pointwise(operand(0), attrs);
+          // Sets the dimensions for unary ops whose operands are broadcast
+          // for cuDNN to infer its inputs' shapes. constant has dimension [1]
+          // while cuDNN requires constant to have dimension [1,1,1]. Not
+          // setting output of the unary shapes results in the rejection of
+          // the cuDNN graph.
+          if (hlo->operand(0)->opcode() == HloOpcode::kBroadcast) {
+            const auto scope = adapter->analysis_.QueryInstructionScope(*hlo);
+            std::vector<int64_t> dimensions;
+            std::vector<int64_t> strides;
+            if (!scope.has_value()) {
+              LOG(FATAL) << "No scope for instruction: "
+                         << hlo->ToShortString();
+            }
+            if (!adapter->DimensionsAndStrides(*hlo, scope.value(), dimensions,
+                                               strides)) {
+              VLOG(3) << "Unsupported hlo for querying dimensions: "
+                      << hlo->ToShortString();
+            } else {
+              hlo_to_cudnn[hlo]->set_dim(dimensions);
+            }
           }
-          if (!adapter->DimensionsAndStrides(*hlo, scope.value(), dimensions,
-                                             strides)) {
-            VLOG(3) << "Unsupported hlo for querying dimensions: "
-                    << hlo->ToShortString();
-          } else {
-            hlo_to_cudnn[hlo]->set_dim(dimensions);
+        } else if (hlo->operand_count() == 2) {
+          hlo_to_cudnn[hlo] = graph.pointwise(operand(0), operand(1), attrs);
+        } else if (hlo->operand_count() == 3) {
+          if (hlo->opcode() != HloOpcode::kSelect) {
+            VLOG(3) << "Unexpected ternary operation: " << hlo->ToString();
+            return std::nullopt;
           }
-        }
-      } else if (hlo->operand_count() == 2) {
-        hlo_to_cudnn[hlo] = graph.pointwise(operand(0), operand(1), attrs);
-      } else if (hlo->operand_count() == 3) {
-        if (hlo->opcode() != HloOpcode::kSelect) {
-          VLOG(3) << "Unexpected ternary operation: " << hlo->ToString();
+          // Operand order for select differs between HLO and cuDNN.
+          hlo_to_cudnn[hlo] =
+              graph.pointwise(operand(1), operand(2), operand(0), attrs);
+        } else {
+          VLOG(3) << "Unimplemented elementwise operation.";
           return std::nullopt;
         }
-        // Operand order for select differs between HLO and cuDNN.
-        hlo_to_cudnn[hlo] =
-            graph.pointwise(operand(1), operand(2), operand(0), attrs);
-      } else {
-        VLOG(3) << "Unimplemented elementwise operation.";
-        return std::nullopt;
       }
     } else if (hlo->opcode() == HloOpcode::kDot) {
       const auto compute_dtype =
@@ -508,11 +549,6 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       VLOG(3) << "Creation of the operation failed.";
       return std::nullopt;
     }
-    const auto data_type = ToCudnnDataType(hlo->shape().element_type());
-    if (!data_type.has_value()) {
-      VLOG(3) << "Unimplemented data type: " << hlo->shape().element_type();
-      return std::nullopt;
-    }
     hlo_to_cudnn[hlo]
         ->set_data_type(data_type.value())
         .set_name(std::string(hlo->name()));
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 051040693daed4..4234c935954003 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -556,6 +556,32 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(CuDnnFusionLevel2Test, ClampExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  x = bf16[16,32] parameter(0)
+  y = bf16[32,16] parameter(1)
+  x_const_lower = bf16[] constant(3e-3)
+  x_const_upper = bf16[] constant(1e-1)
+  y_const_lower = bf16[] constant(3e-3)
+  y_const_upper = bf16[] constant(1e-1)
+  x_const_bcast_lower = bf16[16,32] broadcast(x_const_lower), dimensions={}
+  x_const_bcast_upper = bf16[16,32] broadcast(x_const_upper), dimensions={}
+  y_const_bcast_lower = bf16[32,16] broadcast(y_const_lower), dimensions={}
+  y_const_bcast_upper = bf16[32,16] broadcast(y_const_upper), dimensions={}
+  x_clamp = bf16[16,32] clamp(x_const_bcast_lower, x, x_const_bcast_upper)
+  y_clamp = bf16[32,16] clamp(y_const_bcast_lower, y, y_const_bcast_upper)
+  ROOT dot_a = f32[16,16] dot(x_clamp, y_clamp), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+ENTRY e {
+  p0 = bf16[16,32] parameter(0)
+  p1 = bf16[32,16] parameter(1)
+  ROOT _ = f32[16,16] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(CuDnnFusionLevel2Test, DotF8ExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index f0578973d07c70..3ee89002ce2da8 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -645,9 +645,8 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
 
   // Triton configurations are adjusted and deduplicated.
   absl::flat_hash_set<TritonGemmConfig> added;
-  bool is_hopper = debug_options_.xla_gpu_enable_triton_hopper() &&
-                   !config_.IsDeviceless() &&
-                   GetComputeCapability().IsAtLeastHopper();
+  bool is_hopper =
+      !config_.IsDeviceless() && GetComputeCapability().IsAtLeastHopper();
   for (TritonGemmConfig& config : triton_configs) {
     config.block_m = std::min(config.block_m, limits.block_m);
     config.block_n = std::min(config.block_n, limits.block_n);
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 3b5223eb8523fd..f39dfb616302ec 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1215,24 +1215,22 @@ absl::Status GpuCompiler::OptimizeHloModule(
     opts.set_enable_unconditional_reduce_of_concat_replacement(false);
     return opts;
   }();
-  if (debug_options.xla_gpu_normalize_layouts()) {
-    layout_normalization_pipeline.AddPass<ReshapeDecomposer>();
-    layout_normalization_pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
-    layout_normalization_pipeline.AddPass<LayoutNormalization>(
-        &NormalizeLayoutForGpuCustomCalls);
-    // The LayoutAssignment pass may leave behind kCopy instructions which are
-    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    layout_normalization_pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        simplifier_options);
-    // Layout normalization will create broadcasts that are not canonical.
-    layout_normalization_pipeline.AddPass<BroadcastCanonicalizer>();
-    // Layout normalization will create scatters that are not simplified and
-    // also have unsorted update_window_dims.
-    layout_normalization_pipeline.AddPass<ScatterSimplifier>();
-    // Layout normalization will create gathers that are not simplified and also
-    // have unsorted offset_dims.
-    layout_normalization_pipeline.AddPass<GatherSimplifier>();
-  }
+  layout_normalization_pipeline.AddPass<ReshapeDecomposer>();
+  layout_normalization_pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
+  layout_normalization_pipeline.AddPass<LayoutNormalization>(
+      &NormalizeLayoutForGpuCustomCalls);
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  layout_normalization_pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+      simplifier_options);
+  // Layout normalization will create broadcasts that are not canonical.
+  layout_normalization_pipeline.AddPass<BroadcastCanonicalizer>();
+  // Layout normalization will create scatters that are not simplified and
+  // also have unsorted update_window_dims.
+  layout_normalization_pipeline.AddPass<ScatterSimplifier>();
+  // Layout normalization will create gathers that are not simplified and also
+  // have unsorted offset_dims.
+  layout_normalization_pipeline.AddPass<GatherSimplifier>();
   TF_RETURN_IF_ERROR(layout_normalization_pipeline.Run(hlo_module).status());
   // Run target-specific HLO optimization passes after layout assignment.
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
@@ -1379,18 +1377,16 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
     pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
-    if (debug_options.xla_gpu_normalize_layouts()) {
-      pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
-      // Remove any redundant operations (such as bitcasts) introduced by layout
-      // normalization.
-      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
-      // Layout normalization will create scatters that are not simplified and
-      // also have unsorted update_window_dims.
-      pipeline.AddPass<ScatterSimplifier>();
-      // Layout normalization will create gathers that are not simplified and
-      // also have unsorted offset_dims.
-      pipeline.AddPass<GatherSimplifier>();
-    }
+    pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
+    // Remove any redundant operations (such as bitcasts) introduced by layout
+    // normalization.
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
+    // Layout normalization will create scatters that are not simplified and
+    // also have unsorted update_window_dims.
+    pipeline.AddPass<ScatterSimplifier>();
+    // Layout normalization will create gathers that are not simplified and
+    // also have unsorted offset_dims.
+    pipeline.AddPass<GatherSimplifier>();
     pipeline.AddPass<BroadcastCanonicalizer>();
 
     pipeline.AddPass<ReductionDegenerateDimRemover>();
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion.cc b/third_party/xla/xla/service/gpu/instruction_fusion.cc
index a507e086e50256..29d10f37d24732 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion.cc
+++ b/third_party/xla/xla/service/gpu/instruction_fusion.cc
@@ -39,8 +39,30 @@ bool ElementIsF32OrF16(const Shape& shape) {
   PrimitiveType type = shape.element_type();
   return type == F32 || type == F16;
 }
+
+class EmptyFusionQueue : public FusionQueue {
+ public:
+  std::pair<HloInstruction*, std::vector<int64_t>>
+  DequeueNextInstructionAndOperandsToFuseInOrder() override {
+    return {nullptr, {}};
+  }
+  void RemoveInstruction(HloInstruction* instruction) override {};
+  const std::vector<bool>* FusionConfiguration() override { return nullptr; };
+};
+
 }  // namespace
 
+absl::StatusOr<bool> GpuInstructionFusion::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  fusion_node_evaluations_.clear();
+  auto fusible_computations =
+      GetFusibleComputations(*module, execution_threads);
+  fusible_computations_ = {fusible_computations.begin(),
+                           fusible_computations.end()};
+  return InstructionFusion::Run(module, execution_threads);
+}
+
 /*static*/ bool GpuInstructionFusion::IsExpensive(
     const HloInstruction& instruction) {
   // Some floating-point math ops are cheap on the GPU.
@@ -151,7 +173,10 @@ HloInstruction* GpuInstructionFusion::FuseInstruction(
 
 std::unique_ptr<FusionQueue> GpuInstructionFusion::GetFusionQueue(
     HloComputation* computation) {
-  return InstructionFusion::GetFusionQueue(computation);
+  if (fusible_computations_.contains(computation)) {
+    return InstructionFusion::GetFusionQueue(computation);
+  }
+  return std::make_unique<EmptyFusionQueue>();
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion.h b/third_party/xla/xla/service/gpu/instruction_fusion.h
index db57690ce9571f..29eb0325e1a23b 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion.h
+++ b/third_party/xla/xla/service/gpu/instruction_fusion.h
@@ -38,20 +38,16 @@ namespace gpu {
 
 class GpuInstructionFusion : public InstructionFusion {
  public:
-  explicit GpuInstructionFusion(bool may_duplicate,
-                                const se::DeviceDescription& d)
+  GpuInstructionFusion(bool may_duplicate, const se::DeviceDescription& d)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate),
         device_info_(d) {}
 
   static bool IsExpensive(const HloInstruction& instruction);
 
   using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
-    fusion_node_evaluations_.clear();
-    return InstructionFusion::Run(module, execution_threads);
-  }
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  protected:
   std::unique_ptr<FusionQueue> GetFusionQueue(
@@ -73,6 +69,7 @@ class GpuInstructionFusion : public InstructionFusion {
 
   // Keep track of the number of times each instruction inside a fusion node is
   // indexed with different index vectors.
+  absl::flat_hash_set<const HloComputation*> fusible_computations_;
   absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
       fusion_node_evaluations_;
 
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion_test.cc b/third_party/xla/xla/service/gpu/instruction_fusion_test.cc
index d0d3d387bb1f97..fa96edfd364aa2 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/instruction_fusion_test.cc
@@ -980,5 +980,27 @@ ENTRY main {
   EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
 }
 
+TEST_F(InstructionFusionTest, DoNotFuseInsideReducer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test_module
+
+scalar_add_computation {
+  scalar_rhs = f32[] parameter(1)
+  scalar_lhs = f32[] parameter(0)
+  add.1 = f32[] add(scalar_lhs, scalar_rhs)
+  ROOT add.2 = f32[] add(add.1, scalar_rhs)
+}
+
+ENTRY main {
+  param_0 = f16[64,96] parameter(0)
+  constant_2 = f32[] constant(0)
+  ROOT reduce = f32[64] reduce(param_0, constant_2), dimensions={1}, to_apply=scalar_add_computation
+})")
+                    .value();
+
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
+  SCOPED_TRACE(module->ToString());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 55ae622528a037..63b182c38db5b7 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -475,6 +475,10 @@ absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
       return Maximum(b, device_info, inputs);
     case HloOpcode::kMinimum:
       return Minimum(b, device_info, inputs);
+    case HloOpcode::kClamp:
+      return Maximum(
+          b, device_info,
+          {Minimum(b, device_info, {inputs[1], inputs[2]}), inputs[0]});
     case HloOpcode::kAnd:
       return b.create<ma::AndIOp>(inputs[0], inputs[1]);
     case HloOpcode::kOr:
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index dcde7c88fdc9e2..e05e560245f4d6 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -4452,7 +4452,12 @@ triton_gemm_dot_computation {
   tmp_19 = f32[3,57]{1,0} multiply(tmp_17, tmp_18)
   tmp_20 = f32[3,57]{1,0} negate(tmp_19)
   tmp_21 = f32[3,57]{1,0} add(tmp_13, tmp_20)
-  ROOT tmp_22 = f32[32,57]{0,1} dot(tmp_8, tmp_21), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  const_1 = f32[] constant(-3e-3)
+  const_2 = f32[] constant(3e-2)
+  broadcast_1 = f32[3,57]{1,0} broadcast(const_1), dimensions={}
+  broadcast_2 = f32[3,57]{1,0} broadcast(const_2), dimensions={}
+  tmp_22 = f32[3,57]{1,0} clamp(broadcast_1, tmp_21, broadcast_2)
+  ROOT tmp_23 = f32[32,57]{0,1} dot(tmp_8, tmp_22), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
@@ -4493,7 +4498,12 @@ fused_computation {
   param_0.1 = f32[3,57]{1,0} parameter(0)
   multiply.3 = f32[3,57]{1,0} multiply(select.2, param_0.1)
   negate.1 = f32[3,57]{1,0} negate(multiply.3)
-  ROOT add.2 = f32[3,57]{1,0} add(add.3, negate.1)
+  add.2 = f32[3,57]{1,0} add(add.3, negate.1)
+  const.1 = f32[] constant(-3e-3)
+  const.2 = f32[] constant(3e-2)
+  broadcast.1 = f32[3,57]{1,0} broadcast(const.1), dimensions={}
+  broadcast.2 = f32[3,57]{1,0} broadcast(const.2), dimensions={}
+  ROOT clamp = f32[3,57]{1,0} clamp(broadcast.1, add.2, broadcast.2)
 }
 
 fused_computation.1 {
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index fb9ad7b12719b3..462d278dba36c9 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -692,10 +692,12 @@ cc_library(
         ":indexing_map",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:gather_simplifier",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu/fusions:fusion_emitter",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index c391ade9379383..06f723d9cb24ec 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -33,9 +33,12 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
+#include "xla/service/gather_simplifier.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -44,6 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -52,6 +56,7 @@ namespace gpu {
 // producer and consumer are considered as one fusion, otherwise it's only the
 // producer.
 bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
+                              const se::DeviceDescription& device_info,
                               const HloInstruction* producer,
                               const HloInstruction* consumer) {
   // Transposing minor dimension breaks coalescing.
@@ -87,6 +92,60 @@ bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
     if (is_bad_transpose(producer)) return false;
     if (consumer && is_bad_transpose(consumer)) return false;
   }
+
+  // Gather is usually uncoalesced, unless the window is big enough and in the
+  // most minor physical dimensions.
+  auto gather_reads_coalesced_window = [&](const HloInstruction* instr) {
+    auto* gather = DynCast<HloGatherInstruction>(instr);
+    if (!GatherSimplifier::IsSimplifiedGather(gather)) {
+      // Gather simplifier pass should make sure we never reach here. In case we
+      // do, let's assume that the gather window dims are not most minor.
+      return false;
+    }
+    auto gather_dnums = gather->gather_dimension_numbers();
+    Layout operand_layout = gather->operand(0)->shape().layout();
+    Layout output_layout = gather->shape().layout();
+    int64_t adjacent_minor_elements = 1;
+    for (auto [i, dim] : llvm::enumerate(operand_layout.minor_to_major())) {
+      // For a simplified gather, the dimensions 1 to 'operand_rank' are the
+      // window dimensions. We want to have the same physical order of the
+      // matching dimensions.
+      if (dim != output_layout.minor_to_major(i) - 1) {
+        break;
+      }
+      int64_t output_slice_size = gather->gather_slice_sizes()[dim];
+      adjacent_minor_elements *= output_slice_size;
+      if (output_slice_size != gather->operand(0)->shape().dimensions(dim)) {
+        break;
+      }
+    }
+    int64_t type_size =
+        ShapeUtil::ByteSizeOfPrimitiveType(gather->shape().element_type());
+    return adjacent_minor_elements * type_size >=
+           device_info.dram_to_l2_transaction_size_bytes();
+  };
+
+  auto is_bad_gather = [&](const HloInstruction* hlo) {
+    if (hlo->opcode() == HloOpcode::kGather) {
+      return !gather_reads_coalesced_window(hlo);
+    }
+    if (hlo->opcode() == HloOpcode::kFusion &&
+        HloAnyOf({hlo->fused_expression_root()},
+                 [&](const HloInstruction* instr) {
+                   return instr->opcode() == HloOpcode::kGather &&
+                          !gather_reads_coalesced_window(instr);
+                 })) {
+      return true;
+    }
+    return false;
+  };
+  if (is_bad_gather(producer)) {
+    return false;
+  }
+  if (consumer && is_bad_gather(consumer)) {
+    return false;
+  }
+
   // Fusing two row reductions breaks coalescing.
   if (fusion_kind == HloFusionAnalysis::EmitterFusionKind::kReduction &&
       IsInputFusibleReduction(*producer) && consumer &&
@@ -582,7 +641,8 @@ CoalescingAnalysis::CoalescingAnalysis(
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
   is_coalesced_computed_by_heuristic_ =
-      IsReadCoalescedHeuristic(fusion_analysis.GetEmitterFusionKind(), instr);
+      IsReadCoalescedHeuristic(fusion_analysis.GetEmitterFusionKind(),
+                               fusion_analysis.device_info(), instr);
 }
 
 CoalescingAnalysis::CoalescingAnalysis(
@@ -600,7 +660,8 @@ CoalescingAnalysis::CoalescingAnalysis(
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
   is_coalesced_computed_by_heuristic_ = IsReadCoalescedHeuristic(
-      fusion_analysis.GetEmitterFusionKind(), producer, consumer);
+      fusion_analysis.GetEmitterFusionKind(), fusion_analysis.device_info(),
+      producer, consumer);
 }
 
 bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index f65f4c8b8ddd49..0558571f961365 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -69,6 +69,7 @@ class CoalescingAnalysis {
 // producer and consumer are considered as one fusion, otherwise it's only the
 // producer.
 bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
+                              const se::DeviceDescription& device_info,
                               const HloInstruction* producer,
                               const HloInstruction* consumer = nullptr);
 
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 15115935843ee2..6967f133057c8a 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -73,6 +73,7 @@ class CoalescingTest : public HloTestBase {
     HloInstruction* root = module->entry_computation()->root_instruction();
     auto analysis = AnalyzeFusion(*root, device_info_);
     return xla::gpu::IsReadCoalescedHeuristic(analysis.GetEmitterFusionKind(),
+                                              analysis.device_info(),
                                               root->operand(0), root);
   }
 
@@ -253,6 +254,63 @@ TEST_F(CoalescingTest, TransposeOnlyOuterDims) {
   EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
 }
 
+TEST_F(CoalescingTest, NormalizedGatherIsCoalesced) {
+  absl::string_view ir = R"(
+    HloModule module
+
+    fusion {
+      p0 = f32[32, 4, 8]{2,1,0} parameter(0)
+      ROOT negate = f32[32, 4, 8]{2,1,0} negate(p0)
+    }
+
+    ENTRY entry {
+      input = f32[128, 8]{1,0} parameter(0)
+      indices = s32[32, 2]{1,0} parameter(1)
+      // We are reading a consecutive window of 16 elements of 4 bytes each.
+      gather = f32[32, 4, 8]{2,1,0} gather(input, indices), offset_dims={1,2}, collapsed_slice_dims={}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={4, 8}
+      ROOT %fusion = f32[32, 4, 8]{2,1,0} fusion(gather), kind=kLoop, calls=fusion
+  })";
+  EXPECT_TRUE(IsReadCoalescedHeuristic(ir));
+}
+
+TEST_F(CoalescingTest, NormalizedGatherWindowTooSmallNotCoalesced) {
+  absl::string_view ir = R"(
+    HloModule module
+
+    fusion {
+      p0 = f32[32, 4, 8]{2,1,0} parameter(0)
+      ROOT negate = f32[32, 4, 8]{2,1,0} negate(p0)
+    }
+
+    ENTRY entry {
+      input = f32[128, 9]{1,0} parameter(0)
+      indices = s32[32, 2]{1,0} parameter(1)
+      // We are reading 32 elements of 4 bytes each, but it is not consecutive,
+      // as we are not reading all elements of the most minor dimension.
+      gather = f32[32, 4, 8]{2,1,0} gather(input, indices), offset_dims={1,2}, collapsed_slice_dims={}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={4, 8}
+      ROOT %fusion = f32[32, 4, 8]{2,1,0} fusion(gather), kind=kLoop, calls=fusion
+  })";
+  EXPECT_FALSE(IsReadCoalescedHeuristic(ir));
+}
+
+TEST_F(CoalescingTest, UnsimplifiedGatherNotCoalesced) {
+  absl::string_view ir = R"(
+    HloModule module
+
+    fusion {
+      p0 = f32[32, 4, 8]{2,1,0} parameter(0)
+      ROOT negate = f32[32, 4, 8]{2,1,0} negate(p0)
+    }
+
+    ENTRY entry {
+      input = f32[128, 8]{1,0} parameter(0)
+      indices = s32[32, 2]{1,0} parameter(1)
+      gather = f32[32, 4, 8]{2,1,0} gather(input, indices), offset_dims={1,2}, collapsed_slice_dims={}, start_index_map={1,0}, index_vector_dim=1, slice_sizes={4, 8}
+      ROOT %fusion = f32[32, 4, 8]{2,1,0} fusion(gather), kind=kLoop, calls=fusion
+  })";
+  EXPECT_FALSE(IsReadCoalescedHeuristic(ir));
+}
+
 TEST_F(CoalescingTest, PadOp) {
   absl::string_view ir = R"(
     HloModule module
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index a7d435dc00c440..9ecb7dcce24912 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -201,8 +201,9 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForInstruction(
 
   auto fusion_analysis = AnalyzeFusion(*producer, *device_info_);
 
-  bool is_coalesced = IsReadCoalescedHeuristic(
-      fusion_analysis.GetEmitterFusionKind(), producer);
+  bool is_coalesced =
+      IsReadCoalescedHeuristic(fusion_analysis.GetEmitterFusionKind(),
+                               fusion_analysis.device_info(), producer);
   return EstimateRunTimeForFusion(fusion_analysis, is_coalesced);
 }
 
@@ -213,7 +214,8 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForProducerConsumer(
       AnalyzeProducerConsumerFusion(*producer, *consumer, *device_info_);
 
   bool is_coalesced = IsReadCoalescedHeuristic(
-      fusion_analysis.GetEmitterFusionKind(), producer, consumer);
+      fusion_analysis.GetEmitterFusionKind(), fusion_analysis.device_info(),
+      producer, consumer);
   return EstimateRunTimeForFusion(fusion_analysis, is_coalesced);
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index daddff1f4c1436..651fd41bbc3e3b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -305,22 +305,12 @@ TEST_F(IndexingAnalysisTest, CopyNothing) {
   auto input_indexing = GetOutputToInputIndexing(root, /*output_id=*/0);
   input_indexing.Simplify();
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1) -> (d0, d1)
-                            domain:
-                            d0 in [0, -1]
-                            d1 in [0, -1]
-                          )"))));
+              ElementsAre(ElementsAre(MatchIndexingMap("KNOWN EMPTY"))));
 
   auto output_indexing = GetInputToOutputIndexing(root, /*input_id=*/0);
   output_indexing.Simplify();
   EXPECT_THAT(output_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1) -> (d0, d1)
-                            domain:
-                            d0 in [0, -1]
-                            d1 in [0, -1]
-                          )"))));
+              ElementsAre(ElementsAre(MatchIndexingMap("KNOWN EMPTY"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeNothing) {
@@ -334,22 +324,12 @@ TEST_F(IndexingAnalysisTest, ReshapeNothing) {
   auto input_indexing = GetOutputToInputIndexing(root, /*output_id=*/0);
   input_indexing.Simplify();
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(R"(
-                            (d0) -> (0, 0, 0)
-                            domain:
-                            d0 in [0, -1]
-                          )"))));
+              ElementsAre(ElementsAre(MatchIndexingMap("KNOWN EMPTY"))));
 
   auto output_indexing = GetInputToOutputIndexing(root, /*input_id=*/0);
   output_indexing.Simplify();
   EXPECT_THAT(output_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1, d2) -> (0)
-                            domain:
-                            d0 in [0, 0]
-                            d1 in [0, -1]
-                            d2 in [0, -1]
-                          )"))));
+              ElementsAre(ElementsAre(MatchIndexingMap("KNOWN EMPTY"))));
 }
 
 TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
@@ -1493,20 +1473,8 @@ TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
                             d1 in [0, 2]
                             d2 in [0, 6]
                           )")),
-                          ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1, d2) -> (d0, d1 * 2 - 5, d2)
-                            domain:
-                            d0 in [0, 1]
-                            d1 in [3, 2]
-                            d2 in [0, 6]
-                          )")),
-                          ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1, d2) -> (d0, d1 * 2 - 16, d2)
-                            domain:
-                            d0 in [0, 1]
-                            d1 in [8, 2]
-                            d2 in [0, 6]
-                          )"))));
+                          ElementsAre(MatchIndexingMap("KNOWN EMPTY")),
+                          ElementsAre(MatchIndexingMap("KNOWN EMPTY"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 770fba9bfce995..8ffe3ad8f7f72d 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -910,6 +910,38 @@ std::vector<RangeVar> RangeVarsFromTensorSizes(
   return ranges;
 }
 
+IndexingMap::IndexingMap(
+    AffineMap affine_map, std::vector<DimVar> dimensions,
+    std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
+    absl::Span<std::pair<AffineExpr, Interval>> constraints)
+    : affine_map_(affine_map),
+      dim_vars_(std::move(dimensions)),
+      range_vars_(std::move(range_vars)),
+      rt_vars_(std::move(rt_vars)) {
+  if (!VerifyVariableIntervals()) {
+    ResetToKnownEmpty();
+    return;
+  }
+  for (const auto& [expr, range] : constraints) {
+    AddConstraint(expr, range);
+  }
+}
+
+IndexingMap::IndexingMap(
+    AffineMap affine_map, std::vector<DimVar> dimensions,
+    std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
+    const llvm::DenseMap<AffineExpr, Interval>& constraints)
+    : affine_map_(affine_map),
+      dim_vars_(std::move(dimensions)),
+      range_vars_(std::move(range_vars)),
+      rt_vars_(std::move(rt_vars)),
+      constraints_(constraints) {
+  if (!VerifyVariableIntervals() || !VerifyConstraintIntervals()) {
+    ResetToKnownEmpty();
+    return;
+  }
+}
+
 IndexingMap IndexingMap::FromTensorSizes(
     AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
     absl::Span<const int64_t> symbol_upper_bounds) {
@@ -971,21 +1003,32 @@ std::vector<Interval> IndexingMap::GetSymbolBounds() const {
 }
 
 void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
+  // Do not add the constraint if the domain is already empty.
+  if (IsKnownEmpty()) {
+    return;
+  }
+  // If the range is empty, reset the indexing map to the canonical empty form.
+  if (!range.IsFeasible()) {
+    ResetToKnownEmpty();
+    return;
+  }
   if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
     Interval& current_range = GetMutableDimensionBound(dim_expr.getPosition());
-    current_range = Intersect(current_range, range);
+    current_range = current_range.Intersect(range);
+    if (!current_range.IsFeasible()) ResetToKnownEmpty();
     return;
   }
   if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
     Interval& current_range = GetMutableSymbolBound(symbol_expr.getPosition());
-    current_range = Intersect(current_range, range);
+    current_range = current_range.Intersect(range);
+    if (!current_range.IsFeasible()) ResetToKnownEmpty();
     return;
   }
   if (auto constant_expr = mlir::dyn_cast<AffineConstantExpr>(expr)) {
-    if (constant_expr.getValue() >= range.lower &&
-        constant_expr.getValue() <= range.upper) {
-      return;
+    if (!range.Contains(constant_expr.getValue())) {
+      ResetToKnownEmpty();
     }
+    return;
   }
   if (SimplifyConstraintRange(&expr, &range)) {
     AddConstraint(expr, range);
@@ -993,7 +1036,10 @@ void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
   }
   auto [it, inserted] = constraints_.insert({expr, range});
   if (!inserted) {
-    it->second = Intersect(it->second, range);
+    it->second = it->second.Intersect(range);
+    if (!it->second.IsFeasible()) {
+      ResetToKnownEmpty();
+    }
   }
 }
 
@@ -1028,21 +1074,6 @@ SmallVector<int64_t, 4> IndexingMap::Evaluate(
   return eval.getConstantResults();
 }
 
-bool IndexingMap::IsKnownEmpty() const {
-  return llvm::any_of(dim_vars_,
-                      [](const DimVar& dim_var) {
-                        return dim_var.bounds.lower > dim_var.bounds.upper;
-                      }) ||
-         llvm::any_of(range_vars_,
-                      [](const RangeVar& range_var) {
-                        return range_var.range.lower > range_var.range.upper;
-                      }) ||
-         llvm::any_of(constraints_,
-                      [&](const std::pair<AffineExpr, Interval>& item) {
-                        return item.second.lower > item.second.upper;
-                      });
-}
-
 RangeEvaluator::RangeEvaluator(absl::Span<const Interval> dim_ranges,
                                absl::Span<const Interval> symbol_ranges,
                                MLIRContext* mlir_context)
@@ -1136,6 +1167,10 @@ void PrintRTVars(const std::vector<RTVar>& rt_vars,
 
 void IndexingMap::Print(std::ostream& out,
                         const AffineMapPrinter& printer) const {
+  if (IsKnownEmpty()) {
+    out << "KNOWN EMPTY\n";
+    return;
+  }
   printer.Print(out, affine_map_);
   out << "\ndomain:\n";
   for (const auto& [index, dim_var] : llvm::enumerate(dim_vars_)) {
@@ -1201,7 +1236,7 @@ IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
 // simplification, because the ranges of constraints were already optimized once
 // when IndexingMap was constructed.
 bool IndexingMap::Simplify(IndexingMapProvider indexing_map_provider) {
-  if (IsUndefined()) return false;
+  if (IsUndefined() || IsKnownEmpty()) return false;
 
   bool rtvars_were_eliminated = ReplaceConstantRTVars(indexing_map_provider);
 
@@ -1483,6 +1518,35 @@ SmallBitVector IndexingMap::RemoveUnusedDimensions() {
   return std::move(unused_vars.unused_dims);
 }
 
+void IndexingMap::ResetToKnownEmpty() {
+  affine_map_ = AffineMap::get(GetMLIRContext());
+  dim_vars_.clear();
+  range_vars_.clear();
+  rt_vars_.clear();
+  constraints_.clear();
+  is_known_empty_ = true;
+}
+
+bool IndexingMap::VerifyVariableIntervals() {
+  return llvm::all_of(dim_vars_,
+                      [](const DimVar& dim_var) {
+                        return dim_var.bounds.IsFeasible();
+                      }) &&
+         llvm::all_of(range_vars_,
+                      [](const RangeVar& range_var) {
+                        return range_var.range.IsFeasible();
+                      }) &&
+         llvm::all_of(rt_vars_, [](const RTVar& rt_var) {
+           return rt_var.feasible_values.IsFeasible();
+         });
+}
+
+bool IndexingMap::VerifyConstraintIntervals() {
+  return llvm::all_of(constraints_, [](const auto& constraint) {
+    return constraint.second.IsFeasible();
+  });
+}
+
 SmallBitVector IndexingMap::RemoveUnusedVars() {
   if (IsUndefined()) return {};
 
@@ -1563,6 +1627,10 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   if (second.IsUndefined() || first.IsUndefined()) {
     return IndexingMap::GetUndefined();
   }
+  MLIRContext* mlir_context = first.GetMLIRContext();
+  if (first.IsKnownEmpty() || second.IsKnownEmpty()) {
+    return IndexingMap::GetKnownEmpty(mlir_context);
+  }
   AffineMap producer_affine_map = second.GetAffineMap();
   AffineMap composed_map = producer_affine_map.compose(first.GetAffineMap());
 
@@ -1602,9 +1670,9 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
     constraints.push_back(expr);
     constraints_ranges.push_back(range);
   }
-  auto constraints_map = AffineMap::get(
-      producer_affine_map.getNumDims(), producer_affine_map.getNumSymbols(),
-      constraints, producer_affine_map.getContext());
+  auto constraints_map = AffineMap::get(producer_affine_map.getNumDims(),
+                                        producer_affine_map.getNumSymbols(),
+                                        constraints, mlir_context);
   auto remapped_constraints =
       constraints_map.compose(first.GetAffineMap())
           .replaceDimsAndSymbols(/*dimReplacements=*/{}, symbol_replacements,
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 84256ec4c47303..371591aaeaad1e 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -47,6 +47,7 @@ struct Interval {
 
   bool IsPoint() const { return lower == upper; }
   int64_t NumElements() const { return upper - lower + 1; }
+  bool IsFeasible() const { return lower <= upper; }
 
   bool Contains(int64_t value) const {
     return value >= lower && value <= upper;
@@ -236,26 +237,25 @@ class IndexingMap {
   IndexingMap(
       mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
       std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
-      absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {})
-      : affine_map_(affine_map),
-        dim_vars_(std::move(dimensions)),
-        range_vars_(std::move(range_vars)),
-        rt_vars_(std::move(rt_vars)) {
-    for (const auto& [expr, range] : constraints) {
-      AddConstraint(expr, range);
-    }
-  }
+      absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {});
+
   IndexingMap(mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
               std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
-              const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints)
-      : affine_map_(affine_map),
-        dim_vars_(std::move(dimensions)),
-        range_vars_(std::move(range_vars)),
-        rt_vars_(std::move(rt_vars)),
-        constraints_(constraints) {}
+              const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints);
 
+  // Returns an undefined indexing map.
   static IndexingMap GetUndefined() { return IndexingMap(); }
 
+  // Returns a "known" empty indexing map, i.e. () -> () affine map, no
+  // dimensions, no symbols and `is_know_empty` set to true.
+  static IndexingMap GetKnownEmpty(mlir::MLIRContext* mlir_context) {
+    IndexingMap known_empty(mlir::AffineMap::get(mlir_context),
+                            std::vector<DimVar>{}, std::vector<RangeVar>{},
+                            std::vector<RTVar>{});
+    known_empty.is_known_empty_ = true;
+    return known_empty;
+  }
+
   static IndexingMap FromTensorSizes(
       mlir::AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
       absl::Span<const int64_t> symbol_upper_bounds);
@@ -332,13 +332,12 @@ class IndexingMap {
       llvm::ArrayRef<mlir::AffineExpr> dim_const_exprs,
       llvm::ArrayRef<mlir::AffineExpr> symbol_const_exprs) const;
 
-  // Returns true if the domain is empty. Right now it scans through all
-  // constraints to find the one where lower_bound > upper_bound. If it returns
-  // true, that does not mean that the domain is not effectively empty.
+  // Returns true if the domain is empty. If it returns false, that does not
+  // mean that the domain is not effectively empty.
   // For example, if there are two constraints 0 <= d0 mod 7 <= 0 and
   // 0 <= d0 mod 11 <= 0 for a dimension 0<= d0 <= 50 then there is no d0 that
   // satisfies both constraints.
-  bool IsKnownEmpty() const;
+  bool IsKnownEmpty() const { return is_known_empty_; }
 
   bool IsUndefined() const { return affine_map_ == mlir::AffineMap(); }
 
@@ -396,6 +395,17 @@ class IndexingMap {
   bool CompressVars(const llvm::SmallBitVector& unused_dims,
                     const llvm::SmallBitVector& unused_symbols);
 
+  // Resets the indexing map to the canonical "known" empty indexing map, i.e.
+  // () -> () affine map, no dimensions, no symbols and `is_know_empty` set to
+  // true.
+  void ResetToKnownEmpty();
+
+  // Verify if all intervals for DimVars, RangeVars and RTVars are feasible.
+  bool VerifyVariableIntervals();
+
+  // Verify if all intervals for constraints.
+  bool VerifyConstraintIntervals();
+
   mlir::AffineMap affine_map_;
   std::vector<DimVar> dim_vars_;
   std::vector<RangeVar> range_vars_;
@@ -404,6 +414,8 @@ class IndexingMap {
   // set for the domain of the indexing map. It contains affine expressions
   // other than AffineDimExpr and AffineSymbolExpr.
   llvm::DenseMap<mlir::AffineExpr, Interval> constraints_;
+  // Flag to indicate that the domain is empty.
+  bool is_known_empty_ = false;
 };
 std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
 bool operator==(const IndexingMap& lhs, const IndexingMap& rhs);
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 387235686adbf0..1776e2ddfa8bb8 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -329,18 +329,38 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintIsAConstantWithinRange) {
                         )"));
 }
 
-TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintIsAConstantOutOfRange) {
+TEST_F(IndexingMapTest, KnownEmpty_CreatingIndexingMapWithInfeasibleRange) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {-1}, {});
+  EXPECT_THAT(indexing_map, MatchIndexingMap("KNOWN EMPTY"));
+}
+
+TEST_F(IndexingMapTest, KnownEmpty_AddingConstraintOutOfRange) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0) -> (d0)", &mlir_context_), {50}, {});
   // Addition of this constraint makes the domain empty.
   indexing_map.AddConstraint(ParseAffineExpr("0", &mlir_context_),
                              Interval{10, 15});
-  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
-                          (d0) -> (d0)
-                          domain:
-                          d0 in [0, 49]
-                          0 in [10, 15]
-                        )"));
+  EXPECT_THAT(indexing_map, MatchIndexingMap("KNOWN EMPTY"));
+}
+
+TEST_F(IndexingMapTest, KnownEmpty_Composition) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {50}, {});
+  EXPECT_THAT(indexing_map * IndexingMap::GetKnownEmpty(&mlir_context_),
+              MatchIndexingMap("KNOWN EMPTY"));
+  EXPECT_THAT(IndexingMap::GetKnownEmpty(&mlir_context_) * indexing_map,
+              MatchIndexingMap("KNOWN EMPTY"));
+}
+
+TEST_F(IndexingMapTest,
+       KnownEmpty_AddingConstraintOutOfRangeAfterSimplification) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
+      {50, 60}, {70, 20});
+  indexing_map.AddConstraint(ParseAffineExpr("s1 floordiv 20", &mlir_context_),
+                             Interval{2, 2});
+  EXPECT_THAT(indexing_map, MatchIndexingMap("KNOWN EMPTY"));
 }
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index 76073e48089755..b2499bccedf087 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -144,7 +144,7 @@ bool ApproximateMatch(std::string_view lhs, std::string_view rhs) {
   size_t lhs_length = lhs.size();
   size_t rhs_length = rhs.size();
   size_t l = 0, r = 0;
-  while (l < lhs_length && r < rhs_length) {
+  while (l < lhs_length || r < rhs_length) {
     while (l < lhs_length && std::isspace(lhs[l])) {
       ++l;
     }
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index f286f57b834a7c..df5aec07ace110 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1225,8 +1225,10 @@ cc_library(
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index 56c312252da203..1a0dc460249c73 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "xla/service/gpu/runtime/while_thunk.h"
 
 #include <cstdint>
+#include <iterator>
+#include <list>
 #include <memory>
 #include <optional>
 #include <utility>
 
+#include "absl/cleanup/cleanup.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
@@ -35,6 +38,20 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+static thread_local auto* loop_counters = new std::list<int64_t>();
+
+absl::StatusOr<int64_t> WhileThunk::CurrentLoopIteration(int64_t depth) {
+  if (depth >= loop_counters->size()) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Loop depth %d is greater than the number of tracked loops %d", depth,
+        loop_counters->size()));
+  }
+
+  auto counter = loop_counters->begin();
+  std::advance(counter, depth);
+  return *counter;
+}
+
 WhileThunk::WhileThunk(
     ThunkInfo thunk_info,
     const BufferAllocation::Slice& condition_result_buffer_index,
@@ -74,21 +91,22 @@ absl::Status WhileThunk::Initialize(const InitializeParams& params) {
 absl::Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
 
+  int64_t& iter = loop_counters->emplace_front();
+  absl::Cleanup cleanup = [&] { loop_counters->pop_front(); };
+
   se::DeviceMemoryBase condition_result_data =
       params.buffer_allocations->GetDeviceAddress(
           condition_result_buffer_index_);
 
   if (trip_count_.has_value()) {
     VLOG(2) << "Executing WhileThunk for " << *trip_count_ << " iterations";
-    for (int64_t i = 0; i < trip_count_; ++i) {
-      VLOG(3) << "Executing iteration # " << i;
+    for (iter = 0; iter < trip_count_; ++iter) {
+      VLOG(3) << "Executing iteration # " << iter;
       TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
     }
     return absl::OkStatus();
   }
 
-  int64_t iter = 0;
-
   // Get memory allocation for copying condition result from device.
   bool* condition_result = [&] {
     absl::MutexLock lock(&mutex_);
@@ -115,8 +133,9 @@ absl::Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
       break;
     }
 
-    VLOG(3) << "Executing WhileThunk body computation; iter=" << iter++;
+    VLOG(3) << "Executing WhileThunk body computation; iter=" << iter;
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
+    ++iter;
   }
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.h b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
index e1a06c96308590..5cf46876dc9e2b 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
@@ -75,6 +76,12 @@ class WhileThunk : public Thunk {
     return condition_result_buffer_index_;
   }
 
+  // Returns the current loop iteration if the caller is inside a while loop(s).
+  //
+  // Implementation relies on thread local storage, be careful when call it from
+  // code running on multiple threads.
+  static absl::StatusOr<int64_t> CurrentLoopIteration(int64_t depth = 0);
+
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
   std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 6823cb63c84874..7b21ec5206779a 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -811,9 +811,15 @@ lit_test_suite(
     args = if_cuda_is_configured([
         "--param=PTX=PTX",
         "--param=GPU=a6000",
+        "--param=GPU1=v100",
+        "--param=SM=CHECK-SM70",
+        "--param=GPU2=a100_80",
     ]) + if_rocm_is_configured([
         "--param=PTX=GCN",
         "--param=GPU=mi200",
+        "--param=GPU1=mi200",
+        "--param=GPU2=mi200",
+        "--param=SM=CHECK-SM80",
     ]),
     cfg = "//xla:lit.cfg.py",
     data = [
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index a2a2c34c37eb88..fec181165d22b3 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -1,5 +1,5 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU1}.txtpb --split-input-file | FileCheck %s --check-prefixes=%{SM}
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU2}.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80
 
 
 // CHECK-SM70: custom-call(f32
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
index 0fcecb093bded5..46e1240f947f8f 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
@@ -67,11 +67,12 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_36:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_37:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_38:.*]] = alloca float, align 4
-// CHECK:         %[[LOOP3_I_2:loop3.invar_address.*]] = alloca i32, align 4
+// CHECK:         %[[LOOP3_I_2:loop[23].invar_address.*]] = alloca i32, align 4
+// CHECK-GCN:     %[[VAL_42:return_buffer.*]] = alloca float, align 4
 // CHECK:         %[[LOOP2_I_2:loop2.invar_address.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_42:return_buffer.*]] = alloca float, align 4
-// CHECK:         %[[VAL_40:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_43:.*]] = alloca i32, align 4
+// CHECK-PTX:     %[[VAL_42:return_buffer.*]] = alloca float, align 4
+// CHECK-PTX:     %[[VAL_40:.*]] = alloca i32, align 4
+// CHECK-PTX:     %[[VAL_43:.*]] = alloca i32, align 4
 // CHECK:         %partial_reduction_result = alloca float, align 4
 // CHECK:         %reduction_input_address = alloca float, align 4
 // CHECK-PTX:     %[[VAL_47:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !4
@@ -81,158 +82,245 @@ ENTRY reduce.1 {
 // CHECK:       reduce-group-0-after:                             ; preds = %[[VAL_51:.*]], %[[VAL_52:.*]]
 // CHECK:         ret void
 // CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_52]]
-// CHECK:         %[[VAL_53:.*]] = load float, ptr %[[VAL_54:.*]], align 4, !invariant.load !5
-// CHECK:         store float %[[VAL_53]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_53:.*]] = load float, ptr %[[VAL_54:.*]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_53]], ptr{{.*}} %partial_reduction_result, align 4
 // CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !6
 // CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
 // CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !7
 // CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
 // CHECK:         %thread.id.2 = urem i32 %thread.id.x, 1024
 // CHECK:         %lane_id = urem i32 %thread.id.x, 32
-// CHECK:         %[[VAL_63:.*]] = udiv i32 %block.id.x, 1
-// CHECK:         %[[VECTOR_OFFSET:.*]] = urem i32 %[[VAL_63]], 1
+// CHECK-PTX:     %[[VAL_63:.*]] = udiv i32 %block.id.x, 1
+// CHECK-PTX:     %[[VECTOR_OFFSET:.*]] = urem i32 %[[VAL_63]], 1
 // CHECK:         %[[VAL_63_2:.*]] = udiv i32 %block.id.x, 1
 // CHECK:         %[[VAL_64:.*]] = urem i32 %[[VAL_63_2]], 19
 // CHECK:         %[[VAL_65:.*]] = udiv i32 %block.id.x, 19
 // CHECK:         %[[VAL_66:.*]] = urem i32 %[[VAL_65]], 1
 // CHECK:         %[[VAL_67:.*]] = udiv i32 %block.id.x, 19
 // CHECK:         %[[VAL_68:.*]] = icmp eq i32 %[[VAL_64]], 18
-// CHECK:         %tile_bound.2 = select i1 %[[VAL_68]], i32 2544, i32 8192
+// CHECK-PTX:     %tile_bound.2 = select i1 %[[VAL_68]], i32 2544, i32 8192
+// CHECK-GCN:     %tile_bound.2 = select i1 %[[VAL_68]], i32 5088, i32 16384
 // CHECK:         %tile_origin.0 = mul i32 %[[VAL_67]], 1
 // CHECK:         %tile_origin.1 = mul i32 %[[VAL_66]], 1
-// CHECK:         %tile_origin.2 = mul i32 %[[VAL_64]], 8192
-// CHECK:         %tile_origin.3 = mul i32 %[[VECTOR_OFFSET]], 2
-// CHECK:         %[[VAL_81:.*]] = icmp eq i32 8192, %tile_bound.2
+// CHECK-PTX:     %tile_origin.2 = mul i32 %[[VAL_64]], 8192
+// CHECK-GCN:     %tile_origin.2 = mul i32 %[[VAL_64]], 16384
+// CHECK-PTX:     %tile_origin.3 = mul i32 %[[VECTOR_OFFSET]], 2
+// CHECK-PTX:     %[[VAL_81:.*]] = icmp eq i32 8192, %tile_bound.2
+// CHECK-GCN:     %[[VAL_81:.*]] = icmp eq i32 16384, %tile_bound.2
 // CHECK:         br i1 %[[VAL_81]], label %[[VAL_82:.*]], label %[[VAL_83:.*]]
 // CHECK:       is_full_tile-after:                               ; preds = %[[VAL_84:.*]], %[[VAL_85:.*]]
-// CHECK:         %[[VAL_86:.*]] = load float, ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 16, i32 31)
-// CHECK:         store float %[[VAL_87]], ptr %[[VAL_37]], align 4
-// CHECK:         call void @[[MIN:Min.*]](ptr %partial_reduction_result, ptr %[[VAL_37]], ptr %[[VAL_36]])
-// CHECK:         %[[VAL_88:.*]] = load float, ptr %[[VAL_36]], align 4
-// CHECK:         store float %[[VAL_88]], ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_89:.*]] = load float, ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_90:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_89]], i32 8, i32 31)
-// CHECK:         store float %[[VAL_90]], ptr %[[VAL_35]], align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_35]], ptr %[[VAL_34]])
-// CHECK:         %[[VAL_91:.*]] = load float, ptr %[[VAL_34]], align 4
-// CHECK:         store float %[[VAL_91]], ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_92:.*]] = load float, ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31)
-// CHECK:         store float %[[VAL_93]], ptr %[[VAL_33]], align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_33]], ptr %[[VAL_32]])
-// CHECK:         %[[VAL_94:.*]] = load float, ptr %[[VAL_32]], align 4
-// CHECK:         store float %[[VAL_94]], ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_95:.*]] = load float, ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_96:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_95]], i32 2, i32 31)
-// CHECK:         store float %[[VAL_96]], ptr %[[VAL_31]], align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_31]], ptr %[[VAL_30]])
-// CHECK:         %[[VAL_97:.*]] = load float, ptr %[[VAL_30]], align 4
-// CHECK:         store float %[[VAL_97]], ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_98:.*]] = load float, ptr %partial_reduction_result, align 4
-// CHECK:         %[[VAL_99:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_98]], i32 1, i32 31)
-// CHECK:         store float %[[VAL_99]], ptr %[[VAL_29]], align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_29]], ptr %[[VAL_28]])
-// CHECK:         %[[VAL_100:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         store float %[[VAL_100]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_86:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-PTX:     %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_87_1:.*]] = bitcast float %[[VAL_86]] to i32
+// CHECK-GCN:     %[[VAL_87_2:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:     %[[VAL_87:.*]] = bitcast i32 %[[VAL_87_2]] to float 
+// CHECK:         store float %[[VAL_87]], ptr{{.*}} %[[VAL_37]], align 4
+// CHECK-GCN:     %[[VAL_88_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_88_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_88_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_36]] to ptr
+// CHECK-PTX:     call void @[[MIN:Min.*]](ptr %partial_reduction_result, ptr %[[VAL_37]], ptr %[[VAL_36]])
+// CHECK-GCN:     call void @[[MIN:Min.*]](ptr %[[VAL_88_1]], ptr %[[VAL_88_2]], ptr %[[VAL_88_3]])
+// CHECK:         %[[VAL_88:.*]] = load float, ptr{{.*}} %[[VAL_36]], align 4
+// CHECK:         store float %[[VAL_88]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK:         %[[VAL_89:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-PTX:     %[[VAL_90:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_89]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_90_1:.*]] = bitcast float %[[VAL_89]] to i32
+// CHECK-GCN:     %[[VAL_90_2:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:     %[[VAL_90:.*]] = bitcast i32 %[[VAL_90_2]] to float
+// CHECK:         store float %[[VAL_90]], ptr{{.*}} %[[VAL_35]], align 4
+// CHECK-GCN:     %[[VAL_91_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_91_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_35]] to ptr
+// CHECK-GCN:     %[[VAL_91_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_34]] to ptr
+// CHECK-PTX:     call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_35]], ptr %[[VAL_34]])
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_91_1]], ptr %[[VAL_91_2]], ptr %[[VAL_91_3]])
+// CHECK:         %[[VAL_91:.*]] = load float, ptr{{.*}} %[[VAL_34]], align 4
+// CHECK:         store float %[[VAL_91]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK:         %[[VAL_92:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-PTX:     %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_93_1:.*]] = bitcast float %[[VAL_92]] to i32
+// CHECK-GCN:     %[[VAL_93_2:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:     %[[VAL_93:.*]] = bitcast i32 %[[VAL_93_2]] to float
+// CHECK:         store float %[[VAL_93]], ptr{{.*}} %[[VAL_33]], align 4
+// CHECK-GCN:     %[[VAL_94_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_94_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_33]] to ptr
+// CHECK-GCN:     %[[VAL_94_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_32]] to ptr
+// CHECK-PTX:     call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_33]], ptr %[[VAL_32]])
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_94_1]], ptr %[[VAL_94_2]], ptr %[[VAL_94_3]])
+// CHECK:         %[[VAL_94:.*]] = load float, ptr{{.*}} %[[VAL_32]], align 4
+// CHECK:         store float %[[VAL_94]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK:         %[[VAL_95:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-PTX:     %[[VAL_96:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_95]], i32 2, i32 31)
+// CHECK-GCN:      %[[VAL_96_1:.*]] = bitcast float %[[VAL_95]] to i32
+// CHECK-GCN:      %[[VAL_96_2:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:      %[[VAL_96:.*]] = bitcast i32 %[[VAL_96_2]] to float
+// CHECK:         store float %[[VAL_96]], ptr{{.*}} %[[VAL_31]], align 4
+// CHECK-GCN:     %[[VAL_97_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_97_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_31]] to ptr
+// CHECK-GCN:     %[[VAL_97_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_30]] to ptr
+// CHECK-PTX:     call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_31]], ptr %[[VAL_30]])
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_97_1]], ptr %[[VAL_97_2]], ptr %[[VAL_97_3]])
+// CHECK:         %[[VAL_97:.*]] = load float, ptr{{.*}} %[[VAL_30]], align 4
+// CHECK:         store float %[[VAL_97]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK:         %[[VAL_98:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-PTX:     %[[VAL_99:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_98]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_99_1:.*]] = bitcast float %[[VAL_98]] to i32
+// CHECK-GCN:     %[[VAL_99_2:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:     %[[VAL_99:.*]] = bitcast i32 %[[VAL_99_2]] to float
+// CHECK:         store float %[[VAL_99]], ptr{{.*}} %[[VAL_29]], align 4
+// CHECK-GCN:     %[[VAL_100_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_100_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr
+// CHECK-GCN:     %[[VAL_100_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-PTX:     call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_29]], ptr %[[VAL_28]])
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_100_1]], ptr %[[VAL_100_2]], ptr %[[VAL_100_3]])
+// CHECK:         %[[VAL_100:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK:         store float %[[VAL_100]], ptr{{.*}} %partial_reduction_result, align 4
 // CHECK:         %[[VAL_101:.*]] = udiv i32 %thread.id.2, 32
 // CHECK:         br i1 true, label %[[VAL_105:.*]], label %[[VAL_51]]
+
 // CHECK:       thread_in_bounds-after:
 // CHECK:         br label %[[VAL_50]]
 // CHECK:       is_full_tile-true:
-// CHECK:         store i32 0, ptr %[[VAL_43]], align 4
+// CHECK-PTX:     store i32 0, ptr{{.*}} %[[VAL_43]], align 4
+// CHECK-GCN:     store i32 0, ptr{{.*}} %[[LOOP2_I_2]], align 4
 // CHECK:         br label %[[VAL_107:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_108:.*]], %[[VAL_82]]
-// CHECK:         %[[VAL_109:.*]] = load i32, ptr %[[VAL_43]], align 4
-// CHECK:         %[[VAL_110:.*]] = icmp uge i32 %[[VAL_109]], 8
+// CHECK-PTX:     %[[VAL_109:.*]] = load i32, ptr %[[VAL_43]], align 4
+// CHECK-GCN:     %[[VAL_109:.*]] = load i32, ptr{{.*}} %[[LOOP2_I_2]], align 4
+// CHECK:         %[[VAL_110:.*]] = icmp uge i32 %[[VAL_109]], 
 // CHECK:         br i1 %[[VAL_110]], label %loop2.loop_exit, label %loop2.loop_body
+
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_107]]
 // CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_109]], 1
-// CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4
+// CHECK-PTX:     store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4
+// CHECK-GCN:     store i32 %[[VAL_111]], ptr{{.*}} %[[LOOP2_I_2]], align 4
 // CHECK:         %[[OFFSET_2:.*]] = add i32 %loop2.indvar, %thread.id.2
-// CHECK:         store i32 0, ptr %loop3.invar_address, align 4
-// CHECK:         br label %loop3.loop_header
-// CHECK:       loop3.loop_header:
-// CHECK:         %loop3.indvar = load i32, ptr %loop3.invar_address, align 4
-// CHECK:         %[[LOOP3_OOB:.*]] = icmp uge i32 %loop3.indvar, 2
-// CHECK:         br i1 %[[LOOP3_OOB]], label %loop3.loop_exit, label %loop3.loop_body
-// CHECK:       loop3.loop_body:
-// CHECK:         %[[LOOP3_INC:.*]] = add nuw nsw i32 %loop3.indvar, 1
-// CHECK:         store i32 %[[LOOP3_INC]], ptr %loop3.invar_address, align 4
-// CHECK:         %[[START_0:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[START_1:.*]] = add i32 %tile_origin.1, 0
-// CHECK:         %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]]
-// CHECK:         %[[START_3:.*]] = add i32 %tile_origin.3, %loop3.indvar
-// CHECK:         %[[VAL_113:.*]] = mul nuw nsw i32 %[[START_3]], 1
-// CHECK:         %[[VAL_114:.*]] = add nuw nsw i32 0, %[[VAL_113]]
-// CHECK:         %[[VAL_115:.*]] = mul nuw nsw i32 %[[START_2]], 2
-// CHECK:         %[[VAL_116:.*]] = add nuw nsw i32 %[[VAL_114]], %[[VAL_115]]
-// CHECK:         %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[VAL_116]]
-// CHECK:         %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !5
-// CHECK:         store float %[[VAL_121]], ptr %reduction_input_address, align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_42]])
-// CHECK:         %[[VAL_123:.*]] = load float, ptr %[[VAL_42]], align 4
-// CHECK:         store float %[[VAL_123]], ptr %partial_reduction_result, align 4
-// CHECK:         br label %loop3.loop_header
-// CHECK:       loop3.loop_exit:
-// CHECK:         br label %loop2.loop_header
+// CHECK-GCN:     %[[START_0:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[START_1:.*]] = add i32 %tile_origin.1, 0
+// CHECK-GCN:     %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]]
+// CHECK-GCN:     %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[START_2]]
+// CHECK-GCN:     %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !3
+// CHECK-GCN:     store float %[[VAL_121]], ptr{{.*}} %reduction_input_address, align 4
+// CHECK-GCN:     %[[VAL_123_1:.*]] = addrspacecast ptr addrspace(5) %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_123_2:.*]] = addrspacecast ptr addrspace(5) %reduction_input_address to ptr
+// CHECK-GCN:     %[[VAL_123_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_42]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_123_1]], ptr %[[VAL_123_2]], ptr %[[VAL_123_3]])
+// CHECK-GCN:     %[[VAL_123:.*]] = load float, ptr{{.*}} %[[VAL_42]], align 4
+// CHECK-GCN:     store float %[[VAL_123]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-GCN:     br label %loop2.loop_header
+// CHECK-PTX:     store i32 0, ptr %loop3.invar_address, align 4
+// CHECK-PTX:     br label %loop3.loop_header
+
+// CHECK-PTX:   loop3.loop_header:
+// CHECK-PTX:     %loop3.indvar = load i32, ptr %loop3.invar_address, align 4
+// CHECK-PTX:     %[[LOOP3_OOB:.*]] = icmp uge i32 %loop3.indvar, 2
+// CHECK-PTX:     br i1 %[[LOOP3_OOB]], label %loop3.loop_exit, label %loop3.loop_body
+// CHECK-PTX:   loop3.loop_body:
+// CHECK-PTX:     %[[LOOP3_INC:.*]] = add nuw nsw i32 %loop3.indvar, 1
+// CHECK-PTX:     store i32 %[[LOOP3_INC]], ptr %loop3.invar_address, align 4
+// CHECK-PTX:     %[[START_0:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:     %[[START_1:.*]] = add i32 %tile_origin.1, 0
+// CHECK-PTX:     %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]]
+// CHECK-PTX:     %[[START_3:.*]] = add i32 %tile_origin.3, %loop3.indvar
+// CHECK-PTX:     %[[VAL_113:.*]] = mul nuw nsw i32 %[[START_3]], 1
+// CHECK-PTX:     %[[VAL_114:.*]] = add nuw nsw i32 0, %[[VAL_113]]
+// CHECK-PTX:     %[[VAL_115:.*]] = mul nuw nsw i32 %[[START_2]], 2
+// CHECK-PTX:     %[[VAL_116:.*]] = add nuw nsw i32 %[[VAL_114]], %[[VAL_115]]
+// CHECK-PTX:     %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[VAL_116]]
+// CHECK-PTX:     %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !5
+// CHECK-PTX:     store float %[[VAL_121]], ptr %reduction_input_address, align 4
+// CHECK-PTX:     call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_42]])
+// CHECK-PTX:     %[[VAL_123:.*]] = load float, ptr %[[VAL_42]], align 4
+// CHECK-PTX:     store float %[[VAL_123]], ptr %partial_reduction_result, align 4
+// CHECK-PTX:     br label %loop3.loop_header
+// CHECK-PTX:   loop3.loop_exit:
+// CHECK-PTX:     br label %loop2.loop_header
+
 // CHECK:       loop2.loop_exit:
 // CHECK:         br label %is_full_tile-after
+
 // CHECK:       is_full_tile-false:
-// CHECK:         store i32 0, ptr %[[LOOP2_I_2]], align 4
+// CHECK-PTX:     store i32 0, ptr %[[LOOP2_I_2]], align 4
+// CHECK-GCN:     store i32 0, ptr{{.*}} %[[LOOP3_I_2]], align 4
 // CHECK:         br label %[[VAL_134:.*]]
-// CHECK:       loop2.loop_header4:
-// CHECK:         %[[VAL_136:.*]] = load i32, ptr %[[LOOP2_I_2]], align 4
-// CHECK:         %[[VAL_137:.*]] = icmp uge i32 %[[VAL_136]], 8
+
+// CHECK:       loop2.loop_header{{(4|3)}}:
+// CHECK-PTX:     %[[VAL_136:.*]] = load i32, ptr %[[LOOP2_I_2]], align 4
+// CHECK-GCN:     %[[VAL_136:.*]] = load i32, ptr{{.*}} %[[LOOP3_I_2]], align 4
+// CHECK:         %[[VAL_137:.*]] = icmp uge i32 %[[VAL_136]], {{(8|16384)}}
 // CHECK:         br i1 %[[VAL_137]], label %[[VAL_84]], label %[[VAL_138:.*]]
-// CHECK:       loop2.loop_body5:
+
+// CHECK:       loop2.loop_body{{(5|4)}}:
 // CHECK:         %[[VAL_139:.*]] = add nuw nsw i32 %[[VAL_136]], 1
-// CHECK:         store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4
+// CHECK-PTX:     store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4
+// CHECK-GCN:     store i32 %[[VAL_139]], ptr{{.*}} %[[LOOP3_I_2]], align 4
 // CHECK:         %[[VAL_141:.*]] = add i32 %[[VAL_136]], %thread.id.2
 // CHECK:         %[[VAL_144:.*]] = icmp ult i32 %[[VAL_141]], %tile_bound.2
 // CHECK:         br i1 %[[VAL_144]], label %x_in_tile-true, label %x_in_tile-after
+
 // CHECK:       x_in_tile-after:
-// CHECK:         br label %loop2.loop_header4
-// CHECK:       loop2.loop_exit3:
+// CHECK:         br label %loop2.loop_header{{(4|3)}}
+
+// CHECK:       loop2.loop_exit{{(3|2)}}:
 // CHECK:         br label %is_full_tile-after
+
 // CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_138]]
-// CHECK:         store i32 0, ptr %[[LOOP3_I_2]], align 4
-// CHECK:         br label %loop3.loop_header10
-// CHECK:      loop3.loop_header10:
-// CHECK:         %[[VAL_145:.*]] = load i32, ptr %[[LOOP3_I_2]], align 4
-// CHECK:         %[[VAL_146:.*]] = icmp uge i32 %[[VAL_145]], 2
-// CHECK:         br i1 %[[VAL_146]], label %loop3.loop_exit9, label %loop3.loop_body11
-// CHECK:      loop3.loop_body11:
-// CHECK:         %[[VAL_147:.*]] = add nuw nsw i32 %[[VAL_145]], 1
-// CHECK:         store i32 %[[VAL_147]], ptr %[[LOOP3_I_2]], align 4
-// CHECK:         %[[IDX0:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[IDX1:.*]] = add i32 %tile_origin.1, 0
-// CHECK:         %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]]
-// CHECK:         %[[IDX3:.*]] = add i32 %tile_origin.3, %[[VAL_145]]
-// CHECK:         %[[VAL_148:.*]] = mul nuw nsw i32 %[[IDX3]], 1
-// CHECK:         %[[VAL_149:.*]] = add nuw nsw i32 0, %[[VAL_148]]
-// CHECK:         %[[VAL_150:.*]] = mul nuw nsw i32 %[[IDX2]], 2
-// CHECK:         %[[VAL_151:.*]] = add nuw nsw i32 %[[VAL_149]], %[[VAL_150]]
-// CHECK:         %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[VAL_151]]
-// CHECK:         %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !5
-// CHECK:         store float %[[VAL_156]], ptr %reduction_input_address, align 4
-// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_38]])
-// CHECK:         %[[VAL_158:.*]] = load float, ptr %[[VAL_38]], align 4
-// CHECK:         store float %[[VAL_158]], ptr %partial_reduction_result, align 4
-// CHECK:         br label %loop3.loop_header10
-// CHECK:       loop3.loop_exit9:
-// CHECK:         br label %x_in_tile-after
+// CHECK-GCN:     %[[IDX0:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[IDX1:.*]] = add i32 %tile_origin.1, 0
+// CHECK-GCN:     %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]]
+// CHECK-GCN:     %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[IDX2]]
+// CHECK-GCN:     %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !3
+// CHECK-GCN:     store float %[[VAL_156]], ptr{{.*}} %reduction_input_address, align 4
+// CHECK-GCN:     %[[VAL_158_1:.*]] = addrspacecast ptr addrspace(5) %partial_reduction_result to ptr
+// CHECK-GCN:     %[[VAL_158_2:.*]] = addrspacecast ptr addrspace(5) %reduction_input_address to ptr
+// CHECK-GCN:     %[[VAL_158_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_38]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_158_1]], ptr %[[VAL_158_2]], ptr %[[VAL_158_3]])
+// CHECK-GCN:     %[[VAL_158:.*]] = load float, ptr{{.*}} %[[VAL_38]], align 4
+// CHECK-GCN:     store float %[[VAL_158]], ptr{{.*}} %partial_reduction_result, align 4
+// CHECK-GCN:     br label %x_in_tile-after
+// CHECK-PTX:     store i32 0, ptr %[[LOOP3_I_2]], align 4
+// CHECK-PTX:     br label %loop3.loop_header10
+
+// CHECK-PTX:   loop3.loop_header10:
+// CHECK-PTX:      %[[VAL_145:.*]] = load i32, ptr %[[LOOP3_I_2]], align 4
+// CHECK-PTX:      %[[VAL_146:.*]] = icmp uge i32 %[[VAL_145]], 2
+// CHECK-PTX:      br i1 %[[VAL_146]], label %loop3.loop_exit9, label %loop3.loop_body11
+
+// CHECK-PTX:   loop3.loop_body11:
+// CHECK-PTX:      %[[VAL_147:.*]] = add nuw nsw i32 %[[VAL_145]], 1
+// CHECK-PTX:      store i32 %[[VAL_147]], ptr %[[LOOP3_I_2]], align 4
+// CHECK-PTX:      %[[IDX0:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:      %[[IDX1:.*]] = add i32 %tile_origin.1, 0
+// CHECK-PTX:      %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]]
+// CHECK-PTX:      %[[IDX3:.*]] = add i32 %tile_origin.3, %[[VAL_145]]
+// CHECK-PTX:      %[[VAL_148:.*]] = mul nuw nsw i32 %[[IDX3]], 1
+// CHECK-PTX:      %[[VAL_149:.*]] = add nuw nsw i32 0, %[[VAL_148]]
+// CHECK-PTX:      %[[VAL_150:.*]] = mul nuw nsw i32 %[[IDX2]], 2
+// CHECK-PTX:      %[[VAL_151:.*]] = add nuw nsw i32 %[[VAL_149]], %[[VAL_150]]
+// CHECK-PTX:      %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[VAL_151]]
+// CHECK-PTX:      %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !5
+// CHECK-PTX:      store float %[[VAL_156]], ptr %reduction_input_address, align 4
+// CHECK-PTX:      call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_38]])
+// CHECK-PTX:      %[[VAL_158:.*]] = load float, ptr %[[VAL_38]], align 4
+// CHECK-PTX:      store float %[[VAL_158]], ptr %partial_reduction_result, align 4
+// CHECK-PTX:      br label %loop3.loop_header10
+
+// CHECK-PTX:   loop3.loop_exit9:
+// CHECK-PTX:     br label %x_in_tile-after
+
 // CHECK:       thread_in_bounds-true:
 // CHECK:         %[[VAL_166:.*]] = icmp eq i32 %lane_id, 0
 // CHECK:         br i1 %[[VAL_166]], label %[[VAL_167:.*]], label %[[VAL_168:.*]]
+
 // CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_167]], %[[VAL_105]]
-// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK-GCM:     fence syncscope("workgroup") seq_cst
+// CHECK-GCM:     call void @llvm.amdgcn.s.barrier()
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
 // CHECK:         %[[VAL_169:.*]] = icmp eq i32 %[[VAL_101]], 0
 // CHECK:         br i1 %[[VAL_169]], label %inter_warp_reduce-true, label %inter_warp_reduce-after
 // CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_171:.*]], %[[VAL_168]]
 // CHECK:         br label %[[VAL_51]]
 // CHECK:       intra_warp_reduce_write-true:                     ; preds = %[[VAL_105]]
-// CHECK:         %[[VAL_172:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_172:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4
 // CHECK:         %[[VAL_173:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_101]]
 // CHECK:         %[[VAL_174:.*]] = addrspacecast ptr addrspace(3) %[[VAL_173]] to ptr
 // CHECK:         store float %[[VAL_172]], ptr %[[VAL_174]], align 4
@@ -240,38 +328,76 @@ ENTRY reduce.1 {
 // CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_168]]
 // CHECK:         %[[VAL_175:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id
 // CHECK:         %[[VAL_176:.*]] = addrspacecast ptr addrspace(3) %[[VAL_175]] to ptr
-// CHECK:         store float %[[VAL_53]], ptr %[[VAL_27]], align 4
+// CHECK-GCN:     %[[VAL_176_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_27]] to ptr
+// CHECK-GCN:     store float %[[VAL_53]], ptr{{.*}} %[[VAL_176_1]], align 4
+// CHECK-PTX:     store float %[[VAL_53]], ptr %[[VAL_27]], align 4
 // CHECK:         %[[VAL_177:.*]] = icmp ult i32 %thread.id.2, 32
-// CHECK:         %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_27]]
+// CHECK-GCN:     %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_176_1]]
+// CHECK-PTX:     %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_27]]
 // CHECK:         %[[VAL_179:.*]] = load float, ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_180:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_179]], i32 16, i32 31)
-// CHECK:         store float %[[VAL_180]], ptr %[[VAL_26]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_26]], ptr %[[VAL_25]])
-// CHECK:         %[[VAL_181:.*]] = load float, ptr %[[VAL_25]], align 4
+// CHECK-GCN:     %[[VAL_179_1:.*]] = bitcast float %[[VAL_179]] to i32
+// CHECK-GCN:     %[[VAL_180:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_179_1]], i32 16)
+// CHECK-GCN:     %[[VAL_180_1:.*]] = bitcast i32 %[[VAL_180]] to float
+// CHECK-GCN:     store float %[[VAL_180_1]], ptr{{.*}} %[[VAL_26]], align 4
+// CHECK-PTX:     %[[VAL_180:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_179]], i32 16, i32 31)
+// CHECK-PTX:     store float %[[VAL_180]], ptr %[[VAL_26]], align 4
+// CHECK-GCN:     %[[VAL_181_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_26]] to ptr
+// CHECK-GCN:     %[[VAL_181_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_25]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_181_2]], ptr %[[VAL_181_3]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_26]], ptr %[[VAL_25]])
+// CHECK:         %[[VAL_181:.*]] = load float, ptr{{.*}} %[[VAL_25]], align 4
 // CHECK:         store float %[[VAL_181]], ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_182:.*]] = load float, ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_183:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_182]], i32 8, i32 31)
-// CHECK:         store float %[[VAL_183]], ptr %[[VAL_24]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_24]], ptr %[[VAL_23]])
-// CHECK:         %[[VAL_184:.*]] = load float, ptr %[[VAL_23]], align 4
+// CHECK-GCN:     %[[VAL_182_1:.*]] = bitcast float %[[VAL_182]] to i32
+// CHECK-GCN:     %[[VAL_183:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_182_1]], i32 8)
+// CHECK-GCN:     %[[VAL_183_1:.*]] = bitcast i32 %[[VAL_183]] to float
+// CHECK-GCN:     store float %[[VAL_183_1]], ptr{{.*}} %[[VAL_24]], align 4
+// CHECK-PTX:     %[[VAL_183:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_182]], i32 8, i32 31)
+// CHECK-PTX:     store float %[[VAL_183]], ptr %[[VAL_24]], align 4
+// CHECK-GCN:     %[[VAL_184_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_24]] to ptr
+// CHECK-GCN:     %[[VAL_184_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_23]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_184_2]], ptr %[[VAL_184_3]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_24]], ptr %[[VAL_23]])
+// CHECK:         %[[VAL_184:.*]] = load float, ptr{{.*}} %[[VAL_23]], align 4
 // CHECK:         store float %[[VAL_184]], ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_185:.*]] = load float, ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_186:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_185]], i32 4, i32 31)
-// CHECK:         store float %[[VAL_186]], ptr %[[VAL_22]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_22]], ptr %[[VAL_21]])
-// CHECK:         %[[VAL_187:.*]] = load float, ptr %[[VAL_21]], align 4
+// CHECK-GCN:     %[[VAL_185_1:.*]] = bitcast float %[[VAL_185]] to i32
+// CHECK-GCN:     %[[VAL_186:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_185_1]], i32 4)
+// CHECK-GCN:     %[[VAL_186_1:.*]] = bitcast i32 %[[VAL_186]] to float
+// CHECK-GCN:     store float %[[VAL_186_1]], ptr{{.*}} %[[VAL_22]], align 4
+// CHECK-PTX:     %[[VAL_186:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_185]], i32 4, i32 31)
+// CHECK-PTX:     store float %[[VAL_186]], ptr %[[VAL_22]], align 4
+// CHECK-GCN:     %[[VAL_187_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_22]] to ptr
+// CHECK-GCN:     %[[VAL_187_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_21]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_187_2]], ptr %[[VAL_187_3]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_22]], ptr %[[VAL_21]])
+// CHECK:         %[[VAL_187:.*]] = load float, ptr{{.*}} %[[VAL_21]], align 4
 // CHECK:         store float %[[VAL_187]], ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_188:.*]] = load float, ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_189:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_188]], i32 2, i32 31)
-// CHECK:         store float %[[VAL_189]], ptr %[[VAL_20]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_20]], ptr %[[VAL_19]])
-// CHECK:         %[[VAL_190:.*]] = load float, ptr %[[VAL_19]], align 4
+// CHECK-GCN:     %[[VAL_188_1:.*]] = bitcast float %[[VAL_188]] to i32
+// CHECK-GCN:     %[[VAL_189:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_188_1]], i32 2)
+// CHECK-GCN:     %[[VAL_189_1:.*]] = bitcast i32 %[[VAL_189]] to float
+// CHECK-GCN:     store float %[[VAL_189_1]], ptr{{.*}} %[[VAL_20]], align 4
+// CHECK-PTX:     %[[VAL_189:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_188]], i32 2, i32 31)
+// CHECK-PTX:     store float %[[VAL_189]], ptr %[[VAL_20]], align 4
+// CHECK-GCN:     %[[VAL_190_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_20]] to ptr
+// CHECK-GCN:     %[[VAL_190_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_19]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_190_2]], ptr %[[VAL_190_3]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_20]], ptr %[[VAL_19]])
+// CHECK:         %[[VAL_190:.*]] = load float, ptr{{.*}} %[[VAL_19]], align 4
 // CHECK:         store float %[[VAL_190]], ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_191:.*]] = load float, ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_192:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_191]], i32 1, i32 31)
-// CHECK:         store float %[[VAL_192]], ptr %[[VAL_18]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_18]], ptr %[[VAL_17]])
-// CHECK:         %[[VAL_193:.*]] = load float, ptr %[[VAL_17]], align 4
+// CHECK-GCN:     %[[VAL_191_1:.*]] = bitcast float %[[VAL_191]] to i32
+// CHECK-GCN:     %[[VAL_192:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_191_1]], i32 1)
+// CHECK-GCN:     %[[VAL_192_1:.*]] = bitcast i32 %[[VAL_192]] to float
+// CHECK-GCN:     store float %[[VAL_192_1]], ptr{{.*}} %[[VAL_18]], align 4
+// CHECK-PTX:     %[[VAL_192:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_191]], i32 1, i32 31)
+// CHECK-PTX:     store float %[[VAL_192]], ptr %[[VAL_18]], align 4
+// CHECK-GCN:     %[[VAL_193_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_18]] to ptr
+// CHECK-GCN:     %[[VAL_193_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_17]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_193_2]], ptr %[[VAL_193_3]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_18]], ptr %[[VAL_17]])
+// CHECK:         %[[VAL_193:.*]] = load float, ptr{{.*}} %[[VAL_17]], align 4
 // CHECK:         store float %[[VAL_193]], ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_194:.*]] = icmp eq i32 %thread.id.2, 0
 // CHECK:         br i1 %[[VAL_194]], label %[[VAL_195:.*]], label %[[VAL_171]]
@@ -280,29 +406,38 @@ ENTRY reduce.1 {
 // CHECK:       reduction_write_output-true:
 // CHECK:         %[[VAL_200:.*]] = load float, ptr %[[VAL_178]], align 4
 // CHECK:         %[[VAL_201:.*]] = load i32, ptr %[[VAL_202:.*]], align 4
-// CHECK:         store i32 %[[VAL_201]], ptr %[[VAL_16]], align 4
+// CHECK:         store i32 %[[VAL_201]], ptr{{.*}} %[[VAL_16]], align 4
 // CHECK:         br label %[[VAL_203:.*]]
 // CHECK:       atomic_op_loop_exit:                              ; preds = %[[VAL_204:.*]], %[[VAL_203]]
 // CHECK:         br label %[[VAL_171]]
 // CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_204]], %[[VAL_195]]
-// CHECK:         %[[VAL_205:.*]] = load i32, ptr %[[VAL_16]], align 4
-// CHECK:         store i32 %[[VAL_205]], ptr %[[VAL_15]], align 4
-// CHECK:         call void @[[MIN]](ptr %[[VAL_15]], ptr %[[VAL_178]], ptr %[[VAL_15]])
-// CHECK:         %[[VAL_206:.*]] = load i32, ptr %[[VAL_15]], align 4
+// CHECK:         %[[VAL_205:.*]] = load i32, ptr{{.*}} %[[VAL_16]], align 4
+// CHECK:         store i32 %[[VAL_205]], ptr{{.*}} %[[VAL_15]], align 4
+// CHECK-GCN:     %[[VAL_206_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr
+// CHECK-GCN:     %[[VAL_206_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr
+// CHECK-GCN:     call void @[[MIN]](ptr %[[VAL_206_1]], ptr %[[VAL_178]], ptr %[[VAL_206_2]])
+// CHECK-PTX:     call void @[[MIN]](ptr %[[VAL_15]], ptr %[[VAL_178]], ptr %[[VAL_15]])
+// CHECK:         %[[VAL_206:.*]] = load i32, ptr{{.*}} %[[VAL_15]], align 4
 // CHECK:         %[[VAL_207:.*]] = icmp eq i32 %[[VAL_205]], %[[VAL_206]]
 // CHECK:         br i1 %[[VAL_207]], label %atomic_op_loop_exit, label %atomic_op_loop_cas
 // CHECK:       atomic_op_loop_cas:                               ; preds = %[[VAL_203]]
-// CHECK:         %[[VAL_208:.*]] = cmpxchg ptr %[[VAL_202]], i32 %[[VAL_205]], i32 %[[VAL_206]] seq_cst seq_cst, align 4
+// CHECK:         %[[VAL_208:.*]] = cmpxchg ptr %[[VAL_202]], i32 %[[VAL_205]], i32 %[[VAL_206]]{{.*}} seq_cst seq_cst, align 4
 // CHECK:         %[[VAL_209:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 0
-// CHECK:         store i32 %[[VAL_209]], ptr %[[VAL_16]], align 4
+// CHECK:         store i32 %[[VAL_209]], ptr{{.*}} %[[VAL_16]], align 4
 // CHECK:         %[[VAL_210:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 1
 // CHECK:         br i1 %[[VAL_210]], label %atomic_op_loop_exit, label %atomic_op_loop_body
 // CHECK:       entry:
 // CHECK:         %[[VAL_211:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_212:.*]] = load float, ptr %[[VAL_213:.*]], align 4
 // CHECK:         %[[VAL_214:.*]] = load float, ptr %[[VAL_215:.*]], align 4
-// CHECK:         %[[VAL_216:.*]] = call float @llvm.minimum.f32(float %[[VAL_212]], float %[[VAL_214]])
-// CHECK:         store float %[[VAL_216]], ptr %[[VAL_211]], align 4
-// CHECK:         %[[VAL_217:.*]] = load float, ptr %[[VAL_211]], align 4
+// CHECK-PTX:     %[[VAL_216:.*]] = call float @llvm.minimum.f32(float %[[VAL_212]], float %[[VAL_214]])
+// CHECK-GCN:     %[[VAL_216_1:.*]] = fcmp une float %[[VAL_212]], %[[VAL_212]]
+// CHECK-GCN:     %[[VAL_216_2:.*]] = fcmp oeq float %[[VAL_214]], %[[VAL_214]]
+// CHECK-GCN:     %[[VAL_216_3:.*]] = fcmp ole float %[[VAL_212]], %[[VAL_214]]
+// CHECK-GCN:     %[[VAL_216_4:.*]] = and i1 %[[VAL_216_2]], %[[VAL_216_3]]
+// CHECK-GCN:     %[[VAL_216_5:.*]] = or i1 %[[VAL_216_1]], %[[VAL_216_4]]
+// CHECK-GCN:     %[[VAL_216:.*]] = select i1 %[[VAL_216_5]], float %[[VAL_212]], float %[[VAL_214]]
+// CHECK:         store float %[[VAL_216]], ptr{{.*}} %[[VAL_211]], align 4
+// CHECK:         %[[VAL_217:.*]] = load float, ptr{{.*}} %[[VAL_211]], align 4
 // CHECK:         store float %[[VAL_217]], ptr %[[VAL_218:.*]], align 4
 // CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
index ba61b0b7921ab4..35ba85befe94a6 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
@@ -43,9 +43,9 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_20:.*]] = alloca %[[VAL_1]], align 8
 // CHECK:         %[[VAL_21:.*]] = alloca %[[VAL_1]], align 8
 // CHECK:         %[[VAL_22:.*]] = alloca %[[VAL_1]], align 8
-// CHECK:         %[[VAL_23:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_24:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8
+// CHECK-PTX:         %[[VAL_23:.*]] = alloca i32, align 4   
+// CHECK-PTX:         %[[VAL_24:.*]] = alloca i32, align 4
+// CHECK-DAG:         %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8
 // CHECK-DAG:         %[[VAL_26:.*]] = alloca i32, align 4
 // CHECK-DAG:         %[[VAL_27:.*]] = alloca i32, align 4
 // CHECK-DAG:         %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8
@@ -57,8 +57,8 @@ ENTRY reduce.1 {
 // CHECK:       reduce-group-0-after:                             ; preds = %thread_in_bounds-after, %[[VAL_34:.*]]
 // CHECK:         ret void
 // CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_34]]
-// CHECK:         %[[VAL_35:.*]] = load %[[VAL_1]], ptr %[[VAL_36:.*]], align 1, !invariant.load !3
-// CHECK:         store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_35:.*]] = load %[[VAL_1]], ptr %[[VAL_36:.*]], align 1, !invariant.load !{{[0-9]}}
+// CHECK:         store %[[VAL_1]] %[[VAL_35]], ptr{{.*}} %[[VAL_28]], align 1
 // CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !4
 // CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
 // CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
@@ -69,221 +69,313 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1
 // CHECK:         %[[VAL_39:.*]] = udiv i32 %block.id.x, 1
 // CHECK:         %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1
-// CHECK:         %[[VAL_41:.*]] = udiv i32 %block.id.x, 1
-// CHECK:         %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 1
+// CHECK-PTX:     %[[VAL_41:.*]] = udiv i32 %block.id.x, 1
+// CHECK-PTX:     %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 1
 // CHECK:         %[[VAL_43:.*]] = udiv i32 %block.id.x, 1
-// CHECK:         %[[VAL_44:.*]] = icmp eq i32 %[[VAL_40]], 0
-// CHECK:         %tile_bound.2 = select i1 %[[VAL_44]], i32 5000, i32 5120
+// CHECK-PTX:     %[[VAL_44:.*]] = icmp eq i32 %[[VAL_40]], 0
+// CHECK-GCN:     %[[VAL_44:.*]] = icmp eq i32 %[[VAL_38]], 0
+// CHECK-PTX:     %tile_bound.2 = select i1 %[[VAL_44]], i32 5000, i32 5120
+// CHECK-GCN:     %tile_bound.2 = select i1 %[[VAL_44]], i32 10000, i32 10240
 // CHECK:         %tile_origin.0 = mul i32 %[[VAL_43]], 1
-// CHECK:         %tile_origin.1 = mul i32 %[[VAL_42]], 1
-// CHECK:         %tile_origin.2 = mul i32 %[[VAL_40]], 5120
-// CHECK:         %tile_origin.3 = mul i32 %[[VAL_38]], 2
-// CHECK:         %[[VAL_45:.*]] = icmp eq i32 5120, %tile_bound.2
+// CHECK-PTX:     %tile_origin.1 = mul i32 %[[VAL_42]], 1
+// CHECK-GCN:     %tile_origin.1 = mul i32 %[[VAL_40]], 1
+// CHECK-PTX:     %tile_origin.2 = mul i32 %[[VAL_40]], 5120
+// CHECK-GCN:     %tile_origin.2 = mul i32 %[[VAL_38]], 10240
+// CHECK-PTX:     %tile_origin.3 = mul i32 %[[VAL_38]], 2
+// CHECK-PTX:     %[[VAL_45:.*]] = icmp eq i32 5120, %tile_bound.2
+// CHECK-GCN:     %[[VAL_45:.*]] = icmp eq i32 10240, %tile_bound.2
 // CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_47:.*]]
 // CHECK:       is_full_tile-after:                               ; preds = %[[VAL_48:.*]], %[[VAL_49:.*]]
-// CHECK:         %[[VAL_50:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_50:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}}
 // CHECK:         %[[VAL_51:.*]] = bitcast i128 %[[VAL_50]] to <4 x i32>
 // CHECK:         %[[VAL_52:.*]] = extractelement <4 x i32> %[[VAL_51]], i64 0
-// CHECK:         %[[VAL_53:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_52]], i32 16, i32 31)
+// CHECK-PTX:     %[[VAL_53:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_52]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_53:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_52]], i32 16)
 // CHECK:         %[[VAL_54:.*]] = insertelement <4 x i32> %[[VAL_51]], i32 %[[VAL_53]], i64 0
 // CHECK:         %[[VAL_55:.*]] = extractelement <4 x i32> %[[VAL_54]], i64 1
-// CHECK:         %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31)
+// CHECK-PTX:     %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_56:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_55]], i32 16)
 // CHECK:         %[[VAL_57:.*]] = insertelement <4 x i32> %[[VAL_54]], i32 %[[VAL_56]], i64 1
 // CHECK:         %[[VAL_58:.*]] = extractelement <4 x i32> %[[VAL_57]], i64 2
-// CHECK:         %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31)
+// CHECK-PTX:     %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_59:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_58]], i32 16)
 // CHECK:         %[[VAL_60:.*]] = insertelement <4 x i32> %[[VAL_57]], i32 %[[VAL_59]], i64 2
 // CHECK:         %[[VAL_61:.*]] = extractelement <4 x i32> %[[VAL_60]], i64 3
-// CHECK:         %[[VAL_62:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_61]], i32 16, i32 31)
+// CHECK-PTX:     %[[VAL_62:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_61]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_62:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_61]], i32 16)
 // CHECK:         %[[VAL_63:.*]] = insertelement <4 x i32> %[[VAL_60]], i32 %[[VAL_62]], i64 3
 // CHECK:         %[[VAL_64:.*]] = bitcast <4 x i32> %[[VAL_63]] to i128
-// CHECK:         store i128 %[[VAL_64]], ptr %[[VAL_21]], align 16
-// CHECK:         call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_21]], ptr %[[VAL_20]])
-// CHECK:         %[[VAL_65:.*]] = load %[[VAL_1]], ptr %[[VAL_20]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_65]], ptr %[[VAL_28]], align 1
-// CHECK:         %[[VAL_66:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         store i128 %[[VAL_64]], ptr{{.*}} %[[VAL_21]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_65_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_65_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_21]] to ptr
+// CHECK-GCN:     %[[VAL_65_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_20]] to ptr
+// CHECK-GCN:     call void @[[SUM:Sum.*]](ptr %[[VAL_65_1]], ptr %[[VAL_65_2]], ptr %[[VAL_65_3]])
+// CHECK-PTX:     call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_21]], ptr %[[VAL_20]])
+// CHECK:         %[[VAL_65:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_20]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_65]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK:         %[[VAL_66:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}}
 // CHECK:         %[[VAL_67:.*]] = bitcast i128 %[[VAL_66]] to <4 x i32>
 // CHECK:         %[[VAL_68:.*]] = extractelement <4 x i32> %[[VAL_67]], i64 0
-// CHECK:         %[[VAL_69:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_68]], i32 8, i32 31)
+// CHECK-PTX:     %[[VAL_69:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_68]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_69:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_68]], i32 8)   
 // CHECK:         %[[VAL_70:.*]] = insertelement <4 x i32> %[[VAL_67]], i32 %[[VAL_69]], i64 0
 // CHECK:         %[[VAL_71:.*]] = extractelement <4 x i32> %[[VAL_70]], i64 1
-// CHECK:         %[[VAL_72:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_71]], i32 8, i32 31)
+// CHECK-PTX:     %[[VAL_72:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_71]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_72:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_71]], i32 8) 
 // CHECK:         %[[VAL_73:.*]] = insertelement <4 x i32> %[[VAL_70]], i32 %[[VAL_72]], i64 1
 // CHECK:         %[[VAL_74:.*]] = extractelement <4 x i32> %[[VAL_73]], i64 2
-// CHECK:         %[[VAL_75:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_74]], i32 8, i32 31)
+// CHECK-PTX:     %[[VAL_75:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_74]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_75:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_74]], i32 8) 
 // CHECK:         %[[VAL_76:.*]] = insertelement <4 x i32> %[[VAL_73]], i32 %[[VAL_75]], i64 2
 // CHECK:         %[[VAL_77:.*]] = extractelement <4 x i32> %[[VAL_76]], i64 3
-// CHECK:         %[[VAL_78:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_77]], i32 8, i32 31)
+// CHECK-PTX:     %[[VAL_78:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_77]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_78:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_77]], i32 8) 
 // CHECK:         %[[VAL_79:.*]] = insertelement <4 x i32> %[[VAL_76]], i32 %[[VAL_78]], i64 3
 // CHECK:         %[[VAL_80:.*]] = bitcast <4 x i32> %[[VAL_79]] to i128
-// CHECK:         store i128 %[[VAL_80]], ptr %[[VAL_19]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_19]], ptr %[[VAL_18]])
-// CHECK:         %[[VAL_81:.*]] = load %[[VAL_1]], ptr %[[VAL_18]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_81]], ptr %[[VAL_28]], align 1
-// CHECK:         %[[VAL_82:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         store i128 %[[VAL_80]], ptr{{.*}} %[[VAL_19]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_81_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_81_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_19]] to ptr
+// CHECK-GCN:     %[[VAL_81_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_18]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_81_1]], ptr %[[VAL_81_2]], ptr %[[VAL_81_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_19]], ptr %[[VAL_18]])
+// CHECK:         %[[VAL_81:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_18]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_81]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK:         %[[VAL_82:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}}
 // CHECK:         %[[VAL_83:.*]] = bitcast i128 %[[VAL_82]] to <4 x i32>
 // CHECK:         %[[VAL_84:.*]] = extractelement <4 x i32> %[[VAL_83]], i64 0
-// CHECK:         %[[VAL_85:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_84]], i32 4, i32 31)
+// CHECK-PTX:     %[[VAL_85:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_84]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_85:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_84]], i32 4)  
 // CHECK:         %[[VAL_86:.*]] = insertelement <4 x i32> %[[VAL_83]], i32 %[[VAL_85]], i64 0
 // CHECK:         %[[VAL_87:.*]] = extractelement <4 x i32> %[[VAL_86]], i64 1
-// CHECK:         %[[VAL_88:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_87]], i32 4, i32 31)
+// CHECK-PTX:     %[[VAL_88:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_87]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_88:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_87]], i32 4)  
 // CHECK:         %[[VAL_89:.*]] = insertelement <4 x i32> %[[VAL_86]], i32 %[[VAL_88]], i64 1
 // CHECK:         %[[VAL_90:.*]] = extractelement <4 x i32> %[[VAL_89]], i64 2
-// CHECK:         %[[VAL_91:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_90]], i32 4, i32 31)
+// CHECK-PTX:     %[[VAL_91:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_90]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_91:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_90]], i32 4)  
 // CHECK:         %[[VAL_92:.*]] = insertelement <4 x i32> %[[VAL_89]], i32 %[[VAL_91]], i64 2
 // CHECK:         %[[VAL_93:.*]] = extractelement <4 x i32> %[[VAL_92]], i64 3
-// CHECK:         %[[VAL_94:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_93]], i32 4, i32 31)
+// CHECK-PTX:     %[[VAL_94:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_93]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_94:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_93]], i32 4)  
 // CHECK:         %[[VAL_95:.*]] = insertelement <4 x i32> %[[VAL_92]], i32 %[[VAL_94]], i64 3
 // CHECK:         %[[VAL_96:.*]] = bitcast <4 x i32> %[[VAL_95]] to i128
-// CHECK:         store i128 %[[VAL_96]], ptr %[[VAL_17]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_17]], ptr %[[VAL_16]])
-// CHECK:         %[[VAL_97:.*]] = load %[[VAL_1]], ptr %[[VAL_16]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_97]], ptr %[[VAL_28]], align 1
-// CHECK:         %[[VAL_98:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         store i128 %[[VAL_96]], ptr{{.*}} %[[VAL_17]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_98_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_98_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_17]] to ptr
+// CHECK-GCN:     %[[VAL_98_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_16]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_98_1]], ptr %[[VAL_98_2]], ptr %[[VAL_98_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_17]], ptr %[[VAL_16]])
+// CHECK:         %[[VAL_97:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_16]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_97]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK:         %[[VAL_98:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}}
 // CHECK:         %[[VAL_99:.*]] = bitcast i128 %[[VAL_98]] to <4 x i32>
 // CHECK:         %[[VAL_100:.*]] = extractelement <4 x i32> %[[VAL_99]], i64 0
-// CHECK:         %[[VAL_101:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_100]], i32 2, i32 31)
+// CHECK-PTX:     %[[VAL_101:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_100]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_101:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_100]], i32 2)
 // CHECK:         %[[VAL_102:.*]] = insertelement <4 x i32> %[[VAL_99]], i32 %[[VAL_101]], i64 0
 // CHECK:         %[[VAL_103:.*]] = extractelement <4 x i32> %[[VAL_102]], i64 1
-// CHECK:         %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 2, i32 31)
+// CHECK-PTX:     %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_104:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_103]], i32 2)
 // CHECK:         %[[VAL_105:.*]] = insertelement <4 x i32> %[[VAL_102]], i32 %[[VAL_104]], i64 1
 // CHECK:         %[[VAL_106:.*]] = extractelement <4 x i32> %[[VAL_105]], i64 2
-// CHECK:         %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 2, i32 31)
+// CHECK-PTX:     %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_107:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_106]], i32 2)
 // CHECK:         %[[VAL_108:.*]] = insertelement <4 x i32> %[[VAL_105]], i32 %[[VAL_107]], i64 2
 // CHECK:         %[[VAL_109:.*]] = extractelement <4 x i32> %[[VAL_108]], i64 3
-// CHECK:         %[[VAL_110:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_109]], i32 2, i32 31)
+// CHECK-PTX:     %[[VAL_110:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_109]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_110:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_109]], i32 2)
 // CHECK:         %[[VAL_111:.*]] = insertelement <4 x i32> %[[VAL_108]], i32 %[[VAL_110]], i64 3
 // CHECK:         %[[VAL_112:.*]] = bitcast <4 x i32> %[[VAL_111]] to i128
-// CHECK:         store i128 %[[VAL_112]], ptr %[[VAL_15]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_15]], ptr %[[VAL_14]])
-// CHECK:         %[[VAL_113:.*]] = load %[[VAL_1]], ptr %[[VAL_14]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_113]], ptr %[[VAL_28]], align 1
-// CHECK:         %[[VAL_114:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         store i128 %[[VAL_112]], ptr{{.*}} %[[VAL_15]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_113_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_113_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_15]] to ptr
+// CHECK-GCN:     %[[VAL_113_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_14]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_113_1]], ptr %[[VAL_113_2]], ptr %[[VAL_113_3]])
+// CHECK_PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_15]], ptr %[[VAL_14]])
+// CHECK:         %[[VAL_113:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_14]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_113]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK:         %[[VAL_114:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}}
 // CHECK:         %[[VAL_115:.*]] = bitcast i128 %[[VAL_114]] to <4 x i32>
 // CHECK:         %[[VAL_116:.*]] = extractelement <4 x i32> %[[VAL_115]], i64 0
-// CHECK:         %[[VAL_117:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_116]], i32 1, i32 31)
+// CHECK-PTX:     %[[VAL_117:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_116]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_117:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_116]], i32 1)
 // CHECK:         %[[VAL_118:.*]] = insertelement <4 x i32> %[[VAL_115]], i32 %[[VAL_117]], i64 0
 // CHECK:         %[[VAL_119:.*]] = extractelement <4 x i32> %[[VAL_118]], i64 1
-// CHECK:         %[[VAL_120:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_119]], i32 1, i32 31)
+// CHECK-PTX:     %[[VAL_120:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_119]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_120:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_119]], i32 1)
 // CHECK:         %[[VAL_121:.*]] = insertelement <4 x i32> %[[VAL_118]], i32 %[[VAL_120]], i64 1
 // CHECK:         %[[VAL_122:.*]] = extractelement <4 x i32> %[[VAL_121]], i64 2
-// CHECK:         %[[VAL_123:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_122]], i32 1, i32 31)
+// CHECK-PTX:     %[[VAL_123:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_122]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_123:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_122]], i32 1)
 // CHECK:         %[[VAL_124:.*]] = insertelement <4 x i32> %[[VAL_121]], i32 %[[VAL_123]], i64 2
 // CHECK:         %[[VAL_125:.*]] = extractelement <4 x i32> %[[VAL_124]], i64 3
-// CHECK:         %[[VAL_126:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_125]], i32 1, i32 31)
+// CHECK-PTX:     %[[VAL_126:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_125]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_126:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_125]], i32 1)
 // CHECK:         %[[VAL_127:.*]] = insertelement <4 x i32> %[[VAL_124]], i32 %[[VAL_126]], i64 3
 // CHECK:         %[[VAL_128:.*]] = bitcast <4 x i32> %[[VAL_127]] to i128
-// CHECK:         store i128 %[[VAL_128]], ptr %[[VAL_13]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_13]], ptr %[[VAL_12]])
-// CHECK:         %[[VAL_129:.*]] = load %[[VAL_1]], ptr %[[VAL_12]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_129]], ptr %[[VAL_28]], align 1
+// CHECK:         store i128 %[[VAL_128]], ptr{{.*}} %[[VAL_13]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_129_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_129_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_13]] to ptr
+// CHECK-GCN:     %[[VAL_129_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_12]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_129_1]], ptr %[[VAL_129_2]], ptr %[[VAL_129_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_13]], ptr %[[VAL_12]])
+// CHECK:         %[[VAL_129:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_12]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_129]], ptr{{.*}} %[[VAL_28]], align 1
 // CHECK:         %[[VAL_130:.*]] = udiv i32 %thread.id.2, 32
 // CHECK:         br i1 true, label %thread_in_bounds-true, label %thread_in_bounds-after
+
 // CHECK:       thread_in_bounds-after:                           ; preds = %[[VAL_131:.*]], %[[VAL_132:.*]]
 // CHECK:         br label %[[VAL_33]]
+
 // CHECK:       is_full_tile-true:                                ; preds = %[[VAL_32]]
-// CHECK:         store i32 0, ptr %[[VAL_27]], align 4
+// CHECK:         store i32 0, ptr{{.*}} %[[VAL_27]], align 4
 // CHECK:         br label %[[VAL_133:.*]]
+
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_134:.*]], %[[VAL_46]]
-// CHECK:         %[[VAL_135:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK:         %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 5120
+// CHECK:         %[[VAL_135:.*]] = load i32, ptr{{.*}} %[[VAL_27]], align 4
+// CHECK-PTX:     %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 5120
+// CHECK-GCN:     %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 10240
 // CHECK:         br i1 %[[VAL_136]], label %[[VAL_49]], label %[[VAL_137:.*]]
+
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_133]]
 // CHECK:         %[[VAL_138:.*]] = add nuw nsw i32 %[[VAL_135]], 640
-// CHECK:         store i32 %[[VAL_138]], ptr %[[VAL_27]], align 4
+// CHECK:         store i32 %[[VAL_138]], ptr{{.*}} %[[VAL_27]], align 4
 // CHECK:         %[[VAL_140:.*]] = add i32 %[[VAL_135]], %thread.id.2
-// CHECK:         store i32 0, ptr %[[VAL_26]], align 4
+// CHECK-GCN:     %[[VAL_147:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[VAL_148:.*]] = add i32 %tile_origin.1, 0
+// CHECK-GCN:     %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]]
+// CHECK-GCN:     %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_149]]
+// CHECK-GCN:     %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !2
+// CHECK-GCN:     store %[[VAL_1]] %[[VAL_162]], ptr{{.*}} %[[VAL_29]], align 1
+// CHECK-GCN:     %[[VAL_163_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_163_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr
+// CHECK-GCN:     %[[VAL_163_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_25]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_163_1]], ptr %[[VAL_163_2]], ptr %[[VAL_163_3]])
+// CHECK-GCN:     %[[VAL_163:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_25]], align 1
+// CHECK-GCN:     store %[[VAL_1]] %[[VAL_163]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK-PTX:     store i32 0, ptr %[[VAL_26]], align 4
 // CHECK:         br label %[[VAL_141:.*]]
-// CHECK:       loop3.loop_header:                                ; preds = %[[VAL_142:.*]], %[[VAL_137]]
-// CHECK:         %[[VAL_143:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK:         %[[VAL_144:.*]] = icmp uge i32 %[[VAL_143]], 2
-// CHECK:         br i1 %[[VAL_144]], label %[[VAL_134]], label %[[VAL_142]]
-// CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_141]]
-// CHECK:         %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1
-// CHECK:         store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4
-// CHECK:         %[[VAL_147:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[VAL_148:.*]] = add i32 %tile_origin.1, 0
-// CHECK:         %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]]
-// CHECK:         %[[VAL_150:.*]] = add i32 %tile_origin.3, %[[VAL_143]]
-// CHECK:         %[[VAL_151:.*]] = mul nuw nsw i32 %[[VAL_150]], 1
-// CHECK:         %[[VAL_152:.*]] = add nuw nsw i32 0, %[[VAL_151]]
-// CHECK:         %[[VAL_153:.*]] = mul nuw nsw i32 %[[VAL_149]], 2
-// CHECK:         %[[VAL_154:.*]] = add nuw nsw i32 %[[VAL_152]], %[[VAL_153]]
-// CHECK:         %[[VAL_155:.*]] = udiv i32 %[[VAL_154]], 10000
-// CHECK:         %[[VAL_156:.*]] = mul nuw nsw i32 %[[VAL_148]], 1
-// CHECK:         %[[VAL_157:.*]] = add nuw nsw i32 0, %[[VAL_156]]
-// CHECK:         %[[VAL_158:.*]] = mul nuw nsw i32 %[[VAL_147]], 1
-// CHECK:         %[[VAL_159:.*]] = add nuw nsw i32 0, %[[VAL_158]]
-// CHECK:         %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_154]]
-// CHECK:         %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !3
-// CHECK:         store %[[VAL_1]] %[[VAL_162]], ptr %[[VAL_29]], align 1
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_25]])
-// CHECK:         %[[VAL_163:.*]] = load %[[VAL_1]], ptr %[[VAL_25]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_163]], ptr %[[VAL_28]], align 1
-// CHECK:         br label %[[VAL_141]], !llvm.loop !5
-// CHECK:       loop3.loop_exit:                                  ; preds = %[[VAL_141]]
-// CHECK:         br label %[[VAL_133]], !llvm.loop !7
+
+// CHECK-PTX:   loop3.loop_header:                                ; preds = %[[VAL_142:.*]], %[[VAL_137]]
+// CHECK-PTX:     %[[VAL_143:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK-PTX:     %[[VAL_144:.*]] = icmp uge i32 %[[VAL_143]], 2
+// CHECK-PTX:     br i1 %[[VAL_144]], label %[[VAL_134]], label %[[VAL_142]]
+
+// CHECK-PTX:   loop3.loop_body:                                  ; preds = %[[VAL_141]]
+// CHECK-PTX:     %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1
+// CHECK-PTX:     store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4
+// CHECK-PTX:     %[[VAL_147:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:     %[[VAL_148:.*]] = add i32 %tile_origin.1, 0
+// CHECK-PTX:     %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]]
+// CHECK-PTX:     %[[VAL_150:.*]] = add i32 %tile_origin.3, %[[VAL_143]]
+// CHECK-PTX:     %[[VAL_151:.*]] = mul nuw nsw i32 %[[VAL_150]], 1
+// CHECK-PTX:     %[[VAL_152:.*]] = add nuw nsw i32 0, %[[VAL_151]]
+// CHECK-PTX:     %[[VAL_153:.*]] = mul nuw nsw i32 %[[VAL_149]], 2
+// CHECK-PTX:     %[[VAL_154:.*]] = add nuw nsw i32 %[[VAL_152]], %[[VAL_153]]
+// CHECK-PTX:     %[[VAL_155:.*]] = udiv i32 %[[VAL_154]], 10000
+// CHECK-PTX:     %[[VAL_156:.*]] = mul nuw nsw i32 %[[VAL_148]], 1
+// CHECK-PTX:     %[[VAL_157:.*]] = add nuw nsw i32 0, %[[VAL_156]]
+// CHECK-PTX:     %[[VAL_158:.*]] = mul nuw nsw i32 %[[VAL_147]], 1
+// CHECK-PTX:     %[[VAL_159:.*]] = add nuw nsw i32 0, %[[VAL_158]]
+// CHECK-PTX:     %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_154]]
+// CHECK-PTX:     %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !3
+// CHECK-PTX:     store %[[VAL_1]] %[[VAL_162]], ptr %[[VAL_29]], align 1
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_25]])
+// CHECK-PTX:     %[[VAL_163:.*]] = load %[[VAL_1]], ptr %[[VAL_25]], align 1
+// CHECK-PTX:     store %[[VAL_1]] %[[VAL_163]], ptr %[[VAL_28]], align 1
+// CHECK-PTX:     br label %[[VAL_141]], !llvm.loop !5
+
+// CHECK-PTX:   loop3.loop_exit:                                  ; preds = %[[VAL_141]]
+// CHECK-PTX:     br label %[[VAL_133]], !llvm.loop !7
+
 // CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_133]]
 // CHECK:         br label %[[VAL_132]]
+
 // CHECK:       is_full_tile-false:                               ; preds = %[[VAL_32]]
-// CHECK:         store i32 0, ptr %[[VAL_24]], align 4
+// CHECK-PTX:     store i32 0, ptr %[[VAL_24]], align 4
+// CHECK-GCN:     store i32 0, ptr{{.*}} %[[VAL_26]], align 4
 // CHECK:         br label %[[VAL_164:.*]]
-// CHECK:       loop2.loop_header4:                               ; preds = %[[VAL_165:.*]], %[[VAL_47]]
-// CHECK:         %[[VAL_166:.*]] = load i32, ptr %[[VAL_24]], align 4
-// CHECK:         %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 5120
+
+// CHECK:       loop2.loop_header{{4|3}}:                               ; preds = %[[VAL_165:.*]], %[[VAL_47]]
+// CHECK-PTX:     %[[VAL_166:.*]] = load i32, ptr %[[VAL_24]], align 4
+// CHECK-PTX:     %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 5120
+// CHECK-GCN:     %[[VAL_166:.*]] = load i32, ptr{{.*}} %[[VAL_26]], align 4
+// CHECK-GCN:     %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 10240
 // CHECK:         br i1 %[[VAL_167]], label %[[VAL_48]], label %[[VAL_168:.*]]
-// CHECK:       loop2.loop_body5:                                 ; preds = %[[VAL_164]]
+
+// CHECK:       loop2.loop_body{{5|4}}:                                 ; preds = %[[VAL_164]]
 // CHECK:         %[[VAL_169:.*]] = add nuw nsw i32 %[[VAL_166]], 640
-// CHECK:         store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4
+// CHECK-PTX:     store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4
+// CHECK-GCN:     store i32 %[[VAL_169]], ptr{{.*}} %[[VAL_26]], align 4
 // CHECK:         %[[VAL_171:.*]] = add i32 %[[VAL_166]], %thread.id.2
 // CHECK:         %[[VAL_172:.*]] = icmp ult i32 %[[VAL_171]], %tile_bound.2
 // CHECK:         br i1 %[[VAL_172]], label %[[VAL_173:.*]], label %[[VAL_165]]
+
 // CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_174:.*]], %[[VAL_168]]
-// CHECK:         br label %[[VAL_164]], !llvm.loop !9
-// CHECK:       loop2.loop_exit3:                                 ; preds = %[[VAL_164]]
+// CHECK:         br label %[[VAL_164]], !llvm.loop !{{9|7}}
+
+// CHECK:       loop2.loop_exit{{3|2}}:                                 ; preds = %[[VAL_164]]
 // CHECK:         br label %[[VAL_132]]
+
 // CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_168]]
-// CHECK:         store i32 0, ptr %[[VAL_23]], align 4
-// CHECK:         br label %[[VAL_175:.*]]
-// CHECK:       loop3.loop_header10:                              ; preds = %[[VAL_176:.*]], %[[VAL_173]]
-// CHECK:         %[[VAL_177:.*]] = load i32, ptr %[[VAL_23]], align 4
-// CHECK:         %[[VAL_178:.*]] = icmp uge i32 %[[VAL_177]], 2
-// CHECK:         br i1 %[[VAL_178]], label %[[VAL_174]], label %[[VAL_176]]
-// CHECK:       loop3.loop_body11:                                ; preds = %[[VAL_175]]
-// CHECK:         %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1
-// CHECK:         store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4
-// CHECK:         %[[VAL_181:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[VAL_182:.*]] = add i32 %tile_origin.1, 0
-// CHECK:         %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]]
-// CHECK:         %[[VAL_184:.*]] = add i32 %tile_origin.3, %[[VAL_177]]
-// CHECK:         %[[VAL_185:.*]] = mul nuw nsw i32 %[[VAL_184]], 1
-// CHECK:         %[[VAL_186:.*]] = add nuw nsw i32 0, %[[VAL_185]]
-// CHECK:         %[[VAL_187:.*]] = mul nuw nsw i32 %[[VAL_183]], 2
-// CHECK:         %[[VAL_188:.*]] = add nuw nsw i32 %[[VAL_186]], %[[VAL_187]]
-// CHECK:         %[[VAL_189:.*]] = udiv i32 %[[VAL_188]], 10000
-// CHECK:         %[[VAL_190:.*]] = mul nuw nsw i32 %[[VAL_182]], 1
-// CHECK:         %[[VAL_191:.*]] = add nuw nsw i32 0, %[[VAL_190]]
-// CHECK:         %[[VAL_192:.*]] = mul nuw nsw i32 %[[VAL_181]], 1
-// CHECK:         %[[VAL_193:.*]] = add nuw nsw i32 0, %[[VAL_192]]
-// CHECK:         %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_188]]
-// CHECK:         %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !3
-// CHECK:         store %[[VAL_1]] %[[VAL_195]], ptr %[[VAL_29]], align 1
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_22]])
-// CHECK:         %[[VAL_196:.*]] = load %[[VAL_1]], ptr %[[VAL_22]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_196]], ptr %[[VAL_28]], align 1
-// CHECK:         br label %[[VAL_175]], !llvm.loop !10
-// CHECK:       loop3.loop_exit9:                                 ; preds = %[[VAL_175]]
-// CHECK:         br label %[[VAL_165]]
+// CHECK-GCN:     %[[VAL_181:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[VAL_182:.*]] = add i32 %tile_origin.1, 0
+// CHECK-GCN:     %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]]
+// CHECK-GCN:     %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_183]]
+// CHECK-GCN:     %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !2
+// CHECK-GCN:     store %[[VAL_1]] %[[VAL_195]], ptr{{.*}} %[[VAL_29]], align 1
+// CHECK-GCN:     %[[VAL_196_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_196_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr
+// CHECK-GCN:     %[[VAL_196_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_22]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_196_1]], ptr %[[VAL_196_2]], ptr %[[VAL_196_3]])
+// CHECK-GCN:     %[[VAL_196:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_22]], align 1
+// CHECK-GCN:     store %[[VAL_1]] %[[VAL_196]], ptr{{.*}} %[[VAL_28]], align 1
+// CHECK-PTX:     store i32 0, ptr %[[VAL_23]], align 4
+// CHECK:         br label %[[VAL_175:.*]]   
+
+// CHECK-PTX:   loop3.loop_header10:                              ; preds = %[[VAL_176:.*]], %[[VAL_173]]
+// CHECK-PTX:     %[[VAL_177:.*]] = load i32, ptr %[[VAL_23]], align 4
+// CHECK-PTX:     %[[VAL_178:.*]] = icmp uge i32 %[[VAL_177]], 2
+// CHECK-PTX:     br i1 %[[VAL_178]], label %[[VAL_174]], label %[[VAL_176]]
+// CHECK-PTX:   loop3.loop_body11:                                ; preds = %[[VAL_175]]
+// CHECK-PTX:     %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1
+// CHECK-PTX:     store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4
+// CHECK-PTX:     %[[VAL_181:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:     %[[VAL_182:.*]] = add i32 %tile_origin.1, 0
+// CHECK-PTX:     %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]]
+// CHECK-PTX:     %[[VAL_184:.*]] = add i32 %tile_origin.3, %[[VAL_177]]
+// CHECK-PTX:     %[[VAL_185:.*]] = mul nuw nsw i32 %[[VAL_184]], 1
+// CHECK-PTX:     %[[VAL_186:.*]] = add nuw nsw i32 0, %[[VAL_185]]
+// CHECK-PTX:     %[[VAL_187:.*]] = mul nuw nsw i32 %[[VAL_183]], 2
+// CHECK-PTX:     %[[VAL_188:.*]] = add nuw nsw i32 %[[VAL_186]], %[[VAL_187]]
+// CHECK-PTX:     %[[VAL_189:.*]] = udiv i32 %[[VAL_188]], 10000
+// CHECK-PTX:     %[[VAL_190:.*]] = mul nuw nsw i32 %[[VAL_182]], 1
+// CHECK-PTX:     %[[VAL_191:.*]] = add nuw nsw i32 0, %[[VAL_190]]
+// CHECK-PTX:     %[[VAL_192:.*]] = mul nuw nsw i32 %[[VAL_181]], 1
+// CHECK-PTX:     %[[VAL_193:.*]] = add nuw nsw i32 0, %[[VAL_192]]
+// CHECK-PTX:     %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_188]]
+// CHECK-PTX:     %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !3
+// CHECK-PTX:     store %[[VAL_1]] %[[VAL_195]], ptr %[[VAL_29]], align 1
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_22]])
+// CHECK-PTX:     %[[VAL_196:.*]] = load %[[VAL_1]], ptr %[[VAL_22]], align 1
+// CHECK-PTX:     store %[[VAL_1]] %[[VAL_196]], ptr %[[VAL_28]], align 1
+// CHECK-PTX:     br label %[[VAL_175]], !llvm.loop !10
+// CHECK-PTX:   loop3.loop_exit9:                                 ; preds = %[[VAL_175]]
+// CHECK-PTX:     br label %[[VAL_165]]
+
 // CHECK:       thread_in_bounds-true:                            ; preds = %[[VAL_132]]
 // CHECK:         %[[VAL_197:.*]] = icmp eq i32 %lane_id, 0
 // CHECK:         br i1 %[[VAL_197]], label %[[VAL_198:.*]], label %[[VAL_199:.*]]
 // CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_198]], %thread_in_bounds-true
-// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     fence syncscope("workgroup") seq_cst
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
 // CHECK:         %[[VAL_200:.*]] = icmp eq i32 %[[VAL_130]], 0
 // CHECK:         br i1 %[[VAL_200]], label %[[VAL_201:.*]], label %[[VAL_131]]
 // CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_202:.*]], %[[VAL_199]]
 // CHECK:         br label %thread_in_bounds-after
 // CHECK:       intra_warp_reduce_write-true:                     ; preds = %thread_in_bounds-true
-// CHECK:         %[[VAL_203:.*]] = load %[[VAL_1]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_203:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_28]], align 1
 // CHECK:         %[[VAL_204:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_130]]
 // CHECK:         %[[VAL_205:.*]] = addrspacecast ptr addrspace(3) %[[VAL_204]] to ptr
 // CHECK:         store %[[VAL_1]] %[[VAL_203]], ptr %[[VAL_205]], align 1
@@ -291,103 +383,141 @@ ENTRY reduce.1 {
 // CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_199]]
 // CHECK:         %[[VAL_206:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id
 // CHECK:         %[[VAL_207:.*]] = addrspacecast ptr addrspace(3) %[[VAL_206]] to ptr
-// CHECK:         store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_11]], align 1
+// CHECK-GCN:     %[[VAL_207_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr
+// CHECK-GCN:     store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_207_1]], align 1
+// CHECK-PTX:     store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_11]], align 1
 // CHECK:         %[[VAL_208:.*]] = icmp ult i32 %thread.id.2, 20
-// CHECK:         %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_11]]
-// CHECK:         %[[VAL_210:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK-GCN:     %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_207_1]]
+// CHECK-PTX:     %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_11]]
+// CHECK:         %[[VAL_210:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}}
 // CHECK:         %[[VAL_211:.*]] = bitcast i128 %[[VAL_210]] to <4 x i32>
 // CHECK:         %[[VAL_212:.*]] = extractelement <4 x i32> %[[VAL_211]], i64 0
-// CHECK:         %[[VAL_213:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_212]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_213:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_212]], i32 16)
+// CHECK-PTX:     %[[VAL_213:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_212]], i32 16, i32 31)
 // CHECK:         %[[VAL_214:.*]] = insertelement <4 x i32> %[[VAL_211]], i32 %[[VAL_213]], i64 0
 // CHECK:         %[[VAL_215:.*]] = extractelement <4 x i32> %[[VAL_214]], i64 1
-// CHECK:         %[[VAL_216:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_215]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_216:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_215]], i32 16)     
+// CHECK-PTX:     %[[VAL_216:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_215]], i32 16, i32 31)
 // CHECK:         %[[VAL_217:.*]] = insertelement <4 x i32> %[[VAL_214]], i32 %[[VAL_216]], i64 1
 // CHECK:         %[[VAL_218:.*]] = extractelement <4 x i32> %[[VAL_217]], i64 2
-// CHECK:         %[[VAL_219:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_218]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_219:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_218]], i32 16)
+// CHECK-PTX:     %[[VAL_219:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_218]], i32 16, i32 31)
 // CHECK:         %[[VAL_220:.*]] = insertelement <4 x i32> %[[VAL_217]], i32 %[[VAL_219]], i64 2
 // CHECK:         %[[VAL_221:.*]] = extractelement <4 x i32> %[[VAL_220]], i64 3
-// CHECK:         %[[VAL_222:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_221]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_222:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_221]], i32 16)
+// CHECK-PTX:     %[[VAL_222:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_221]], i32 16, i32 31)
 // CHECK:         %[[VAL_223:.*]] = insertelement <4 x i32> %[[VAL_220]], i32 %[[VAL_222]], i64 3
 // CHECK:         %[[VAL_224:.*]] = bitcast <4 x i32> %[[VAL_223]] to i128
-// CHECK:         store i128 %[[VAL_224]], ptr %[[VAL_10]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_10]], ptr %[[VAL_9]])
-// CHECK:         %[[VAL_225:.*]] = load %[[VAL_1]], ptr %[[VAL_9]], align 1
+// CHECK:         store i128 %[[VAL_224]], ptr{{.*}} %[[VAL_10]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_225_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr
+// CHECK-GCN:     %[[VAL_225_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_225_1]], ptr %[[VAL_225_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_10]], ptr %[[VAL_9]])
+// CHECK:         %[[VAL_225:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_9]], align 1
 // CHECK:         store %[[VAL_1]] %[[VAL_225]], ptr %[[VAL_209]], align 1
-// CHECK:         %[[VAL_226:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_226:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}}
 // CHECK:         %[[VAL_227:.*]] = bitcast i128 %[[VAL_226]] to <4 x i32>
 // CHECK:         %[[VAL_228:.*]] = extractelement <4 x i32> %[[VAL_227]], i64 0
-// CHECK:         %[[VAL_229:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_228]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_229:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_228]], i32 8)
+// CHECK-PTX:     %[[VAL_229:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_228]], i32 8, i32 31)
 // CHECK:         %[[VAL_230:.*]] = insertelement <4 x i32> %[[VAL_227]], i32 %[[VAL_229]], i64 0
 // CHECK:         %[[VAL_231:.*]] = extractelement <4 x i32> %[[VAL_230]], i64 1
-// CHECK:         %[[VAL_232:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_231]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_232:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_231]], i32 8)
+// CHECK-PTX:     %[[VAL_232:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_231]], i32 8, i32 31)
 // CHECK:         %[[VAL_233:.*]] = insertelement <4 x i32> %[[VAL_230]], i32 %[[VAL_232]], i64 1
 // CHECK:         %[[VAL_234:.*]] = extractelement <4 x i32> %[[VAL_233]], i64 2
-// CHECK:         %[[VAL_235:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_234]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_235:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_234]], i32 8)
+// CHECK-PTX:     %[[VAL_235:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_234]], i32 8, i32 31)
 // CHECK:         %[[VAL_236:.*]] = insertelement <4 x i32> %[[VAL_233]], i32 %[[VAL_235]], i64 2
 // CHECK:         %[[VAL_237:.*]] = extractelement <4 x i32> %[[VAL_236]], i64 3
-// CHECK:         %[[VAL_238:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_237]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_238:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_237]], i32 8)
+// CHECK-PTX:     %[[VAL_238:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_237]], i32 8, i32 31)
 // CHECK:         %[[VAL_239:.*]] = insertelement <4 x i32> %[[VAL_236]], i32 %[[VAL_238]], i64 3
 // CHECK:         %[[VAL_240:.*]] = bitcast <4 x i32> %[[VAL_239]] to i128
-// CHECK:         store i128 %[[VAL_240]], ptr %[[VAL_8]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_8]], ptr %[[VAL_7]])
-// CHECK:         %[[VAL_241:.*]] = load %[[VAL_1]], ptr %[[VAL_7]], align 1
+// CHECK:         store i128 %[[VAL_240]], ptr{{.*}} %[[VAL_8]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_241_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr
+// CHECK-GCN:     %[[VAL_241_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_241_1]], ptr %[[VAL_241_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_8]], ptr %[[VAL_7]])
+// CHECK:         %[[VAL_241:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_7]], align 1
 // CHECK:         store %[[VAL_1]] %[[VAL_241]], ptr %[[VAL_209]], align 1
-// CHECK:         %[[VAL_242:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_242:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}}
 // CHECK:         %[[VAL_243:.*]] = bitcast i128 %[[VAL_242]] to <4 x i32>
 // CHECK:         %[[VAL_244:.*]] = extractelement <4 x i32> %[[VAL_243]], i64 0
-// CHECK:         %[[VAL_245:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_244]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_245:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_244]], i32 4)
+// CHECK-PTX:     %[[VAL_245:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_244]], i32 4, i32 31)
 // CHECK:         %[[VAL_246:.*]] = insertelement <4 x i32> %[[VAL_243]], i32 %[[VAL_245]], i64 0
 // CHECK:         %[[VAL_247:.*]] = extractelement <4 x i32> %[[VAL_246]], i64 1
-// CHECK:         %[[VAL_248:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_247]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_248:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_247]], i32 4)
+// CHECK-PTX:     %[[VAL_248:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_247]], i32 4, i32 31)
 // CHECK:         %[[VAL_249:.*]] = insertelement <4 x i32> %[[VAL_246]], i32 %[[VAL_248]], i64 1
 // CHECK:         %[[VAL_250:.*]] = extractelement <4 x i32> %[[VAL_249]], i64 2
-// CHECK:         %[[VAL_251:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_250]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_251:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_250]], i32 4)
+// CHECK-PTX:     %[[VAL_251:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_250]], i32 4, i32 31)
 // CHECK:         %[[VAL_252:.*]] = insertelement <4 x i32> %[[VAL_249]], i32 %[[VAL_251]], i64 2
 // CHECK:         %[[VAL_253:.*]] = extractelement <4 x i32> %[[VAL_252]], i64 3
-// CHECK:         %[[VAL_254:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_253]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_254:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_253]], i32 4)
+// CHECK-PTX:     %[[VAL_254:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_253]], i32 4, i32 31)
 // CHECK:         %[[VAL_255:.*]] = insertelement <4 x i32> %[[VAL_252]], i32 %[[VAL_254]], i64 3
 // CHECK:         %[[VAL_256:.*]] = bitcast <4 x i32> %[[VAL_255]] to i128
-// CHECK:         store i128 %[[VAL_256]], ptr %[[VAL_6]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_6]], ptr %[[VAL_5]])
-// CHECK:         %[[VAL_257:.*]] = load %[[VAL_1]], ptr %[[VAL_5]], align 1
-// CHECK:         store %[[VAL_1]] %[[VAL_257]], ptr %[[VAL_209]], align 1
-// CHECK:         %[[VAL_258:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         store i128 %[[VAL_256]], ptr{{.*}} %[[VAL_6]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_257_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr
+// CHECK-GCN:     %[[VAL_257_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_257_1]], ptr %[[VAL_257_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_6]], ptr %[[VAL_5]])
+// CHECK:         %[[VAL_257:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_5]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_257]], ptr{{.*}} %[[VAL_209]], align 1
+// CHECK:         %[[VAL_258:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}}
 // CHECK:         %[[VAL_259:.*]] = bitcast i128 %[[VAL_258]] to <4 x i32>
 // CHECK:         %[[VAL_260:.*]] = extractelement <4 x i32> %[[VAL_259]], i64 0
-// CHECK:         %[[VAL_261:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_260]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_261:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_260]], i32 2)
+// CHECK-PTX:     %[[VAL_261:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_260]], i32 2, i32 31)
 // CHECK:         %[[VAL_262:.*]] = insertelement <4 x i32> %[[VAL_259]], i32 %[[VAL_261]], i64 0
 // CHECK:         %[[VAL_263:.*]] = extractelement <4 x i32> %[[VAL_262]], i64 1
-// CHECK:         %[[VAL_264:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_263]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_264:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_263]], i32 2)
+// CHECK-PTX:     %[[VAL_264:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_263]], i32 2, i32 31)
 // CHECK:         %[[VAL_265:.*]] = insertelement <4 x i32> %[[VAL_262]], i32 %[[VAL_264]], i64 1
 // CHECK:         %[[VAL_266:.*]] = extractelement <4 x i32> %[[VAL_265]], i64 2
-// CHECK:         %[[VAL_267:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_266]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_267:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_266]], i32 2)
+// CHECK-PTX:     %[[VAL_267:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_266]], i32 2, i32 31)
 // CHECK:         %[[VAL_268:.*]] = insertelement <4 x i32> %[[VAL_265]], i32 %[[VAL_267]], i64 2
 // CHECK:         %[[VAL_269:.*]] = extractelement <4 x i32> %[[VAL_268]], i64 3
-// CHECK:         %[[VAL_270:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_269]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_270:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_269]], i32 2)
+// CHECK-PTX:     %[[VAL_270:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_269]], i32 2, i32 31)
 // CHECK:         %[[VAL_271:.*]] = insertelement <4 x i32> %[[VAL_268]], i32 %[[VAL_270]], i64 3
 // CHECK:         %[[VAL_272:.*]] = bitcast <4 x i32> %[[VAL_271]] to i128
-// CHECK:         store i128 %[[VAL_272]], ptr %[[VAL_4]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_4]], ptr %[[VAL_3]])
-// CHECK:         %[[VAL_273:.*]] = load %[[VAL_1]], ptr %[[VAL_3]], align 1
+// CHECK:         store i128 %[[VAL_272]], ptr{{.*}} %[[VAL_4]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_273_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr
+// CHECK-GCN:     %[[VAL_273_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_273_1]], ptr %[[VAL_273_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_4]], ptr %[[VAL_3]])
+// CHECK:         %[[VAL_273:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_3]], align 1
 // CHECK:         store %[[VAL_1]] %[[VAL_273]], ptr %[[VAL_209]], align 1
-// CHECK:         %[[VAL_274:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_274:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}}
 // CHECK:         %[[VAL_275:.*]] = bitcast i128 %[[VAL_274]] to <4 x i32>
 // CHECK:         %[[VAL_276:.*]] = extractelement <4 x i32> %[[VAL_275]], i64 0
-// CHECK:         %[[VAL_277:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_276]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_277:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_276]], i32 1)
+// CHECK-PTX:     %[[VAL_277:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_276]], i32 1, i32 31)
 // CHECK:         %[[VAL_278:.*]] = insertelement <4 x i32> %[[VAL_275]], i32 %[[VAL_277]], i64 0
 // CHECK:         %[[VAL_279:.*]] = extractelement <4 x i32> %[[VAL_278]], i64 1
-// CHECK:         %[[VAL_280:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_279]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_280:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_279]], i32 1)
+// CHECK-PTX:     %[[VAL_280:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_279]], i32 1, i32 31)
 // CHECK:         %[[VAL_281:.*]] = insertelement <4 x i32> %[[VAL_278]], i32 %[[VAL_280]], i64 1
 // CHECK:         %[[VAL_282:.*]] = extractelement <4 x i32> %[[VAL_281]], i64 2
-// CHECK:         %[[VAL_283:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_282]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_283:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_282]], i32 1)
+// CHECK-PTX:     %[[VAL_283:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_282]], i32 1, i32 31)
 // CHECK:         %[[VAL_284:.*]] = insertelement <4 x i32> %[[VAL_281]], i32 %[[VAL_283]], i64 2
 // CHECK:         %[[VAL_285:.*]] = extractelement <4 x i32> %[[VAL_284]], i64 3
-// CHECK:         %[[VAL_286:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_285]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_286:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_285]], i32 1)
+// CHECK-PTX:     %[[VAL_286:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_285]], i32 1, i32 31)
 // CHECK:         %[[VAL_287:.*]] = insertelement <4 x i32> %[[VAL_284]], i32 %[[VAL_286]], i64 3
 // CHECK:         %[[VAL_288:.*]] = bitcast <4 x i32> %[[VAL_287]] to i128
-// CHECK:         store i128 %[[VAL_288]], ptr %[[VAL_2]], align 16
-// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_2]], ptr %[[VAL_0]])
-// CHECK:         %[[VAL_289:.*]] = load %[[VAL_1]], ptr %[[VAL_0]], align 1
+// CHECK:         store i128 %[[VAL_288]], ptr{{.*}} %[[VAL_2]], align {{(16|8)}}
+// CHECK-GCN:     %[[VAL_289_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr
+// CHECK-GCN:     %[[VAL_289_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_289_1]], ptr %[[VAL_289_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_2]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_289:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_0]], align 1
 // CHECK:         store %[[VAL_1]] %[[VAL_289]], ptr %[[VAL_209]], align 1
 // CHECK:         %[[VAL_290:.*]] = icmp eq i32 %thread.id.2, 0
 // CHECK:         br i1 %[[VAL_290]], label %[[VAL_291:.*]], label %[[VAL_202]]
@@ -402,15 +532,23 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_298:.*]] = alloca %[[VAL_299:.*]], align 8
 // CHECK:         %[[VAL_300:.*]] = load %[[VAL_299]], ptr %[[VAL_301:.*]], align 1
 // CHECK:         %[[VAL_302:.*]] = load %[[VAL_299]], ptr %[[VAL_303:.*]], align 1
-// CHECK:         %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0
-// CHECK:         %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0
-// CHECK:         %[[VAL_306:.*]] = fadd double %[[VAL_304]], %[[VAL_305]]
-// CHECK:         %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1
-// CHECK:         %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1
-// CHECK:         %[[VAL_309:.*]] = fadd double %[[VAL_307]], %[[VAL_308]]
-// CHECK:         %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_306]], 0
-// CHECK:         %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_309]], 1
-// CHECK:         store %[[VAL_299]] %[[VAL_311]], ptr %[[VAL_298]], align 1
-// CHECK:         %[[VAL_312:.*]] = load %[[VAL_299]], ptr %[[VAL_298]], align 1
+// CHECK-GCN:     %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1
+// CHECK-GCN:     %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1
+// CHECK-PTX:     %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0
+// CHECK-PTX:     %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0
+// CHECK-GCN:     %[[VAL_306:.*]] = fadd double %[[VAL_305]], %[[VAL_304]] 
+// CHECK-PTX:     %[[VAL_306:.*]] = fadd double %[[VAL_304]], %[[VAL_305]]
+// CHECK-GCN:     %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0
+// CHECK-GCN:     %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0
+// CHECK-PTX:     %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1
+// CHECK-PTX:     %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1
+// CHECK-GCN:     %[[VAL_309:.*]] = fadd double %[[VAL_308]], %[[VAL_307]]
+// CHECK-PTX:     %[[VAL_309:.*]] = fadd double %[[VAL_307]], %[[VAL_308]]
+// CHECK-GCN:     %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_309]], 0
+// CHECK-GCN:     %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_306]], 1
+// CHECK-PTX:     %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_306]], 0
+// CHECK-PTX:     %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_309]], 1
+// CHECK:         store %[[VAL_299]] %[[VAL_311]], ptr{{.*}} %[[VAL_298]], align 1
+// CHECK:         %[[VAL_312:.*]] = load %[[VAL_299]], ptr{{.*}} %[[VAL_298]], align 1
 // CHECK:         store %[[VAL_299]] %[[VAL_312]], ptr %[[VAL_313:.*]], align 1
 // CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 05eea435bd70dd..155ef105d645fe 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -123,7 +123,7 @@ std::vector<HloOpcode> TritonSupportedBinaryElementwise(
 
 std::vector<HloOpcode> TritonSupportedTernaryElementwise(
     PrimitiveType element_type) {
-  return {HloOpcode::kSelect};
+  return {HloOpcode::kSelect, HloOpcode::kClamp};
 }
 
 bool IsTritonSupportedElementwise(HloOpcode opcode,
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 38f7d60dd3ecbd..2d34835228c6c0 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,12 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <iomanip>
+#include <memory>
 #include <optional>
+#include <queue>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -33,8 +37,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal_util.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/pattern_matcher.h"
@@ -59,801 +65,876 @@ void SetMemorySpace(Shape* shape, int64_t memory_space_color) {
   shape->mutable_layout()->set_memory_space(memory_space_color);
 }
 
-// Checks if all of the HloPositions of this HloValue, apart from the defining
-// position, are allowed when doing memory-only offload.
-bool AllPositionsAreAllowed(const HloValue* value) {
-  // Given an HloValue, validate that none of its positions are doing any
-  // compute.
-  for (const HloPosition& position : value->positions()) {
-    if (position == value->defining_position()) {
-      // Skip defining positions.
-      continue;
-    }
-    // Check if this position is of an allowed type.
-    if (!absl::c_linear_search(HostOffloader::GetAllowedPositionOpcodes(),
-                               position.instruction->opcode())) {
-      VLOG(1) << "Position " << position.instruction->ToString()
-              << " is not supported.";
-      return false;
-    }
+bool SetBuffersToMemorySpaceColor(
+    const std::vector<InstructionAndShapeIndex>& buffers_to_set_to_host_memory,
+    int64_t memory_space_color) {
+  bool changed = false;
+  for (const auto& instr_and_shape : buffers_to_set_to_host_memory) {
+    VLOG(2) << absl::StreamFormat("Setting %s to memory space %d",
+                                  instr_and_shape.ToString(),
+                                  memory_space_color);
+    Shape* shape = ShapeUtil::GetMutableSubshape(
+        instr_and_shape.instruction->mutable_shape(),
+        instr_and_shape.shape_index);
+    CHECK(shape->has_layout()) << "Shape must have a layout";
+    SetMemorySpace(ShapeUtil::GetMutableSubshape(
+                       instr_and_shape.instruction->mutable_shape(),
+                       instr_and_shape.shape_index),
+                   memory_space_color);
+    changed = true;
   }
-
-  // Did not find any invalid ops.
-  return true;
-}
-
-bool DefiningPositionIsAllowed(const HloInstruction* instruction) {
-  static constexpr std::array kAllowedOpcodes = {HloOpcode::kWhile,
-                                                 HloOpcode::kParameter};
-  return absl::c_linear_search(kAllowedOpcodes, instruction->opcode());
+  return changed;
 }
 
-template <typename MatcherType>
-absl::StatusOr<HloInstruction*> BufferHasPositionWithUser(
-    const HloBuffer& buffer, MatcherType matcher) {
-  HloInstruction* result = nullptr;
-  for (const HloValue* value : buffer.values()) {
-    for (const HloPosition& position : value->positions()) {
-      for (HloInstruction* user : position.instruction->users()) {
-        if (Match(user, matcher)) {
-          if (result != nullptr && result != user) {
-            return Internal("Found multiple matching users! At least %s and %s",
-                            result->name(), user->name());
-          }
-          result = user;
-        }
-      }
+bool CustomCallReusesBuffer(const HloInstruction* custom_call,
+                            int64_t operand_index) {
+  if (custom_call->custom_call_target() == kMoveToDeviceCustomCallTarget ||
+      custom_call->custom_call_target() == kMoveToHostCustomCallTarget) {
+    // Does not define a new buffer.
+    return true;
+  }
+  // Check the custom call's output_to_operand_aliasing.
+  const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
+      aliases = custom_call->output_operand_aliasing();
+  for (const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>& alias :
+       aliases) {
+    int64_t alias_operand_index = alias.second.first;
+    if (alias_operand_index == operand_index) {
+      // This operand aliases with the output.
+      return true;
     }
   }
-  return result;
+  // By default, assume custom calls define new buffers.
+  return false;
 }
 
-template <typename MatcherType>
-absl::StatusOr<std::vector<HloInstruction*>> GetBufferPositionsWithUser(
-    const HloBuffer& buffer, MatcherType matcher) {
-  std::vector<HloInstruction*> result;
-  for (const HloValue* value : buffer.values()) {
-    for (const HloPosition& position : value->positions()) {
-      for (HloInstruction* user : position.instruction->users()) {
-        if (Match(user, matcher)) {
-          result.emplace_back(user);
+// If an instruction's user is a call, we descend into the call first.
+// Eventually, a later invocation of this function while walking the graph will
+// return the call itself as a successor of the ROOT instruction of the
+// computation.
+absl::StatusOr<std::vector<InstructionAndShapeIndex>> GetSuccessors(
+    const InstructionAndShapeIndex& instruction_and_shape_index) {
+  std::vector<InstructionAndShapeIndex> result;
+  HloInstruction* instruction = instruction_and_shape_index.instruction;
+  if (instruction->IsRoot()) {
+    // Successor of the root is the call instruction(s).
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(instruction->GetModule());
+    auto callers = call_graph->GetComputationCallers(instruction->parent());
+    for (HloInstruction* caller : callers) {
+      result.push_back({caller, instruction_and_shape_index.shape_index});
+    }
+  }
+  for (HloInstruction* user : instruction->users()) {
+    if (user->opcode() == HloOpcode::kTuple) {
+      auto operand_indices = user->OperandIndices(instruction);
+      for (const auto i : operand_indices) {
+        auto tmp_shape_index = instruction_and_shape_index.shape_index;
+        tmp_shape_index.push_back(i);
+        result.push_back({user, std::move(tmp_shape_index)});
+      }
+    } else if (user->opcode() == HloOpcode::kGetTupleElement) {
+      ShapeIndex tmp_shape_index = instruction_and_shape_index.shape_index;
+      const auto index = tmp_shape_index.front();
+      if (index == user->tuple_index()) {
+        // This GTE is for the buffer we're tracking.
+        tmp_shape_index.pop_front();
+        result.push_back({user, std::move(tmp_shape_index)});
+      }
+    } else if (user->opcode() == HloOpcode::kCall) {
+      auto operand_indices = user->OperandIndices(instruction);
+      CHECK(user->called_computations().size() == 1)
+          << "Expect call to only have one called computation.";
+      for (const auto i : operand_indices) {
+        HloComputation* called_computation =
+            user->called_computations().front();
+        HloInstruction* parameter_instruction =
+            called_computation->parameter_instruction(i);
+        result.push_back(
+            {parameter_instruction, instruction_and_shape_index.shape_index});
+      }
+    } else if (user->opcode() == HloOpcode::kWhile) {
+      auto operand_indices = user->OperandIndices(instruction);
+      HloComputation* while_body_computation = user->while_body();
+      HloComputation* while_condition_computation = user->while_condition();
+      for (const auto i : operand_indices) {
+        HloInstruction* parameter_instruction =
+            while_body_computation->parameter_instruction(i);
+        result.push_back(
+            {parameter_instruction, instruction_and_shape_index.shape_index});
+
+        HloInstruction* condition_instruction =
+            while_condition_computation->parameter_instruction(i);
+        result.push_back(
+            {condition_instruction, instruction_and_shape_index.shape_index});
+      }
+    } else if (user->opcode() == HloOpcode::kAsyncStart) {
+      LOG(INFO) << "Instruction " << instruction->name()
+                << " feeds into async-start " << user->name();
+      auto operand_indices = user->OperandIndices(instruction);
+      CHECK(user->called_computations().size() == 1)
+          << "Expect async-start to only have one called computation.";
+      for (const auto i : operand_indices) {
+        HloComputation* called_computation =
+            user->called_computations().front();
+        HloInstruction* parameter_instruction =
+            called_computation->parameter_instruction(i);
+        LOG(INFO) << "Which is used by parameter "
+                  << parameter_instruction->name();
+        result.push_back(
+            {parameter_instruction, instruction_and_shape_index.shape_index});
+      }
+    } else if (user->opcode() == HloOpcode::kCustomCall) {
+      const auto operand_indices = user->OperandIndices(instruction);
+      // TODO(b/342650757): Rather than a boolean indicating whether the
+      // instruction reuses the buffer, return the shape index of the output
+      // that the operand aliases with.
+      bool found_one = false;
+      for (const auto i : operand_indices) {
+        if (CustomCallReusesBuffer(user, i)) {
+          if (found_one) {
+            return absl::InternalError(
+                "Found multiple operands of a custom call that reuse the same "
+                "output buffer.");
+          }
+          result.push_back({user, instruction_and_shape_index.shape_index});
+          found_one = true;
         }
       }
+    } else {
+      result.push_back({user, instruction_and_shape_index.shape_index});
     }
   }
   return result;
 }
 
-template <typename MatcherType>
-absl::StatusOr<std::vector<HloInstruction*>> GetBufferUsersOfType(
-    const HloBuffer& buffer, MatcherType matcher) {
-  std::vector<HloInstruction*> result;
-  for (const HloValue* value : buffer.values()) {
-    VLOG(3) << "Buffer defined at " << value->defining_instruction()->name()
-            << " has positions ["
-            << absl::StrJoin(value->positions(), ", ",
-                             [](std::string* out, const HloPosition& position) {
-                               out->append(position.instruction->name());
-                             })
-            << "]";
-    for (const HloPosition& position : value->positions()) {
-      VLOG(4) << "  Position " << position.instruction->name() << " has users ["
-              << absl::StrJoin(
-                     position.instruction->users(), ", ",
-                     [](std::string* out, const HloInstruction* user) {
-                       out->append(user->name());
-                     })
-              << "]";
-      for (HloInstruction* user : position.instruction->users()) {
-        if (Match(user, matcher)) {
-          result.emplace_back(user);
-        }
-      }
+// If an instruction's operand is a call, return the call now. A follow up call
+// of this function on that call returns the ROOT. Eventually, once the given
+// instruction is a parameter, the returned predecessor will be the appropriate
+// operand of the call (not the call itself, since we already returned it).
+std::vector<InstructionAndShapeIndex> GetPredecessors(
+    const InstructionAndShapeIndex& instruction_and_shape_index) {
+  std::vector<InstructionAndShapeIndex> result;
+  HloInstruction* instruction = instruction_and_shape_index.instruction;
+  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+    const int64_t index = instruction->tuple_index();
+    auto tmp_shape_index = instruction_and_shape_index.shape_index;
+    tmp_shape_index.push_front(index);
+    result.push_back({instruction->mutable_operand(0), tmp_shape_index});
+  } else if (instruction->opcode() == HloOpcode::kTuple) {
+    CHECK(!instruction_and_shape_index.shape_index.empty())
+        << "Did not store an index before encountering a tuple.";
+    auto tmp_shape_index = instruction_and_shape_index.shape_index;
+    const int64_t index = tmp_shape_index.front();
+    tmp_shape_index.pop_front();
+    result.push_back({instruction->mutable_operand(index), tmp_shape_index});
+  } else if (instruction->opcode() == HloOpcode::kCall) {
+    // Predecessor of a call is its computation's root instruction.
+    CHECK(instruction->called_computations().size() == 1)
+        << "Expect call to only have one called computation.";
+    HloComputation* called_computation =
+        instruction->called_computations().front();
+    result.push_back({called_computation->root_instruction(),
+                      instruction_and_shape_index.shape_index});
+  } else if (instruction->opcode() == HloOpcode::kParameter) {
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(instruction->GetModule());
+    auto callers = call_graph->GetComputationCallers(instruction->parent());
+    for (HloInstruction* caller : callers) {
+      result.push_back(
+          {caller->mutable_operand(instruction->parameter_number()),
+           instruction_and_shape_index.shape_index});
     }
+  } else if (instruction->opcode() == HloOpcode::kDynamicSlice) {
+    result.push_back({instruction->mutable_operand(0),
+                      instruction_and_shape_index.shape_index});
+  } else if (instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    result.push_back({instruction->mutable_operand(0),
+                      instruction_and_shape_index.shape_index});
+  } else if (instruction->opcode() == HloOpcode::kWhile) {
+    HloComputation* while_body_computation = instruction->while_body();
+    result.push_back({while_body_computation->root_instruction(),
+                      instruction_and_shape_index.shape_index});
+  } else {
+    CHECK(instruction->operand_count() == 1) << absl::StreamFormat(
+        "Expecting instruction %s to have 1 operand, but it has %d.",
+        instruction->name(), instruction->operand_count());
+    result.push_back({instruction->mutable_operand(0),
+                      instruction_and_shape_index.shape_index});
   }
   return result;
 }
 
-// Returns true if the instruction passed in preserves the underlying buffer,
-// And the buffer is passed through the first operand.
-// This is used to trace the graph between an annotation and its relevant slice.
-bool CanTraverseOpBetweenAnnotation(HloInstruction* hlo) {
-  if (hlo->opcode() == HloOpcode::kBitcast ||
-      hlo->opcode() == HloOpcode::kCopy) {
-    return true;
-  } else if (hlo->opcode() == HloOpcode::kReshape) {
-    return ShapeUtil::ReshapeIsBitcast(hlo->operand(0)->shape(), hlo->shape());
-  } else if (hlo->opcode() == HloOpcode::kReduce) {
-    // TODO(b/333902007): Remove this once trivial reduces no longer appear.
-    return ShapeUtil::TrueRank(hlo->operand(0)->shape()) ==
-           ShapeUtil::TrueRank(hlo->shape());
-  }
-  return false;
+}  // namespace
+
+bool operator==(const InstructionAndShapeIndex& lhs,
+                const InstructionAndShapeIndex& rhs) {
+  return lhs.instruction == rhs.instruction &&
+         lhs.shape_index == rhs.shape_index;
 }
 
-// Starting from a slice or dynamic-slice, trace the graph down through reshapes
-// and bitcasts to find the annotation that signals that the data is being moved
-// to the device from the host. If no custom call is found, returns an empty
-// optional.
-std::optional<HloInstruction*> FindAnnotationFromDS(HloInstruction* hlo) {
-  CHECK(hlo->opcode() == HloOpcode::kDynamicSlice ||
-        hlo->opcode() == HloOpcode::kSlice)
-      << "Expected a dynamic-slice or slice as input.";
-  if (hlo->user_count() != 1) {
-    return std::nullopt;
-  }
-  hlo = hlo->users()[0];
-  while (!hlo->IsCustomCall(kMoveToDeviceCustomCallTarget)) {
-    if (!CanTraverseOpBetweenAnnotation(hlo) || hlo->user_count() != 1) {
-      break;
-    }
-    hlo = hlo->users()[0];
-  }
-  if (hlo->IsCustomCall(kMoveToDeviceCustomCallTarget)) {
-    return hlo;
-  }
-  return std::nullopt;
+std::string InstructionAndShapeIndex::ToString() const {
+  return absl::StrFormat("{Instr: %s, ShapeIndex: %s}", instruction->name(),
+                         shape_index.ToString());
 }
 
-// Starting from a MoveToHost custom call, trace the graph down through reshapes
-// and bitcasts to return the dynamic-update-slice that moves the data from the
-// host to the device. If no DUS is found, returns an empty optional.
-std::optional<HloInstruction*> FindDUSFromAnnotation(HloInstruction* hlo) {
-  CHECK(hlo->IsCustomCall(kMoveToHostCustomCallTarget))
-      << "Expected a MoveToHost custom call as input.";
-  if (hlo->user_count() != 1) {
-    return std::nullopt;
-  }
-  hlo = hlo->users()[0];
-  while (hlo->opcode() != HloOpcode::kDynamicUpdateSlice) {
-    if (!CanTraverseOpBetweenAnnotation(hlo) || hlo->user_count() != 1) {
-      break;
-    }
-    hlo = hlo->users()[0];
-  }
-  if (hlo->opcode() == HloOpcode::kDynamicUpdateSlice) {
-    return hlo;
-  }
-  return std::nullopt;
+bool HostOffloader::IsValidDuringPureMemoryOffload(
+    const HloInstruction* instruction) const {
+  static constexpr std::array allowed_opcodes = {
+      HloOpcode::kGetTupleElement,
+      HloOpcode::kBitcast,
+      HloOpcode::kTuple,
+      HloOpcode::kCall,
+      HloOpcode::kWhile,
+      HloOpcode::kParameter,
+      HloOpcode::kOptimizationBarrier,
+      HloOpcode::kAsyncStart,
+      HloOpcode::kAsyncDone,
+      HloOpcode::kCustomCall};
+  return absl::c_linear_search(allowed_opcodes, instruction->opcode());
 }
 
-// Starting from a dynamic-update-slice, trace the graph up reshapes,
-// bitcasts and reduces to return the MoveToHost custom call that feeds into the
-// DUS, if it exists. If no MoveToHost call is found, returns an empty optional.
-std::optional<HloInstruction*> FindAnnotationFromDUS(HloInstruction* hlo) {
-  CHECK(hlo->opcode() == HloOpcode::kDynamicUpdateSlice)
-      << "Expected a dynamic-update-slice as input.";
-  // We expect the custom call to come from the written slice, i.e. operand 1.
-  hlo = hlo->mutable_operand(1);
-  while (!hlo->IsCustomCall(kMoveToHostCustomCallTarget)) {
-    if (!CanTraverseOpBetweenAnnotation(hlo)) {
-      break;
-    }
-    hlo = hlo->mutable_operand(0);
-  }
-  if (hlo->IsCustomCall(kMoveToHostCustomCallTarget)) {
-    return hlo;
+bool HostOffloader::InstructionIsAllowedBetweenMoveToHostAndDus(
+    const HloInstruction* instruction) const {
+  if (instruction->opcode() == HloOpcode::kReshape) {
+    return ShapeUtil::ReshapeIsBitcast(instruction->operand(0)->shape(),
+                                       instruction->shape());
   }
-  return std::nullopt;
+  return instruction->opcode() == HloOpcode::kBitcast;
 }
 
-}  // namespace
-
-absl::Status HostOffloader::HandleMoveToHostCustomCall(
-    HloInstruction* custom_call) {
-  VLOG(2) << "Found a custom call annotating start-of-host-offload: "
-          << custom_call->ToString();
-  // Save a pointer to this custom call for when we want to remove it later.
-  custom_calls_to_remove_.emplace(custom_call);
-
-  // We expect that either the custom call is the root or the DUS is the only
-  // user of this custom call.
-  if (!custom_call->IsRoot() && custom_call->user_count() != 1) {
-    return FailedPrecondition(
-        "Expecting custom call %s to either be the root or only have 1 user; "
-        "it is not the root and has %d users: [%s]",
-        custom_call->name(), custom_call->user_count(),
-        absl::StrJoin(custom_call->users(), ", ",
-                      [](std::string* out, const HloInstruction* user) {
-                        out->append(user->name());
-                      }));
-  }
-
-  HloInstruction* consumer = nullptr;
-  if (!custom_call->IsRoot()) {
-    consumer = custom_call->users().at(0);
-    // Skip past any bitcasts.
-    while (consumer != nullptr && consumer->opcode() == HloOpcode::kBitcast) {
-      VLOG(1) << "Skipping bitcast " << consumer->ToString();
-      consumer = consumer->users().at(0);
-    }
+bool HostOffloader::InstructionIsAllowedBetweenDsAndMoveToDevice(
+    const HloInstruction* instruction) const {
+  if (instruction->opcode() == HloOpcode::kReduce) {
+    // TODO(b/333902007): Remove this once trivial reduces no longer appear.
+    return ShapeUtil::TrueRank(instruction->operand(0)->shape()) ==
+           ShapeUtil::TrueRank(instruction->shape());
   }
-
-  if (consumer != nullptr &&
-      consumer->opcode() == HloOpcode::kDynamicUpdateSlice) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(consumer));
-  } else if (consumer != nullptr && consumer->opcode() == HloOpcode::kCopy) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(consumer));
-  } else {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
+  if (instruction->opcode() == HloOpcode::kReshape) {
+    return ShapeUtil::ReshapeIsBitcast(instruction->operand(0)->shape(),
+                                       instruction->shape());
   }
-  return absl::OkStatus();
+  return instruction->opcode() == HloOpcode::kBitcast ||
+         instruction->opcode() == HloOpcode::kCopy;
 }
 
-absl::Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
-    const HloInstruction* dynamic_update_slice) {
-  // The user wants to offload the data defined by this dynamic-update-slice.
-  VLOG(2) << "Host memory offload starts with a dynamic-update-slice: "
-          << dynamic_update_slice->name();
-  // Get the buffer for this DUS.
-  const HloBuffer& unique_buffer =
-      alias_analysis_->GetUniqueBufferAt(dynamic_update_slice);
-
-  // We must find at least two HloValues:
-  //  1. Defined by a broadcast.
-  //    a. For now, we only offload if the original destination of DUS is
-  //    created by a broadcast.
-  //  2. Defined by a dynamic-update-slice.
-  const HloValue* dus_value = nullptr;
-  const HloValue* broadcast_value = nullptr;
-  for (const HloValue* value : unique_buffer.values()) {
-    HloInstruction* defining_instruction =
-        value->defining_position().instruction;
-    if (defining_instruction->opcode() == HloOpcode::kBroadcast) {
-      if (broadcast_value != nullptr) {
-        LOG(WARNING) << "Already found one broadcast ("
-                     << broadcast_value->defining_position().instruction->name()
-                     << ") value for this buffer. This one is "
-                     << defining_instruction->name();
+absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
+    const InstructionAndShapeIndex& starting_instruction_and_index,
+    bool insert_copy_before) {
+  bool changed = false;
+  absl::flat_hash_set<HloInstruction*> mth_custom_calls_to_remove;
+  absl::flat_hash_set<HloInstruction*> slices_to_dynamify;
+  absl::flat_hash_set<HloInstruction*> custom_calls_to_insert_copies_before;
+  std::vector<InstructionAndShapeIndex> buffers_to_set_to_host_memory;
+  std::vector<HloInstruction*> dynamic_update_slices;
+  HloInstruction* starting_instruction =
+      starting_instruction_and_index.instruction;
+  std::queue<InstructionAndShapeIndex> queue;
+  queue.push(starting_instruction_and_index);
+  while (!queue.empty()) {
+    InstructionAndShapeIndex instruction_and_shape_index = queue.front();
+    queue.pop();
+    HloInstruction* instruction = instruction_and_shape_index.instruction;
+    VLOG(4) << absl::StreamFormat("Visiting instruction: %s",
+                                  instruction_and_shape_index.ToString());
+    bool already_saved_buffer = false;
+    if (instruction->opcode() == HloOpcode::kCustomCall &&
+        instruction->custom_call_target() ==
+            host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+      // This MoveToHost custom call is a no-op; save it to remove later.
+      already_visited_move_to_host_custom_calls_.insert(instruction);
+      mth_custom_calls_to_remove.insert(instruction);
+    } else if (instruction->opcode() == HloOpcode::kCustomCall &&
+               instruction->custom_call_target() ==
+                   host_memory_offload_annotations::
+                       kMoveToDeviceCustomCallTarget) {
+      // This MoveToDevice marks the end of this path.
+      custom_calls_to_insert_copies_before.insert(instruction);
+      continue;
+    } else if (instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      if (instruction == starting_instruction) {
+        dynamic_update_slices.push_back(instruction);
+      } else {
+        // The input to this DynamicUpdateSlice is already in host memory. Save
+        // this so that we don't try to create an AllocateBuffer later.
+        dynamic_update_slices_already_allocated_.insert(instruction);
       }
-      broadcast_value = value;
-    } else if (defining_instruction->opcode() ==
-               HloOpcode::kDynamicUpdateSlice) {
-      if (dus_value != nullptr) {
-        LOG(WARNING) << "Already found one dynamic-update-slice ("
-                     << dus_value->defining_position().instruction->name()
-                     << ") value for this buffer. This one is "
-                     << defining_instruction->name();
+    } else if (IsValidDuringPureMemoryOffload(instruction)) {
+      if (instruction->opcode() == HloOpcode::kAsyncStart) {
+        // When visiting the parameter, we already set the memory space of the
+        // input of the async-start; do not set it now.
+        already_saved_buffer = true;
+      } else if (instruction->opcode() == HloOpcode::kAsyncDone) {
+        // Also set host memory space for the output in the async-start's shape.
+        HloInstruction* async_start = instruction->mutable_operand(0);
+        buffers_to_set_to_host_memory.emplace_back(async_start, ShapeIndex{1});
+      } else if (instruction->opcode() == HloOpcode::kParameter) {
+        // When setting the memory space of a parameter, also set the memory
+        // space of the call site of the computation with this parameter if that
+        // caller is an async-start.
+        std::unique_ptr<CallGraph> call_graph =
+            CallGraph::Build(instruction->GetModule());
+        std::vector<HloInstruction*> callers =
+            call_graph->GetComputationCallers(instruction->parent());
+        for (HloInstruction* caller : callers) {
+          if (caller->opcode() == HloOpcode::kAsyncStart) {
+            ShapeIndex tmp_index = instruction_and_shape_index.shape_index;
+            tmp_index.push_front(instruction->parameter_number());
+            tmp_index.push_front(
+                0);  // Index 0 for the inputs of the async-start. The shape of
+                     // async-start is ((inputs, ...), output, context).
+            buffers_to_set_to_host_memory.emplace_back(caller, tmp_index);
+          }
+        }
       }
-      dus_value = value;
+    } else if (instruction->opcode() == HloOpcode::kDynamicSlice) {
+      TF_RETURN_IF_ERROR(
+          ValidateSliceLeadsToMoveToDeviceCustomCall(instruction));
+      // This DynamicSlice is the end of this path of host memory offload.
+      continue;
+    } else if (instruction->opcode() == HloOpcode::kSlice) {
+      TF_RETURN_IF_ERROR(
+          ValidateSliceLeadsToMoveToDeviceCustomCall(instruction));
+      // This Slice is the end of this path of host memory offload.
+      // This Slice should be a DynamicSlice to be able to work with host
+      // memory.
+      slices_to_dynamify.insert(instruction);
+      continue;
     } else {
-      // For all values other than the two we were looking for, ensure that the
-      // defining position is non-compute as well as all other positions.
-      if (!DefiningPositionIsAllowed(value->defining_position().instruction)) {
-        return Internal(
-            "HloValue is defined by an unsupported op: %s. HloValue: %s",
-            defining_instruction->name(), value->ToString());
-      }
-      if (!AllPositionsAreAllowed(value)) {
-        return Internal(
-            "HloValue defined by %s has an invalid position. HloValue: %s",
-            defining_instruction->name(), value->ToString());
-      }
-    }
-  }
-
-  // For the two found HloValues, ensure that all other positions are
-  // non-compute.
-  if (dus_value == nullptr) {
-    return Internal(
-        "DynamicUpdateSlice's buffer does not have a value which is defined by "
-        "a dynamic update slice. HloBuffer: %s",
-        unique_buffer.ToString());
-  }
-  if (!AllPositionsAreAllowed(dus_value)) {
-    return Internal(
-        "HloValue defined by %s has an invalid position. HloValue: %s",
-        dus_value->defining_position().instruction->name(),
-        dus_value->ToString());
-  }
-  if (broadcast_value == nullptr) {
-    return Internal(
-        "DynamicUpdateSlice's buffer does not have a value which is defined by "
-        "a broadcast. HloBuffer: %s",
-        unique_buffer.ToString());
-  }
-  if (!AllPositionsAreAllowed(broadcast_value)) {
-    return Internal(
-        "HloValue defined by %s has an invalid position. HloValue: %s",
-        broadcast_value->defining_position().instruction->name(),
-        broadcast_value->ToString());
-  }
-
-  // TODO(b/319681297): Further analyze the HloValue defined by the broadcast.
-  // Make sure that nothing is expecting the result of the broadcast, as we'll
-  // be replacing it.
-
-  // Check that this buffer is finally an input to at least one slice or
-  // dynamic-slice.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<HloInstruction*> consuming_slices,
-      GetBufferUsersOfType(
-          unique_buffer,
-          match::AnyOf<HloInstruction>(match::Slice(), match::DynamicSlice())));
-  VLOG(2) << dynamic_update_slice->name()
-          << " is consumed by [dynamic-]slices: ["
-          << absl::StrJoin(consuming_slices, ", ",
-                           [](std::string* out, const HloInstruction* inst) {
-                             out->append(inst->name());
-                           })
-          << ']';
-  if (!dus_for_streamed_buffer_.contains(dynamic_update_slice) &&
-      consuming_slices.empty()) {
-    return Internal(
-        "The dynamic-update-slice (%s) never feeds into a slice nor "
-        "dynamic-slice.",
-        dynamic_update_slice->name());
-  }
-
-  // Each dynamic_slice and slice should feed into another annotation.
-  for (HloInstruction* consuming_slice : consuming_slices) {
-    VLOG(1) << "Host data produced by " << dynamic_update_slice->name()
-            << " is consumed by " << consuming_slice->name();
-    if (consuming_slice->user_count() != 1) {
-      return Internal(
-          "Slice/Dynamic-slice %s should only have one user. It should be an "
-          "annotation "
-          "to load the data back on the device. Instead, it has users [%s]",
-          consuming_slice->name(),
-          absl::StrJoin(consuming_slice->users(), ", ",
-                        [](std::string* out, const HloInstruction* inst) {
-                          out->append(inst->name());
-                        }));
-    }
-    std::optional<HloInstruction*> slice_user =
-        FindAnnotationFromDS(consuming_slice);
-    if (!slice_user.has_value()) {
-      return Internal(
-          "Slice/Dynamic-slice %s does not have a matching annotation.",
-          consuming_slice->name());
-    }
-
-    HloInstruction* consuming_slice_user = slice_user.value();
-    if (consuming_slice_user->custom_call_target() !=
-        host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
-      return Internal(
-          "Found custom-call (%s) is not the expected matching host offload "
-          "annotation",
-          consuming_slice_user->name());
+      // Found an instruction which is invalid during host memory offloading.
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Tensor which is moved to host (starting from "
+                          "\"%s\") is used by an instruction (\"%s\") which is "
+                          "not acceptable during pure memory offload.",
+                          starting_instruction->name(), instruction->name()));
     }
-    expected_host_to_device_annotations_.emplace(consuming_slice_user);
-  }
-
-  // Save the broadcast to later be replaced with a
-  // custom-call("AllocateBuffer")
-  broadcasts_to_replace_.emplace(
-      broadcast_value->defining_position().instruction);
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return absl::OkStatus();
-}
 
-void HostOffloader::AddAllPositionsToBeMovedToHostMemory(
-    const HloBuffer& unique_buffer) {
-  for (const HloValue* value : unique_buffer.values()) {
-    for (const HloPosition& position : value->positions()) {
-      positions_to_move_to_host_memory_.emplace(position);
+    if (!already_saved_buffer) {
+      // Save buffer to be set to host memory.
+      VLOG(5) << "Saving " << instruction_and_shape_index.ToString()
+              << " to be set to host memory.";
+      buffers_to_set_to_host_memory.push_back(instruction_and_shape_index);
     }
-  }
-}
 
-absl::Status HostOffloader::MemoryOnlyOffloadStartingWithCopy(
-    const HloInstruction* copy) {
-  // The user wants to offload the data defined by this copy.
-  VLOG(2) << "Host memory offload starts with a copy: " << copy->name();
-
-  // Get the buffer for this copy.
-  const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(copy);
-
-  // Look for a value defined by a copy.
-  const HloValue* copy_value = nullptr;
-  for (const HloValue* value : unique_buffer.values()) {
-    HloInstruction* defining_instruction =
-        value->defining_position().instruction;
-    if (defining_instruction->opcode() == HloOpcode::kCopy) {
-      if (copy_value != nullptr) {
-        LOG(WARNING)
-            << "Already found one dynamic-update-slice value for this buffer";
-      }
-      copy_value = value;
-    } else {
-      // For all other values (that aren't defined by a copy), ensure that the
-      // defining position is non-compute as well as all other positions.
-      if (!DefiningPositionIsAllowed(value->defining_position().instruction)) {
-        return Internal(
-            "HloValue is defined by an unsupported op: %s. HloValue: %s",
-            defining_instruction->name(), value->ToString());
-      }
-      if (!AllPositionsAreAllowed(value)) {
-        return Internal(
-            "HloValue defined by %s has an invalid position. HloValue: %s",
-            defining_instruction->name(), value->ToString());
+    // Check if this path ends at the output of the entry computation.
+    if (instruction->IsRoot() && instruction->parent()->IsEntryComputation()) {
+      const Shape& output_shape = ShapeUtil::GetSubshape(
+          instruction->GetModule()->entry_computation_layout().result_shape(),
+          instruction_and_shape_index.shape_index);
+      CHECK(output_shape.has_layout())
+          << "Expecting output shape of entry computation to have a layout.";
+      if (output_shape.layout().memory_space() == kHostMemorySpaceColor) {
+        VLOG(2) << absl::StreamFormat(
+            "Memory offloaded starting from %s is output streamed",
+            starting_instruction_and_index.ToString());
+        continue;
+      } else {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Tensor which is moved to host (starting from %s) "
+                            "is returned from the entry computation but the "
+                            "layout for this output is not set to host memory.",
+                            starting_instruction->name()));
       }
     }
-  }
-
-  if (copy_value == nullptr) {
-    return Internal(
-        "Copy's buffer does not have a value which is defined by a copy. "
-        "HloBuffer: %s",
-        unique_buffer.ToString());
-  }
-  // For the copy, ensure that all other positions are non-compute.
-  if (!AllPositionsAreAllowed(copy_value)) {
-    return Internal(
-        "HloValue defined by %s has an invalid position. HloValue: %s",
-        copy_value->defining_position().instruction->name(),
-        copy_value->ToString());
-  }
-
-  // Check that this buffer is finally an input to another copy.
-  TF_ASSIGN_OR_RETURN(HloInstruction * consuming_copy,
-                      BufferHasPositionWithUser(unique_buffer, match::Copy()));
-  if (consuming_copy == nullptr) {
-    return Internal("The copy (%s) never feeds into another copy.",
-                    copy->name());
-  }
-
-  // The copy should feed into another annotation.
-  if (consuming_copy->user_count() != 1) {
-    return Internal(
-        "Copy should only have one user. It should be an annotation to load "
-        "the data back on the device. Instead, it has users [%s]",
-        absl::StrJoin(consuming_copy->users(), ", ",
-                      [](std::string* out, const HloInstruction* inst) {
-                        out->append(inst->name());
-                      }));
-  }
-  HloInstruction* consuming_copy_user = consuming_copy->users()[0];
-  if (consuming_copy_user->opcode() != HloOpcode::kCustomCall) {
-    return Internal("Copy does not have a matching annotation.");
-  }
-  if (consuming_copy_user->custom_call_target() !=
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
-    return Internal(
-        "Found custom-call is not the expected matching host offload "
-        "annotation");
-  }
-  expected_host_to_device_annotations_.emplace(consuming_copy_user);
-
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return absl::OkStatus();
-}
-
-absl::Status HostOffloader::MemoryOnlyOffloadInsertCopies(
-    HloInstruction* custom_call) {
-  VLOG(3) << "Found an offload annotation (" << custom_call->name()
-          << "). Expecting that we'll need to insert copies";
-  const HloBuffer& unique_buffer =
-      alias_analysis_->GetUniqueBufferAt(custom_call);
-  for (const HloValue* value : unique_buffer.values()) {
-    HloInstruction* defining_instruction =
-        value->defining_position().instruction;
-    if (!AllPositionsAreAllowed(value)) {
-      return Internal(
-          "HloValue defined by %s has an invalid position. HloValue: %s",
-          defining_instruction->name(), value->ToString());
+    // Push successors onto the queue to be visited.
+    TF_ASSIGN_OR_RETURN(const std::vector<InstructionAndShapeIndex> successors,
+                        GetSuccessors(instruction_and_shape_index));
+    for (const InstructionAndShapeIndex& successor : successors) {
+      queue.push(successor);
     }
   }
 
-  // Check that this buffer is finally an input to a load-from-host custom-call.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<HloInstruction*> matching_annotations,
-      GetBufferPositionsWithUser(
-          unique_buffer,
-          match::CustomCall({host_memory_offload_annotations::
-                                 kMoveToDeviceCustomCallTarget})));
-
-  // This fits the pattern that we're looking for. Save these annotations to
-  // later insert copies around.
-  annotations_for_copy_to_host_to_insert_.emplace(custom_call);
-  for (HloInstruction* matching_annotation : matching_annotations) {
-    annotations_for_copy_to_device_to_insert_.emplace(matching_annotation);
+  // Finished walking all host memory paths. Now we'll make all the necessary
+  // changes.
+  const bool set_buffers_changed = SetBuffersToMemorySpaceColor(
+      buffers_to_set_to_host_memory, kHostMemorySpaceColor);
+  changed = changed || set_buffers_changed;
 
-    // Save the matching annotation to later be removed.
-    expected_host_to_device_annotations_.emplace(matching_annotation);
+  for (HloInstruction* dus : dynamic_update_slices) {
+    // Create a host AllocateBuffer instruction which this DynamicUpdateSlice
+    // will update-slice into.
+    TF_RETURN_IF_ERROR(CreateAllocateBufferForDynamicUpdateSlice(dus));
+    changed = true;
   }
 
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return absl::OkStatus();
-}
-
-absl::Status HostOffloader::DynamifySlice(HloInstruction* slice) {
-  VLOG(3) << "Dynamifying slice " << slice->ToString();
-  std::vector<HloInstruction*> start_constants;
-  for (int64_t start : slice->slice_starts()) {
-    HloInstruction* constant = slice->parent()->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(start)));
-    start_constants.push_back(constant);
+  if (insert_copy_before) {
+    const auto predecessors = GetPredecessors(starting_instruction_and_index);
+    CHECK_EQ(predecessors.size(), 1);
+    TF_ASSIGN_OR_RETURN(bool inserted_copy,
+                        InsertCopyBetween(predecessors.front(),
+                                          starting_instruction_and_index));
+    changed = changed || inserted_copy;
   }
-  std::vector<int64_t> slice_sizes;
-  slice_sizes.reserve(slice->slice_limits().size());
-  for (int i = 0; i < slice->slice_limits().size(); ++i) {
-    slice_sizes.push_back(slice->slice_limits()[i] - slice->slice_starts()[i]);
+
+  // Insert copies to move to device.
+  for (HloInstruction* custom_call : custom_calls_to_insert_copies_before) {
+    HloInstruction* data_to_copy = custom_call->mutable_operand(0);
+    HloInstruction* copy_to_device =
+        data_to_copy->parent()->AddInstruction(HloInstruction::CreateUnary(
+            data_to_copy->shape(), HloOpcode::kCopy, data_to_copy));
+    SetMemorySpace(copy_to_device->mutable_shape(),
+                   Layout::kDefaultMemorySpace);
+    VLOG(1) << absl::StreamFormat(
+        "Inserted copy \"%s\" before custom call \"%s\"",
+        copy_to_device->name(), custom_call->name());
+    TF_RETURN_IF_ERROR(custom_call->ReplaceAllUsesWith(copy_to_device));
+    changed = true;
   }
-  HloInstruction* new_ds =
-      slice->parent()->AddInstruction(HloInstruction::CreateDynamicSlice(
-          slice->shape(), slice->mutable_operand(0), start_constants,
-          slice_sizes));
-  VLOG(3) << "Newly created dynamic slice: " << new_ds->name();
-  TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(new_ds));
-  TF_RETURN_IF_ERROR(slice->parent()->RemoveInstruction(slice));
-  return absl::OkStatus();
-}
 
-// Taking an instruction representing a move-to-device custom call, creates a
-// copy to device for that operand and replaces all uses of the operand of the
-// load annotation with the copy.
-absl::Status HostOffloader::CreateCopyForInputStreaming(
-    HloInstruction* custom_call) {
-  HloInstruction* operand_of_load_annotation = custom_call->mutable_operand(0);
-  Shape copy_shape = operand_of_load_annotation->shape();
-  SetMemorySpace(&copy_shape, Layout::kDefaultMemorySpace);
-  HloInstruction* copy_to_device =
-      custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
-          copy_shape, HloOpcode::kCopy, operand_of_load_annotation));
-
-  auto users = operand_of_load_annotation->users();
-  for (HloInstruction* use : users) {
-    if (use == copy_to_device) {
-      continue;
-    }
-    auto callers = call_graph_->GetComputationCallers(copy_to_device->parent());
-    if (callers.size() > 1) {
-      return absl::InvalidArgumentError(
-          "Expected to be called only by one caller");
-    }
-    if (callers.size() == 1 && callers[0]->opcode() == HloOpcode::kWhile &&
-        use->opcode() == HloOpcode::kTuple && use->IsRoot()) {
-      // Need some special filtering for while body's root instruction.
-      for (int i = 0; i < use->operands().size(); i++) {
-        if (use->operands()[i] == operand_of_load_annotation) {
-          if (operand_of_load_annotation->opcode() ==
-                  HloOpcode::kGetTupleElement &&
-              operand_of_load_annotation->operand(0)->opcode() ==
-                  HloOpcode::kParameter &&
-              operand_of_load_annotation->tuple_index() == i) {
-            // A special case where move-to-device is put into the result
-            // tuple element at the same index as where the move-to-device
-            // gets the data from. In this case, while loop's result tuple
-            // should not use move-to-device since at loop entry it's still
-            // on host.
-            continue;
-          }
-          TF_RETURN_IF_ERROR(operand_of_load_annotation->ReplaceUseWith(
-              use, i, copy_to_device));
-        }
-      }
-    } else {
-      TF_RETURN_IF_ERROR(
-          operand_of_load_annotation->ReplaceUseWith(use, copy_to_device));
-    }
+  // All host memory offloading has been completed. Remove MoveToHost custom
+  // calls.
+  for (HloInstruction* custom_call : mth_custom_calls_to_remove) {
+    VLOG(1) << absl::StreamFormat("Removing MoveToHost custom call \"%s\"",
+                                  custom_call->name());
+    TF_RETURN_IF_ERROR(
+        custom_call->ReplaceAllUsesWith(custom_call->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(custom_call->parent()->RemoveInstruction(custom_call));
+    changed = true;
   }
-  return absl::OkStatus();
-}
 
-// From a unique buffer on host memory, finds move-to-device custom calls
-// for this buffer and inserts the appropriate copies.
-absl::Status HostOffloader::HandleStreamedBuffer(
-    const HloBuffer& unique_buffer) {
-  // Find all move-to-device custom calls that are using this buffer.
-  for (const HloValue* value : unique_buffer.values()) {
-    // First, handle the defining instruction of this buffer, as a potential
-    // move-to-host custom call.
-    if (value->defining_instruction()->IsCustomCall(
-            kMoveToHostCustomCallTarget)) {
-      annotations_for_copy_to_host_to_insert_.emplace(
-          value->defining_instruction());
-      AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-    } else if (value->defining_instruction()->opcode() ==
-               HloOpcode::kDynamicUpdateSlice) {
-      std::optional<HloInstruction*> dus =
-          FindAnnotationFromDUS(value->defining_instruction());
-      if (dus.has_value()) {
-        dus_for_streamed_buffer_.emplace(value->defining_instruction());
-        AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-      }
-    }
-    // Next, handle uses of this buffer as potential move-to-device custom
-    // calls.
-    for (const HloUse& use : value->GetUses()) {
-      if (use.instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
-        HloInstruction* move_to_device_custom_call = use.instruction;
-
-        // Create a copy to device for the move-to-device custom call. Mark
-        // the move-to-device custom call as expected.
-        TF_RETURN_IF_ERROR(
-            CreateCopyForInputStreaming(move_to_device_custom_call));
-        expected_host_to_device_annotations_.emplace(
-            move_to_device_custom_call);
-      } else if (use.instruction->opcode() == HloOpcode::kDynamicSlice ||
-                 use.instruction->opcode() == HloOpcode::kSlice) {
-        std::optional<HloInstruction*> move_to_device_custom_call =
-            FindAnnotationFromDS(use.instruction);
-        if (move_to_device_custom_call.has_value()) {
-          TF_RETURN_IF_ERROR(
-              CreateCopyForInputStreaming(move_to_device_custom_call.value()));
-          expected_host_to_device_annotations_.emplace(
-              move_to_device_custom_call.value());
-        }
-      }
-    }
+  for (HloInstruction* slice : slices_to_dynamify) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * dynamic_slice, DynamifySlice(slice));
+    // We've already validated this slice. Since we're changing it to a dynamic
+    // slice, save the new dynamic slice so that we don't try to validate it
+    // again.
+    validated_slices_.insert(dynamic_slice);
+    changed = true;
   }
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return absl::OkStatus();
+
+  return changed;
 }
 
-// Finds parameters of the entry computation that are in host memory space and
-// corresponding move-to-device custom calls for these parameters. Once found,
-// adds these move-to-device custom calls to the expected host-to-device
-// annotations, and creates the necessary copies for input streaming.
-absl::Status HostOffloader::HandleInputStreaming(HloComputation* computation) {
+absl::StatusOr<bool> HostOffloader::HandleInputStreaming(
+    HloComputation* entry_computation) {
+  bool changed = false;
   const ComputationLayout& entry_computation_layout =
-      computation->parent()->entry_computation_layout();
+      entry_computation->parent()->entry_computation_layout();
 
   for (int i = 0; i < entry_computation_layout.parameter_count(); ++i) {
     if (entry_computation_layout.parameter_shape(i).IsToken()) {
       LOG(WARNING) << "Token parameters are not supported for streaming.";
       continue;
     }
-    ShapeUtil::ForEachSubshape(
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         entry_computation_layout.parameter_shape(i),
         [&](const Shape& subshape, const ShapeIndex& index) {
           if (subshape.has_layout() &&
               subshape.layout().memory_space() == kHostMemorySpaceColor) {
-            VLOG(4) << "Handling streamed element in input with shape: "
-                    << subshape.ToString(true);
-            const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
-                computation->parameter_instruction(i), {index});
-            TF_CHECK_OK(HandleStreamedBuffer(unique_buffer));
+            HloInstruction* parameter_instruction =
+                entry_computation->parameter_instruction(i);
+            VLOG(1) << "Host parameter streamed into program with shape: "
+                    << subshape.ToString(/*print_layout=*/true) << " at index "
+                    << index.ToString();
+            TF_ASSIGN_OR_RETURN(
+                bool result,
+                WalkDownHostMemoryOffloadPaths(
+                    InstructionAndShapeIndex(parameter_instruction, index),
+                    /*insert_copy_before=*/false));
+            changed = changed || result;
           }
-        });
+          return OkStatus();
+        }));
   }
-  return absl::OkStatus();
+  return changed;
 }
 
-// Starts from the result of the entry computation and looks for a case of
-// output streaming. This function will not change any hlo, it will only mark
-// instructions to be converted to host memory space.
-absl::Status HostOffloader::HandleOutputStreaming(HloComputation* computation) {
-  const ComputationLayout& entry_computation_layout =
-      computation->parent()->entry_computation_layout();
-
-  ShapeUtil::ForEachSubshape(
-      entry_computation_layout.result_shape(),
-      [&](const Shape& subshape, const ShapeIndex& index) {
-        if (subshape.has_layout() &&
-            subshape.layout().memory_space() == kHostMemorySpaceColor) {
-          VLOG(4) << "Handling streamed element in result with shape: "
-                  << subshape.ToString(true);
-          const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
-              computation->root_instruction(), {index});
-          TF_CHECK_OK(HandleStreamedBuffer(unique_buffer));
-        }
-      });
-  return absl::OkStatus();
-}
+absl::StatusOr<bool> HostOffloader::HandleMoveToHostCustomCall(
+    HloInstruction* custom_call_instruction) {
+  if (already_visited_move_to_host_custom_calls_.contains(
+          custom_call_instruction)) {
+    return false;
+  }
+  VLOG(1) << "Offloading " << custom_call_instruction->operand(0)->name()
+          << " to host.";
+  TF_ASSIGN_OR_RETURN(
+      std::vector<InstructionAndShapeIndex> starting_instruction_and_shapes,
+      GetStartingInstructions(custom_call_instruction));
+  if (starting_instruction_and_shapes.empty()) {
+    // Either:
+    //  1. This custom call has no users.
+    //  2. It is the root of the entry computation.
+    // In the case of 1, there is nothing to do. You could argue that we should
+    // still copy the data to the host, as it is side effecting. However, that
+    // would be wasteful, so we won't do it. In the case of 2, we'll simply
+    // insert a copy to host and replace the root instruction with that.
+    if (custom_call_instruction == custom_call_instruction->GetModule()
+                                       ->entry_computation()
+                                       ->root_instruction()) {
+      HloInstruction* data_to_copy =
+          custom_call_instruction->mutable_operand(0);
+      HloInstruction* copy_to_host =
+          data_to_copy->parent()->AddInstruction(HloInstruction::CreateUnary(
+              data_to_copy->shape(), HloOpcode::kCopy, data_to_copy));
+      SetMemorySpace(copy_to_host->mutable_shape(), kHostMemorySpaceColor);
+      TF_RETURN_IF_ERROR(
+          custom_call_instruction->ReplaceAllUsesWith(copy_to_host));
+      VLOG(2) << absl::StreamFormat(
+          "Custom call \"%s\" is entry computation root. Inserted copy \"%s\" "
+          "and replaced root instruction.",
+          custom_call_instruction->name(), copy_to_host->name());
+    }
+  }
 
-absl::StatusOr<bool> HostOffloader::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
+  // Walk down the graph from each starting instruction.
+  for (const InstructionAndShapeIndex& starting_instruction_and_shape :
+       starting_instruction_and_shapes) {
+    const bool should_insert_copy_before_instruction =
+        starting_instruction_and_shape.instruction->opcode() !=
+        HloOpcode::kDynamicUpdateSlice;
+    TF_ASSIGN_OR_RETURN(
+        bool result,
+        WalkDownHostMemoryOffloadPaths(starting_instruction_and_shape,
+                                       should_insert_copy_before_instruction));
+    (void)result;  // This function *will* change the HloModule. We don't care
+                   // if WalkDownHostMemoryOffloadPaths changed it or not.
+  }
 
-  call_graph_ = CallGraph::Build(module);
+  already_visited_move_to_host_custom_calls_.insert(custom_call_instruction);
 
-  // Run HloAliasAnalysis on module.
-  TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
+  // Remove custom call.
+  VLOG(2) << absl::StreamFormat("Removing MoveToHost custom call \"%s\"",
+                                custom_call_instruction->name());
+  TF_RETURN_IF_ERROR(custom_call_instruction->ReplaceAllUsesWith(
+      custom_call_instruction->mutable_operand(0)));
+  TF_RETURN_IF_ERROR(custom_call_instruction->parent()->RemoveInstruction(
+      custom_call_instruction));
+  return true;
+}
 
-  TF_RETURN_IF_ERROR(HandleInputStreaming(module->entry_computation()));
-  TF_RETURN_IF_ERROR(HandleOutputStreaming(module->entry_computation()));
+absl::StatusOr<bool> HostOffloader::HandleMoveToDeviceCustomCall(
+    HloInstruction* custom_call_instruction) {
+  VLOG(2) << absl::StreamFormat("Removing MoveToDevice custom call \"%s\"",
+                                custom_call_instruction->name());
+  TF_RETURN_IF_ERROR(custom_call_instruction->ReplaceAllUsesWith(
+      custom_call_instruction->mutable_operand(0)));
+  TF_RETURN_IF_ERROR(custom_call_instruction->parent()->RemoveInstruction(
+      custom_call_instruction));
+  move_to_device_custom_calls_to_remove_.insert(custom_call_instruction);
+  return true;
+}
 
-  // Iterate over all instructions and look for XLA host offload annotations.
-  for (HloComputation* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      if (instruction->opcode() != HloOpcode::kCustomCall) {
-        continue;
-      }
-      if (instruction->custom_call_target() ==
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
-        TF_RETURN_IF_ERROR(HandleMoveToHostCustomCall(instruction));
-      } else if (instruction->custom_call_target() ==
-                 host_memory_offload_annotations::
-                     kMoveToDeviceCustomCallTarget) {
-        found_host_to_device_annotations_.emplace(instruction);
+absl::StatusOr<bool> HostOffloader::InsertCopyBetween(
+    const InstructionAndShapeIndex& before_instruction_and_index,
+    const InstructionAndShapeIndex& after_instruction_and_index) {
+  bool changed = false;
+  HloInstruction* after_instruction = after_instruction_and_index.instruction;
+
+  // Get a list of instructions to insert copies before. Normally, this is just
+  // `after_instruction_and_index.instruction`, however, if this instruction is
+  // a parameter, then we need to insert the copies before the call sites.
+  std::vector<InstructionAndShapeIndex> instructions_to_insert_copies_before;
+  if (after_instruction->opcode() == HloOpcode::kParameter) {
+    // To insert a copy between an instruction and a parameter means we actually
+    // want to insert a copy between the instruction and the call site of the
+    // computation with this parameter.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(after_instruction->GetModule());
+    auto callers =
+        call_graph->GetComputationCallers(after_instruction->parent());
+    for (HloInstruction* caller : callers) {
+      const auto indices =
+          caller->OperandIndices(before_instruction_and_index.instruction);
+      for (int64_t index : indices) {
+        instructions_to_insert_copies_before.push_back(
+            InstructionAndShapeIndex{caller, {index}});
       }
     }
+  } else {
+    // Instruction is not a parameter, replacement is straightforward.
+    instructions_to_insert_copies_before.push_back(after_instruction_and_index);
   }
 
-  // Insert copies to the host for the saved annotations.
-  for (HloInstruction* to_host_annotation :
-       annotations_for_copy_to_host_to_insert_) {
-    HloInstruction* data_to_host = to_host_annotation->mutable_operand(0);
-    // Create a copy (to host) of the first and only operand to the given custom
-    // call.
-    HloInstruction* copy_to_host =
-        data_to_host->parent()->AddInstruction(HloInstruction::CreateUnary(
-            data_to_host->shape(), HloOpcode::kCopy, data_to_host));
-    // Replace all uses of the to-host annotation with the first copy.
-    TF_RETURN_IF_ERROR(to_host_annotation->ReplaceAllUsesWith(copy_to_host));
-    // Also save the position of the newly created copy-to-host to later have
-    // its memory space updated.
-    positions_to_move_to_host_memory_.emplace(HloPosition{copy_to_host});
-  }
-
-  // Insert copies to the device for the saved annotations.
-  for (HloInstruction* to_device_annotation :
-       annotations_for_copy_to_device_to_insert_) {
-    HloInstruction* data_to_device = to_device_annotation->mutable_operand(0);
-    // Create another copy (back to device) of that copy.
-    HloInstruction* copy_to_device =
-        data_to_device->parent()->AddInstruction(HloInstruction::CreateUnary(
-            data_to_device->shape(), HloOpcode::kCopy, data_to_device));
-    // Replace all uses of the to-device annotation with the second copy.
-    TF_RETURN_IF_ERROR(
-        to_device_annotation->ReplaceAllUsesWith(copy_to_device));
+  // Insert a copy before each of the above instructions.
+  for (const InstructionAndShapeIndex& instruction_and_index :
+       instructions_to_insert_copies_before) {
+    if (already_inserted_copy_before_.find(instruction_and_index) ==
+        already_inserted_copy_before_.end()) {
+      HloInstruction* data_to_copy = before_instruction_and_index.instruction;
+      HloInstruction* copy_to_host;
+      auto it = copies_created_after_.find(data_to_copy);
+      if (it == copies_created_after_.end()) {
+        // Don't have a copy yet; create it.
+        copy_to_host =
+            data_to_copy->parent()->AddInstruction(HloInstruction::CreateUnary(
+                data_to_copy->shape(), HloOpcode::kCopy, data_to_copy));
+        SetMemorySpace(copy_to_host->mutable_shape(), kHostMemorySpaceColor);
+        copies_created_after_[data_to_copy] = copy_to_host;
+      } else {
+        // We already have a copy which feeds into this instruction.
+        copy_to_host = it->second;
+      }
+      const int64_t operand_index =
+          after_instruction_and_index.shape_index.empty()
+              ? 0
+              : after_instruction_and_index.shape_index.front();
+      TF_RETURN_IF_ERROR(instruction_and_index.instruction->ReplaceOperandWith(
+          operand_index, copy_to_host));
+      VLOG(2) << absl::StreamFormat(
+          "Inserted copy \"%s\" between \"%s\" and \"%s\"",
+          copy_to_host->name(), before_instruction_and_index.ToString(),
+          after_instruction_and_index.ToString());
+      already_inserted_copy_before_.insert(instruction_and_index);
+      changed = true;
+    }
   }
+  return changed;
+}
 
-  // Check that we found all the annotations that we expected.
-  if (found_host_to_device_annotations_ !=
-      expected_host_to_device_annotations_) {
-    return Internal(
-        "There is a mismatch between the expected host-to-device annotations "
-        "(%s) and the found host-to-device annotations (%s)",
-        absl::StrJoin(expected_host_to_device_annotations_, ", ",
-                      [](std::string* str, HloInstruction* instr) {
-                        str->append(instr->name());
-                      }),
-        absl::StrJoin(found_host_to_device_annotations_, ", ",
-                      [](std::string* str, HloInstruction* instr) {
-                        str->append(instr->name());
-                      }));
+absl::StatusOr<std::vector<InstructionAndShapeIndex>>
+HostOffloader::GetStartingInstructions(
+    HloInstruction* custom_call_instruction) {
+  // We want to offload the single operand of this custom call to the host.
+  // For each user, it either:
+  // 1. Feeds into a DynamicUpdateSlice.
+  // 2. Does "normal" memory offloading.
+  std::vector<InstructionAndShapeIndex> result;
+  std::queue<InstructionAndShapeIndex> queue;
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<InstructionAndShapeIndex> successors_of_custom_call,
+      GetSuccessors(InstructionAndShapeIndex(custom_call_instruction)));
+  for (const InstructionAndShapeIndex& successor : successors_of_custom_call) {
+    queue.push(successor);
+  }
+  while (!queue.empty()) {
+    InstructionAndShapeIndex instruction_and_shape = queue.front();
+    queue.pop();
+    HloInstruction* current_instruction = instruction_and_shape.instruction;
+    if (current_instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      // Found a DynamicUpdateSlice.
+      result.push_back(instruction_and_shape);
+      continue;
+    } else if (!InstructionIsAllowedBetweenMoveToHostAndDus(
+                   current_instruction)) {
+      // Found the start of "normal" memory offloading.
+      result.push_back(instruction_and_shape);
+      continue;
+    } else {
+      // Is a logical bitcast/reshape, we won't offload this yet.
+    }
+    TF_ASSIGN_OR_RETURN(const std::vector<InstructionAndShapeIndex> successors,
+                        GetSuccessors(instruction_and_shape));
+    for (const InstructionAndShapeIndex& successor : successors) {
+      queue.push(successor);
+    }
   }
+  return result;
+}
 
-  // Remove these host-to-device annotations.
-  for (HloInstruction* instr : found_host_to_device_annotations_) {
-    custom_calls_to_remove_.emplace(instr);
+absl::Status HostOffloader::ValidateSliceLeadsToMoveToDeviceCustomCall(
+    HloInstruction* slice) {
+  if (validated_slices_.find(slice) != validated_slices_.end()) {
+    // Already validated this one.
+    return OkStatus();
+  }
+  // Every host-to-device DynamicSlice/Slice must be followed by a MoveToDevice
+  // custom call. This function verifiest that.
+  CHECK(slice->opcode() == HloOpcode::kDynamicSlice ||
+        slice->opcode() == HloOpcode::kSlice)
+      << "This function must only be called with a slice or dynamic slice.";
+  std::queue<InstructionAndShapeIndex> queue;
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<InstructionAndShapeIndex> successors_of_slice,
+      GetSuccessors(InstructionAndShapeIndex(slice)));
+  for (const InstructionAndShapeIndex& successor : successors_of_slice) {
+    queue.push(successor);
+  }
+  while (!queue.empty()) {
+    InstructionAndShapeIndex instruction_and_shape = queue.front();
+    queue.pop();
+    HloInstruction* current_instruction = instruction_and_shape.instruction;
+    if (current_instruction->opcode() == HloOpcode::kCustomCall &&
+        current_instruction->custom_call_target() ==
+            host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+      // This path ended with the MoveToDevice custom call. This path is good.
+      continue;
+    }
+    if (!InstructionIsAllowedBetweenDsAndMoveToDevice(current_instruction)) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Tensor which is moved to host and back to device (ending at \"%s\") "
+          "has an invalid instruction (\"%s\") between DynamicSlice/Slice and "
+          "the MoveToDevice custom call.",
+          slice->name(), current_instruction->name()));
+    }
+    TF_ASSIGN_OR_RETURN(const std::vector<InstructionAndShapeIndex> successors,
+                        GetSuccessors(instruction_and_shape));
+    for (const InstructionAndShapeIndex& successor : successors) {
+      queue.push(successor);
+    }
   }
+  validated_slices_.insert(slice);
+  return OkStatus();
+}
 
-  absl::flat_hash_set<HloInstruction*> slices_to_dynamify;
-  // Change the memory space of these positions to the host memory space.
-  for (const HloPosition& position : positions_to_move_to_host_memory_) {
-    // If a user of this position is a slice, change it to be a dynamic-slice.
-    for (HloInstruction* user : position.instruction->users()) {
-      if (user->opcode() == HloOpcode::kSlice) {
-        slices_to_dynamify.emplace(user);
+absl::Status HostOffloader::CreateAllocateBufferForDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice) {
+  if (dynamic_update_slices_already_allocated_.find(dynamic_update_slice) !=
+      dynamic_update_slices_already_allocated_.end()) {
+    // Already added an AllocateBuffer for this DynamicUpdateSlice.
+    return OkStatus();
+  }
+  VLOG(2) << absl::StreamFormat(
+      "Creating a AllocateBuffer in host memory space for \"%s\"",
+      dynamic_update_slice->name());
+  // Walk the graph up. We expect to find a broadcast. Also, while walking up
+  // the graph, set host memory space on everything between the AllocateBuffer
+  // and the DynamicUpdateSlice.
+  std::queue<InstructionAndShapeIndex> queue;
+  queue.push(InstructionAndShapeIndex(dynamic_update_slice));
+  bool found_broadcast = false;
+  while (!queue.empty()) {
+    InstructionAndShapeIndex instruction_and_shape = queue.front();
+    queue.pop();
+    VLOG(2) << absl::StreamFormat("Setting %s to have host memory space",
+                                  instruction_and_shape.ToString());
+    SetMemorySpace(ShapeUtil::GetMutableSubshape(
+                       instruction_and_shape.instruction->mutable_shape(),
+                       instruction_and_shape.shape_index),
+                   kHostMemorySpaceColor);
+    HloInstruction* instruction = instruction_and_shape.instruction;
+    if (instruction->opcode() == HloOpcode::kParameter) {
+      // If this is a parameter of a while_body, we also need to find the
+      // matching parameter in the while_condition and set the memory spaces
+      // there.
+      std::unique_ptr<CallGraph> call_graph =
+          CallGraph::Build(instruction->GetModule());
+      const std::vector<HloInstruction*> callers =
+          call_graph->GetComputationCallers(instruction->parent());
+      for (HloInstruction* caller : callers) {
+        if (caller->opcode() == HloOpcode::kWhile) {
+          // This parameter belongs to a while.
+          CHECK(caller->while_body() == instruction->parent())
+              << "We assume that we're starting from the while body";
+          HloComputation* while_condition_computation =
+              caller->while_condition();
+          CHECK(while_condition_computation->num_parameters() == 1)
+              << "Expecting While to have just 1 parameter";
+          HloInstruction* while_condition_parameter =
+              while_condition_computation->parameter_instruction(0);
+          VLOG(2) << absl::StreamFormat("Setting %s to have host memory space",
+                                        while_condition_parameter->name());
+          SetMemorySpace(ShapeUtil::GetMutableSubshape(
+                             while_condition_parameter->mutable_shape(),
+                             instruction_and_shape.shape_index),
+                         kHostMemorySpaceColor);
+          // Walk further down the graph and set the memory spaces of all uses
+          // too. This includes verifying that no compute is done on the buffer.
+          // Another, better way, to do this, is to walk down the graph starting
+          // from the newly created AllocateBuffer and set everything visited as
+          // host memory space.
+          std::queue<InstructionAndShapeIndex> nested_queue;
+          nested_queue.push(InstructionAndShapeIndex(
+              while_condition_parameter, instruction_and_shape.shape_index));
+          while (!nested_queue.empty()) {
+            InstructionAndShapeIndex nested_instruction_and_shape =
+                nested_queue.front();
+            nested_queue.pop();
+            if (!IsValidDuringPureMemoryOffload(
+                    nested_instruction_and_shape.instruction)) {
+              return absl::InvalidArgumentError(absl::StrFormat(
+                  "Tensor which is moved to host is used by an invalid "
+                  "instruction (\"%s\") during while condition body.",
+                  nested_instruction_and_shape.instruction->name()));
+            }
+            SetMemorySpace(
+                ShapeUtil::GetMutableSubshape(
+                    nested_instruction_and_shape.instruction->mutable_shape(),
+                    nested_instruction_and_shape.shape_index),
+                kHostMemorySpaceColor);
+            TF_ASSIGN_OR_RETURN(
+                const std::vector<InstructionAndShapeIndex> successors,
+                GetSuccessors(nested_instruction_and_shape));
+            for (const InstructionAndShapeIndex& successor : successors) {
+              nested_queue.push(successor);
+            }
+          }
+        }
+      }
+    } else if (instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      // The AllocateBuffer that we're about to create will suffice for every
+      // DynamicUpdateSlice we pass through as we walk up the graph.
+      dynamic_update_slices_already_allocated_.insert(instruction);
+    }
+    const std::vector<InstructionAndShapeIndex> predecessors =
+        GetPredecessors(instruction_and_shape);
+    for (const InstructionAndShapeIndex& predecessor : predecessors) {
+      HloInstruction* predecessor_instruction = predecessor.instruction;
+      if (predecessor_instruction->opcode() == HloOpcode::kBroadcast) {
+        // Found a broadcast.
+        found_broadcast = true;
+        HloInstruction* broadcast_user = instruction_and_shape.instruction;
+        const auto operand_indices =
+            broadcast_user->OperandIndices(predecessor_instruction);
+        CHECK(!operand_indices.empty())
+            << "We could only have the broadcast as a predecessor if it is an "
+               "operand of this instruction; something is wrong.";
+        HloInstruction* allocate_buffer =
+            predecessor_instruction->parent()->AddInstruction(
+                HloInstruction::CreateCustomCall(
+                    predecessor_instruction->shape(), {}, "AllocateBuffer"));
+        VLOG(1) << absl::StreamFormat(
+            "Created new AllocateBuffer instruction \"%s\"",
+            allocate_buffer->ToString());
+        SetMemorySpace(allocate_buffer->mutable_shape(), kHostMemorySpaceColor);
+        for (int64_t index : operand_indices) {
+          TF_RETURN_IF_ERROR(
+              broadcast_user->ReplaceOperandWith(index, allocate_buffer));
+        }
+        if (predecessor_instruction->user_count() == 0) {
+          // No remaining users. Remove the broadcast.
+          VLOG(3) << absl::StreamFormat(
+              "Broadcast \"%s\" has no remaining users; removing.",
+              predecessor_instruction->name());
+          TF_RETURN_IF_ERROR(
+              predecessor_instruction->parent()->RemoveInstruction(
+                  predecessor_instruction));
+        }
+      } else {
+        queue.push(predecessor);
       }
     }
-    Shape* shape_to_change = ShapeUtil::GetMutableSubshape(
-        position.instruction->mutable_shape(), position.index);
-    VLOG(2) << "Setting instruction to have host memory space: "
-            << position.instruction->name();
-    SetMemorySpace(shape_to_change, kHostMemorySpaceColor);
-    changed = true;
   }
-
-  for (HloInstruction* user : slices_to_dynamify) {
-    TF_RETURN_IF_ERROR(DynamifySlice(user));
+  if (!found_broadcast) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("DynamicUpdateSlice \"%s\"'s first operand is not the "
+                        "result of a broadcast.",
+                        dynamic_update_slice->name()));
   }
+  return OkStatus();
+}
 
-  // Replace these broadcasts with AllocateBuffer instructions for host memory.
-  for (HloInstruction* broadcast : broadcasts_to_replace_) {
-    HloInstruction* allocate_buffer =
-        broadcast->parent()->AddInstruction(HloInstruction::CreateCustomCall(
-            broadcast->shape(), {}, "AllocateBuffer"));
-    VLOG(2) << "Replacing broadcast " << broadcast->name()
-            << " with AllocateBuffer " << allocate_buffer->ToString();
-    SetMemorySpace(allocate_buffer->mutable_shape(), kHostMemorySpaceColor);
-    CHECK_OK(broadcast->ReplaceAllUsesWith(allocate_buffer));
-    TF_RETURN_IF_ERROR(broadcast->parent()->RemoveInstruction(broadcast));
-    changed = true;
+absl::StatusOr<HloInstruction*> HostOffloader::DynamifySlice(
+    HloInstruction* slice) {
+  std::vector<HloInstruction*> start_constants;
+  for (int64_t start : slice->slice_starts()) {
+    HloInstruction* constant = slice->parent()->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(start)));
+    start_constants.push_back(constant);
+  }
+  std::vector<int64_t> slice_sizes;
+  slice_sizes.reserve(slice->slice_limits().size());
+  for (int i = 0; i < slice->slice_limits().size(); ++i) {
+    slice_sizes.push_back(slice->slice_limits()[i] - slice->slice_starts()[i]);
   }
+  HloInstruction* new_ds =
+      slice->parent()->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice->shape(), slice->mutable_operand(0), start_constants,
+          slice_sizes));
+  TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(new_ds));
+  VLOG(2) << absl::StreamFormat(
+      "Changed slice \"%s\" into dynamic slice \"%s\"", slice->name(),
+      new_ds->name());
+  TF_RETURN_IF_ERROR(slice->parent()->RemoveInstruction(slice));
+  return new_ds;
+}
 
-  // Recompute alias analysis after changes.
-  TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
-  auto uses_parameter_buffer = [this](HloInstruction* hlo) {
-    for (const HloBuffer* buffer : alias_analysis_->ComputeBuffersAt(hlo)) {
+absl::StatusOr<bool> HostOffloader::ApplySchedulingFix(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+  auto uses_parameter_buffer = [&](HloInstruction* hlo) {
+    for (const HloBuffer* buffer : alias_analysis->ComputeBuffersAt(hlo)) {
       for (const HloValue* value : buffer->values()) {
         for (const HloPosition& pos : value->positions()) {
           if (absl::c_linear_search(hlo->parent()->parameter_instructions(),
@@ -865,31 +946,95 @@ absl::StatusOr<bool> HostOffloader::Run(
     }
     return false;
   };
-  // Remove these custom-calls that were previously used for annotation.
-  for (HloInstruction* custom_call : custom_calls_to_remove_) {
-    CHECK_EQ(custom_call->operand_count(), 1);
-    HloInstruction* operand = custom_call->operands()[0];
-    if (custom_call->parent() !=
-            custom_call->GetModule()->entry_computation() &&
-        custom_call->IsCustomCall(
-            host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
-      // Replace custom call with a copy for dynamic-update-slice in case it
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    if (computation == computation->parent()->entry_computation()) {
+      continue;
+    }
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
+        continue;
+      }
+      if (instruction->shape().layout().memory_space() !=
+          kHostMemorySpaceColor) {
+        continue;
+      }
+      // Replace DynamicUpdateSlice's 1st operand with a copy in case it
       // used parameter buffer directly because in case of aliasing with loop
       // parameters control dependencies can mess with scheduling.
+      HloInstruction* operand = instruction->mutable_operand(1);
       if (uses_parameter_buffer(operand)) {
-        VLOG(10) << "Adding copy for custom call " << custom_call->name();
-        operand =
-            custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
+        HloInstruction* copy =
+            instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
                 operand->shape(), HloOpcode::kCopy, operand));
-      } else {
-        VLOG(10) << "NOT Adding copy for custom call " << custom_call->name();
+        VLOG(5) << "Added copy " << std::quoted(copy->name())
+                << " for DynamicUpdateSlice " << instruction->name()
+                << "'s 1st operand " << operand->name();
+        TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(1, copy));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> HostOffloader::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(const bool input_streaming_changed_module,
+                      HandleInputStreaming(module->entry_computation()));
+  changed = changed || input_streaming_changed_module;
+
+  // Since we're modifying the graph as we iterate over it, any time we change
+  // it, we need to re-run the loop.
+  bool changed_in_loop;
+  do {
+    changed_in_loop = false;
+    for (HloComputation* computation :
+         module->MakeComputationPostOrder(execution_threads)) {
+      for (HloInstruction* instruction :
+           computation->MakeInstructionPostOrder()) {
+        if (instruction->IsCustomCall(
+                host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+          TF_ASSIGN_OR_RETURN(changed_in_loop,
+                              HandleMoveToHostCustomCall(instruction));
+          if (changed_in_loop) {
+            changed = true;
+            break;
+          }
+        }
+      }
+      if (changed_in_loop) {
+        break;
+      }
+    }
+  } while (changed_in_loop);
+
+  // Remove all MoveToDevice custom calls.
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->IsCustomCall(
+              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+        TF_ASSIGN_OR_RETURN(bool result,
+                            HandleMoveToDeviceCustomCall(instruction));
+        changed = changed || result;
       }
     }
-    CHECK_OK(custom_call->ReplaceAllUsesWith(operand));
-    TF_RETURN_IF_ERROR(custom_call->parent()->RemoveInstruction(custom_call));
-    changed = true;
   }
 
+  TF_ASSIGN_OR_RETURN(bool applied_scheduling_fix,
+                      ApplySchedulingFix(module, execution_threads));
+  changed = changed || applied_scheduling_fix;
+
+  // Finally, run CSE to do a little cleanup.
+  HloCSE cse(/*is_layout_sensitive=*/true);
+  TF_ASSIGN_OR_RETURN(bool cse_changed, cse.Run(module, execution_threads));
+  changed = changed || cse_changed;
+
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index f4fba494c2bfc9..880cda3d77b621 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <string>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -30,12 +31,44 @@ namespace xla {
 
 class HloCostAnalysis;
 
+struct InstructionAndShapeIndex {
+  explicit InstructionAndShapeIndex(HloInstruction* instruction)
+      : instruction(instruction) {}
+  InstructionAndShapeIndex(HloInstruction* instruction, ShapeIndex shape_index)
+      : instruction(instruction), shape_index(shape_index) {}
+  HloInstruction* instruction;
+  ShapeIndex shape_index;
+  std::string ToString() const;
+
+  template <typename H>
+  static H Hash(H h, const InstructionAndShapeIndex& i) {
+    h = H::combine(std::move(h), i.instruction);
+    h = H::combine(std::move(h), i.shape_index);
+    return std::move(h);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const InstructionAndShapeIndex& i) {
+    return InstructionAndShapeIndex::Hash(std::move(h), i);
+  }
+};
+
+bool operator==(const InstructionAndShapeIndex& lhs,
+                const InstructionAndShapeIndex& rhs);
+
 // This pass does "host memory offloading". If a tensor is annotated to be moved
 // to or from the host, this pass will remove the annotations and update each
 // tensor's layout with host memory spaces and insert copies if necessary. This
 // pass checks to make sure that no compute is done on the tensors annotated for
 // host memory offload; if there is compute, it is considered a user error and
 // an error will be returned.
+// The pass will "walk down" the Hlo graph starting from either MoveToHost
+// custom calls or from parameters with host memory space in their layout. All
+// tensors along each path have their memory space set as host memory space. If
+// a MoveToHost custom call is paired with a DynamicUpdateSlice, the
+// DynamicUpdateSlice will write into host memory space. Otherwise, a copy from
+// device to host will be inserted. All MoveToHost and MoveToDevice custom calls
+// are removed by the end of this pass.
 class HostOffloader : public HloModulePass {
  public:
   explicit HostOffloader(int64_t host_memory_space_color)
@@ -48,65 +81,92 @@ class HostOffloader : public HloModulePass {
   absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  static absl::Span<const HloOpcode> GetAllowedPositionOpcodes() {
-    return kAllowedPositionOpcodes;
-  }
 
  private:
   const int64_t kHostMemorySpaceColor;
-  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
-  absl::flat_hash_set<HloInstruction*> found_host_to_device_annotations_;
-  absl::flat_hash_set<HloInstruction*> expected_host_to_device_annotations_;
-  absl::flat_hash_set<HloInstruction*> custom_calls_to_remove_;
-  absl::flat_hash_set<HloInstruction*> broadcasts_to_replace_;
-  absl::flat_hash_set<HloPosition> positions_to_move_to_host_memory_;
-  absl::flat_hash_set<HloInstruction*> annotations_for_copy_to_host_to_insert_;
   absl::flat_hash_set<HloInstruction*>
-      annotations_for_copy_to_device_to_insert_;
-  absl::flat_hash_set<HloInstruction*> dus_for_streamed_buffer_;
-  std::unique_ptr<CallGraph> call_graph_;
-
-  // Positions of all HloValues of the given HloBuffer will be added to
-  // positions_to_move_to_host_memory_.
-  void AddAllPositionsToBeMovedToHostMemory(const HloBuffer& unique_buffer);
-
-  // Process streamed inputs for the given computation, finding the relevant
-  // move-to-device custom calls and inserting the appropriate copies.
-  absl::Status HandleInputStreaming(HloComputation* computation);
-  // Process streamed outputs for the given computation, finding the relevant
-  // move-to-host custom calls and inserting the appropriate copies.
-  absl::Status HandleOutputStreaming(HloComputation* computation);
-  // From a unique buffer on host memory, finds move-to-device custom calls
-  // for this buffer and inserts the appropriate copies.
-  absl::Status HandleStreamedBuffer(const HloBuffer& unique_buffer);
-  // Creates a copy to device for the input streaming custom call.
-  absl::Status CreateCopyForInputStreaming(HloInstruction* custom_call);
-  absl::StatusOr<bool> TryParameterStreaming(HloInstruction* custom_call);
-  absl::StatusOr<bool> TryOutputStreaming(HloInstruction* custom_call);
-  absl::Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
-
-  // Handle memory-only offloading where the data is written to the host via a
-  // dynamic-update-slice and is read back via a dynamic-slice.
-  absl::Status MemoryOnlyOffloadStartingWithDus(
-      const HloInstruction* dynamic_update_slice);
-
-  // Handle memory-only offloading where the data is written to the host via a
-  // copy and is read back via a copy.
-  absl::Status MemoryOnlyOffloadStartingWithCopy(const HloInstruction* copy);
-
-  // Handle memory-only offloading where there are no ops yet for data movement.
-  // We will insert copies at the points where the annotations are.
-  absl::Status MemoryOnlyOffloadInsertCopies(HloInstruction* custom_call);
-
-  absl::Status DynamifySlice(HloInstruction* slice);
-
-  static constexpr std::array kAllowedPositionOpcodes = {
-      HloOpcode::kBitcast,
-      HloOpcode::kGetTupleElement,
-      HloOpcode::kOptimizationBarrier,
-      HloOpcode::kParameter,
-      HloOpcode::kTuple,
-      HloOpcode::kWhile};
+      already_visited_move_to_host_custom_calls_;
+  absl::flat_hash_set<HloInstruction*> dynamic_update_slices_already_allocated_;
+  absl::flat_hash_set<HloInstruction*> validated_slices_;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> copies_created_after_;
+  absl::flat_hash_set<HloInstruction*> move_to_device_custom_calls_to_remove_;
+  absl::flat_hash_set<InstructionAndShapeIndex> already_inserted_copy_before_;
+
+  // Sometimes previous transformations turn a DynamicSlice into a Slice. Since
+  // we're doing a DMA between the host and device, we need to turn the Slice
+  // back into a DynamicSlice.
+  absl::StatusOr<HloInstruction*> DynamifySlice(HloInstruction* slice);
+
+  // Returns true if the instruction is allowed to be in the
+  // middle of a pure memory offload path.
+  bool IsValidDuringPureMemoryOffload(const HloInstruction* instruction) const;
+
+  // Returns true if the instruction is allowed to be in the
+  // middle of a path between a MoveToHost custom-call annotation and a
+  // DynamicUpdateSlice. Ideally the custom-call should be immediately followed
+  // by the DynamicUpdateSlice, but this is not always the case.
+  bool InstructionIsAllowedBetweenMoveToHostAndDus(
+      const HloInstruction* instruction) const;
+
+  // Returns true if the instruction is allowed to be in the
+  // middle of a path between a DynamicSlice and a MoveToDevice custom-call
+  // annotation. Ideally the DynamicSlice should be immediately followed by the
+  // custom-call, but this is not always the case.
+  bool InstructionIsAllowedBetweenDsAndMoveToDevice(
+      const HloInstruction* instruction) const;
+
+  // Walks down the graph and does "host memory offloading" starting from every
+  // host memory parameter in the entry computation.
+  absl::StatusOr<bool> HandleInputStreaming(HloComputation* entry_computation);
+
+  // Walks down the graph and does "host memory offloading" starting from every
+  // MoveToHost custom call.
+  absl::StatusOr<bool> HandleMoveToHostCustomCall(
+      HloInstruction* custom_call_instruction);
+
+  // Since we always walk the graph from the top down, this function only needs
+  // to remove these lingering custom calls. This function should only be called
+  // once all host memory offloading is done because multiple paths might lead
+  // to the same MoveToDevice custom call. Removing it too early will confuse
+  // subsequent walkings of the graph.
+  absl::StatusOr<bool> HandleMoveToDeviceCustomCall(
+      HloInstruction* custom_call_instruction);
+
+  // DynamicUpdateSlices which write into host memory must have their
+  // destination buffer allocated on the host. This function creates the
+  // allocation and updates all positions to have host memory space.
+  absl::Status CreateAllocateBufferForDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice);
+
+  // Returns an error if something unallowed exists between the
+  // Slice/DynamicSlice and the MoveToDevice custom call.
+  absl::Status ValidateSliceLeadsToMoveToDeviceCustomCall(
+      HloInstruction* slice);
+
+  // Common function for doing the actual walking of the graph. Host memory
+  // spaces are set and copies are inserted in here.
+  absl::StatusOr<bool> WalkDownHostMemoryOffloadPaths(
+      const InstructionAndShapeIndex& starting_instruction_and_index,
+      bool insert_copy_before);
+
+  // Given a custom call, this returns the first instruction and shape index to
+  // start the host memory offload path from for each use of the custom call.
+  absl::StatusOr<std::vector<InstructionAndShapeIndex>> GetStartingInstructions(
+      HloInstruction* custom_call_instruction);
+
+  // When a MoveToHost custom call is not paired with a DynamicUpdateSlice, a
+  // copy from device to host must be inserted.
+  absl::StatusOr<bool> InsertCopyBetween(
+      const InstructionAndShapeIndex& before_instruction_and_index,
+      const InstructionAndShapeIndex& after_instruction_and_index);
+
+  // This is a fix for scheduling. Add copies to inputs of dynamic-update-slice
+  // if the inserted value is directly a parameter of a computation. This is to
+  // avoid cases in while loop where parameter/output aliasing can stop
+  // scheduling because control-dependencies are added.
+  absl::StatusOr<bool> ApplySchedulingFix(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 01fa2ecbe07c66..10ca95f53cdd62 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -137,15 +137,43 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
-TEST_F(HostOffloaderTest, BasicCopy) {
+TEST_F(HostOffloaderTest, DusFirstOperandIsNotFromABroadcast) {
   const std::string& hlo_string = R"(
 HloModule my_module
 ENTRY main {
-  data_param = f32[2048] parameter(0)
-  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
-  copy_0 = f32[2048] copy(offload_custom_call)
-  copy_1 = f32[2048] copy(copy_0)
-  ROOT load_custom_call = f32[2048] custom-call(copy_1), custom_call_target="MoveToDevice"
+  data_param = f32[1,2048,2048] parameter(0)
+  index_param = s32[] parameter(1)
+  param_2 = f32[2,2048,2048] parameter(2)
+  constant_s32_0 = s32[] constant(0)
+  offload_custom_call = f32[1,2048,2048] custom-call(data_param), custom_call_target="MoveToHost"
+  dynamic_update_slice = f32[2,2048,2048] dynamic-update-slice(param_2, offload_custom_call, index_param, constant_s32_0, constant_s32_0)
+  dynamic_slice = f32[1,2048,2048] dynamic-slice(dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  ROOT load_custom_call = f32[1,2048,2048] custom-call(dynamic_slice), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  const absl::StatusOr<bool> result = RunHostOffloader(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
+TEST_F(HostOffloaderTest, DusDsWithTupleAfterBroadcast) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[1,2048,2048] parameter(0)
+  index_param = s32[] parameter(1)
+  constant_f32_0 = f32[] constant(0)
+  constant_s32_0 = s32[] constant(0)
+  broadcast = f32[2,2048,2048] broadcast(constant_f32_0), dimensions={}
+  tuple = (f32[2,2048,2048]) tuple(broadcast)
+  gte = f32[2,2048,2048] get-tuple-element(tuple), index=0
+  offload_custom_call = f32[1,2048,2048] custom-call(data_param), custom_call_target="MoveToHost"
+  dynamic_update_slice = f32[2,2048,2048] dynamic-update-slice(gte, offload_custom_call, index_param, constant_s32_0, constant_s32_0)
+  dynamic_slice = f32[1,2048,2048] dynamic-slice(dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  ROOT load_custom_call = f32[1,2048,2048] custom-call(dynamic_slice), custom_call_target="MoveToDevice"
 }
 )";
 
@@ -157,22 +185,173 @@ ENTRY main {
   EXPECT_TRUE(changed);
 
   // Look for the following pattern:
-  // param
-  //   |
-  // copy (to host)
-  //   |
-  // copy (to device)
+  // "AllocateBuffer"
+  //               |
+  //             tuple
+  //               |
+  //              gte  param_0  _...
+  //               |  /        /
+  //           dynamic-update-slice  _...
+  //                          |     /
+  //                       dynamic-slice
+  HloInstruction* param;
+  HloInstruction* allocate_buffer;
+  HloInstruction* tuple;
+  HloInstruction* gte;
+  HloInstruction* dynamic_update_slice;
+  HloInstruction* dynamic_slice;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::DynamicSlice(
+                  &dynamic_slice,
+                  m::DynamicUpdateSlice(
+                      &dynamic_update_slice,
+                      m::GetTupleElement(
+                          &gte,
+                          m::Tuple(&tuple, m::CustomCall(&allocate_buffer,
+                                                         {"AllocateBuffer"})),
+                          0),
+                      m::Parameter(&param, 0), m::Op(), m::Op(), m::Op()),
+                  m::Op(), m::Op(), m::Op())));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(allocate_buffer->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_update_slice->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_slice->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, DusWithoutDs) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[1,2048,2048] parameter(0)
+  index_param = s32[] parameter(1)
+  constant_f32_0 = f32[] constant(0)
+  constant_s32_0 = s32[] constant(0)
+  broadcast = f32[2,2048,2048] broadcast(constant_f32_0), dimensions={}
+  offload_custom_call = f32[1,2048,2048] custom-call(data_param), custom_call_target="MoveToHost"
+  dynamic_update_slice = f32[2,2048,2048] dynamic-update-slice(broadcast, offload_custom_call, index_param, constant_s32_0, constant_s32_0)
+  ROOT load_custom_call = f32[2,2048,2048] custom-call(dynamic_update_slice), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
 
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // "AllocateBuffer"  param_0  _...
+  //               |  /        /
+  //           dynamic-update-slice
+  //                          |
+  //                         copy
   HloInstruction* param;
+  HloInstruction* allocate_buffer;
+  HloInstruction* dynamic_update_slice;
+  HloInstruction* copy;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(
+          &copy, m::DynamicUpdateSlice(
+                     &dynamic_update_slice,
+                     m::CustomCall(&allocate_buffer, {"AllocateBuffer"}),
+                     m::Parameter(&param, 0), m::Op(), m::Op(), m::Op()))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(allocate_buffer->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_update_slice->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, DusAndNoCopyFromSameCustomCall) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[1,2048,2048] parameter(0)
+  index_param = s32[] parameter(1)
+  constant_f32_0 = f32[] constant(0)
+  constant_s32_0 = s32[] constant(0)
+  broadcast = f32[2,2048,2048] broadcast(constant_f32_0), dimensions={}
+  offload_custom_call = f32[1,2048,2048] custom-call(data_param), custom_call_target="MoveToHost"
+  dynamic_update_slice = f32[2,2048,2048] dynamic-update-slice(broadcast, offload_custom_call, index_param, constant_s32_0, constant_s32_0)
+  dynamic_slice = f32[1,2048,2048] dynamic-slice(dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  tuple = (f32[1,2048,2048]) tuple(offload_custom_call)
+  gte = f32[1,2048,2048] get-tuple-element(tuple), index=0
+  load_custom_call_0 = f32[1,2048,2048] custom-call(dynamic_slice), custom_call_target="MoveToDevice"
+  load_custom_call_1 = f32[1,2048,2048] custom-call(gte), custom_call_target="MoveToDevice"
+  ROOT tuple_1 = (f32[1,2048,2048], f32[1,2048,2048]) tuple(load_custom_call_0, load_custom_call_1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // "AllocateBuffer"      param_0
+  //               |          /   \_________
+  //               |         /              \
+  //               |        /              copy
+  //               |       /                 |
+  //               |      /     _...       tuple
+  //               |     /     /             |
+  //           dynamic-update-slice  _...   gte
+  //                          |     /        |
+  //                       dynamic-slice   copy
+  //                                   \    /
+  //                                    tuple
+  HloInstruction* param_match_1;
+  HloInstruction* param_match_2;
+  HloInstruction* allocate_buffer;
+  HloInstruction* dynamic_update_slice;
+  HloInstruction* dynamic_slice;
   HloInstruction* copy_to_host;
+  HloInstruction* tuple_0;
+  HloInstruction* gte;
   HloInstruction* copy_to_device;
+  HloInstruction* tuple_1;
+
+  const auto dynamic_slice_pattern = m::DynamicSlice(
+      &dynamic_slice,
+      m::DynamicUpdateSlice(&dynamic_update_slice,
+                            m::CustomCall(&allocate_buffer, {"AllocateBuffer"}),
+                            m::Parameter(&param_match_1, 0), m::Op(), m::Op(),
+                            m::Op()),
+      m::Op(), m::Op(), m::Op());
+  const auto copy_pattern = m::Copy(
+      &copy_to_device,
+      m::GetTupleElement(
+          &gte,
+          m::Tuple(&tuple_0,
+                   m::Copy(&copy_to_host, m::Parameter(&param_match_2, 0))),
+          0));
   ASSERT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(m::Copy(&copy_to_device,
-                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
-  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+      GmockMatch(m::Tuple(&tuple_1, dynamic_slice_pattern, copy_pattern)));
+  EXPECT_EQ(param_match_1, param_match_2);
+  TestShapeHasMemorySpace(param_match_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(allocate_buffer->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_update_slice->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_slice->shape(), Layout::kDefaultMemorySpace);
   TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_0->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
   TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {0}),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
 
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
@@ -468,16 +647,17 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
-TEST_F(HostOffloaderTest, NoCopyWithOptBarrier) {
+TEST_F(HostOffloaderTest, NoCopyThroughTuple) {
   const std::string& hlo_string = R"(
 HloModule my_module
 ENTRY main {
   data_param = f32[2048] parameter(0)
+  other_param = f32[2048] parameter(1)
   offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
-  tuple = (f32[2048]) tuple(offload_custom_call)
-  opt_barrier = (f32[2048]) opt-barrier(tuple)
-  get_tuple_element = f32[2048] get-tuple-element(opt_barrier), index=0
-  ROOT load_custom_call = f32[2048] custom-call(get_tuple_element), custom_call_target="MoveToDevice"
+  tuple = (f32[2048], f32[2048]) tuple(offload_custom_call, other_param)
+  gte_0 = f32[2048] get-tuple-element(tuple), index=0
+  gte_1 = f32[2048] get-tuple-element(tuple), index=1
+  ROOT load_custom_call = f32[2048] custom-call(gte_0), custom_call_target="MoveToDevice"
 }
 )";
 
@@ -492,19 +672,17 @@ ENTRY main {
   // param
   //   |
   // copy (to host)
-  //   |
+  //   |   _
+  //   |  /
   // tuple
   //   |
-  // opt-barrier
-  //   |
-  // get-tuple-element
+  //  gte
   //   |
   // copy (to device)
 
   HloInstruction* param;
   HloInstruction* copy_to_host;
   HloInstruction* tuple;
-  HloInstruction* opt_barrier;
   HloInstruction* gte;
   HloInstruction* copy_to_device;
   ASSERT_THAT(
@@ -512,35 +690,37 @@ ENTRY main {
       GmockMatch(m::Copy(
           &copy_to_device,
           m::GetTupleElement(
-              &gte, m::OptimizationBarrier(
-                        &opt_barrier,
-                        m::Tuple(&tuple, m::Copy(&copy_to_host,
-                                                 m::Parameter(&param, 0))))))));
+              &gte,
+              m::Tuple(&tuple, m::Copy(&copy_to_host, m::Parameter(&param, 0)),
+                       m::Op()),
+              0))));
   TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
   TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
   TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
                           kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
-                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
   TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
   TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
 
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
-TEST_F(HostOffloaderTest, NoCopyMultipleToDevice) {
+TEST_F(HostOffloaderTest, NoCopyThroughNestedTuple) {
   const std::string& hlo_string = R"(
 HloModule my_module
 ENTRY main {
-  constant = f32[] constant(0)
-  custom_call_0 = f32[] custom-call(constant), custom_call_target="MoveToHost"
-  tuple_0 = (f32[], f32[]) tuple(custom_call_0, custom_call_0)
-  opt_barrier = (f32[], f32[]) opt-barrier(tuple_0)
-  gte_0 = f32[] get-tuple-element(opt_barrier), index=0
-  custom_call_1 = f32[] custom-call(gte_0), custom_call_target="MoveToDevice"
-  gte_1 = f32[] get-tuple-element(opt_barrier), index=1
-  custom_call_2 = f32[] custom-call(gte_1), custom_call_target="MoveToDevice"
-  ROOT tuple_1 = (f32[], f32[]) tuple(custom_call_1, custom_call_2)
+  data_param = f32[2048] parameter(0)
+  other_param_0 = f32[2048] parameter(1)
+  other_param_1 = f32[2048] parameter(2)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
+  tuple_0 = (f32[2048], f32[2048]) tuple(offload_custom_call, other_param_0)
+  tuple_1 = ((f32[2048], f32[2048]), f32[2048]) tuple(tuple_0, other_param_1)
+  gte_0 = (f32[2048], f32[2048]) get-tuple-element(tuple_1), index=0
+  gte_1 = f32[2048] get-tuple-element(tuple_1), index=1
+  gte_2 = f32[2048] get-tuple-element(gte_0), index=0
+  gte_3 = f32[2048] get-tuple-element(gte_0), index=1
+  ROOT load_custom_call = f32[2048] custom-call(gte_2), custom_call_target="MoveToDevice"
 }
 )";
 
@@ -552,81 +732,425 @@ ENTRY main {
   EXPECT_TRUE(changed);
 
   // Look for the following pattern:
-  //  constant
-  //      |
-  //    copy
-  //    |  |
-  //    tuple
-  //      |
-  // opt-barrier
-  //    /  \
-  //  gte  gte
-  //   |    |
-  //  copy copy
-  //    \  /
-  //   tuple
-  HloInstruction* constant;
-  HloInstruction* copy_to_host_1;
-  HloInstruction* copy_to_host_2;
+  // param
+  //   |
+  // copy (to host)
+  //   |   _
+  //   |  /
+  // tuple_0
+  //   |   _
+  //   |  /
+  // tuple_1
+  //   |
+  //  gte_0
+  //   |
+  //  gte_1
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* tuple_0;
+  HloInstruction* gte_0;
   HloInstruction* tuple_1;
-  HloInstruction* opt_barrier;
   HloInstruction* gte_1;
-  HloInstruction* copy_to_device_1;
-  HloInstruction* gte_2;
-  HloInstruction* copy_to_device_2;
-  HloInstruction* tuple_2;
-  const auto constant_pattern = m::ConstantScalar(&constant, 0);
-  const auto opt_barrier_pattern = m::OptimizationBarrier(
-      &opt_barrier,
-      m::Tuple(&tuple_1, m::Copy(&copy_to_host_1, constant_pattern),
-               m::Copy(&copy_to_host_2, constant_pattern)));
+  HloInstruction* copy_to_device;
+  const auto copy_param_pattern =
+      m::Copy(&copy_to_host, m::Parameter(&param, 0));
+  const auto tuple_of_tuple_pattern = m::Tuple(
+      &tuple_1, m::Tuple(&tuple_0, copy_param_pattern, m::Op()), m::Op());
+  const auto gte_of_gte_pattern = m::GetTupleElement(
+      &gte_1, m::GetTupleElement(&gte_0, tuple_of_tuple_pattern, 0), 0);
   ASSERT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Tuple(
-                  &tuple_2,
-                  m::Copy(&copy_to_device_1,
-                          m::GetTupleElement(&gte_1, opt_barrier_pattern)),
-                  m::Copy(&copy_to_device_2,
-                          m::GetTupleElement(&gte_2, opt_barrier_pattern)))));
-  TestShapeHasMemorySpace(constant->shape(), Layout::kDefaultMemorySpace);
-  TestShapeHasMemorySpace(copy_to_host_1->shape(), kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(copy_to_host_2->shape(), kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {0}),
-                          kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {1}),
-                          kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
-                          kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {1}),
+              GmockMatch(m::Copy(&copy_to_device, gte_of_gte_pattern)));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_0->shape(), {0}),
                           kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(copy_to_device_1->shape(),
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_0->shape(), {1}),
                           Layout::kDefaultMemorySpace);
-  TestShapeHasMemorySpace(gte_2->shape(), kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(copy_to_device_2->shape(),
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(gte_0->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(gte_0->shape(), {1}),
                           Layout::kDefaultMemorySpace);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_2->shape(), {0}),
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {0, 0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {0, 1}),
                           Layout::kDefaultMemorySpace);
-  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_2->shape(), {1}),
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {1}),
                           Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
 
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
-TEST_F(HostOffloaderTest, NoCopyWithOptBarrierMoreElaborate) {
+TEST_F(HostOffloaderTest, NoCopyThroughComputation) {
   const std::string& hlo_string = R"(
-HloModule jit_f, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+HloModule my_module
+other_computation {
+  ROOT param = f32[2048] parameter(0)
+}
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
+  call = f32[2048] call(offload_custom_call), to_apply=other_computation
+  ROOT load_custom_call = f32[2048] custom-call(call), custom_call_target="MoveToDevice"
+}
+)";
 
-ENTRY main.24 {
-  Arg_0.1 = f32[16]{0} parameter(0), sharding={devices=[2]<=[2]}
-  cosine.4 = f32[16]{0} cosine(Arg_0.1)
-  custom-call.5 = f32[16]{0} custom-call(cosine.4), custom_call_target="MoveToHost"
-  sine.3 = f32[16]{0} sine(Arg_0.1)
-  cosine.7 = f32[16]{0} cosine(sine.3)
-  custom-call.8 = f32[16]{0} custom-call(cosine.7), custom_call_target="MoveToHost"
-  sine.6 = f32[16]{0} sine(sine.3)
-  cosine.9 = f32[16]{0} cosine(sine.6)
-  custom-call.10 = f32[16]{0} custom-call(cosine.9), custom_call_target="MoveToHost"
-  constant.2 = f32[] constant(1)
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern in the entry computation:
+  // param
+  //   |
+  // copy (to host)
+  //   |             ___
+  //   |            /   \
+  // call===========   param
+  //   |            \___/
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* call;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(
+          &copy_to_device,
+          m::Call(&call, m::Copy(&copy_to_host, m::Parameter(&param, 0))))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(call->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  ASSERT_THAT(call->called_computations(), ::testing::SizeIs(1));
+  HloComputation* called_computation = call->called_computations()[0];
+  HloInstruction* called_computation_param;
+  ASSERT_THAT(called_computation->root_instruction(),
+              GmockMatch(m::Parameter(&called_computation_param, 0)));
+  TestShapeHasMemorySpace(called_computation_param->shape(),
+                          kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyThroughComputationAndTuple) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+other_computation {
+  param_0 = f32[2048] parameter(0)
+  param_1 = f32[2048] parameter(1)
+  ROOT tuple = (f32[2048], f32[2048]) tuple(param_0, param_1)
+}
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  other_param = f32[2048] parameter(1)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
+  call = (f32[2048], f32[2048]) call(offload_custom_call, other_param), to_apply=other_computation
+  gte_0 = f32[2048] get-tuple-element(call), index=0
+  gte_1 = f32[2048] get-tuple-element(call), index=1
+  ROOT load_custom_call = f32[2048] custom-call(gte_0), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param0
+  //   |
+  // copy (to host)
+  //   |              __________
+  //   |  _          /      /   \
+  //   | /          /   param0 param1
+  //  call==========         \ /
+  //   |            \       tuple
+  //  gte            \_______/
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* call;
+  HloInstruction* gte;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(
+          &copy_to_device,
+          m::GetTupleElement(
+              &gte,
+              m::Call(&call, m::Copy(&copy_to_host, m::Parameter(&param, 0)),
+                      m::Op())))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(call->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(call->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_THAT(call->called_computations(), ::testing::SizeIs(1));
+  HloComputation* called_computation = call->called_computations()[0];
+  HloInstruction* called_computation_param_0;
+  HloInstruction* called_computation_param_1;
+  HloInstruction* tuple;
+  ASSERT_THAT(
+      called_computation->root_instruction(),
+      GmockMatch(m::Tuple(&tuple, m::Parameter(&called_computation_param_0, 0),
+                          m::Parameter(&called_computation_param_1, 1))));
+  TestShapeHasMemorySpace(called_computation_param_0->shape(),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(called_computation_param_1->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyThroughWhile) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+while_body {
+  ROOT param = f32[2048] parameter(0)
+}
+while_condition {
+  param = f32[2048] parameter(0)
+  constant_0 = s32[] constant(0)
+  constant_1 = s32[] constant(1)
+  ROOT pred_result = pred[] compare(constant_1, constant_0), direction=LT
+}
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
+  while = f32[2048] while(offload_custom_call), condition=while_condition, body=while_body
+  ROOT load_custom_call = f32[2048] custom-call(while), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //                                     __
+  //                                    /  \
+  // param                             /  param
+  //   |                 <CONDITION>===
+  // copy (to host)     /
+  //   |               /
+  // while=============
+  //   |               \           __
+  // copy (to device)   \         /  \
+  //                     <BODY>===  param
+  //                              \__/
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* while_instr;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(
+                  &copy_to_device,
+                  m::While(&while_instr,
+                           m::Copy(&copy_to_host, m::Parameter(&param, 0))))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(while_instr->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  HloComputation* while_condition = while_instr->while_condition();
+  ASSERT_THAT(while_condition->parameter_instructions(), ::testing::SizeIs(1));
+  TestShapeHasMemorySpace(while_condition->parameter_instruction(0)->shape(),
+                          kHostMemorySpaceColor);
+
+  HloInstruction* while_body_param;
+  HloComputation* while_body = while_instr->while_body();
+  ASSERT_THAT(while_body->root_instruction(),
+              GmockMatch(m::Parameter(&while_body_param, 0)));
+  TestShapeHasMemorySpace(while_body_param->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyWithOptBarrier) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="MoveToHost"
+  tuple = (f32[2048]) tuple(offload_custom_call)
+  opt_barrier = (f32[2048]) opt-barrier(tuple)
+  get_tuple_element = f32[2048] get-tuple-element(opt_barrier), index=0
+  ROOT load_custom_call = f32[2048] custom-call(get_tuple_element), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // tuple
+  //   |
+  // opt-barrier
+  //   |
+  // get-tuple-element
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* tuple;
+  HloInstruction* opt_barrier;
+  HloInstruction* gte;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(
+          &copy_to_device,
+          m::GetTupleElement(
+              &gte, m::OptimizationBarrier(
+                        &opt_barrier,
+                        m::Tuple(&tuple, m::Copy(&copy_to_host,
+                                                 m::Parameter(&param, 0))))))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyMultipleToDevice) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  constant = f32[] constant(0)
+  custom_call_0 = f32[] custom-call(constant), custom_call_target="MoveToHost"
+  tuple_0 = (f32[], f32[]) tuple(custom_call_0, custom_call_0)
+  opt_barrier = (f32[], f32[]) opt-barrier(tuple_0)
+  gte_0 = f32[] get-tuple-element(opt_barrier), index=0
+  custom_call_1 = f32[] custom-call(gte_0), custom_call_target="MoveToDevice"
+  gte_1 = f32[] get-tuple-element(opt_barrier), index=1
+  custom_call_2 = f32[] custom-call(gte_1), custom_call_target="MoveToDevice"
+  ROOT tuple_1 = (f32[], f32[]) tuple(custom_call_1, custom_call_2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //  constant
+  //      |
+  //    copy
+  //    |  |
+  //    tuple
+  //      |
+  // opt-barrier
+  //    /  \
+  //  gte  gte
+  //   |    |
+  //  copy copy
+  //    \  /
+  //   tuple
+  HloInstruction* constant;
+  HloInstruction* copy_to_host_1;
+  HloInstruction* copy_to_host_2;
+  HloInstruction* tuple_1;
+  HloInstruction* opt_barrier;
+  HloInstruction* gte_1;
+  HloInstruction* copy_to_device_1;
+  HloInstruction* gte_2;
+  HloInstruction* copy_to_device_2;
+  HloInstruction* tuple_2;
+  const auto constant_pattern = m::ConstantScalar(&constant, 0);
+  const auto opt_barrier_pattern = m::OptimizationBarrier(
+      &opt_barrier,
+      m::Tuple(&tuple_1, m::Copy(&copy_to_host_1, constant_pattern),
+               m::Copy(&copy_to_host_2, constant_pattern)));
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  &tuple_2,
+                  m::Copy(&copy_to_device_1,
+                          m::GetTupleElement(&gte_1, opt_barrier_pattern)),
+                  m::Copy(&copy_to_device_2,
+                          m::GetTupleElement(&gte_2, opt_barrier_pattern)))));
+  TestShapeHasMemorySpace(constant->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host_1->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_host_2->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_1->shape(), {1}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {1}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device_1->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte_2->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device_2->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_2->shape(), {0}),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple_2->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyWithOptBarrierMoreElaborate) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+ENTRY main.24 {
+  Arg_0.1 = f32[16]{0} parameter(0), sharding={devices=[2]<=[2]}
+  cosine.4 = f32[16]{0} cosine(Arg_0.1)
+  custom-call.5 = f32[16]{0} custom-call(cosine.4), custom_call_target="MoveToHost"
+  sine.3 = f32[16]{0} sine(Arg_0.1)
+  cosine.7 = f32[16]{0} cosine(sine.3)
+  custom-call.8 = f32[16]{0} custom-call(cosine.7), custom_call_target="MoveToHost"
+  sine.6 = f32[16]{0} sine(sine.3)
+  cosine.9 = f32[16]{0} cosine(sine.6)
+  custom-call.10 = f32[16]{0} custom-call(cosine.9), custom_call_target="MoveToHost"
+  constant.2 = f32[] constant(1)
   tuple.11 = (f32[16]{0}, f32[16]{0}, f32[16]{0}, f32[]) tuple(custom-call.5, custom-call.8, custom-call.10, constant.2)
   opt-barrier.12 = (f32[16]{0}, f32[16]{0}, f32[16]{0}, f32[]) opt-barrier(tuple.11)
   get-tuple-element.16 = f32[] get-tuple-element(opt-barrier.12), index=3
@@ -1000,9 +1524,40 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  absl::StatusOr<bool> statusOrChanged = RunHostOffloader(module.get());
-  // The pass should return an error.
-  ASSERT_FALSE(statusOrChanged.ok());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // "AllocateBuffer"  param_0  _...
+  //               |  /        /
+  //           dynamic-update-slice
+  //                          |
+  //                        copy     _...
+  //                          |     /
+  //                       dynamic-slice
+  HloInstruction* param;
+  HloInstruction* allocate_buffer;
+  HloInstruction* dynamic_update_slice;
+  HloInstruction* copy;
+  HloInstruction* dynamic_slice;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::DynamicSlice(
+          &dynamic_slice,
+          m::Copy(&copy,
+                  m::DynamicUpdateSlice(
+                      &dynamic_update_slice,
+                      m::CustomCall(&allocate_buffer, {"AllocateBuffer"}),
+                      m::Parameter(&param, 0), m::Op(), m::Op(), m::Op())),
+          m::Op(), m::Op(), m::Op())));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(allocate_buffer->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(dynamic_update_slice->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(dynamic_slice->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
 TEST_F(HostOffloaderTest, LlmActivation) {
@@ -1095,7 +1650,8 @@ ENTRY main {
   while_output_1 = f32[96,8,6,2048,2048] get-tuple-element(producing_while), index=1
   while_output_2 = f32[96,8,6,2048,1] get-tuple-element(producing_while), index=2
   tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, while_output_1, while_output_2)
-  ROOT consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  ROOT result = s32[] get-tuple-element(consuming_while), index=0
 }
 )";
 
@@ -1115,6 +1671,8 @@ ENTRY main {
   //        tuple
   //         |
   //  consuming_while
+  //         |
+  //        gte
   HloInstruction* consuming_while;
   HloInstruction* producing_while_0;
   HloInstruction* producing_while_1;
@@ -1122,14 +1680,18 @@ ENTRY main {
     HloInstruction* tuple;
     HloInstruction* gte_0;
     HloInstruction* gte_1;
+    HloInstruction* gte_2;
     ASSERT_THAT(
         module->entry_computation()->root_instruction(),
-        GmockMatch(m::While(
-            &consuming_while,
-            m::Tuple(
-                &tuple, m::Constant(),
-                m::GetTupleElement(&gte_0, m::While(&producing_while_0)),
-                m::GetTupleElement(&gte_1, m::While(&producing_while_1))))));
+        GmockMatch(m::GetTupleElement(
+            &gte_2,
+            m::While(
+                &consuming_while,
+                m::Tuple(
+                    &tuple, m::Constant(),
+                    m::GetTupleElement(&gte_0, m::While(&producing_while_0)),
+                    m::GetTupleElement(&gte_1, m::While(&producing_while_1)))),
+            0)));
     ASSERT_EQ(producing_while_0, producing_while_1);
 
     // Check that the memory spaces were properly set.
@@ -1417,7 +1979,8 @@ ENTRY main {
   while_output_1 = f32[96,8,6,2048,2048] get-tuple-element(producing_while), index=1
   while_output_2 = f32[96,8,6,2048,1] get-tuple-element(producing_while), index=2
   tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, while_output_1, while_output_2)
-  ROOT consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  ROOT result = s32[] get-tuple-element(consuming_while), index=0
 }
 )";
 
@@ -1437,6 +2000,8 @@ ENTRY main {
   //        tuple
   //         |
   //  consuming_while
+  //         |
+  //        gte
   HloInstruction* consuming_while;
   HloInstruction* producing_while_0;
   HloInstruction* producing_while_1;
@@ -1444,14 +2009,18 @@ ENTRY main {
     HloInstruction* tuple;
     HloInstruction* gte_0;
     HloInstruction* gte_1;
+    HloInstruction* gte_2;
     ASSERT_THAT(
         module->entry_computation()->root_instruction(),
-        GmockMatch(m::While(
-            &consuming_while,
-            m::Tuple(
-                &tuple, m::Constant(),
-                m::GetTupleElement(&gte_0, m::While(&producing_while_0)),
-                m::GetTupleElement(&gte_1, m::While(&producing_while_1))))));
+        GmockMatch(m::GetTupleElement(
+            &gte_2,
+            m::While(
+                &consuming_while,
+                m::Tuple(
+                    &tuple, m::Constant(),
+                    m::GetTupleElement(&gte_0, m::While(&producing_while_0)),
+                    m::GetTupleElement(&gte_1, m::While(&producing_while_1)))),
+            0)));
     ASSERT_EQ(producing_while_0, producing_while_1);
 
     // Check that the memory spaces were properly set.
@@ -2013,7 +2582,8 @@ ENTRY main {
   while_output_1 = f32[96,8,6,2048,2048] get-tuple-element(producing_while), index=1
   while_output_2 = f32[96,8,6,2048,1] get-tuple-element(producing_while), index=2
   tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, while_output_1, while_output_2)
-  ROOT consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  ROOT result = s32[] get-tuple-element(consuming_while), index=0
 }
 )";
 
@@ -2155,9 +2725,42 @@ ENTRY main {
   TestShapeHasMemorySpace(gte_x->shape(), Layout::kDefaultMemorySpace);
   TestShapeHasMemorySpace(add->shape(), Layout::kDefaultMemorySpace);
   TestShapeHasMemorySpace(copy->shape(), Layout::kDefaultMemorySpace);
-  TestShapeHasMemorySpace(param->shape().tuple_shapes(1),
-                          kHostMemorySpaceColor);
-  TestShapeHasMemorySpace(gte_y->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(param->shape().tuple_shapes(1),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_y->shape(), kHostMemorySpaceColor);
+}
+
+TEST_F(HostOffloaderTest, ParameterStreamingNoOpToHost) {
+  const std::string& hlo_string = R"(
+HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)})->s32[2,1]{1,0:T(2,128)}}
+
+ENTRY main {
+  param = s32[2,1]{1,0} parameter(0)
+  to_host = s32[2,1]{1,0} custom-call(param), custom_call_target="MoveToHost"
+  ROOT to_device = s32[2,1]{1,0} custom-call(to_host), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+  LOG(INFO) << module->ToString();
+
+  // Look for the following pattern:
+  //    param
+  //      |
+  // copy(to device)
+  HloInstruction* param;
+  HloInstruction* copy;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(&copy, m::Parameter(&param, 0))));
+  TestShapeHasMemorySpace(param->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
 TEST_F(HostOffloaderTest, OutputStreaming) {
@@ -2241,6 +2844,32 @@ TEST_F(HostOffloaderTest, OutputStreaming) {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, InvalidOutputStreaming) {
+  const std::string& hlo_string = R"(
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+      ROOT tuple = (s32[2,1]{1,0}, s32[2,1]{1,0}) tuple(custom_call, multiply_1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  absl::StatusOr<bool> result = RunHostOffloader(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
 TEST_F(HostOffloaderTest, OutputStreamingWithoutTuple) {
   const std::string& hlo_string = R"(
     HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
@@ -2460,6 +3089,282 @@ ENTRY entry {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreamingNoOpToDevice) {
+  const std::string& hlo_string = R"(
+HloModule OutputStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+ENTRY main {
+  param = s32[2,1]{1,0} parameter(0)
+  to_device = s32[2,1]{1,0} custom-call(param), custom_call_target="MoveToDevice"
+  ROOT to_host = s32[2,1]{1,0} custom-call(to_device), custom_call_target="MoveToHost"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+  LOG(INFO) << module->ToString();
+
+  // Look for the following pattern:
+  //    param
+  //      |
+  // copy(to host)
+  HloInstruction* param;
+  HloInstruction* copy;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(&copy, m::Parameter(&param, 0))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, ParameterAndOutputStreamingPassThrough) {
+  const std::string& hlo_string = R"(
+HloModule OutputStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+ENTRY main {
+  ROOT param = s32[2,1]{1,0} parameter(0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+  HloInstruction* param;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(&param, 0)));
+  TestShapeHasMemorySpace(param->shape(), kHostMemorySpaceColor);
+}
+
+TEST_F(HostOffloaderTest, ParameterAndOutputStreamingPassThroughTuple) {
+  const std::string& hlo_string = R"(
+HloModule OutputStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+ENTRY main {
+  param = s32[2,1]{1,0} parameter(0)
+  tuple = (s32[2,1]{1,0}) tuple(param)
+  ROOT gte = s32[2,1]{1,0} get-tuple-element(tuple), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //  param
+  //    |
+  //  tuple
+  //    |
+  //   gte
+  HloInstruction* param;
+  HloInstruction* tuple;
+  HloInstruction* gte;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  &gte, m::Tuple(&tuple, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+}
+
+TEST_F(HostOffloaderTest, LoneMoveToDevice) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
+
+ENTRY main {
+  param_0 = f32[16,256]{0,1} parameter(0)
+  ROOT custom_call_2 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  HloInstruction* param;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(&param, 0)));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, RepeatedMoveToHost) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
+
+ENTRY main {
+  param_0 = f32[16,256]{0,1} parameter(0)
+  custom_call_0 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToHost"
+  custom_call_1 = f32[16,256]{0,1} custom-call(custom_call_0), custom_call_target="MoveToHost"
+  ROOT custom_call_2 = f32[16,256]{0,1} custom-call(custom_call_1), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, RepeatedMoveToDevice) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
+
+ENTRY main {
+  param_0 = f32[16,256]{0,1} parameter(0)
+  custom_call_0 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToHost"
+  custom_call_1 = f32[16,256]{0,1} custom-call(custom_call_0), custom_call_target="MoveToDevice"
+  ROOT custom_call_2 = f32[16,256]{0,1} custom-call(custom_call_1), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, RepeatedMoveToHostNonSequential) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
+
+ENTRY main {
+  param_0 = f32[16,256]{0,1} parameter(0)
+  custom_call_0 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToHost"
+  custom_call_1 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToHost"
+  ROOT custom_call_2 = f32[16,256]{0,1} custom-call(custom_call_0), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, RepeatedMoveToDeviceNonSequential) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
+
+ENTRY main {
+  param_0 = f32[16,256]{0,1} parameter(0)
+  custom_call_0 = f32[16,256]{0,1} custom-call(param_0), custom_call_target="MoveToHost"
+  custom_call_1 = f32[16,256]{0,1} custom-call(custom_call_0), custom_call_target="MoveToDevice"
+  ROOT custom_call_2 = f32[16,256]{0,1} custom-call(custom_call_0), custom_call_target="MoveToDevice"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+  // Note: There is another copy with another user, but that's not our problem.
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index dd6c2d084079e2..bfc6ce7f49cd68 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -104,6 +104,14 @@ bool LatencyEstimator::IsAsyncPair(const HloGraphNode& from,
          from_op.inner == target_op.inner;
 }
 
+bool LatencyEstimator::IsP2pPair(const HloGraphNode& from,
+                                 const HloGraphNode& target) const {
+  return (from.GetInstr().opcode() == HloOpcode::kSend &&
+          target.GetInstr().opcode() == HloOpcode::kSendDone) ||
+         (from.GetInstr().opcode() == HloOpcode::kRecv &&
+          target.GetInstr().opcode() == HloOpcode::kRecvDone);
+}
+
 LatencyEstimator::TimeCost ApproximateLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& target) const {
   if (IsAsyncPair(from, target)) {
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 99e148cfb64018..51ca9e7b5f3bcf 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -137,6 +137,7 @@ class LatencyEstimator {
     return get_canonical_async_op_(hlo);
   }
   bool IsAsyncPair(const HloGraphNode& from, const HloGraphNode& target) const;
+  bool IsP2pPair(const HloGraphNode& from, const HloGraphNode& target) const;
   explicit LatencyEstimator(
       GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
       : get_canonical_async_op_(func) {}
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator.cc b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
index 50d250a102a205..8b86b1e98af58c 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
@@ -66,7 +66,8 @@ LatencyEstimator::TimeCost ProfileGuidedLatencyEstimator::GetLatencyBetween(
 
   // For async-start/done instructions, if there is no entry in latencies, fall
   // back to using instruction cost as the latency.
-  if (it->second.cost.has_value() && IsAsyncPair(from, target)) {
+  if (it->second.cost.has_value() &&
+      (IsAsyncPair(from, target) || IsP2pPair(from, target))) {
     VLOG(10) << "PGLE found latency for async op " << from.GetInstr().name()
              << " and (assumed)" << target.GetInstr().name()
              << " in instruction costs";
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
index 74100cbd1c19c8..fe0a012d91f8c8 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
@@ -208,4 +208,60 @@ ENTRY entry {
   EXPECT_EQ(latency, 120.0);
 }
 
+TEST_F(ProfileGuidedLatencyEstimatorTest,
+       TestProfileGuidedLatencyEstimatorWithP2pInstruction) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+ENTRY entry {
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  after-all.1 = token[] after-all()
+  send.7.0 = (f32[16,64,256]{2,1,0}, u32[], token[]) send(p0, after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_source_target_pairs="{{0,1}}"}
+  send-done.7.0 = token[] send-done(send.7.0), channel_id=1
+  recv.7.0 = (f32[16,64,256]{2,1,0}, u32[], token[]) recv(after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_source_target_pairs="{{0,1}}"}
+  recv-done.7.0 = (f32[16,64,256]{2,1,0}, token[]) recv-done(recv.7.0), channel_id=1
+  ROOT recv-data = f32[16,64,256]{2,1,0} get-tuple-element(recv-done.7.0), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::string profiled_instructions_text_proto = R"pb(
+    costs { name: "send.7.0" cost_us: 110.0 }
+    costs { name: "recv.7.0" cost_us: 100.0 }
+  )pb";
+  ;
+  tensorflow::profiler::ProfiledInstructionsProto profiled_instructions_proto;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      profiled_instructions_text_proto, &profiled_instructions_proto));
+
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.schedule_send_recvs = true;
+  auto latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
+      sched_config, std::make_unique<ApproximateLatencyEstimator>(),
+      profiled_instructions_proto);
+  HloInstruction* send_start = FindInstruction(hlo_module.get(), "send.7.0");
+  HloInstruction* send_done =
+      FindInstruction(hlo_module.get(), "send-done.7.0");
+
+  HloInstruction* recv_start = FindInstruction(hlo_module.get(), "recv.7.0");
+  HloInstruction* recv_done =
+      FindInstruction(hlo_module.get(), "recv-done.7.0");
+
+  HloGraphNode send_start_node = HloGraphNode(send_start, 0);
+  HloGraphNode send_done_node = HloGraphNode(send_done, 1);
+
+  HloGraphNode recv_start_node = HloGraphNode(recv_start, 2);
+  HloGraphNode recv_done_node = HloGraphNode(recv_done, 3);
+
+  double send_latency =
+      latency_estimator->GetLatencyBetween(send_start_node, send_done_node);
+  double recv_latency =
+      latency_estimator->GetLatencyBetween(recv_start_node, recv_done_node);
+
+  EXPECT_EQ(send_latency, 110.0);
+  EXPECT_EQ(recv_latency, 100.0);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 74227faf3dae76..8fdc78c6f45cf2 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -51,7 +51,8 @@ struct CudaComputeCapability {
     PASCAL_ = 6,
     VOLTA = 7,
     AMPERE = 8,
-    HOPPER = 9
+    HOPPER = 9,
+    BLACKWELL = 10
   };
 
   constexpr CudaComputeCapability() = default;
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index e83f089cf18899..dbf6dc793f3197 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -37,14 +37,31 @@ cc_library(
         "//xla/tests:client_library_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:test_macros_header",
-        "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "exhaustive_unary_test_f32_or_smaller_lib",
+    testonly = True,
+    srcs = ["exhaustive_unary_test_f32_or_smaller.cc"],
+    tags = ["no_pip"],
+    deps = [
+        ":exhaustive_op_test_utils",
+        "//xla:util",
+        "//xla/client:xla_builder",
+        "//xla/tests:client_library_test_base",
+        "@com_google_absl//absl/flags:flag",
+    ],
+)
+
 xla_test(
     name = "exhaustive_unary_test_f32_or_smaller",
-    srcs = ["exhaustive_unary_test_f32_or_smaller.cc"],
+    srcs = ["exhaustive_test_main.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
     real_hardware_only = True,  # Very slow on the interpreter.
     shard_count = 50,
     tags = [
@@ -53,10 +70,9 @@ xla_test(
         "no_oss",
     ],
     deps = [
-        ":exhaustive_op_test_utils",
-        "//xla:util",
-        "//xla/client:xla_builder",
-        "//xla/tests:client_library_test_base",
+        ":exhaustive_unary_test_f32_or_smaller_lib",
+        "@local_tsl//tsl/platform",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -80,6 +96,7 @@ xla_test(
         "//xla:util",
         "//xla/client:xla_builder",
         "//xla/tests:client_library_test_base",
+        "//xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -103,6 +120,7 @@ xla_test(
         "//xla/tests:client_library_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:test_macros_header",
+        "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -122,6 +140,7 @@ xla_test(
     ],
     deps = [
         ":exhaustive_op_test_utils",
+        "//xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -140,5 +159,6 @@ xla_test(
     ],
     deps = [
         ":exhaustive_op_test_utils",
+        "//xla/tests:xla_internal_test_main",
     ],
 )
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_test_main.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_test_main.cc
new file mode 100644
index 00000000000000..d68cd7340d754a
--- /dev/null
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_test_main.cc
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A program with a main that is suitable for unittests, including those
+// that also define microbenchmarks.  Based on whether the user specified
+// the --benchmark_filter flag which specifies which benchmarks to run,
+// we will either run benchmarks or run the gtest tests in the program.
+
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+static int eup_version = 0;
+int GetEupVersion() { return eup_version; }
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+GTEST_API_ int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
index 56240f06343359..c27b1910779e64 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cmath>
 #include <limits>
 #include <random>
+#include <string>
 #include <tuple>
 #include <utility>
 
@@ -35,6 +36,8 @@ limitations under the License.
 namespace xla {
 namespace exhaustive_op_test {
 
+extern int GetEupVersion();
+
 using Eigen::half;
 
 template <typename T, size_t N>
@@ -198,6 +201,13 @@ class Exhaustive32BitOrLessUnaryTest
     special_input_bounder_ = true;
   }
 
+  bool IsGpu(const std::string& platform) const { return platform == "CUDA"; }
+  bool IsCpu(const std::string& platform) const { return platform == "Host"; }
+  bool IsTpu(const std::string& platform) const {
+    return !IsGpu(platform) && !IsCpu(platform);
+  }
+  int EupVersion() { return xla::exhaustive_op_test::GetEupVersion(); }
+
  protected:
   using typename ExhaustiveUnaryTest<T>::NativeT;
 
@@ -308,7 +318,7 @@ using ExhaustiveBF16UnaryTest = Exhaustive32BitOrLessUnaryTest<BF16>;
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
@@ -318,7 +328,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log, {
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
@@ -328,7 +338,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT min = std::numeric_limits<NativeT>::min();
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
@@ -340,7 +350,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       // FIXME(rmlarsen): Break into region around zero and everything else.
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
@@ -352,7 +362,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Logistic, {
   // FIXME(rmlarsen): Break into region around zero and everything else.
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       float eps = std::numeric_limits<NativeT>::epsilon();
       float atol = std::min(0.004f, 200 * eps);
@@ -402,7 +412,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acosh, {
     NativeT eps = std::numeric_limits<NativeT>::epsilon();
     return ErrorSpec{.abs_err = 1e-7, .rel_err = 50 * eps};
   };
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{2e-4, eps};
@@ -412,7 +422,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acosh, {
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asinh, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
@@ -422,7 +432,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asinh, {
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atanh, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 1e-4, .rel_err = eps};
@@ -461,7 +471,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       // Cosh is always greater than or equal to 1, so an absolute
@@ -474,7 +484,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 1e-5, .rel_err = 100 * eps};
@@ -486,7 +496,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestUpperBound, {
   SetBounder(8, 9);
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA" || platform_ == "Host") {
+  if (!IsTpu(platform_)) {
     error_spec_gen =
         +[](NativeT x) { return ErrorSpec{.abs_err = 0, .rel_err = 0}; };
   }
@@ -498,7 +508,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestUpperBound, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestLowerBound, {
   SetBounder(-9, -8);
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA" || platform_ == "Host") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0}; };
   }
   Run(
@@ -508,7 +518,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestLowerBound, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhNormalTest, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "CUDA" && platform_ != "Host") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       // The range of tanh is [-1:1], so no point in giving a relative
       // tolerance when we have an absolute one.
@@ -548,7 +558,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Tan, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT min = std::numeric_limits<NativeT>::min();
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
@@ -559,7 +569,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, {
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, {
   auto error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 5e-5, .rel_err = 2 * eps};
@@ -573,7 +583,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
     NativeT eps = std::numeric_limits<NativeT>::epsilon();
     return ErrorSpec{2e-5, 10 * eps};
   };
-  if (platform_ != "Host" && platform_ != "CUDA") {
+  if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 2e-4, .rel_err = 10 * eps};
@@ -587,12 +597,12 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
     NativeT eps = std::numeric_limits<NativeT>::epsilon();
     return ErrorSpec{.abs_err = 1e-5, .rel_err = 150 * eps};
   };
-  if (platform_ == "CUDA") {
+  if (IsGpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 1e-5, .rel_err = 5000 * eps};
     };
-  } else if (platform_ != "Host") {
+  } else if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();
       return ErrorSpec{.abs_err = 5e-4, .rel_err = 5000 * eps};
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index e18086dbadfb65..622696c8e4c306 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -1087,9 +1087,8 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
                    }) != participating_tasks.end();
 
   if (!participating_tasks.empty() && !among_participating_tasks) {
-    const std::string task_name = GetTaskName(task);
     absl::Status error = MakeCoordinationError(absl::InvalidArgumentError(
-        absl::StrCat("A non-participating task (", GetTaskName(task),
+        absl::StrCat("A non-participating task (", source_task_name,
                      ") called the barrier: ", barrier_id)));
     {
       absl::MutexLock l(&state_mu_);
@@ -1124,6 +1123,8 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   auto* barrier = &it->second;
   // Create barrier for the first time.
   if (inserted) {
+    LOG(INFO) << "Barrier(" << barrier_id
+              << ") has been created by task: " << source_task_name;
     // Initialize barrier state.
     barrier->passed = false;
     // Assume barrier is for entire cluster if no tasks are specified.
@@ -1263,7 +1264,8 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
     absl::string_view barrier_id, absl::Status result, BarrierState* barrier) {
   barrier->passed = true;
   barrier->result = result;
-  VLOG(3) << "Barrier(" << barrier_id << ") has passed with status: " << result;
+  LOG(INFO) << "Barrier(" << barrier_id
+            << ") has passed with status: " << result;
   // Special hook for device propagation barrier to set global device ids.
   if (barrier_id == device_propagation_barrier_id_) {
     AggregateClusterDevices();
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 5eb2967ddee611..939b993848b40a 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -115,6 +115,7 @@ message DebugOptions {
   bool xla_cpu_use_mkl_dnn = 97;
 
   reserved 177;  // Was xla_cpu_use_xla_runtime
+  bool xla_cpu_use_thunk_runtime = 298;
 
   reserved 98;  // Was xla_gpu_max_kernel_unroll_factor
 
@@ -525,11 +526,7 @@ message DebugOptions {
 
   reserved 168;  // Was xla_gpu_simplify_all_fp_conversions.
 
-  // An experimental option to force all layouts present in the
-  // after-optimizations HLO to be descending, e.g.
-  // ShapeUtil::MakeShapeWithDescendingLayout is an identity on all
-  // instructions.
-  bool xla_gpu_normalize_layouts = 172;
+  reserved 172;  // Was xla_gpu_normalize_layouts.
 
   // Generate calls to Arm Compute Library in the CPU backend.
   bool xla_cpu_use_acl = 174;
@@ -785,7 +782,7 @@ message DebugOptions {
   // a deterministic implementation.
   bool xla_gpu_exclude_nondeterministic_ops = 297;
 
-  // Next id: 298
+  // Next id: 299
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index b72d69edc87709..06594c84f95584 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -173,8 +173,8 @@ enum DimLevelType {
 }
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
-// details about tiling-based layout.
+// g3doc/third_party/xla/docs/tiled_layout.md for details about tiling-based
+// layout.
 message TileProto {
   // Number of elements in each dimension of the tile. It's ordered from the
   // most major dimension of the tile to the most minor dimension of the tile.

From 4eb7c12653a69e11f43f8010686d288c782aa301 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 28 May 2024 08:03:26 -0700
Subject: [PATCH 003/287] Remove `use_gpu` flag from xla_cc_test

We have logic for running a test on GPU in both `xla_test` and `xla_cc_test`
which is duplicate and unnecessary.

The logic in `xla_test` is much more sophisticated and allows more fine
grained control on where to run a test. So I'm removing
all GPU functionaliy from `xla_cc_test` and migrating users to
`xla_test`.

PiperOrigin-RevId: 637901435
---
 .../xla/xla/backends/profiler/gpu/BUILD       |   9 +-
 third_party/xla/xla/pjrt/c/BUILD              |  12 +-
 third_party/xla/xla/pjrt/gpu/BUILD            |   9 +-
 third_party/xla/xla/service/gpu/BUILD         |  54 +++---
 .../service/gpu/cudnn_norm_rewriter_test.cc   |   8 +-
 third_party/xla/xla/service/gpu/fusions/BUILD |  32 ++--
 third_party/xla/xla/service/gpu/kernels/BUILD |   5 +-
 third_party/xla/xla/service/gpu/model/BUILD   |   5 +-
 third_party/xla/xla/service/gpu/tests/BUILD   | 154 ++++++++----------
 .../xla/xla/stream_executor/cuda/BUILD        |  12 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  22 +--
 third_party/xla/xla/tests/BUILD               |  12 +-
 third_party/xla/xla/xla.bzl                   |   7 +-
 13 files changed, 162 insertions(+), 179 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 8f7429713fcd0c..37c5f6c06c9918 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -9,10 +9,7 @@ load(
     "if_cuda_is_configured",
 )
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
+load("//xla/tests:build_defs.bzl", "xla_test")
 load(
     "//xla/tsl:tsl.bzl",
     "internal_visibility",
@@ -96,16 +93,16 @@ tsl_gpu_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "cupti_error_manager_test",
     size = "small",
     srcs = ["cupti_error_manager_test.cc"],
+    backends = ["gpu"],
     copts = tf_profiler_copts() + tsl_copts(),
     tags = [
         "gpu_cupti",
         "nomac",
     ],
-    use_gpu = True,
     deps = [
         ":cupti_interface",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index eccbbddb855f38..44c498272f863f 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -7,7 +7,12 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -342,10 +347,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "pjrt_c_api_gpu_test",
     srcs = ["pjrt_c_api_gpu_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":pjrt_c_api_ffi_extension_hdrs",
         ":pjrt_c_api_gpu",
@@ -368,7 +373,6 @@ xla_cc_test(
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/service:custom_call_target_registry",
-        "//xla/service:gpu_plugin",
         "//xla/stream_executor/gpu:gpu_init",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index f3223cb52380f6..b573c899961c0a 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -4,6 +4,7 @@ load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 
 package(
@@ -196,18 +197,20 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "pjrt_client_test_se_gpu",
     srcs = ["pjrt_client_test_se_gpu.cc"],
+    backend_tags = {
+        "gpu": ["multi_gpu"],
+    },
+    backends = ["gpu"],
     tags = [
         "no_oss",
         "notap",
-        "requires-gpu-nvidia:2",
     ],
     deps = [
         ":se_gpu_pjrt_client",
         "//xla/pjrt:pjrt_client_test_common",
-        "//xla/service:gpu_plugin",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 518f67628cc14b..c877c99ce7290e 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -155,11 +155,11 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "custom_call_test",
     srcs = if_gpu_is_configured(["custom_call_test.cc"]),
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    use_gpu = True,
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -175,7 +175,6 @@ xla_cc_test(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
-        "//xla/service:gpu_plugin",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:client_library_test_base",
@@ -199,7 +198,6 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_copy_insertion_test",
     srcs = ["gpu_copy_insertion_test.cc"],
-    use_gpu = True,
     deps = [
         ":buffer_sharing",
         "//xla:test",
@@ -1724,15 +1722,14 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "dot_dimension_sorter_test",
     srcs = ["dot_dimension_sorter_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":dot_dimension_sorter",
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
@@ -1761,7 +1758,6 @@ cc_library(
 xla_cc_test(
     name = "dot_sparsity_rewriter_test",
     srcs = ["dot_sparsity_rewriter_test.cc"],
-    use_gpu = True,
     deps = [
         ":dot_sparsity_rewriter",
         "//xla:xla_data_proto_cc",
@@ -3627,15 +3623,14 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "auto_sharding_gpu_compiler_test",
     srcs = ["auto_sharding_gpu_compiler_test.cc"],
+    backends = ["gpu"],
     tags = ["no_oss"],  # TODO(b/277355322): Make autosharding work in OSS
-    use_gpu = True,
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
@@ -4093,7 +4088,6 @@ cc_library(
 xla_cc_test(
     name = "gpu_layout_assignment_test",
     srcs = ["gpu_layout_assignment_test.cc"],
-    use_gpu = True,
     deps = [
         ":gpu_layout_assignment",
         ":stream_executor_util",
@@ -4192,19 +4186,18 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_hlo_schedule_test",
     srcs = [
         "gpu_hlo_schedule_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_hlo_schedule",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_ordering",
         "//xla/stream_executor:device_description",
@@ -4440,13 +4433,13 @@ gpu_kernel_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "buffer_comparator_test",
     srcs = if_gpu_is_configured(["buffer_comparator_test.cc"]),
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    use_gpu = True,
     deps = [
         ":stream_executor_util",
         "//xla:shape_util",
@@ -4657,11 +4650,11 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "cudnn_norm_rewriter_test",
     srcs = ["cudnn_norm_rewriter_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    use_gpu = True,
     deps = [
         ":cublas_cudnn",
         ":cudnn_norm_rewriter",
@@ -4809,10 +4802,10 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "conv_layout_normalization_test",
     srcs = ["conv_layout_normalization_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
@@ -4981,10 +4974,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "horizontal_loop_fusion_test",
     srcs = ["horizontal_loop_fusion_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_device_info_for_tests",
         ":horizontal_loop_fusion",
@@ -4995,7 +4988,6 @@ xla_cc_test(
         "//xla:test",
         "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_dce",
         "//xla/service:hlo_parser",
         "//xla/service:hlo_pass",
@@ -5033,10 +5025,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "horizontal_input_fusion_test",
     srcs = ["horizontal_input_fusion_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_device_info_for_tests",
         ":horizontal_input_fusion",
@@ -5338,14 +5330,13 @@ tsl_gpu_library(
     alwayslink = 1,
 )
 
-xla_cc_test(
+xla_test(
     name = "runtime_intrinsics_test",
     srcs = ["runtime_intrinsics_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":runtime_intrinsics",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
@@ -5512,17 +5503,16 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "topk_test",
     srcs = ["topk_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":topk_specializer",
         "//xla:shape_util",
         "//xla:status",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_pass",
         "//xla/service:platform_util",
         "//xla/service:topk_rewriter",
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
index a13ffba5e51ed5..676ec192e550a8 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
@@ -1493,7 +1493,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
-TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
+// TODO(b/343124533) Reenable when fixed
+TEST_F(CudnnNormRewriterTest,
+       DISABLED_LayerNormTrainBackward4D1DoutputReshapeSplit) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
 #endif
@@ -1612,7 +1614,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
-TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
+// TODO(b/343124533) Reenable when fixed
+TEST_F(CudnnNormRewriterTest,
+       DISABLED_LayerNormTrainBackward4D1DoutputReshapeCombine) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
 #endif
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 972f17aef6f01b..7873af21dc77c1 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -36,7 +36,6 @@ cc_library(
 xla_cc_test(
     name = "in_place_dynamic_update_slice_test",
     srcs = ["in_place_dynamic_update_slice_test.cc"],
-    use_gpu = True,
     deps = [
         ":fusions",
         ":in_place_dynamic_update_slice",
@@ -81,15 +80,14 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "in_place_dynamic_update_slice_mlir_test",
     srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":in_place_dynamic_update_slice_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
-        "//xla/service:gpu_plugin",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
@@ -388,10 +386,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "loop_mlir_test",
     srcs = ["loop_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":loop_mlir",
         ":mlir_emitter_test_base",
@@ -439,10 +437,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "scatter_mlir_test",
     srcs = ["scatter_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
         ":scatter_mlir",
@@ -492,10 +490,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "transpose_mlir_test",
     srcs = ["transpose_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
         ":transpose_mlir",
@@ -886,15 +884,14 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "reduction_mlir_test",
     srcs = ["reduction_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
         ":reduction_mlir",
         "//xla:error_spec",
-        "//xla/service:gpu_plugin",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -972,10 +969,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "concatenate_mlir_test",
     srcs = ["concatenate_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
@@ -1110,15 +1107,14 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "input_slices_mlir_test",
     srcs = ["input_slices_mlir_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":input_slices_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
-        "//xla/service:gpu_plugin",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index a7add705572576..16cfcc2633c406 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -5,7 +5,6 @@ load(
     "if_cuda_is_configured",
     "if_cuda_newer_than",
 )
-load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -174,10 +173,10 @@ gpu_kernel_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "topk_kernel_test",
     srcs = if_gpu_is_configured(["topk_kernel_test.cc"]),
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":topk_kernel",
         "//xla:types",
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 462d278dba36c9..f9375ee406c6b9 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -855,15 +855,14 @@ cc_library(
     ]
 ]
 
-xla_cc_test(
+xla_test(
     name = "hlo_op_profiler_test",
     srcs = ["hlo_op_profiler_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda(["GOOGLE_CUDA"]),
-    use_gpu = True,
     deps = [
         ":hlo_op_profiler_lib",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 7b21ec5206779a..fb382298ab58f9 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -66,10 +66,10 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "element_wise_row_vectorization_test",
     srcs = ["element_wise_row_vectorization_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:error_spec",
         "//xla/tests:hlo_test_base",
@@ -77,10 +77,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "pred_arithmetic_test",
     srcs = ["pred_arithmetic_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:literal_util",
@@ -142,11 +142,11 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_spmd_e2e_compile_test",
     size = "small",
     srcs = ["gpu_spmd_e2e_compile_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:debug_options_flags",
@@ -203,18 +203,17 @@ xla_test(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "gemm_broadcast_folding_rewrite_test",
     srcs = ["gemm_broadcast_folding_rewrite_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service/gpu:gemm_broadcast_folding_rewriter",
         "//xla/service/gpu:gemm_rewriter",
         "@local_tsl//tsl/platform:statusor",
@@ -223,12 +222,12 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_too_many_blocks_test",
     srcs = [
         "gpu_too_many_blocks_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla/hlo/ir:hlo",
@@ -261,16 +260,15 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "reduction_layout_normalizer_test",
     srcs = [
         "reduction_layout_normalizer_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:reduction_layout_normalizer",
         "//xla/tests:filecheck",
@@ -303,34 +301,32 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "swap_conv_operands_test",
     srcs = [
         "swap_conv_operands_test.cc",
     ],
+    backends = ["gpu"],
     tags = ["no_rocm"],
-    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
-        "//xla/service:gpu_plugin",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "reduction_vectorization_test",
     srcs = [
         "reduction_vectorization_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:gpu_executable",
@@ -363,19 +359,18 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "parallel_reduction_test",
     srcs = [
         "parallel_reduction_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -385,12 +380,12 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_compilation_parallelism_test",
     srcs = [
         "gpu_compilation_parallelism_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -402,10 +397,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -419,12 +414,12 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_copy_alone_test",
     srcs = [
         "gpu_copy_alone_test.cc",
     ],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -436,10 +431,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_dyn_shape_test",
     srcs = ["gpu_dyn_shape_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -473,10 +468,10 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -486,10 +481,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_index_test",
     srcs = ["gpu_index_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:comparison_util",
@@ -506,10 +501,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_infeed_test",
     srcs = ["infeed_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",  # build_cleaner: keep
         "//xla:array3d",
@@ -548,10 +543,10 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "concatenate_emitter_test",
     srcs = ["concatenate_emitter_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -561,10 +556,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "transpose_emitter_test",
     srcs = ["transpose_emitter_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -574,10 +569,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "reduction_emitter_test",
     srcs = ["reduction_emitter_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -587,10 +582,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_ldg_test",
     srcs = ["gpu_ldg_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:literal",
@@ -604,10 +599,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_noalias_test",
     srcs = ["gpu_noalias_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:literal",
@@ -619,10 +614,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_fusion_test",
     srcs = ["gpu_fusion_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -635,10 +630,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_fusion_pipeline_test",
     srcs = ["gpu_fusion_pipeline_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -654,10 +649,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_unrolling_test",
     srcs = ["gpu_unrolling_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:debug_options_flags",
@@ -668,15 +663,13 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_alignment_test",
-    testonly = True,
     srcs = ["gpu_alignment_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla/service:custom_call_target_registry",
-        "//xla/service:gpu_plugin",
         "//xla/service/llvm_ir:alias_analysis",
         "//xla/tests:filecheck",
         "@local_tsl//tsl/platform:test",
@@ -684,10 +677,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_atomic_test",
     srcs = ["gpu_atomic_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla/tests:filecheck",
@@ -696,10 +689,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_input_fusible_slice_test",
     srcs = ["gpu_input_fusible_slice_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -733,10 +726,10 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "select_and_scatter_test",
     srcs = ["select_and_scatter_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -745,13 +738,13 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -856,10 +849,10 @@ lit_test_suite(
 # )
 # copybara:uncomment_end
 
-xla_cc_test(
+xla_test(
     name = "kernel_launch_test",
     srcs = ["kernel_launch_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -868,10 +861,10 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "mock_custom_call_test",
     srcs = ["mock_custom_call_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "//xla/tests:hlo_test_base",
@@ -879,22 +872,21 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "in_place_op_test",
     srcs = ["in_place_op_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:debug_options_flags",
-        "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "dynamic_shared_memory_test",
     srcs = if_cuda_is_configured(["dynamic_shared_memory_test.cc"]),
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:types",
@@ -1065,10 +1057,10 @@ xla_test(
     deps = [":simple_optimization_test"],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_int4_test",
     srcs = ["gpu_int4_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         ":gpu_codegen_test",
         "@local_tsl//tsl/platform:test",
@@ -1076,25 +1068,23 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "simplify_fp_conversions_test",
     srcs = ["simplify_fp_conversions_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:xla_proto_cc",
-        "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "nop_custom_call_test",
     srcs = ["nop_custom_call_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
     deps = [
         "//xla:xla_proto_cc",
-        "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/platform:test_main",
     ],
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index a756c861863cdf..a5b49cb3df503d 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -23,6 +23,7 @@ load(
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_gpu_compilation_copts",
 )
+load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 
 package(
@@ -189,11 +190,15 @@ cuda_only_cc_library(
     ] + if_nccl(["@local_config_nccl//:nccl"]),
 )
 
-xla_cc_test(
+xla_test(
     name = "cuda_driver_test",
     srcs = ["cuda_driver_test.cc"],
-    tags = ["no_rocm"],
-    use_gpu = True,
+    backends = ["gpu"],
+    tags = [
+        # TODO(b/317293391) Remove once Bazel test_suite handles tags correctly
+        "gpu",
+        "no_rocm",
+    ],
     deps = [
         ":cuda_driver",
         "//xla/stream_executor/gpu:gpu_driver_header",
@@ -540,7 +545,6 @@ cc_library(
 xla_cc_test(
     name = "ptx_compiler_test",
     srcs = ["ptx_compiler_test.cc"],
-    use_gpu = True,
     deps = [
         ":ptx_compiler",
         ":ptx_compiler_support",
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 53bc2b3a2fe398..68760af5de46ed 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -22,10 +22,6 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load(
     "//xla/service/gpu:build_defs.bzl",
     "gpu_kernel_library",
@@ -555,10 +551,12 @@ gpu_only_cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "redzone_allocator_test",
     srcs = ["redzone_allocator_test.cc"],
-    use_gpu = True,
+    backends = ["gpu"],
+    # TODO(b/317293391) Remove once Bazel test_suite handles tags correctly
+    tags = ["gpu"],
     deps = [
         ":gpu_asm_opts",
         ":gpu_init",
@@ -719,13 +717,15 @@ xla_test(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "memcpy_test",
     srcs = ["memcpy_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    use_gpu = True,
+    # TODO(b/317293391) Remove once Bazel test_suite handles tags correctly
+    tags = ["gpu"],
     deps = [
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
@@ -741,14 +741,16 @@ xla_cc_test(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "stream_search_test",
     size = "small",
     srcs = ["stream_search_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    use_gpu = True,
+    # TODO(b/317293391) Remove once Bazel test_suite handles tags correctly
+    tags = ["gpu"],
     deps = [
         "//xla/stream_executor",
         "//xla/stream_executor/host:host_platform",
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 0ca5d12b25a262..1bd6b5a4896650 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2410,13 +2410,15 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "llvm_compiler_test",
     srcs = ["llvm_compiler_test.cc"],
+    backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM",
     ]),
-    use_gpu = True,
+    # TODO(b/317293391) Remove once Bazel test_suite handles tags correctly
+    tags = ["gpu"],
     deps = if_gpu_is_configured([
         ":verified_hlo_module",
         "//xla:literal_util",
@@ -2425,7 +2427,6 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "//xla/service:backend",
         "//xla/service:cpu_plugin",
-        "//xla/service:gpu_plugin",
         "//xla/service:llvm_compiler",
         "//xla/service:platform_util",
         "//xla/service/cpu:cpu_compiler",
@@ -2733,17 +2734,16 @@ xla_test(
 )
 
 # A demo of test that loads an hlo module from a file and compares results on gpu and cpu.
-xla_cc_test(
+xla_test(
     name = "sample_file_test",
     srcs = ["sample_file_test.cc"],
+    backends = ["gpu"],
     data = ["isolated_convolution.hlo"],
-    use_gpu = True,
     deps = [
         ":hlo_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:test",
         "//xla/service:cpu_plugin",  # reference backend
-        "//xla/service:gpu_plugin",  # test backend
         "//xla/service:platform_util",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 2143baaf6b5d41..83353afe01019e 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -8,7 +8,6 @@ load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
     "tf_exec_properties",
-    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
@@ -70,11 +69,7 @@ _XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
 def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
     native.cc_binary(deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS, copts = copts, **kwargs)
 
-def xla_cc_test(name, deps = [], use_gpu = False, **kwargs):
-    # Need to do it this way so that `tf_exec_properties` can read tags.
-    _tags = kwargs.get("tags", [])
-    kwargs["tags"] = _tags + tf_gpu_tests_tags() if use_gpu else _tags
-
+def xla_cc_test(name, deps = [], **kwargs):
     native.cc_test(
         name = name,
         deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS,

From 2edb9b8330286e222c7b800a74e5953e6eddbaf9 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 28 May 2024 08:38:17 -0700
Subject: [PATCH 004/287] Fix shuffle_reduce with unsigned arguments.

arith.bitcast can't convert from ui64 to i64.

PiperOrigin-RevId: 637911146
---
 .../gpu/fusions/mlir/lower_xla_gpu_to_scf.cc    | 13 +++++++++++++
 .../mlir/tests/lower_xla_gpu_to_scf.mlir        | 17 +++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
index 5a5f6c0df3fbb7..bdd6d6ecd58c33 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -146,6 +147,18 @@ struct RewriteShuffleReduce : mlir::OpRewritePattern<ShuffleReduceOp> {
               shuffle_int_or_float(b.create<mlir::complex::ReOp>(value)),
               shuffle_int_or_float(b.create<mlir::complex::ImOp>(value)));
         }
+        if (value.getType().isUnsignedInteger()) {
+          auto ty = value.getType();
+          auto signless_ty = b.getIntegerType(ty.getIntOrFloatBitWidth());
+          value = b.create<mlir::UnrealizedConversionCastOp>(
+                       mlir::TypeRange{signless_ty}, value)
+                      .getResult(0);
+          value = shuffle_int_or_float(value);
+          value = b.create<mlir::UnrealizedConversionCastOp>(
+                       mlir::TypeRange{ty}, value)
+                      .getResult(0);
+          return value;
+        }
         return shuffle_int_or_float(value);
       };
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
index 79923cde8b3d04..ea464cd50641ea 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
@@ -62,6 +62,23 @@ module {
 
 // -----
 
+module {
+  func.func @reducer(%a: ui64, %b: ui64) -> ui64 {
+    return %a : ui64
+  }
+
+  func.func @shuffler(%a: ui64) -> ui64 {
+    %ret = xla_gpu.shuffle_reduce @reducer(%a) to 1 : ui64
+    return %ret : ui64
+  }
+}
+
+// CHECK: @shuffler
+// CHECK: unrealized_conversion_cast
+// CHECK-COUNT-2: gpu.shuffle down {{.*}}, %[[C1]]
+
+// -----
+
 module {
   func.func @predicated_insert(
       %v: i32, %tensor: tensor<2xi32>, %index: index,

From ae7327d7381bff0bd55dc10c95c0ae30a9a4fc15 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Tue, 28 May 2024 08:41:06 -0700
Subject: [PATCH 005/287] Add missing newline in the output of run_hlo_module
 This is very annoying if one uses the tool frequently

PiperOrigin-RevId: 637911853
---
 third_party/xla/xla/tools/run_hlo_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 13b2ffeed6aacb..705f04f776ffb8 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -287,7 +287,7 @@ absl::Status RunAndCompareInternal(
   }
 
   if (reference_module == nullptr) {
-    std::cerr << "Skipping reference runner";
+    std::cerr << "Skipping reference runner\n";
     return absl::OkStatus();
   }
   if (const HloInstruction* root_instruction =

From 0ca737653235562995d601375f297fd4b4338a7d Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 28 May 2024 08:51:49 -0700
Subject: [PATCH 006/287] Add cuDNN 9 Docker images with CUDA 12.1 and 12.2

PiperOrigin-RevId: 637914649
---
 ...n9.1-ubuntu20.04-manylinux2014-multipython | 46 +++++++++++++++++++
 ...n9.1-ubuntu20.04-manylinux2014-multipython | 46 +++++++++++++++++++
 .../toolchains/remote_config/containers.bzl   | 16 +++++++
 .../toolchains/remote_config/containers.bzl   | 16 +++++++
 .../toolchains/remote_config/containers.bzl   | 16 +++++++
 5 files changed, 140 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython
new file mode 100644
index 00000000000000..f3c39325be1aa5
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython
@@ -0,0 +1,46 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:3573cdabdea7f203b6440a93bb50a0e1a17c2c9a33f09fccdc0c97f514f9689c
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+    libcudnn9-dev-cuda-12=9.1.1.17-1 \
+    libcudnn9-cuda-12=9.1.1.17-1 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.9.18"
+RUN /install/build_and_install_python.sh "3.10.13"
+RUN /install/build_and_install_python.sh "3.11.6"
+RUN /install/build_and_install_python.sh "3.12.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython
new file mode 100644
index 00000000000000..79d10b924a7138
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython
@@ -0,0 +1,46 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:7c8ecb6482e26c4b4efce0ddaefe3fb3667b3b958c83fe8d3cc3763c6ed7a4d1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+    libcudnn9-dev-cuda-12=9.1.1.17-1 \
+    libcudnn9-cuda-12=9.1.1.17-1 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.9.18"
+RUN /install/build_and_install_python.sh "3.10.13"
+RUN /install/build_and_install_python.sh "3.11.6"
+RUN /install/build_and_install_python.sh "3.12.0"
+
+COPY install/install_pip_packages_by_version.sh /install/
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 4e4e1095ee8754..13fcac829ca33a 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -7,7 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7822b47867ecfc1f57df1cfadeaf091b72191d94cb722c271ed38809be7e7a61",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:0c78f3428cde36f041b758fc2f01d23d2f0dd72dec248f78667fb0c9d1f74cef",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     "cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7128b33e8f22d5f5ec9640bc377c3afddf7eb31daa1f958d1dd91dd7fda6a790",
@@ -102,6 +104,13 @@ containers = {
         "digest": container_digests["cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",
@@ -109,6 +118,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index 4e4e1095ee8754..13fcac829ca33a 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -7,7 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7822b47867ecfc1f57df1cfadeaf091b72191d94cb722c271ed38809be7e7a61",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:0c78f3428cde36f041b758fc2f01d23d2f0dd72dec248f78667fb0c9d1f74cef",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     "cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7128b33e8f22d5f5ec9640bc377c3afddf7eb31daa1f958d1dd91dd7fda6a790",
@@ -102,6 +104,13 @@ containers = {
         "digest": container_digests["cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",
@@ -109,6 +118,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index 4e4e1095ee8754..13fcac829ca33a 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -7,7 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7822b47867ecfc1f57df1cfadeaf091b72191d94cb722c271ed38809be7e7a61",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:0c78f3428cde36f041b758fc2f01d23d2f0dd72dec248f78667fb0c9d1f74cef",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     "cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython": "sha256:7128b33e8f22d5f5ec9640bc377c3afddf7eb31daa1f958d1dd91dd7fda6a790",
@@ -102,6 +104,13 @@ containers = {
         "digest": container_digests["cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",
@@ -109,6 +118,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn9.1-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
         "registry": "gcr.io",

From d5cbce0920c88dfac54cf1b383c1ec0d92008871 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 28 May 2024 08:58:08 -0700
Subject: [PATCH 007/287] =?UTF-8?q?[XLA:GPU]=C2=A0Add=20a=20util=20to=20re?=
 =?UTF-8?q?set=20programs=20count=20and=20deflake=20`gpu=5Fcompiler=5Ftest?=
 =?UTF-8?q?`.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, running `gpu_compiler_test` in different orders would yield
different results for `GpuCompilerTest.CompiledProgramsCount`.

PiperOrigin-RevId: 637916181
---
 third_party/xla/xla/service/gpu/gpu_compiler_test.cc | 1 +
 third_party/xla/xla/service/gpu/metrics.cc           | 4 ++++
 third_party/xla/xla/service/gpu/metrics.h            | 8 ++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index bb064b853b74d4..2ebd2b554498e9 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -85,6 +85,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_text).value();
+  ResetCompiledProgramsCountForTesting();
   std::unique_ptr<Executable> executable =
       backend()
           .compiler()
diff --git a/third_party/xla/xla/service/gpu/metrics.cc b/third_party/xla/xla/service/gpu/metrics.cc
index 6d42b92d8bccbc..7f1056e40a12b4 100644
--- a/third_party/xla/xla/service/gpu/metrics.cc
+++ b/third_party/xla/xla/service/gpu/metrics.cc
@@ -85,6 +85,10 @@ int64_t GetCompiledProgramsCount() {
   return compiled_programs_count->GetCell()->value();
 }
 
+void ResetCompiledProgramsCountForTesting() {
+  compiled_programs_count->GetCell()->IncrementBy(-GetCompiledProgramsCount());
+}
+
 void RecordXlaDeviceBinarySize(const int64_t size) {
   xla_device_binary_size->GetCell()->Set(size);
 }
diff --git a/third_party/xla/xla/service/gpu/metrics.h b/third_party/xla/xla/service/gpu/metrics.h
index c3579dba6e4cf0..61173560e2f74b 100644
--- a/third_party/xla/xla/service/gpu/metrics.h
+++ b/third_party/xla/xla/service/gpu/metrics.h
@@ -39,10 +39,14 @@ void RecordLlvmToPtxDuration(uint64_t time_usecs);
 // Compiling PTX to cubin.
 void RecordPtxToCubinDuration(uint64_t time_usecs);
 
-// Counts compiled programs numbers.
+// Counts compiled programs count.
 void IncrementCompiledProgramsCount();
 
-// Gets compiled programs numbers.
+// DO NOT USE---this is exposed only for testing.
+// Resets compiled programs count.
+void ResetCompiledProgramsCountForTesting();
+
+// Gets compiled programs count.
 int64_t GetCompiledProgramsCount();
 
 // Records the size of the XLA device binary in bytes.

From 5557edf36f5b68c1f61120171e7d494bb4590a92 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 28 May 2024 09:03:03 -0700
Subject: [PATCH 008/287] [TSL] Do not cut the stack trace from Status when
 creating fatal error message

PiperOrigin-RevId: 637917998
---
 third_party/xla/third_party/tsl/tsl/platform/status.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.cc b/third_party/xla/third_party/tsl/tsl/platform/status.cc
index 4256bd2f2a44e9..85f7290e05079b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <ostream>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -164,12 +165,11 @@ const char* NullTerminatedMessage(const absl::Status& status) {
 #endif
 
 std::string* TfCheckOpHelperOutOfLine(const absl::Status& v, const char* msg) {
-  std::string r("Non-OK-status: ");
-  r += msg;
-  r += " status: ";
-  r += v.ToString();
+  std::stringstream ss;
+  ss << "Non-OK-status: " << msg << "\nStatus: " << v;
+
   // Leaks string but this is only to be used in a fatal error message
-  return new std::string(r);
+  return new std::string(ss.str());
 }
 
 StatusGroup::StatusGroup() {}

From efad0fea806bbe8636875463cc0841688e82f7d7 Mon Sep 17 00:00:00 2001
From: Tori Baker <vwbaker@google.com>
Date: Tue, 28 May 2024 09:13:12 -0700
Subject: [PATCH 009/287] Integrate Triton up to
 [399b20e8](https://github.com/openai/triton/commits/399b20e8135ea48d0592f94d5d84486571375e3d)

PiperOrigin-RevId: 637921279
---
 .../triton/temporary/enable_mma_v3.patch      |  67 ----
 .../exclude_failing_h100_tests.patch          |  14 -
 .../temporary/fp8_splat_partial_revert.patch  |  35 ++
 .../temporary/reduction_mma_v3_fix.patch      | 300 ------------------
 third_party/triton/temporary/series.bzl       |   4 +-
 third_party/triton/workspace.bzl              |   4 +-
 .../sparse_dot_fixes_y24w17.patch             |  20 +-
 .../sparse_dot_fixes_y24w19.patch             |   4 +-
 .../xla_extensions/sparse_dot_passes.patch    |  78 ++---
 .../triton/temporary/enable_mma_v3.patch      |  67 ----
 .../exclude_failing_h100_tests.patch          |  14 -
 .../temporary/fp8_splat_partial_revert.patch  |  35 ++
 .../temporary/reduction_mma_v3_fix.patch      | 300 ------------------
 .../third_party/triton/temporary/series.bzl   |   4 +-
 .../xla/third_party/triton/workspace.bzl      |   4 +-
 .../sparse_dot_fixes_y24w17.patch             |  20 +-
 .../sparse_dot_fixes_y24w19.patch             |   4 +-
 .../xla_extensions/sparse_dot_passes.patch    |  78 ++---
 .../xla/service/gpu/ir_emitter_triton_cuda.cc |  32 +-
 19 files changed, 195 insertions(+), 889 deletions(-)
 delete mode 100644 third_party/triton/temporary/enable_mma_v3.patch
 delete mode 100644 third_party/triton/temporary/exclude_failing_h100_tests.patch
 create mode 100644 third_party/triton/temporary/fp8_splat_partial_revert.patch
 delete mode 100644 third_party/triton/temporary/reduction_mma_v3_fix.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/enable_mma_v3.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/exclude_failing_h100_tests.patch
 create mode 100644 third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/reduction_mma_v3_fix.patch

diff --git a/third_party/triton/temporary/enable_mma_v3.patch b/third_party/triton/temporary/enable_mma_v3.patch
deleted file mode 100644
index c6655300a4dc6d..00000000000000
--- a/third_party/triton/temporary/enable_mma_v3.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-This can be deleted on the next integrate as is a revert of a previous patch
-(disable_mma_v3). Just delete this and you're fine!
-diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
---- a/include/triton/Tools/Sys/GetEnv.hpp
-+++ b/include/triton/Tools/Sys/GetEnv.hpp
-@@ -15,7 +15,7 @@ inline const std::set<std::string> CACHE
-     "AMDGCN_ENABLE_DUMP",
-     "DISABLE_FAST_REDUCTION",
-     "DISABLE_LLVM_OPT",
--    "ENABLE_MMA_V3",
-+    "DISABLE_MMA_V3",
-     "DISABLE_PTXAS_OPT",
-     "LLVM_IR_ENABLE_DUMP",
-     "LLVM_ENABLE_TIMING",
-diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
---- a/lib/Analysis/Utility.cpp
-+++ b/lib/Analysis/Utility.cpp
-@@ -535,8 +535,7 @@ bool supportMMA(triton::DotOp op, int ve
-   auto aElemTy = op.getA().getType().getElementType();
-   auto bElemTy = op.getB().getType().getElementType();
-   if (version == 3) {
--    // TODO(b/311157761): enable mma_v3
--    if (!triton::tools::getBoolEnv("ENABLE_MMA_V3"))
-+    if (triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return false;
-     auto retType = op.getType();
-     auto retShapePerCTA = getShapePerCTA(retType);
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -40,8 +40,7 @@ public:
-     // Only insert fences for compute capability 9.0
-     if (computeCapability < 90)
-       return;
--    // TODO(b/311157761): enable mma_v3
--    if (!::triton::tools::getBoolEnv("ENABLE_MMA_V3"))
-+    if (::triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return;
-     ModuleOp mod = getOperation();
-     mod.walk([&](Operation *op) {
-diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
---- a/test/Conversion/tritongpu_to_llvm_hopper.mlir
-+++ b/test/Conversion/tritongpu_to_llvm_hopper.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s
- 
- #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>
- #shared = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
-diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
---- a/test/TritonGPU/accelerate-matmul.mlir
-+++ b/test/TritonGPU/accelerate-matmul.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
- // RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=89 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-89
- // RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
- 
-diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir
---- a/test/TritonGPU/fence-inserstion.mlir
-+++ b/test/TritonGPU/fence-inserstion.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --triton-nvidia-gpu-fence-insertion | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --triton-nvidia-gpu-fence-insertion | FileCheck %s
- 
- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
- #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
diff --git a/third_party/triton/temporary/exclude_failing_h100_tests.patch b/third_party/triton/temporary/exclude_failing_h100_tests.patch
deleted file mode 100644
index cfef75b41c38a1..00000000000000
--- a/third_party/triton/temporary/exclude_failing_h100_tests.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Remove once b/342347027 is fixed.
-
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -2073,6 +2073,8 @@ keep_dims_3d_configs = [(op, 'float32', 
-                                                   for op in ['min', 'max', 'sum']]
- 
- 
-+@pytest.mark.skipif(torch.cuda.get_device_capability()[0] >= 9,
-+                    reason='Reduction test produces wrong results on H100, b/342347027')
- @pytest.mark.interpreter
- @pytest.mark.parametrize(
-     "op, dtype_str, shape, axis, keep_dims", reduce_configs1 + reduce_configs2 + reduce_configs3 + invalid_config +
diff --git a/third_party/triton/temporary/fp8_splat_partial_revert.patch b/third_party/triton/temporary/fp8_splat_partial_revert.patch
new file mode 100644
index 00000000000000..57f6c64ecb0ff4
--- /dev/null
+++ b/third_party/triton/temporary/fp8_splat_partial_revert.patch
@@ -0,0 +1,35 @@
+This can be deleted on the next integrate; it is a revert of a previous patch.
+diff --git a/include/triton/Conversion/MLIRTypes.h b/include/triton/Conversion/MLIRTypes.h
+--- a/include/triton/Conversion/MLIRTypes.h
++++ b/include/triton/Conversion/MLIRTypes.h
+@@ -26,6 +26,15 @@ inline Type f32Ty(MLIRContext *ctx) { re
+ inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
+ inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
+ 
++inline bool isFloat(Type type) {
++  return type.isF32() || type.isF64() || type.isF16() || type.isF128() ||
++         type.isBF16() || type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
++         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
++         type.isFloat8E5M2FNUZ();
++}
++
++inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
++
+ } // namespace type
+ } // namespace triton
+ } // namespace mlir
+diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+--- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
++++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+@@ -74,9 +74,9 @@ struct ArithConstantSplatOpConversion
+     auto values = mlir::dyn_cast<SplatElementsAttr>(op.getValue());
+     auto elemType = values.getElementType();
+     Attribute val;
+-    if (isa<FloatType>(elemType)) {
++    if (type::isFloat(elemType)) {
+       val = values.getValues<FloatAttr>()[0];
+-    } else if (isa<IntegerType>(elemType)) {
++    } else if (type::isInt(elemType)) {
+       val = values.getValues<IntegerAttr>()[0];
+     } else {
+       llvm::errs() << "ArithConstantSplatOpConversion get unsupported type: "
diff --git a/third_party/triton/temporary/reduction_mma_v3_fix.patch b/third_party/triton/temporary/reduction_mma_v3_fix.patch
deleted file mode 100644
index 30697d6a32d7e1..00000000000000
--- a/third_party/triton/temporary/reduction_mma_v3_fix.patch
+++ /dev/null
@@ -1,300 +0,0 @@
-This is getting upstreamed here: https://github.com/triton-lang/triton/pull/3970
-diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
---- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h
-+++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
-@@ -692,7 +692,7 @@ emitBaseIndexWithinCTAForMmaLayoutV2V3(L
-   auto _warpsPerCTA = mmaLayout.getWarpsPerCTA();
-   auto rank = shape.size();
-   assert(rank == 2 || rank == 3);
--  auto order = triton::gpu::getOrder(mmaLayout);
-+  auto warpOrder = triton::gpu::getWarpOrder(mmaLayout);
-   ArrayRef<unsigned int> instrShape = mmaLayout.getInstrShape();
-   SmallVector<Value> warpsPerCTA;
-   for (unsigned i = 0; i < rank; ++i)
-@@ -722,19 +722,7 @@ emitBaseIndexWithinCTAForMmaLayoutV2V3(L
-     warpsN = shape[rank - 1] / instrShape[rank - 1];
- 
-   SmallVector<Value> multiDimWarpId(rank);
--  if (mmaLayout.isHopper()) {
--    // TODO[goostavz]: the tiling order from CTA->warp level is different for
--    // MMAv2/3. This is a workaround since we don't explicitly have warpGrp
--    // level in the layout definition, and the tiling order of warpGrp->warp
--    // must be fixed to meet the HW's needs. We may need to consider to
--    // explicitly define warpGrpPerCTA for MMAv3 layout.
--    assert(rank == 2 && "MMAv3 layout does not support 3D tensor yet");
--    multiDimWarpId[rank - 2] = urem(warpId, warpsPerCTA[rank - 2]);
--    multiDimWarpId[rank - 1] =
--        urem(udiv(warpId, warpsPerCTA[rank - 2]), warpsPerCTA[rank - 1]);
--  } else {
--    multiDimWarpId = delinearize(rewriter, loc, warpId, _warpsPerCTA, order);
--  }
-+  multiDimWarpId = delinearize(rewriter, loc, warpId, _warpsPerCTA, warpOrder);
-   Value warpIdM = urem(multiDimWarpId[rank - 2], i32_val(warpsM));
-   Value warpIdN = urem(multiDimWarpId[rank - 1], i32_val(warpsN));
- 
-@@ -808,7 +796,7 @@ emitBaseIndexForMfmaLayout(Location loc,
-   Value laneId = urem(threadId, effectiveWarpSize);
-   Value warpId = udiv(threadId, warpSize);
-   SmallVector<Value> multiDimWarpId = delinearize(
--      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getOrder(mfmaLayout));
-+      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getWarpOrder(mfmaLayout));
-   if (shape[rank - 2] >= mDim) {
-     assert(shape[rank - 2] % mDim == 0);
-     multiDimWarpId[rank - 2] =
-@@ -949,7 +937,7 @@ emitBaseIndexForWmmaLayout(Location loc,
- 
-   Value warpId = udiv(threadId, warpSize);
-   SmallVector<Value> multiDimWarpId = delinearize(
--      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getOrder(wmmaLayout));
-+      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getWarpOrder(wmmaLayout));
-   if (shape[0] >= mnkDim[0]) {
-     assert(shape[0] % mnkDim[0] == 0);
-     multiDimWarpId[0] =
-diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
---- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
-+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
-@@ -75,6 +75,8 @@ getThreadsPerWarpWithUniqueData(Attribut
- SmallVector<unsigned>
- getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
- 
-+SmallVector<unsigned> getWarpOrder(Attribute layout);
-+
- SmallVector<unsigned> getOrder(Attribute layout);
- 
- CTALayoutAttr getCTALayout(Attribute layout);
-diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-@@ -240,7 +240,7 @@ private:
-                     ConversionPatternRewriter &rewriter) const {
-     auto srcLayout = helper.getSrcLayout();
-     auto srcShape = helper.getSrcShape();
--    auto order = getOrder(srcLayout);
-+    auto order = triton::gpu::getWarpOrder(srcLayout);
-     SmallVector<Value> multiDimWarpId;
- 
-     // 2x2 warps with slice dim = 0, warpId = 2 ends up writing at the same
-@@ -249,7 +249,7 @@ private:
-     if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
-       auto parentLayout = sliceLayout.getParent();
-       auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout);
--      auto parentOrder = triton::gpu::getOrder(parentLayout);
-+      auto parentOrder = triton::gpu::getWarpOrder(parentLayout);
-       multiDimWarpId =
-           delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder);
-       multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim());
-diff --git a/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
-@@ -399,8 +399,8 @@ ScanOpConversion::getMultiDimWarpId(Conv
- 
-   auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcEncoding);
-   auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcEncoding);
--  auto order = triton::gpu::getOrder(srcEncoding);
--  return delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+  auto warpOrder = triton::gpu::getWarpOrder(srcEncoding);
-+  return delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
- }
- 
- // Break up the threadId into lane and warp id along the scan dimension and
-@@ -416,10 +416,11 @@ ScanOpConversion::getDelinearizedIds(Con
-   auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcEncoding);
-   auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcEncoding);
-   auto order = triton::gpu::getOrder(srcEncoding);
-+  auto warpOrder = triton::gpu::getWarpOrder(srcEncoding);
-   SmallVector<Value> multiDimLaneId =
-       delinearize(rewriter, loc, laneId, threadsPerWarp, order);
-   SmallVector<Value> multiDimWarpId =
--      delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+      delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
- 
-   Value laneIdAxis = multiDimLaneId[axis];
-   Value warpIdAxis = multiDimWarpId[axis];
-@@ -431,7 +432,7 @@ ScanOpConversion::getDelinearizedIds(Con
-   multiDimWarpId[axis] = i32_val(0);
-   warpsPerCTA[axis] = 1;
-   Value warpIdParallel =
--      linearize(rewriter, loc, multiDimWarpId, warpsPerCTA, order);
-+      linearize(rewriter, loc, multiDimWarpId, warpsPerCTA, warpOrder);
-   Value flatIdParallel =
-       add(laneIdParallel,
-           mul(warpIdParallel, i32_val(helper.getNonAxisNumThreadsPerWarp())));
-diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
---- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
-@@ -510,13 +510,8 @@ SmallVector<Value> getMultiDimOffset(Att
-     // TODO: fix the bug in MMAEncodingAttr document
-     SmallVector<Value> multiDimWarpId(2);
-     auto warpsPerCTA = mmaLayout.getWarpsPerCTA();
--    if (mmaLayout.isHopper()) {
--      multiDimWarpId[0] = urem(warpId, i32_val(warpsPerCTA[0]));
--      multiDimWarpId[1] = udiv(warpId, i32_val(warpsPerCTA[0]));
--    } else {
--      auto order = triton::gpu::getOrder(mmaLayout);
--      multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, order);
--    }
-+    auto warpOrder = triton::gpu::getWarpOrder(mmaLayout);
-+    multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
-     Value _1 = i32_val(1);
-     Value _2 = i32_val(2);
-     Value _4 = i32_val(4);
-@@ -544,7 +539,7 @@ SmallVector<Value> getMultiDimOffset(Att
-       mmaColIdx[0] = add(mmaThreadIdInGrpM2, colWarpOffset);
-       mmaColIdx[1] = add(mmaThreadIdInGrpM2P1, colWarpOffset);
-     } else if (mmaLayout.isVolta()) {
--      // Volta doesn't follow the pattern here."
-+      // Volta doesn't follow the pattern here.
-     } else {
-       llvm_unreachable("Unexpected MMALayout version");
-     }
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -228,6 +228,19 @@ static SmallVector<unsigned> eraseOrder(
-   return resOrder;
- }
- 
-+SmallVector<unsigned> getWarpOrder(Attribute layout) {
-+  auto order = getOrder(layout);
-+  if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
-+    if (mmaLayout.isHopper()) {
-+      // Hopper MMA instructions force a warp order of [0, 1].
-+      auto it = std::find(order.begin(), order.end(), 0);
-+      order.erase(it);
-+      order.insert(order.begin(), 0);
-+    }
-+  }
-+  return order;
-+}
-+
- SmallVector<unsigned> getOrder(Attribute layout) {
-   if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout)) {
-     return SmallVector<unsigned>(blockedLayout.getOrder().begin(),
-@@ -831,7 +844,7 @@ SmallVector<unsigned> SliceEncodingAttr:
-   return warpsPerCTA;
- }
- SmallVector<unsigned> SliceEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> SliceEncodingAttr::getThreadsPerWarp() const {
-   auto parent = getParent();
-@@ -1092,7 +1105,7 @@ SmallVector<unsigned> DotOperandEncoding
-   }
- }
- SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1608,7 +1621,7 @@ SmallVector<unsigned> AMDMfmaEncodingAtt
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> AMDMfmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> AMDMfmaEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1761,7 +1774,7 @@ SmallVector<unsigned> AMDWmmaEncodingAtt
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> AMDWmmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> AMDWmmaEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1859,7 +1872,7 @@ SmallVector<unsigned> NvidiaMmaEncodingA
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> NvidiaMmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> NvidiaMmaEncodingAttr::getThreadsPerWarp() const {
-   auto rank = getWarpsPerCTA().size();
-@@ -1900,7 +1913,10 @@ SmallVector<unsigned> NvidiaMmaEncodingA
-   }
-   if (isHopper()) {
-     auto instrShape = getInstrShape();
--    // TODO(thomas): what are those magic numbers?
-+    // WGMMA instructions have an order of [0, 1] with 4 warps, each with 8
-+    // unique thread ids (32 in a warp group) per column. It is 1 warp wide with
-+    // 4 unique thread ids in the row. So the size per thread is the instruction
-+    // size divided by the number of unique thread ids.
-     return SmallVector<unsigned>{instrShape[0] * 4 / 32, instrShape[1] / 4};
-   }
-   llvm_unreachable("Unexpected mma version");
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -2508,6 +2508,8 @@ layouts = [
-               instr_shape=[16, 8]),
-     MmaLayout(version=(3, 0), warps_per_cta=[4, 1], ctas_per_cga=[1, 1], cta_split_num=[1, 1], cta_order=[1, 0],
-               instr_shape=[16, 16, 16]),
-+    MmaLayout(version=(3, 0), warps_per_cta=[4, 2], ctas_per_cga=[1, 1], cta_split_num=[1, 1], cta_order=[1, 0],
-+              instr_shape=[16, 32, 16]),
-     MfmaLayout(version=(2, 0), warps_per_cta=[2, 2], instr_shape=[32, 32], is_transposed=False),
-     MfmaLayout(version=(2, 0), warps_per_cta=[4, 1], instr_shape=[32, 32], is_transposed=False),
-     MfmaLayout(version=(2, 0), warps_per_cta=[1, 4], instr_shape=[32, 32], is_transposed=False),
-@@ -2524,9 +2526,9 @@ layouts = [
- @pytest.mark.parametrize("dtype_str", ["int32", "float32", "float16"])
- @pytest.mark.parametrize("reduce_op", ["sum", "max"])
- def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, reduce_op, device):
--    if is_hip() and isinstance(src_layout, MfmaLayout) and (M < src_layout.instr_shape[0]
-+    if isinstance(src_layout, (MfmaLayout, MmaLayout)) and (M < src_layout.instr_shape[0]
-                                                             or N < src_layout.instr_shape[1]):
--        pytest.skip("Skipping because tensor shape is smaller than MfmaLayout isntr_shape")
-+        pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape")
-     if is_hip() and isinstance(src_layout, MfmaLayout) and ((M, N) == (128, 128)):
-         pytest.skip("Skipping test because it runs out of shared memory")
-     if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024:
-@@ -2535,6 +2537,9 @@ def test_reduce_layouts(M, N, src_layout
-         pytest.skip(
-             "Currently MmaLayout combined with slice encoding and reduce op trigger device illegal memory access")
- 
-+    if isinstance(src_layout, MmaLayout) and src_layout.version == 3:
-+        src_layout[2] = 16 if dtype_str == "float16" else 8
-+
-     ty = {"int32": "i32", "float32": "f32", "float16": "f16"}[dtype_str]
-     arith_op = {
-         "max": {"int32": "arith.maxsi", "float32": "arith.maximumf", "float16": "arith.maximumf"},  #
-@@ -2545,6 +2550,9 @@ def test_reduce_layouts(M, N, src_layout
-     rdims_2d = f"1x{N}" if axis == 0 else f"{M}x1"
-     store_range = "%7" if axis == 0 else "%1"
-     blocked = BlockedLayout([1, 1], [32, THREADS_PER_WARP // 32], [4, 1], [0, 1], [1, 1], [1, 1], [0, 1])
-+    num_warps = src_layout.warps_per_cta[0] * src_layout.warps_per_cta[1]
-+    if num_warps == 8:
-+        blocked = BlockedLayout([1, 1], [32, THREADS_PER_WARP // 32], [4, 2], [0, 1], [1, 1], [1, 1], [0, 1])
-     one_d_layout = BlockedLayout([1], [THREADS_PER_WARP], [4], [0], [1], [1], [0])
- 
-     expanded_shape = f"1x{N}" if axis == 0 else f"{M}x1"
-@@ -2592,7 +2600,7 @@ def test_reduce_layouts(M, N, src_layout
-     #blocked = {blocked}
-     #src = {src_layout}
-     #one_d_layout = {one_d_layout}
--    module attributes {{"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = {THREADS_PER_WARP} : i32}} {{
-+    module attributes {{"triton_gpu.num-warps" = {num_warps} : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = {THREADS_PER_WARP} : i32}} {{
-     tt.func public @kernel_0d1d2c3d4c(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}) {{
-         %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #{GPU_DIALECT}.slice<{{dim = 1, parent = #blocked}}>>
-         %1 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #{GPU_DIALECT}.slice<{{dim = 1, parent = #blocked}}>> -> tensor<{M}x1xi32, #blocked>
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
-@@ -37,12 +37,13 @@ Value redundantDataMask(Type valueTy, Co
-     auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout);
-     auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout);
-     auto order = triton::gpu::getOrder(layout);
-+    auto warpOrder = triton::gpu::getWarpOrder(layout);
-     auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
-     Value warpSize = i32_val(32);
-     Value laneId = urem(tid, warpSize);
-     Value warpId = udiv(tid, warpSize);
-     SmallVector<Value> multiDimWarpId =
--        delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+        delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
-     SmallVector<Value> multiDimThreadId =
-         delinearize(rewriter, loc, laneId, threadsPerWarp, order);
-     for (unsigned dim = 0; dim < rank; ++dim) {
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index d6bde944a046bd..30ee1d604dc2a9 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -6,7 +6,5 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:reduction_mma_v3_fix.patch",
-    "//third_party/triton/temporary:exclude_failing_h100_tests.patch",
-    "//third_party/triton/temporary:enable_mma_v3.patch",
+    "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
 ]
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 00a674f63d005b..1aace3709dc23a 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl635840438"
-    TRITON_SHA256 = "707101b2e8366e63e80150c26f8ab660052099c91ca0c4fa4c713607fa75f318"
+    TRITON_COMMIT = "cl637553582"
+    TRITON_SHA256 = "400077180416fc59486b698a6523013ee11589c6269e1aeb992292ca12cc1e58"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
index ce009aa688e9bf..abccf863abca64 100644
--- a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
@@ -2,7 +2,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 index 0516fc56f..1f27f8a43 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -142,6 +142,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+@@ -146,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
                  mlir::TypeID::get<arith::ArithDialect>());
    }
  
@@ -10,15 +10,15 @@ index 0516fc56f..1f27f8a43 100644
    // Finds the first different bitwidth in the chain of shape-preserving
    // unary ops that x depends on.
    // There are two primary scenarios:
-@@ -175,7 +176,6 @@ class BlockedToMMA : public mlir::RewritePattern {
+@@ -179,7 +180,6 @@ class BlockedToMMA : public mlir::RewritePattern {
      return origBitWidth;
    }
  
 -public:
    BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
-@@ -389,18 +389,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
+@@ -393,18 +393,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
                                                          newRetType, oldAcc);
  
      if (versionMajor == 2) {
@@ -28,23 +28,23 @@ index 0516fc56f..1f27f8a43 100644
 +
        // convert A operand
        auto oldAType = cast<RankedTensorType>(a.getType());
--      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
+-      auto newAEncoding = DotOperandEncodingAttr::get(
 -          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAEncoding =
-+          ttg::DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
++          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
        auto newAType = RankedTensorType::get(
            oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-       a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
+       a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
  
        // convert B operand
        auto oldBType = cast<RankedTensorType>(b.getType());
--      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
+-      auto newBEncoding = DotOperandEncodingAttr::get(
 -          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBEncoding =
-+          ttg::DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
++          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
        auto newBType = RankedTensorType::get(
            oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-       b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
+       b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 index 3011cf73d..ea587dced 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
index 260f84e892e8b3..71ccb7e3c2e6e5 100644
--- a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
@@ -1,7 +1,7 @@
 diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -365,7 +365,8 @@ class SparseBlockedToMMA : public mlir::
+@@ -368,7 +368,8 @@ class SparseBlockedToMMA : public mlir::
  
      assert(computeCapability >= 80 &&
             "SparseDot is supported on Ampere and higher");
@@ -10,7 +10,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
  
      // get MMA encoding for the given number of warps
-     auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
+     auto retShapePerCTA = getShapePerCTA(oldRetType);
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
diff --git a/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/triton/xla_extensions/sparse_dot_passes.patch
index 9136cb84b24254..afa38b7359b958 100644
--- a/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -117,34 +117,34 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 index 098ee85e4..0516fc56f 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -44,8 +44,9 @@ static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
+@@ -39,8 +39,9 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
    return 0;
  }
  
+-SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
 +template <typename DotType>
- SmallVector<unsigned>
--warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-+warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape, int numWarps) {
++SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
+                                      int numWarps) {
    auto rank = shape.size();
    // Early exit for batched matmul
    if (rank == 3)
-@@ -58,8 +59,8 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+@@ -54,8 +54,8 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
    bool hasChainedDot = false;
    for (Operation *op : slices) {
--    if (isa<tt::DotOp>(op) && (op != dotOp)) {
--      auto chainedDot = cast<tt::DotOp>(op);
+-    if (isa<DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<DotOp>(op);
 +    if (isa<DotType>(op) && (op != dotOp)) {
 +      auto chainedDot = cast<DotType>(op);
        auto resTy = chainedDot.getResult().getType();
        if (resTy.getRank() != rank) {
          continue;
-@@ -103,12 +104,13 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+@@ -99,12 +99,13 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    return ret;
  }
  
 -SmallVector<unsigned, 2>
--warpsPerTileV3(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
 -               const SmallVector<unsigned, 3> &instrShape) {
 +template <typename DotType>
 +SmallVector<unsigned, 2> warpsPerTileV3(
@@ -152,17 +152,17 @@ index 098ee85e4..0516fc56f 100644
 +    const SmallVector<unsigned, 3> &instrShape) {
    SetVector<Operation *> slices;
    mlir::getForwardSlice(dotOp.getResult(), &slices);
--  if (llvm::find_if(slices, [](Operation *op) { return isa<tt::DotOp>(op); }) !=
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
 +  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
        slices.end())
      return {(unsigned)numWarps, 1};
  
-@@ -178,9 +180,10 @@ public:
-       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+@@ -184,9 +184,10 @@ public:
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
  
 -  static SmallVector<unsigned, 3>
--  getWarpsPerTile(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
 -                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
 +  template <typename DotType>
 +  static SmallVector<unsigned, 3> getWarpsPerTile(
@@ -171,7 +171,7 @@ index 098ee85e4..0516fc56f 100644
      switch (version) {
      case 2:
        return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -335,6 +338,98 @@ public:
+@@ -342,6 +342,98 @@ public:
      return success();
    }
  };
@@ -196,7 +196,7 @@ index 098ee85e4..0516fc56f 100644
 +    // Check data-types and SM compatibility
 +    RankedTensorType oldRetType = dotOp.getType();
 +    if (!oldRetType.getEncoding() ||
-+        isa<ttg::NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
++        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
 +      return failure();
 +
 +    assert(computeCapability >= 80 &&
@@ -204,43 +204,43 @@ index 098ee85e4..0516fc56f 100644
 +    int versionMajor = computeCapability < 90 ? 2 : 3;
 +
 +    // get MMA encoding for the given number of warps
-+    auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
++    auto retShapePerCTA = getShapePerCTA(oldRetType);
 +    auto mod = op->getParentOfType<mlir::ModuleOp>();
-+    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
-+    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
++    int numWarps = TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = getCTALayout(oldRetType.getEncoding());
 +
 +    auto instrShape =
 +        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
 +                               cast<TensorOrMemDesc>(a.getType()), numWarps);
 +    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
 +        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
-+    ttg::NvidiaMmaEncodingAttr mmaEnc =
-+        ttg::NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++    NvidiaMmaEncodingAttr mmaEnc =
++        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
 +                                        warpsPerTile, CTALayout, instrShape);
 +    auto newRetType = RankedTensorType::get(
 +        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
 +
 +    // convert accumulator
 +    auto oldAcc = dotOp.getOperand(2);
-+    auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(oldAcc.getLoc(),
++    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
 +                                                        newRetType, oldAcc);
 +
 +    if (versionMajor == 2) {
 +      // convert A operand
 +      auto oldAType = cast<RankedTensorType>(a.getType());
-+      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
++      auto newAEncoding = DotOperandEncodingAttr::get(
 +          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAType = RankedTensorType::get(
 +          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-+      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
++      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
 +
 +      // convert B operand
 +      auto oldBType = cast<RankedTensorType>(b.getType());
-+      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
++      auto newBEncoding = DotOperandEncodingAttr::get(
 +          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBType = RankedTensorType::get(
 +          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-+      b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
++      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
 +    } else {
 +      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
 +      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
@@ -253,13 +253,13 @@ index 098ee85e4..0516fc56f 100644
 +        oldMetaType.getShape(), oldMetaType.getElementType(),
 +        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
 +    meta =
-+        rewriter.create<ttg::ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
 +
 +    // convert dot instruction
 +    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
 +                                               newAcc, meta);
 +
-+    rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(op, oldRetType,
++    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
 +                                                      newDot.getResult());
 +    return success();
 +  }
@@ -270,11 +270,11 @@ index 098ee85e4..0516fc56f 100644
  } // namespace
  
  static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -394,6 +489,7 @@ public:
+@@ -394,6 +493,7 @@ public:
  
      mlir::RewritePatternSet patterns(context);
-     patterns.add<::BlockedToMMA>(context, computeCapability);
-+    patterns.add<::SparseBlockedToMMA>(context, computeCapability);
+     patterns.add<BlockedToMMA>(context, computeCapability);
++    patterns.add<SparseBlockedToMMA>(context, computeCapability);
      if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
        signalPassFailure();
      }
@@ -293,7 +293,7 @@ index 97ca6a840..f0ef124ff 100644
  static bool isMMAv3Dot(Operation *op) {
    auto dot = dyn_cast<tt::DotOp>(op);
    if (!dot)
-@@ -399,19 +403,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
+@@ -391,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
      } else {
        if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
          return std::nullopt;
@@ -334,7 +334,7 @@ index 97ca6a840..f0ef124ff 100644
      }
      // Check that the shared encodings needed by the users are compatible.
      if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -518,7 +531,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
+@@ -519,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
        };
  
    for (Operation &op : forOp.getBody()->without_terminator()) {
@@ -343,7 +343,7 @@ index 97ca6a840..f0ef124ff 100644
        continue;
      seen.clear();
      dfs(&op, 0, &op);
-@@ -595,7 +608,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+@@ -596,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
          continue;
      }
  
@@ -353,7 +353,7 @@ index 97ca6a840..f0ef124ff 100644
        loadInfo.usedByDot = true;
        if (loadIsMMAv3(op)) {
          loadInfo.loadIsMMAV3 = true;
-@@ -614,7 +628,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+@@ -619,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
          // The codegen bug is caught by an assertion, so if you think you've
          // fixed it, feel free to delete this code and see if the assert still
          // fails.  :)
@@ -366,7 +366,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Di
 index 2211df31b..ee5ff44d8 100644
 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -37,6 +37,10 @@ public:
+@@ -38,6 +38,10 @@ public:
        auto srcEncoding = srcType.getEncoding();
        if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
          return;
@@ -377,7 +377,7 @@ index 2211df31b..ee5ff44d8 100644
        auto dstDotOp =
            dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
        if (!dstDotOp)
-@@ -83,6 +87,27 @@ public:
+@@ -84,6 +88,27 @@ public:
        cvtOp.erase();
      });
    }
@@ -404,12 +404,12 @@ index 2211df31b..ee5ff44d8 100644
 +  }
  };
  
- std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
+ } // namespace gpu
 diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 index f456d36a6..a1dac2b72 100644
 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -45,7 +45,7 @@ public:
+@@ -44,7 +44,7 @@ public:
        return;
      ModuleOp mod = getOperation();
      mod.walk([&](Operation *op) {
@@ -418,7 +418,7 @@ index f456d36a6..a1dac2b72 100644
          return WalkResult::advance();
        OpBuilder builder(op);
        auto a = op->getOperand(0);
-@@ -80,7 +80,7 @@ private:
+@@ -79,7 +79,7 @@ private:
      static DenseSet<std::pair<Operation *, unsigned>> trace;
      auto op = operand.getDefiningOp();
      // avoid redundant insertion
diff --git a/third_party/xla/third_party/triton/temporary/enable_mma_v3.patch b/third_party/xla/third_party/triton/temporary/enable_mma_v3.patch
deleted file mode 100644
index c6655300a4dc6d..00000000000000
--- a/third_party/xla/third_party/triton/temporary/enable_mma_v3.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-This can be deleted on the next integrate as is a revert of a previous patch
-(disable_mma_v3). Just delete this and you're fine!
-diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
---- a/include/triton/Tools/Sys/GetEnv.hpp
-+++ b/include/triton/Tools/Sys/GetEnv.hpp
-@@ -15,7 +15,7 @@ inline const std::set<std::string> CACHE
-     "AMDGCN_ENABLE_DUMP",
-     "DISABLE_FAST_REDUCTION",
-     "DISABLE_LLVM_OPT",
--    "ENABLE_MMA_V3",
-+    "DISABLE_MMA_V3",
-     "DISABLE_PTXAS_OPT",
-     "LLVM_IR_ENABLE_DUMP",
-     "LLVM_ENABLE_TIMING",
-diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
---- a/lib/Analysis/Utility.cpp
-+++ b/lib/Analysis/Utility.cpp
-@@ -535,8 +535,7 @@ bool supportMMA(triton::DotOp op, int ve
-   auto aElemTy = op.getA().getType().getElementType();
-   auto bElemTy = op.getB().getType().getElementType();
-   if (version == 3) {
--    // TODO(b/311157761): enable mma_v3
--    if (!triton::tools::getBoolEnv("ENABLE_MMA_V3"))
-+    if (triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return false;
-     auto retType = op.getType();
-     auto retShapePerCTA = getShapePerCTA(retType);
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -40,8 +40,7 @@ public:
-     // Only insert fences for compute capability 9.0
-     if (computeCapability < 90)
-       return;
--    // TODO(b/311157761): enable mma_v3
--    if (!::triton::tools::getBoolEnv("ENABLE_MMA_V3"))
-+    if (::triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return;
-     ModuleOp mod = getOperation();
-     mod.walk([&](Operation *op) {
-diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
---- a/test/Conversion/tritongpu_to_llvm_hopper.mlir
-+++ b/test/Conversion/tritongpu_to_llvm_hopper.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s
- 
- #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>
- #shared = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
-diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
---- a/test/TritonGPU/accelerate-matmul.mlir
-+++ b/test/TritonGPU/accelerate-matmul.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
- // RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=89 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-89
- // RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
- 
-diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir
---- a/test/TritonGPU/fence-inserstion.mlir
-+++ b/test/TritonGPU/fence-inserstion.mlir
-@@ -1,4 +1,4 @@
--// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file --triton-nvidia-gpu-fence-insertion | FileCheck %s
-+// RUN: triton-opt %s -split-input-file --triton-nvidia-gpu-fence-insertion | FileCheck %s
- 
- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
- #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
diff --git a/third_party/xla/third_party/triton/temporary/exclude_failing_h100_tests.patch b/third_party/xla/third_party/triton/temporary/exclude_failing_h100_tests.patch
deleted file mode 100644
index cfef75b41c38a1..00000000000000
--- a/third_party/xla/third_party/triton/temporary/exclude_failing_h100_tests.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Remove once b/342347027 is fixed.
-
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -2073,6 +2073,8 @@ keep_dims_3d_configs = [(op, 'float32', 
-                                                   for op in ['min', 'max', 'sum']]
- 
- 
-+@pytest.mark.skipif(torch.cuda.get_device_capability()[0] >= 9,
-+                    reason='Reduction test produces wrong results on H100, b/342347027')
- @pytest.mark.interpreter
- @pytest.mark.parametrize(
-     "op, dtype_str, shape, axis, keep_dims", reduce_configs1 + reduce_configs2 + reduce_configs3 + invalid_config +
diff --git a/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch b/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
new file mode 100644
index 00000000000000..57f6c64ecb0ff4
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
@@ -0,0 +1,35 @@
+This can be deleted on the next integrate; it is a revert of a previous patch.
+diff --git a/include/triton/Conversion/MLIRTypes.h b/include/triton/Conversion/MLIRTypes.h
+--- a/include/triton/Conversion/MLIRTypes.h
++++ b/include/triton/Conversion/MLIRTypes.h
+@@ -26,6 +26,15 @@ inline Type f32Ty(MLIRContext *ctx) { re
+ inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
+ inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
+ 
++inline bool isFloat(Type type) {
++  return type.isF32() || type.isF64() || type.isF16() || type.isF128() ||
++         type.isBF16() || type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
++         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
++         type.isFloat8E5M2FNUZ();
++}
++
++inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
++
+ } // namespace type
+ } // namespace triton
+ } // namespace mlir
+diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+--- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
++++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+@@ -74,9 +74,9 @@ struct ArithConstantSplatOpConversion
+     auto values = mlir::dyn_cast<SplatElementsAttr>(op.getValue());
+     auto elemType = values.getElementType();
+     Attribute val;
+-    if (isa<FloatType>(elemType)) {
++    if (type::isFloat(elemType)) {
+       val = values.getValues<FloatAttr>()[0];
+-    } else if (isa<IntegerType>(elemType)) {
++    } else if (type::isInt(elemType)) {
+       val = values.getValues<IntegerAttr>()[0];
+     } else {
+       llvm::errs() << "ArithConstantSplatOpConversion get unsupported type: "
diff --git a/third_party/xla/third_party/triton/temporary/reduction_mma_v3_fix.patch b/third_party/xla/third_party/triton/temporary/reduction_mma_v3_fix.patch
deleted file mode 100644
index 30697d6a32d7e1..00000000000000
--- a/third_party/xla/third_party/triton/temporary/reduction_mma_v3_fix.patch
+++ /dev/null
@@ -1,300 +0,0 @@
-This is getting upstreamed here: https://github.com/triton-lang/triton/pull/3970
-diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
---- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h
-+++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
-@@ -692,7 +692,7 @@ emitBaseIndexWithinCTAForMmaLayoutV2V3(L
-   auto _warpsPerCTA = mmaLayout.getWarpsPerCTA();
-   auto rank = shape.size();
-   assert(rank == 2 || rank == 3);
--  auto order = triton::gpu::getOrder(mmaLayout);
-+  auto warpOrder = triton::gpu::getWarpOrder(mmaLayout);
-   ArrayRef<unsigned int> instrShape = mmaLayout.getInstrShape();
-   SmallVector<Value> warpsPerCTA;
-   for (unsigned i = 0; i < rank; ++i)
-@@ -722,19 +722,7 @@ emitBaseIndexWithinCTAForMmaLayoutV2V3(L
-     warpsN = shape[rank - 1] / instrShape[rank - 1];
- 
-   SmallVector<Value> multiDimWarpId(rank);
--  if (mmaLayout.isHopper()) {
--    // TODO[goostavz]: the tiling order from CTA->warp level is different for
--    // MMAv2/3. This is a workaround since we don't explicitly have warpGrp
--    // level in the layout definition, and the tiling order of warpGrp->warp
--    // must be fixed to meet the HW's needs. We may need to consider to
--    // explicitly define warpGrpPerCTA for MMAv3 layout.
--    assert(rank == 2 && "MMAv3 layout does not support 3D tensor yet");
--    multiDimWarpId[rank - 2] = urem(warpId, warpsPerCTA[rank - 2]);
--    multiDimWarpId[rank - 1] =
--        urem(udiv(warpId, warpsPerCTA[rank - 2]), warpsPerCTA[rank - 1]);
--  } else {
--    multiDimWarpId = delinearize(rewriter, loc, warpId, _warpsPerCTA, order);
--  }
-+  multiDimWarpId = delinearize(rewriter, loc, warpId, _warpsPerCTA, warpOrder);
-   Value warpIdM = urem(multiDimWarpId[rank - 2], i32_val(warpsM));
-   Value warpIdN = urem(multiDimWarpId[rank - 1], i32_val(warpsN));
- 
-@@ -808,7 +796,7 @@ emitBaseIndexForMfmaLayout(Location loc,
-   Value laneId = urem(threadId, effectiveWarpSize);
-   Value warpId = udiv(threadId, warpSize);
-   SmallVector<Value> multiDimWarpId = delinearize(
--      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getOrder(mfmaLayout));
-+      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getWarpOrder(mfmaLayout));
-   if (shape[rank - 2] >= mDim) {
-     assert(shape[rank - 2] % mDim == 0);
-     multiDimWarpId[rank - 2] =
-@@ -949,7 +937,7 @@ emitBaseIndexForWmmaLayout(Location loc,
- 
-   Value warpId = udiv(threadId, warpSize);
-   SmallVector<Value> multiDimWarpId = delinearize(
--      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getOrder(wmmaLayout));
-+      rewriter, loc, warpId, _warpsPerCTA, triton::gpu::getWarpOrder(wmmaLayout));
-   if (shape[0] >= mnkDim[0]) {
-     assert(shape[0] % mnkDim[0] == 0);
-     multiDimWarpId[0] =
-diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
---- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
-+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
-@@ -75,6 +75,8 @@ getThreadsPerWarpWithUniqueData(Attribut
- SmallVector<unsigned>
- getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
- 
-+SmallVector<unsigned> getWarpOrder(Attribute layout);
-+
- SmallVector<unsigned> getOrder(Attribute layout);
- 
- CTALayoutAttr getCTALayout(Attribute layout);
-diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-@@ -240,7 +240,7 @@ private:
-                     ConversionPatternRewriter &rewriter) const {
-     auto srcLayout = helper.getSrcLayout();
-     auto srcShape = helper.getSrcShape();
--    auto order = getOrder(srcLayout);
-+    auto order = triton::gpu::getWarpOrder(srcLayout);
-     SmallVector<Value> multiDimWarpId;
- 
-     // 2x2 warps with slice dim = 0, warpId = 2 ends up writing at the same
-@@ -249,7 +249,7 @@ private:
-     if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
-       auto parentLayout = sliceLayout.getParent();
-       auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout);
--      auto parentOrder = triton::gpu::getOrder(parentLayout);
-+      auto parentOrder = triton::gpu::getWarpOrder(parentLayout);
-       multiDimWarpId =
-           delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder);
-       multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim());
-diff --git a/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
-@@ -399,8 +399,8 @@ ScanOpConversion::getMultiDimWarpId(Conv
- 
-   auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcEncoding);
-   auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcEncoding);
--  auto order = triton::gpu::getOrder(srcEncoding);
--  return delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+  auto warpOrder = triton::gpu::getWarpOrder(srcEncoding);
-+  return delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
- }
- 
- // Break up the threadId into lane and warp id along the scan dimension and
-@@ -416,10 +416,11 @@ ScanOpConversion::getDelinearizedIds(Con
-   auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcEncoding);
-   auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcEncoding);
-   auto order = triton::gpu::getOrder(srcEncoding);
-+  auto warpOrder = triton::gpu::getWarpOrder(srcEncoding);
-   SmallVector<Value> multiDimLaneId =
-       delinearize(rewriter, loc, laneId, threadsPerWarp, order);
-   SmallVector<Value> multiDimWarpId =
--      delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+      delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
- 
-   Value laneIdAxis = multiDimLaneId[axis];
-   Value warpIdAxis = multiDimWarpId[axis];
-@@ -431,7 +432,7 @@ ScanOpConversion::getDelinearizedIds(Con
-   multiDimWarpId[axis] = i32_val(0);
-   warpsPerCTA[axis] = 1;
-   Value warpIdParallel =
--      linearize(rewriter, loc, multiDimWarpId, warpsPerCTA, order);
-+      linearize(rewriter, loc, multiDimWarpId, warpsPerCTA, warpOrder);
-   Value flatIdParallel =
-       add(laneIdParallel,
-           mul(warpIdParallel, i32_val(helper.getNonAxisNumThreadsPerWarp())));
-diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
---- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
-@@ -510,13 +510,8 @@ SmallVector<Value> getMultiDimOffset(Att
-     // TODO: fix the bug in MMAEncodingAttr document
-     SmallVector<Value> multiDimWarpId(2);
-     auto warpsPerCTA = mmaLayout.getWarpsPerCTA();
--    if (mmaLayout.isHopper()) {
--      multiDimWarpId[0] = urem(warpId, i32_val(warpsPerCTA[0]));
--      multiDimWarpId[1] = udiv(warpId, i32_val(warpsPerCTA[0]));
--    } else {
--      auto order = triton::gpu::getOrder(mmaLayout);
--      multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, order);
--    }
-+    auto warpOrder = triton::gpu::getWarpOrder(mmaLayout);
-+    multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
-     Value _1 = i32_val(1);
-     Value _2 = i32_val(2);
-     Value _4 = i32_val(4);
-@@ -544,7 +539,7 @@ SmallVector<Value> getMultiDimOffset(Att
-       mmaColIdx[0] = add(mmaThreadIdInGrpM2, colWarpOffset);
-       mmaColIdx[1] = add(mmaThreadIdInGrpM2P1, colWarpOffset);
-     } else if (mmaLayout.isVolta()) {
--      // Volta doesn't follow the pattern here."
-+      // Volta doesn't follow the pattern here.
-     } else {
-       llvm_unreachable("Unexpected MMALayout version");
-     }
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -228,6 +228,19 @@ static SmallVector<unsigned> eraseOrder(
-   return resOrder;
- }
- 
-+SmallVector<unsigned> getWarpOrder(Attribute layout) {
-+  auto order = getOrder(layout);
-+  if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
-+    if (mmaLayout.isHopper()) {
-+      // Hopper MMA instructions force a warp order of [0, 1].
-+      auto it = std::find(order.begin(), order.end(), 0);
-+      order.erase(it);
-+      order.insert(order.begin(), 0);
-+    }
-+  }
-+  return order;
-+}
-+
- SmallVector<unsigned> getOrder(Attribute layout) {
-   if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout)) {
-     return SmallVector<unsigned>(blockedLayout.getOrder().begin(),
-@@ -831,7 +844,7 @@ SmallVector<unsigned> SliceEncodingAttr:
-   return warpsPerCTA;
- }
- SmallVector<unsigned> SliceEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> SliceEncodingAttr::getThreadsPerWarp() const {
-   auto parent = getParent();
-@@ -1092,7 +1105,7 @@ SmallVector<unsigned> DotOperandEncoding
-   }
- }
- SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1608,7 +1621,7 @@ SmallVector<unsigned> AMDMfmaEncodingAtt
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> AMDMfmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> AMDMfmaEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1761,7 +1774,7 @@ SmallVector<unsigned> AMDWmmaEncodingAtt
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> AMDWmmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> AMDWmmaEncodingAttr::getThreadOrder() const {
-   return ::getOrder(*this);
-@@ -1859,7 +1872,7 @@ SmallVector<unsigned> NvidiaMmaEncodingA
-   return SmallVector<unsigned>(getWarpsPerCTA__());
- }
- SmallVector<unsigned> NvidiaMmaEncodingAttr::getWarpOrder() const {
--  return ::getOrder(*this);
-+  return ::getWarpOrder(*this);
- }
- SmallVector<unsigned> NvidiaMmaEncodingAttr::getThreadsPerWarp() const {
-   auto rank = getWarpsPerCTA().size();
-@@ -1900,7 +1913,10 @@ SmallVector<unsigned> NvidiaMmaEncodingA
-   }
-   if (isHopper()) {
-     auto instrShape = getInstrShape();
--    // TODO(thomas): what are those magic numbers?
-+    // WGMMA instructions have an order of [0, 1] with 4 warps, each with 8
-+    // unique thread ids (32 in a warp group) per column. It is 1 warp wide with
-+    // 4 unique thread ids in the row. So the size per thread is the instruction
-+    // size divided by the number of unique thread ids.
-     return SmallVector<unsigned>{instrShape[0] * 4 / 32, instrShape[1] / 4};
-   }
-   llvm_unreachable("Unexpected mma version");
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -2508,6 +2508,8 @@ layouts = [
-               instr_shape=[16, 8]),
-     MmaLayout(version=(3, 0), warps_per_cta=[4, 1], ctas_per_cga=[1, 1], cta_split_num=[1, 1], cta_order=[1, 0],
-               instr_shape=[16, 16, 16]),
-+    MmaLayout(version=(3, 0), warps_per_cta=[4, 2], ctas_per_cga=[1, 1], cta_split_num=[1, 1], cta_order=[1, 0],
-+              instr_shape=[16, 32, 16]),
-     MfmaLayout(version=(2, 0), warps_per_cta=[2, 2], instr_shape=[32, 32], is_transposed=False),
-     MfmaLayout(version=(2, 0), warps_per_cta=[4, 1], instr_shape=[32, 32], is_transposed=False),
-     MfmaLayout(version=(2, 0), warps_per_cta=[1, 4], instr_shape=[32, 32], is_transposed=False),
-@@ -2524,9 +2526,9 @@ layouts = [
- @pytest.mark.parametrize("dtype_str", ["int32", "float32", "float16"])
- @pytest.mark.parametrize("reduce_op", ["sum", "max"])
- def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, reduce_op, device):
--    if is_hip() and isinstance(src_layout, MfmaLayout) and (M < src_layout.instr_shape[0]
-+    if isinstance(src_layout, (MfmaLayout, MmaLayout)) and (M < src_layout.instr_shape[0]
-                                                             or N < src_layout.instr_shape[1]):
--        pytest.skip("Skipping because tensor shape is smaller than MfmaLayout isntr_shape")
-+        pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape")
-     if is_hip() and isinstance(src_layout, MfmaLayout) and ((M, N) == (128, 128)):
-         pytest.skip("Skipping test because it runs out of shared memory")
-     if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024:
-@@ -2535,6 +2537,9 @@ def test_reduce_layouts(M, N, src_layout
-         pytest.skip(
-             "Currently MmaLayout combined with slice encoding and reduce op trigger device illegal memory access")
- 
-+    if isinstance(src_layout, MmaLayout) and src_layout.version == 3:
-+        src_layout[2] = 16 if dtype_str == "float16" else 8
-+
-     ty = {"int32": "i32", "float32": "f32", "float16": "f16"}[dtype_str]
-     arith_op = {
-         "max": {"int32": "arith.maxsi", "float32": "arith.maximumf", "float16": "arith.maximumf"},  #
-@@ -2545,6 +2550,9 @@ def test_reduce_layouts(M, N, src_layout
-     rdims_2d = f"1x{N}" if axis == 0 else f"{M}x1"
-     store_range = "%7" if axis == 0 else "%1"
-     blocked = BlockedLayout([1, 1], [32, THREADS_PER_WARP // 32], [4, 1], [0, 1], [1, 1], [1, 1], [0, 1])
-+    num_warps = src_layout.warps_per_cta[0] * src_layout.warps_per_cta[1]
-+    if num_warps == 8:
-+        blocked = BlockedLayout([1, 1], [32, THREADS_PER_WARP // 32], [4, 2], [0, 1], [1, 1], [1, 1], [0, 1])
-     one_d_layout = BlockedLayout([1], [THREADS_PER_WARP], [4], [0], [1], [1], [0])
- 
-     expanded_shape = f"1x{N}" if axis == 0 else f"{M}x1"
-@@ -2592,7 +2600,7 @@ def test_reduce_layouts(M, N, src_layout
-     #blocked = {blocked}
-     #src = {src_layout}
-     #one_d_layout = {one_d_layout}
--    module attributes {{"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = {THREADS_PER_WARP} : i32}} {{
-+    module attributes {{"triton_gpu.num-warps" = {num_warps} : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.threads-per-warp" = {THREADS_PER_WARP} : i32}} {{
-     tt.func public @kernel_0d1d2c3d4c(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}) {{
-         %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #{GPU_DIALECT}.slice<{{dim = 1, parent = #blocked}}>>
-         %1 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #{GPU_DIALECT}.slice<{{dim = 1, parent = #blocked}}>> -> tensor<{M}x1xi32, #blocked>
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
-@@ -37,12 +37,13 @@ Value redundantDataMask(Type valueTy, Co
-     auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout);
-     auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout);
-     auto order = triton::gpu::getOrder(layout);
-+    auto warpOrder = triton::gpu::getWarpOrder(layout);
-     auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
-     Value warpSize = i32_val(32);
-     Value laneId = urem(tid, warpSize);
-     Value warpId = udiv(tid, warpSize);
-     SmallVector<Value> multiDimWarpId =
--        delinearize(rewriter, loc, warpId, warpsPerCTA, order);
-+        delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
-     SmallVector<Value> multiDimThreadId =
-         delinearize(rewriter, loc, laneId, threadsPerWarp, order);
-     for (unsigned dim = 0; dim < rank; ++dim) {
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index d6bde944a046bd..30ee1d604dc2a9 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -6,7 +6,5 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:reduction_mma_v3_fix.patch",
-    "//third_party/triton/temporary:exclude_failing_h100_tests.patch",
-    "//third_party/triton/temporary:enable_mma_v3.patch",
+    "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
 ]
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 00a674f63d005b..1aace3709dc23a 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl635840438"
-    TRITON_SHA256 = "707101b2e8366e63e80150c26f8ab660052099c91ca0c4fa4c713607fa75f318"
+    TRITON_COMMIT = "cl637553582"
+    TRITON_SHA256 = "400077180416fc59486b698a6523013ee11589c6269e1aeb992292ca12cc1e58"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
index ce009aa688e9bf..abccf863abca64 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
@@ -2,7 +2,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 index 0516fc56f..1f27f8a43 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -142,6 +142,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+@@ -146,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
                  mlir::TypeID::get<arith::ArithDialect>());
    }
  
@@ -10,15 +10,15 @@ index 0516fc56f..1f27f8a43 100644
    // Finds the first different bitwidth in the chain of shape-preserving
    // unary ops that x depends on.
    // There are two primary scenarios:
-@@ -175,7 +176,6 @@ class BlockedToMMA : public mlir::RewritePattern {
+@@ -179,7 +180,6 @@ class BlockedToMMA : public mlir::RewritePattern {
      return origBitWidth;
    }
  
 -public:
    BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
-@@ -389,18 +389,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
+@@ -393,18 +393,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
                                                          newRetType, oldAcc);
  
      if (versionMajor == 2) {
@@ -28,23 +28,23 @@ index 0516fc56f..1f27f8a43 100644
 +
        // convert A operand
        auto oldAType = cast<RankedTensorType>(a.getType());
--      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
+-      auto newAEncoding = DotOperandEncodingAttr::get(
 -          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAEncoding =
-+          ttg::DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
++          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
        auto newAType = RankedTensorType::get(
            oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-       a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
+       a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
  
        // convert B operand
        auto oldBType = cast<RankedTensorType>(b.getType());
--      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
+-      auto newBEncoding = DotOperandEncodingAttr::get(
 -          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBEncoding =
-+          ttg::DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
++          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
        auto newBType = RankedTensorType::get(
            oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-       b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
+       b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 index 3011cf73d..ea587dced 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
index 260f84e892e8b3..71ccb7e3c2e6e5 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
@@ -1,7 +1,7 @@
 diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -365,7 +365,8 @@ class SparseBlockedToMMA : public mlir::
+@@ -368,7 +368,8 @@ class SparseBlockedToMMA : public mlir::
  
      assert(computeCapability >= 80 &&
             "SparseDot is supported on Ampere and higher");
@@ -10,7 +10,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
  
      // get MMA encoding for the given number of warps
-     auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
+     auto retShapePerCTA = getShapePerCTA(oldRetType);
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
index 9136cb84b24254..afa38b7359b958 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -117,34 +117,34 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 index 098ee85e4..0516fc56f 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -44,8 +44,9 @@ static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
+@@ -39,8 +39,9 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
    return 0;
  }
  
+-SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
 +template <typename DotType>
- SmallVector<unsigned>
--warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-+warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape, int numWarps) {
++SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
+                                      int numWarps) {
    auto rank = shape.size();
    // Early exit for batched matmul
    if (rank == 3)
-@@ -58,8 +59,8 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+@@ -54,8 +54,8 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
    bool hasChainedDot = false;
    for (Operation *op : slices) {
--    if (isa<tt::DotOp>(op) && (op != dotOp)) {
--      auto chainedDot = cast<tt::DotOp>(op);
+-    if (isa<DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<DotOp>(op);
 +    if (isa<DotType>(op) && (op != dotOp)) {
 +      auto chainedDot = cast<DotType>(op);
        auto resTy = chainedDot.getResult().getType();
        if (resTy.getRank() != rank) {
          continue;
-@@ -103,12 +104,13 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+@@ -99,12 +99,13 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    return ret;
  }
  
 -SmallVector<unsigned, 2>
--warpsPerTileV3(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
 -               const SmallVector<unsigned, 3> &instrShape) {
 +template <typename DotType>
 +SmallVector<unsigned, 2> warpsPerTileV3(
@@ -152,17 +152,17 @@ index 098ee85e4..0516fc56f 100644
 +    const SmallVector<unsigned, 3> &instrShape) {
    SetVector<Operation *> slices;
    mlir::getForwardSlice(dotOp.getResult(), &slices);
--  if (llvm::find_if(slices, [](Operation *op) { return isa<tt::DotOp>(op); }) !=
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
 +  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
        slices.end())
      return {(unsigned)numWarps, 1};
  
-@@ -178,9 +180,10 @@ public:
-       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+@@ -184,9 +184,10 @@ public:
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
  
 -  static SmallVector<unsigned, 3>
--  getWarpsPerTile(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
 -                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
 +  template <typename DotType>
 +  static SmallVector<unsigned, 3> getWarpsPerTile(
@@ -171,7 +171,7 @@ index 098ee85e4..0516fc56f 100644
      switch (version) {
      case 2:
        return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -335,6 +338,98 @@ public:
+@@ -342,6 +342,98 @@ public:
      return success();
    }
  };
@@ -196,7 +196,7 @@ index 098ee85e4..0516fc56f 100644
 +    // Check data-types and SM compatibility
 +    RankedTensorType oldRetType = dotOp.getType();
 +    if (!oldRetType.getEncoding() ||
-+        isa<ttg::NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
++        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
 +      return failure();
 +
 +    assert(computeCapability >= 80 &&
@@ -204,43 +204,43 @@ index 098ee85e4..0516fc56f 100644
 +    int versionMajor = computeCapability < 90 ? 2 : 3;
 +
 +    // get MMA encoding for the given number of warps
-+    auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
++    auto retShapePerCTA = getShapePerCTA(oldRetType);
 +    auto mod = op->getParentOfType<mlir::ModuleOp>();
-+    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
-+    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
++    int numWarps = TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = getCTALayout(oldRetType.getEncoding());
 +
 +    auto instrShape =
 +        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
 +                               cast<TensorOrMemDesc>(a.getType()), numWarps);
 +    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
 +        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
-+    ttg::NvidiaMmaEncodingAttr mmaEnc =
-+        ttg::NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++    NvidiaMmaEncodingAttr mmaEnc =
++        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
 +                                        warpsPerTile, CTALayout, instrShape);
 +    auto newRetType = RankedTensorType::get(
 +        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
 +
 +    // convert accumulator
 +    auto oldAcc = dotOp.getOperand(2);
-+    auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(oldAcc.getLoc(),
++    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
 +                                                        newRetType, oldAcc);
 +
 +    if (versionMajor == 2) {
 +      // convert A operand
 +      auto oldAType = cast<RankedTensorType>(a.getType());
-+      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
++      auto newAEncoding = DotOperandEncodingAttr::get(
 +          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAType = RankedTensorType::get(
 +          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-+      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
++      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
 +
 +      // convert B operand
 +      auto oldBType = cast<RankedTensorType>(b.getType());
-+      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
++      auto newBEncoding = DotOperandEncodingAttr::get(
 +          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBType = RankedTensorType::get(
 +          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-+      b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
++      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
 +    } else {
 +      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
 +      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
@@ -253,13 +253,13 @@ index 098ee85e4..0516fc56f 100644
 +        oldMetaType.getShape(), oldMetaType.getElementType(),
 +        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
 +    meta =
-+        rewriter.create<ttg::ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
 +
 +    // convert dot instruction
 +    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
 +                                               newAcc, meta);
 +
-+    rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(op, oldRetType,
++    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
 +                                                      newDot.getResult());
 +    return success();
 +  }
@@ -270,11 +270,11 @@ index 098ee85e4..0516fc56f 100644
  } // namespace
  
  static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -394,6 +489,7 @@ public:
+@@ -394,6 +493,7 @@ public:
  
      mlir::RewritePatternSet patterns(context);
-     patterns.add<::BlockedToMMA>(context, computeCapability);
-+    patterns.add<::SparseBlockedToMMA>(context, computeCapability);
+     patterns.add<BlockedToMMA>(context, computeCapability);
++    patterns.add<SparseBlockedToMMA>(context, computeCapability);
      if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
        signalPassFailure();
      }
@@ -293,7 +293,7 @@ index 97ca6a840..f0ef124ff 100644
  static bool isMMAv3Dot(Operation *op) {
    auto dot = dyn_cast<tt::DotOp>(op);
    if (!dot)
-@@ -399,19 +403,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
+@@ -391,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
      } else {
        if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
          return std::nullopt;
@@ -334,7 +334,7 @@ index 97ca6a840..f0ef124ff 100644
      }
      // Check that the shared encodings needed by the users are compatible.
      if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -518,7 +531,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
+@@ -519,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
        };
  
    for (Operation &op : forOp.getBody()->without_terminator()) {
@@ -343,7 +343,7 @@ index 97ca6a840..f0ef124ff 100644
        continue;
      seen.clear();
      dfs(&op, 0, &op);
-@@ -595,7 +608,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+@@ -596,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
          continue;
      }
  
@@ -353,7 +353,7 @@ index 97ca6a840..f0ef124ff 100644
        loadInfo.usedByDot = true;
        if (loadIsMMAv3(op)) {
          loadInfo.loadIsMMAV3 = true;
-@@ -614,7 +628,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+@@ -619,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
          // The codegen bug is caught by an assertion, so if you think you've
          // fixed it, feel free to delete this code and see if the assert still
          // fails.  :)
@@ -366,7 +366,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Di
 index 2211df31b..ee5ff44d8 100644
 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -37,6 +37,10 @@ public:
+@@ -38,6 +38,10 @@ public:
        auto srcEncoding = srcType.getEncoding();
        if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
          return;
@@ -377,7 +377,7 @@ index 2211df31b..ee5ff44d8 100644
        auto dstDotOp =
            dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
        if (!dstDotOp)
-@@ -83,6 +87,27 @@ public:
+@@ -84,6 +88,27 @@ public:
        cvtOp.erase();
      });
    }
@@ -404,12 +404,12 @@ index 2211df31b..ee5ff44d8 100644
 +  }
  };
  
- std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
+ } // namespace gpu
 diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 index f456d36a6..a1dac2b72 100644
 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -45,7 +45,7 @@ public:
+@@ -44,7 +44,7 @@ public:
        return;
      ModuleOp mod = getOperation();
      mod.walk([&](Operation *op) {
@@ -418,7 +418,7 @@ index f456d36a6..a1dac2b72 100644
          return WalkResult::advance();
        OpBuilder builder(op);
        auto a = op->getOperand(0);
-@@ -80,7 +80,7 @@ private:
+@@ -79,7 +79,7 @@ private:
      static DenseSet<std::pair<Operation *, unsigned>> trace;
      auto op = operand.getDefiningOp();
      // avoid redundant insertion
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
index 07014ca5852ab7..60fa5382d302e5 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
@@ -63,33 +63,35 @@ absl::Status CreateTritonPipeline(
   pm.addPass(mt::createConvertTritonToTritonGPUPass(
       absl::StrFormat("cuda:%u", ccAsInt), config.num_warps, threadsPerWarp,
       config.num_ctas));
-  pm.addPass(mt::gpu::createCoalescePass());
+  pm.addPass(mt::gpu::createTritonGPUCoalesce());
   if (ccCuda.IsAtLeastAmpere()) {
-    pm.addPass(mt::gpu::createF32DotTCPass());
+    pm.addPass(mt::gpu::createTritonGPUF32DotTC());
   }
   pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
-  pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(ccCuda.IsAtLeastAmpere()));
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  pm.addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
+  pm.addPass(mt::gpu::createTritonGPUAccelerateMatmul({ccAsInt}));
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  pm.addPass(
+      mt::gpu::createTritonGPUOptimizeDotOperands({ccCuda.IsAtLeastAmpere()}));
   pm.addPass(mlir::createCSEPass());
 
   // Even though we don't run on pre-Ampere architectures anymore, we keep this
   // check for consistency with the upstream pipeline
   if (ccCuda.IsAtLeastAmpere()) {
-    pm.addPass(mt::gpu::createCombineTensorSelectAndIfPass());
-    pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                           config.num_ctas, ccAsInt));
+    pm.addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
+    pm.addPass(mt::gpu::createTritonGPUPipeline(
+        {config.num_stages, config.num_warps, config.num_ctas, ccAsInt}));
   }
   if (!ccCuda.IsAtLeastHopper()) {
-    pm.addPass(mt::gpu::createPrefetchPass());
+    pm.addPass(mt::gpu::createTritonGPUPrefetch());
   }
 
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(ccCuda.IsAtLeastAmpere()));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
-  pm.addPass(mt::gpu::createReorderInstructionsPass());
+  pm.addPass(
+      mt::gpu::createTritonGPUOptimizeDotOperands({ccCuda.IsAtLeastAmpere()}));
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  pm.addPass(mt::gpu::createTritonGPUReduceDataDuplication());
+  pm.addPass(mt::gpu::createTritonGPUReorderInstructions());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
   if (ccCuda.IsAtLeastHopper()) {

From 092d33a973d45048fc3463fd388e66f60195e209 Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Tue, 28 May 2024 09:18:16 -0700
Subject: [PATCH 010/287] [Triton][NFC] Clean-up duplicate bug reference

PiperOrigin-RevId: 637922896
---
 .../xla/xla/service/gpu/gemm_fusion_autotuner_test.cc       | 6 +++---
 .../xla/service/gpu/ir_emitter_triton_parametrized_test.cc  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index 4a5799aef8483f..c1b3aa3951700a 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -433,7 +433,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-// Modify block_k back to 16 once b/331362083 is fixed.
+// Modify block_k back to 16 once b/337839570 is fixed.
 TEST_F(GemmFusionAutotunerTest, DoNotRunAutotuningKernelSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
@@ -468,7 +468,7 @@ ENTRY %e {
               "Compilation result discarded due to register spilling")));
 }
 
-// Modify block_k back to 16 once b/331362083 is fixed.
+// Modify block_k back to 16 once b/337839570 is fixed.
 TEST_F(GemmFusionAutotunerTest,
        DoNotFilterOutAutotuningKernelSpillingRegisters) {
   const std::string kHloText = R"(
@@ -510,7 +510,7 @@ ENTRY %e {
   EXPECT_NE(executable, nullptr);
 }
 
-// Modify block_k back to 16 once b/331362083 is fixed.
+// Modify block_k back to 16 once b/337839570 is fixed.
 TEST_F(GemmFusionAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 8502a59cacaa16..e78107367fb447 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -132,7 +132,7 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                              //  TritonRewriteTest2Params{F32, BF16},
                              MixTypeParams{S8, BF16, 24, 40, 8},
                              // Modify the case below to use k = 32 instead of
-                             // 16 once b/331362083 is fixed.
+                             // 16 once b/337839570 is fixed.
                              MixTypeParams{S8, F16, 80, 32, 32, 1e-3, 1e-6},
                              MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
                              MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},

From ce3e93a6fa84207d9f46078a9fadd77df79edebe Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 10:00:06 -0700
Subject: [PATCH 011/287] [xla:gpu] Do not copy constant dynamic-slice offsets
 from device memory

When dynamic-slice/dynamic-update-slice offset is defined by a constant value there is no need to issue D2H transfer to move it to host, and instead we can use the value known at compile time.

This saves 2 out of 3 memory copies for dot operation inside jax.lax.scan loop.

PiperOrigin-RevId: 637935353
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 +
 .../xla/xla/service/gpu/fusions/custom.cc     | 44 ++++++++++----
 third_party/xla/xla/service/gpu/runtime/BUILD |  1 +
 .../gpu/runtime/address_computation_thunk.cc  | 59 +++++++++++++------
 .../gpu/runtime/address_computation_thunk.h   | 13 ++--
 .../runtime/address_computation_thunk_test.cc | 48 +++++++--------
 6 files changed, 106 insertions(+), 60 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 7873af21dc77c1..6cdbf912fc730a 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -129,6 +129,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 2d4f26aa2f77da..9974c15d877a14 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -64,6 +63,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -172,7 +172,8 @@ absl::Status CollectSliceInfo(
     const BufferAssignment& buffer_assignment,
     const HloInstruction& fusion_instr,
     absl::Span<HloInstruction*> slice_instrs,
-    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>&
+    std::vector<std::optional<
+        std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>&
         offset_buffer_indices,
     std::vector<std::optional<Shape>>& orig_shapes,
     std::vector<std::optional<Shape>>& sliced_shapes,
@@ -183,15 +184,30 @@ absl::Status CollectSliceInfo(
     return absl::OkStatus();
   }
 
-  std::vector<BufferAllocation::Slice> offset_slices;
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> offset_slices;
   for (auto idx_op : slice_instr->index_operands()) {
     const auto* param = Cast<HloParameterInstruction>(idx_op);
-    TF_ASSIGN_OR_RETURN(
-        auto offset_slice,
-        GetAllocationSlice(buffer_assignment,
-                           fusion_instr.operand(param->parameter_number()),
-                           /*index=*/{}));
-    offset_slices.push_back(offset_slice);
+    const auto* offset_param = fusion_instr.operand(param->parameter_number());
+
+    if (auto* cst_offset = DynCast<HloConstantInstruction>(offset_param)) {
+      auto s32_scalar = ShapeUtil::MakeShape(PrimitiveType::S32, {});
+      auto s64_scalar = ShapeUtil::MakeShape(PrimitiveType::S64, {});
+
+      if (cst_offset->shape() == s32_scalar) {
+        offset_slices.emplace_back() = cst_offset->literal().data<int32_t>()[0];
+      } else if (cst_offset->shape() == s64_scalar) {
+        offset_slices.emplace_back() = cst_offset->literal().data<int64_t>()[0];
+      } else {
+        return absl::InternalError(
+            absl::StrCat("Unsupported constant offset shape: ",
+                         cst_offset->shape().ToString()));
+      }
+
+    } else {
+      TF_ASSIGN_OR_RETURN(offset_slices.emplace_back(),
+                          GetAllocationSlice(buffer_assignment, offset_param,
+                                             /*index=*/{}));
+    }
   }
   offset_buffer_indices[arg_idx] = std::move(offset_slices);
   orig_shapes[arg_idx] = slice_instr->operand(0)->shape();
@@ -256,7 +272,8 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+  std::vector<std::optional<
+      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
@@ -379,7 +396,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,
         slice_out_fake, slice_workspace_fake, deterministic_ops));
 
-    std::vector<std::optional<const BufferAllocation::Slice>> arguments{
+    std::vector<std::optional<BufferAllocation::Slice>> arguments{
         lhs_slice, rhs_slice, output, workspace};
 
     thunk = std::make_unique<AddressComputationThunk>(
@@ -435,7 +452,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
     num_args += ShapeUtil::GetLeafCount(operand->shape());
   });
 
-  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+  std::vector<std::optional<
+      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
       offset_buffer_indices(num_args, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
@@ -443,7 +461,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                                                          std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
-  std::vector<std::optional<const BufferAllocation::Slice>> arguments;
+  std::vector<std::optional<BufferAllocation::Slice>> arguments;
 
   unsigned arg_idx = 0;
   // TODO(vuson): add test for custom call with token-typed operands
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index df5aec07ace110..feeaf7ef046c9a 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -337,6 +337,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 2e49de4220e2df..c31f8176fffca4 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/status/status.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/service/buffer_assignment.h"
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -45,9 +46,10 @@ namespace gpu {
 
 AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-    std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+    std::vector<std::optional<BufferAllocation::Slice>> arguments,
     std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
-    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+    std::vector<std::optional<
+        std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
         offset_buffer_indices,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
@@ -151,28 +153,47 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     std::vector<int64_t> slice_starts;
     slice_starts.reserve(dst_shape.rank());
 
+    // Number of issues d2h transfers to copy offset values from device to host.
+    int64_t num_transfers = 0;
+
     // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
     // components.
     for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
              *offset_slice, src_shape.dimensions(), dst_shape.dimensions()))) {
       auto [slice, src_dim, dst_dim] = values;
-      se::DeviceMemoryBase offset_src =
-          orig_allocations.GetDeviceAddress(slice);
-      int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
-      // Copy the `offset_idx`-th component of the offset for the
-      // `argument_idx`-th argument from device to host.
-      TF_RETURN_IF_ERROR(
-          stream.Memcpy(offset_dst, offset_src, offset_byte_size.value()));
-
-      if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-        return absl::InternalError(absl::StrFormat(
-            "Failed to retrieve all slice offset values on stream %p: %s",
-            &stream, blocked.message()));
+
+      if (int64_t* const_offset = std::get_if<int64_t>(&slice)) {
+        // Forward slice offsets that are known constant values
+        offsets_base[argument_idx + offset_idx] = *const_offset;
+      } else {
+        // Transfer slice offset value from device to host.
+        se::DeviceMemoryBase offset_src = orig_allocations.GetDeviceAddress(
+            std::get<BufferAllocation::Slice>(slice));
+        int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
+
+        // Copy the `offset_idx`-th component of the offset for the
+        // `argument_idx`-th argument from device to host.
+        TF_RETURN_IF_ERROR(
+            stream.Memcpy(offset_dst, offset_src, offset_byte_size.value()));
+        ++num_transfers;
       }
-      // Clamp start indices:
-      // start_indices[i] = min(max(start_indices[i], 0),
-      //                        operand.dimension_size[i] - size_indices[i])
-      auto start_index = std::min(std::max(*offset_dst, 0L), src_dim - dst_dim);
+    }
+
+    // Wait for the completion of all transfers.
+    if (num_transfers > 0) {
+      VLOG(2) << "Wait for completion of " << num_transfers << " transfer";
+      TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+    }
+
+    // Clamp start indices:
+    // start_indices[i] = min(max(start_indices[i], 0),
+    //                        operand.dimension_size[i] - size_indices[i])
+    for (auto [offset_idx, values] : llvm::enumerate(
+             llvm::zip(src_shape.dimensions(), dst_shape.dimensions()))) {
+      auto [src_dim, dst_dim] = values;
+      int64_t start_index =
+          std::min(std::max(offsets_base[argument_idx + offset_idx], 0L),
+                   src_dim - dst_dim);
       slice_starts.push_back(start_index);
     }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index bfc70574d975d5..8374362f11da50 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <variant>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -45,9 +47,10 @@ class AddressComputationThunk : public Thunk {
  public:
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-      std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+      std::vector<std::optional<BufferAllocation::Slice>> arguments,
       std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
-      std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      std::vector<std::optional<
+          std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
           offset_buffer_indices,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
@@ -65,10 +68,10 @@ class AddressComputationThunk : public Thunk {
 
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
-      embedded_thunk_arguments_;
+  std::vector<std::optional<BufferAllocation::Slice>> embedded_thunk_arguments_;
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
-  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+  std::vector<std::optional<
+      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
       offset_buffer_indices_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 39398f6dd10f8b..5673a1efa75b35 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 #if GOOGLE_CUDA
@@ -135,8 +137,8 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -288,10 +290,10 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
-  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
-                                                   slice_rhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> rhs_offsets{
+      slice_rhs_offset_0, slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -452,10 +454,10 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
-  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
-                                                   slice_rhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> rhs_offsets{
+      slice_rhs_offset_0, slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -630,7 +632,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<BufferAllocation::Slice> slice_offsets{
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_offsets{
       slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
@@ -788,10 +790,10 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<BufferAllocation::Slice> slice_src_offsets{
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
@@ -968,8 +970,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1116,8 +1118,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1257,8 +1259,8 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1427,10 +1429,10 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<BufferAllocation::Slice> slice_src_offsets{
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
@@ -1608,8 +1610,8 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
-                                                   slice_lhs_offset_1};
+  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
+      slice_lhs_offset_0, slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},

From 5bb59d6d45b6adb977bf4e8c435cc97c81fb806f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 11:13:52 -0700
Subject: [PATCH 012/287] NFC: BUILD file changes

Reverts 2ddbc6e6e3a02a0fdb9d533dcc080fbd3051e386

PiperOrigin-RevId: 637962175
---
 third_party/xla/xla/stream_executor/cuda/BUILD | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index a5b49cb3df503d..131d2dbf03e437 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -117,10 +117,6 @@ cc_library(name = "ptxas_wrapper")
 
 cc_library(name = "nvlink_wrapper")
 
-# Buildozer can not remove dependencies inside select guards, so we have to use
-# an intermediate target.
-cc_library(name = "fatbinary_wrapper")
-
 cuda_only_cc_library(
     name = "cuda_driver",
     srcs = ["cuda_driver.cc"],
@@ -564,6 +560,13 @@ cuda_only_cc_library(
     srcs = ["cuda_asm_compiler.cc"],
     hdrs = ["cuda_asm_compiler.h"],
     copts = tf_additional_gpu_compilation_copts(),
+    # copybara:uncomment_begin
+    # data = [
+    # "@local_config_cuda//cuda:runtime_fatbinary",
+    # "@local_config_cuda//cuda:runtime_nvlink",
+    # "@local_config_cuda//cuda:runtime_ptxas",
+    # ],
+    # copybara:uncomment_end
     visibility = internal_visibility([
         "//third_party/py/jax:__subpackages__",
         "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
@@ -573,11 +576,8 @@ cuda_only_cc_library(
     ]),
     deps = [
         ":cuda_driver",
-        ":fatbinary_wrapper",
-        ":nvlink_wrapper",
         ":ptx_compiler",
         ":ptx_compiler_support",
-        ":ptxas_wrapper",
         "//xla:status_macros",
         "//xla:util",
         "//xla/stream_executor/gpu:gpu_asm_opts",

From 61b2db55a9162cfb5035e43e19d0251d8cb891c4 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 11:15:51 -0700
Subject: [PATCH 013/287] [xla:cpu] Add support for emitting constant
 allocations with thunk-based runtime

To use thunk runtime in tests:

bazel test //xla/tests:copy_test_cpu --test_env=XLA_FLAGS="--xla_dump_to=/tmp/xla-cpu --xla_cpu_use_thunk_runtime=true" --test_env=TF_CPP_VMODULE=copy_thunk=3 --test_output=all

PiperOrigin-RevId: 637962850
---
 third_party/xla/xla/service/cpu/BUILD         |   1 +
 .../xla/xla/service/cpu/cpu_compiler.cc       |  48 +++++++-
 .../xla/xla/service/cpu/cpu_executable.cc     | 106 ++++++++++++++++--
 .../xla/xla/service/cpu/cpu_executable.h      |  21 ++++
 .../service/cpu/runtime/buffer_allocations.cc |  15 ++-
 .../xla/xla/service/cpu/thunk_emitter.cc      |  10 ++
 .../xla/xla/service/cpu/thunk_emitter.h       |   3 +-
 7 files changed, 189 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index c543644ccddd5a..245628e35d5f7c 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -594,6 +594,7 @@ cc_library(
         "//xla/service:maybe_owning_device_memory",
         "//xla/service:shaped_buffer",
         "//xla/service:xla_debug_info_manager",
+        "//xla/service/cpu/runtime:buffer_allocations",
         "//xla/service/cpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 8cb3587fe8557b..f64f76a16e60cb 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1031,6 +1031,48 @@ std::vector<ComputationToEmit> SubcomputationEmissionOrder(
 
 }  // namespace
 
+// Creates a vector of constant allocations from the given buffer assignment.
+static absl::StatusOr<std::vector<CpuExecutable::ConstantAllocation>>
+CreateConstantAllocations(const BufferAssignment& assignment) {
+  std::vector<CpuExecutable::ConstantAllocation> constants;
+
+  for (const BufferAllocation& allocation : assignment.Allocations()) {
+    if (!allocation.is_constant()) {
+      continue;
+    }
+
+    // Find the constant instruction defining the value for allocation.
+    HloInstruction* const_instr = nullptr;
+    for (const auto& [value, _] : allocation.assigned_buffers()) {
+      // Multiple aliasing instructions can share the allocation, we need to
+      // find the original constant instruction that defines the value.
+      if (value->instruction()->opcode() == HloOpcode::kConstant) {
+        if (const_instr != nullptr) {
+          return absl::InternalError(
+              absl::StrCat("Multiple constant instructions define buffer ",
+                           allocation.ToString()));
+        }
+        const_instr = value->instruction();
+      }
+    }
+    if (const_instr == nullptr) {
+      return absl::InternalError(
+          absl::StrCat("Could not find constant instruction defining buffer ",
+                       allocation.ToString()));
+    }
+
+    const void* untyped_data = const_instr->literal().untyped_data();
+    int64_t size_in_bytes = const_instr->literal().size_bytes();
+
+    constants.push_back(CpuExecutable::ConstantAllocation{
+        allocation.index(),
+        absl::Span<const uint8_t>(
+            reinterpret_cast<const uint8_t*>(untyped_data), size_in_bytes)});
+  }
+
+  return constants;
+}
+
 absl::StatusOr<std::unique_ptr<CpuExecutable>>
 CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   ModuleHook pre_optimization_ir_hook;
@@ -1120,10 +1162,14 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
     TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
                         thunk_emitter.EmitEntryComputation(*module));
 
+    TF_ASSIGN_OR_RETURN(
+        std::vector<CpuExecutable::ConstantAllocation> constants,
+        CreateConstantAllocations(*assignment));
+
     TF_ASSIGN_OR_RETURN(
         auto cpu_executable,
         CpuExecutable::Create(std::move(assignment), std::move(module),
-                              std::move(thunks),
+                              std::move(thunks), std::move(constants),
                               std::move(hlo_profile_printer_data),
                               std::move(hlo_profile_index_map)));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 00ec3eb9a5966f..68a9d7e84d6d67 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/dynamic_annotations.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/buffer_allocations.h"
 #include "xla/service/cpu/runtime/thunk.h"
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/custom_call_status.h"
@@ -53,6 +55,7 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/host/host_stream.h"
 #include "xla/util.h"
@@ -64,6 +67,25 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+using ConstantAllocation = CpuExecutable::ConstantAllocation;
+
+se::DeviceMemoryBase ConstantAllocation::AsDeviceMemoryBase() const {
+  if (auto* empty = std::get_if<std::monostate>(&data)) {
+    return se::DeviceMemoryBase();
+  }
+
+  if (auto* owned = std::get_if<std::vector<uint8_t>>(&data)) {
+    return se::DeviceMemoryBase(
+        const_cast<void*>(reinterpret_cast<const void*>(owned->data())),
+        owned->size());
+  }
+
+  auto* view = std::get_if<absl::Span<const uint8_t>>(&data);
+  return se::DeviceMemoryBase(
+      const_cast<void*>(reinterpret_cast<const void*>(view->data())),
+      view->size());
+}
+
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
@@ -103,10 +125,11 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
+    std::vector<ConstantAllocation> constants,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
   VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
-          << hlo_module->name();
+          << hlo_module->name() << ", constants=" << constants.size();
 
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
@@ -114,6 +137,14 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
 
   executable->thunks_ = std::move(thunks);
 
+  // Re-index constants by their allocation index to allow efficient lookup.
+  for (auto& constant : constants) {
+    if (executable->constants_.size() <= constant.index) {
+      executable->constants_.resize(constant.index + 1);
+    }
+    executable->constants_[constant.index] = std::move(constant);
+  }
+
   return executable;
 }
 
@@ -139,7 +170,8 @@ CpuExecutable::~CpuExecutable() {
 
 static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
     const BufferAllocation& allocation,
-    absl::Span<ExecutionInput const> arguments,
+    absl::Span<const ExecutionInput> arguments,
+    absl::Span<const ConstantAllocation> constants,
     se::DeviceMemoryAllocator* memory_allocator, int device_ordinal) {
   VLOG(3) << allocation.ToString();
   if (allocation.is_entry_computation_parameter()) {
@@ -153,6 +185,10 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
     return MaybeOwningDeviceMemory{out};
   } else if (allocation.is_constant()) {
     VLOG(3) << "allocation is a constant";
+    if (allocation.index() < constants.size()) {
+      return MaybeOwningDeviceMemory(
+          constants[allocation.index()].AsDeviceMemoryBase());
+    }
     return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
   } else if (allocation.is_thread_local()) {
     VLOG(3) << "buffer is thread-local";
@@ -184,9 +220,9 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    TF_ASSIGN_OR_RETURN(
-        buffers[i], MemoryForAllocation(allocation, arguments, memory_allocator,
-                                        device_ordinal));
+    TF_ASSIGN_OR_RETURN(buffers[i],
+                        MemoryForAllocation(allocation, arguments, constants_,
+                                            memory_allocator, device_ordinal));
   }
 
   if (VLOG_IS_ON(3)) {
@@ -263,6 +299,53 @@ absl::Status CpuExecutable::ExecuteComputeFunction(
   return absl::OkStatus();
 }
 
+absl::Status CpuExecutable::ExecuteThunks(
+    const ExecutableRunOptions* run_options,
+    absl::Span<MaybeOwningDeviceMemory const> buffers,
+    HloExecutionProfile* hlo_execution_profile) {
+  uint64_t start_ns = tsl::Env::Default()->NowNanos();
+
+  size_t profile_counters_size =
+      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+                            : 0;
+  int64_t* profile_counters =
+      hlo_execution_profile
+          ? hlo_execution_profile->mutable_profile_counters()->data()
+          : nullptr;
+
+  BufferAllocations allocations(buffers);
+
+  VLOG(3) << "Executing XLA:CPU thunks:";
+  VLOG(3) << absl::StrFormat("  Number of buffer allocations: %u",
+                             buffers.size());
+  auto mem_printer = [](std::string* out, const MaybeOwningDeviceMemory& mem) {
+    absl::StrAppend(out,
+                    absl::StrFormat("%p", mem.AsDeviceMemoryBase().opaque()));
+  };
+  VLOG(3) << absl::StrFormat("  Buffer allocations: [%s]",
+                             absl::StrJoin(buffers, ", ", mem_printer));
+  VLOG(3) << absl::StrFormat("  Number of profile counters: %u",
+                             profile_counters_size);
+  VLOG(3) << absl::StrFormat("  Profile counters: %p", profile_counters);
+
+  Thunk::ExecuteParams execute_params = {&allocations};
+  absl::Status executed = thunks_->Execute(execute_params);
+
+  if (run_options->execution_profile()) {
+    uint64_t end_ns = tsl::Env::Default()->NowNanos();
+    run_options->execution_profile()->set_compute_time_ns(
+        std::max<int64_t>(end_ns - start_ns, 1));
+    // If hlo profiling was disabled then the cycle count is left empty.
+    if (hlo_execution_profile) {
+      run_options->execution_profile()->set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
+    }
+  }
+
+  return executed;
+}
+
 absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<MaybeOwningDeviceMemory> buffers,
@@ -417,8 +500,15 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     HloExecutionProfile* hlo_execution_profile;
 
     absl::Status operator()() {
-      return executable->ExecuteComputeFunction(
-          &run_options.run_options(), *task_buffers, hlo_execution_profile);
+      if (executable->has_compute_function()) {
+        return executable->ExecuteComputeFunction(
+            &run_options.run_options(), *task_buffers, hlo_execution_profile);
+      } else if (executable->has_thunks()) {
+        return executable->ExecuteThunks(&run_options.run_options(),
+                                         *task_buffers, hlo_execution_profile);
+      } else {
+        return Internal("No compute function or thunks found.");
+      }
     }
   };
   host_stream->EnqueueTaskWithStatus(
@@ -450,7 +540,7 @@ const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
 }
 
 int64_t CpuExecutable::SizeOfGeneratedCodeInBytes() const {
-  return jit_->SizeOfGeneratedCodeInBytes();
+  return jit_ ? jit_->SizeOfGeneratedCodeInBytes() : 0;
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 8510ffbe3858dd..005517c31c557b 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 
 namespace xla {
@@ -50,6 +52,16 @@ namespace cpu {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
+  // A storage (or an alias) for constant allocations data.
+  struct ConstantAllocation {
+    se::DeviceMemoryBase AsDeviceMemoryBase() const;
+
+    BufferAllocation::Index index;
+    std::variant<std::monostate, std::vector<uint8_t>,
+                 absl::Span<const uint8_t>>
+        data;
+  };
+
   // Creates a CpuExecutable from JIT compiled cpu function by resolving
   // `entry_function_name` in the `jit`.
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
@@ -64,6 +76,7 @@ class CpuExecutable : public Executable {
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
       std::unique_ptr<const BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
+      std::vector<ConstantAllocation> constants,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
@@ -81,6 +94,12 @@ class CpuExecutable : public Executable {
       absl::Span<MaybeOwningDeviceMemory const> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
+  // Calls emitted thunk sequence with the given arguments using the supplied
+  // buffers.
+  absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
+                             absl::Span<MaybeOwningDeviceMemory const> buffers,
+                             HloExecutionProfile* hlo_execution_profile);
+
   absl::Span<const std::string> obj_files() const { return obj_files_; }
 
   void set_obj_files(std::vector<std::string> obj_files) {
@@ -188,6 +207,8 @@ class CpuExecutable : public Executable {
 
   // A thunk sequence implementing CpuExecutable.
   std::optional<ThunkSequence> thunks_;
+  // Vector indexed by BufferAllocation::Index for efficient access.
+  std::vector<ConstantAllocation> constants_;
 
   // Entry function name for the computation.
   const std::string entry_function_name_;
diff --git a/third_party/xla/xla/service/cpu/runtime/buffer_allocations.cc b/third_party/xla/xla/service/cpu/runtime/buffer_allocations.cc
index 229781c117b5b5..d97caf0ff67014 100644
--- a/third_party/xla/xla/service/cpu/runtime/buffer_allocations.cc
+++ b/third_party/xla/xla/service/cpu/runtime/buffer_allocations.cc
@@ -39,6 +39,13 @@ absl::StatusOr<se::DeviceMemoryBase> BufferAllocations::GetDeviceAddress(
 
 absl::StatusOr<se::DeviceMemoryBase> BufferAllocations::GetDeviceAddress(
     const BufferAllocation::Slice& buffer_slice) const {
+  // Handle empty slices explicitly and return a null pointer device memory to
+  // guarantee that we do not accidentally write through the empty slice which
+  // would hide a real bug in the code.
+  if (buffer_slice.size() == 0) {
+    return se::DeviceMemoryBase(nullptr, 0);
+  }
+
   int64_t index = buffer_slice.index();
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase base, GetDeviceAddress(index));
 
@@ -52,14 +59,14 @@ absl::StatusOr<se::DeviceMemoryBase> BufferAllocations::GetDeviceAddress(
 
   if (offset >= base.size()) {
     return absl::InvalidArgumentError(absl::StrCat(
-        "Buffer slice offset ", offset, " must be smaller than buffer #", index,
-        " size ", base.size()));
+        "Buffer slice offset ", offset, " is out of range for buffer #", index,
+        " of size ", base.size()));
   }
 
   if (extent > base.size()) {
     return absl::InvalidArgumentError(absl::StrCat(
-        "Buffer slice extent ", extent, " must be smaller than buffer #", index,
-        " size ", base.size()));
+        "Buffer slice extent ", extent, " is out of range for buffer #", index,
+        " of size ", base.size()));
   }
 
   return base.GetByteSlice(offset, buffer_slice.size());
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 2e8d2f949be1e4..f2576c7e9667ee 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -33,6 +33,9 @@ limitations under the License.
 
 namespace xla::cpu {
 
+ThunkEmitter::ThunkEmitter(const BufferAssignment* buffer_assignment)
+    : buffer_assignment_(buffer_assignment) {}
+
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
     const HloModule& module) {
   if (!module.has_schedule()) {
@@ -71,9 +74,16 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
   switch (instruction->opcode()) {
     // Instructions that do not have a thunk implementation and instead fully
     // defined by the corresponding buffer assignment.
+    case HloOpcode::kBitcast:
     case HloOpcode::kParameter:
       return ThunkSequence::Empty();
 
+    // Allocations for constants owned by the executable, and resolved at run
+    // time according to the buffer assignment (using allocation index). We do
+    // not need to emit any thunks for constant instructions.
+    case HloOpcode::kConstant:
+      return ThunkSequence::Empty();
+
     case HloOpcode::kCopy:
       return EmitCopyThunk(instruction);
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index b89e48230d59d4..77a9eda1e65930 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -31,8 +31,7 @@ namespace xla::cpu {
 // call into the libraries (oneDNN, Eigen, etc.).
 class ThunkEmitter {
  public:
-  explicit ThunkEmitter(const BufferAssignment* buffer_assignment)
-      : buffer_assignment_(buffer_assignment) {}
+  explicit ThunkEmitter(const BufferAssignment* buffer_assignment);
 
   // Emits HLO module entry computation as a sequence of thunks.
   absl::StatusOr<ThunkSequence> EmitEntryComputation(const HloModule& module);

From f0497bc3038a7c828632301a840548b10fe08ff0 Mon Sep 17 00:00:00 2001
From: Seher Ellis <sacer@google.com>
Date: Tue, 28 May 2024 11:18:13 -0700
Subject: [PATCH 014/287] [XLA:SPMD:CollectiveMatmul] Sort the modified
 dimension fields for the new dot operation.

PiperOrigin-RevId: 637963589
---
 third_party/xla/xla/service/spmd/BUILD        |  4 ++
 .../xla/xla/service/spmd/dot_handler.cc       |  5 ++
 .../xla/service/spmd/spmd_partitioner_test.cc | 48 +++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index cb83f5527766c8..04b60ad4fba942 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -110,8 +110,12 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index 696c9f8aa2b3c5..7cef761b6bf6e2 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -183,6 +183,11 @@ void UpdateDDNums(DotDimensionNumbers* new_ddnums, int64_t reshaped_dim,
         }
         if (add_reshaped_dim) {
           dims->Add(reshaped_dim);
+          // Sort the dimensions (assumes they were sorted before the addition)
+          for (int64_t i = dims->size() - 1;
+               i >= 1 && dims->at(i) < dims->at(i - 1); i--) {
+            dims->SwapElements(i - 1, i);
+          }
         }
       };
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 45e9b584046515..8cc77cbb25e9a5 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -25,7 +25,10 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -43,6 +46,7 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -4748,6 +4752,50 @@ ENTRY entry {
                         op::GetTupleElement(op::Parameter(0)), next_i));
 }
 
+TEST_P(SpmdPartitioningTest, WindowedEinsumKeepBatchDimensionsSorted) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  p0 = bf16[64,1025,4096]{2,1,0} parameter(0), sharding={devices=[8,1,1,8]<=[64] last_tile_dim_replicate}
+  p1 = bf16[1,4096,16384]{2,1,0} parameter(1), sharding={devices=[1,8,8]<=[64]}
+
+  reshape.9434 = bf16[64,1025,32,128]{3,2,1,0} reshape(p0), sharding={devices=[8,1,1,1,8]<=[64] last_tile_dim_replicate}
+  reshape.9438 = bf16[32,128,16384]{2,1,0} reshape(p1), sharding={devices=[8,1,8]<=[64]}
+  ROOT dot.1104 = bf16[32,64,1025,16384]{3,2,1,0} dot(reshape.9434, reshape.9438), lhs_batch_dims={2}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, sharding={devices=[1,8,1,8]<=[64]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/64,
+                           /*conv_halo_exchange_always_on_lhs=*/true,
+                           /*choose_faster_windowed_einsum=*/true,
+                           /*unroll_windowed_einsum=*/true,
+                           /*bidirectional_windowed_einsum=*/true,
+                           /*threshold_for_windowed_einsum_mib=*/0));
+  VLOG(1) << module->ToString();
+  TF_ASSERT_OK(HloVerifier(/*layout_sensitive=*/false,
+                           /*allow_mixed_precision=*/false)
+                   .Run(module.get())
+                   .status());
+  const HloInstruction* while_inst =
+      module->entry_computation()->root_instruction()->operand(0);
+  for (HloInstruction* inst : while_inst->while_body()->instructions()) {
+    if (inst->opcode() == HloOpcode::kDot) {
+      auto lhs_batch_dims =
+          inst->dot_dimension_numbers().lhs_batch_dimensions();
+      CHECK_EQ(lhs_batch_dims.size(), 2);
+      CHECK_EQ(lhs_batch_dims[0], 2);
+      CHECK_EQ(lhs_batch_dims[1], 3);
+      auto rhs_batch_dims =
+          inst->dot_dimension_numbers().rhs_batch_dimensions();
+      CHECK_EQ(rhs_batch_dims.size(), 2);
+      CHECK_EQ(rhs_batch_dims[0], 0);
+      CHECK_EQ(rhs_batch_dims[1], 1);
+    }
+  }
+}
+
 TEST_P(SpmdPartitioningTest, DotPartialDeviceOrder) {
   absl::string_view hlo_string = R"(
 HloModule module

From 3a9f631630d027a430f56c8973c2a12a04a9a802 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 11:18:53 -0700
Subject: [PATCH 015/287] Minor refactor:  rename the 'lower bound batch
 threads' transform to a more generic 'reconfig batch op'.  It makes no
 logical changes.

PiperOrigin-RevId: 637963842
---
 tensorflow/compiler/mlir/tfrt/BUILD           |  2 +-
 ...ch_threads.mlir => reconfig_batch_op.mlir} |  2 +-
 .../compiler/mlir/tfrt/transforms/passes.cc   |  4 +-
 .../compiler/mlir/tfrt/transforms/passes.h    |  8 +++-
 ..._batch_threads.cc => reconfig_batch_op.cc} | 37 +++++++++----------
 5 files changed, 27 insertions(+), 26 deletions(-)
 rename tensorflow/compiler/mlir/tfrt/tests/{lower_bound_batch_threads.mlir => reconfig_batch_op.mlir} (94%)
 rename tensorflow/compiler/mlir/tfrt/transforms/{lower_bound_batch_threads.cc => reconfig_batch_op.cc} (68%)

diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 1b6dbbd9176d22..3de47aca90c0c0 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -208,12 +208,12 @@ cc_library(
         "transforms/deduplicate_if_result_pass.cc",
         "transforms/fuse_tpu_compile_and_execute_ops.cc",
         "transforms/insert_tensor_copy.cc",
-        "transforms/lower_bound_batch_threads.cc",
         "transforms/lower_saved_model.cc",
         "transforms/merge_tf_if_ops.cc",
         "transforms/optimize.cc",
         "transforms/optimize_tf_control_flow_side_effect.cc",
         "transforms/passes.cc",
+        "transforms/reconfig_batch_op.cc",
         "transforms/remove_device_attribute.cc",
         "transforms/remove_tf_if_const_args.cc",
         "transforms/reorder_assert.cc",
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir b/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
similarity index 94%
rename from tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir
rename to tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
index 317d9b3ad9e00a..79991dc1e0f6cf 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -split-input-file -tfrt-lower-bound-batch-threads="tfrt-min-num-batch-threads=2" %s | FileCheck %s --dump-input=always
+// RUN: tf-tfrt-opt -split-input-file -tfrt-reconfig-batch-op="tfrt-min-num-batch-threads=2" %s | FileCheck %s --dump-input=always
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
index 69bc9370424671..93ee91f91cdad1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -118,8 +118,8 @@ void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
   pm.addPass(tfrt_compiler::CreateMergeTfIfOpsPass());
 
   // Lower bound on the number of batch threads in `tf.BatchFunction`.
-  pm.addPass(tfrt_compiler::CreateLowerBoundBatchThreadsPass(
-      options.min_num_batch_threads));
+  pm.addPass(tfrt_compiler::CreateReconfigBatchOpPass(
+      {.min_num_batch_threads = options.min_num_batch_threads}));
 
   // Deduplicate functions invoked by tf.BatchFunction with the same
   // shared_name
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
index e1c848210dd19b..f91670b039567e 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
 
+#include <cstdint>
 #include <memory>
 
 #include "llvm/Support/CommandLine.h"
@@ -67,8 +68,11 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDeduplicateFunctionsInovkedByBatchFunctionPass();
 
 // Create a pass to lower bound the number of threads in tf.BatchFunction.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateLowerBoundBatchThreadsPass(int64_t min_num_batch_threads);
+struct ReconfigBatchOpPassOptions {
+  int64_t min_num_batch_threads = 1;
+};
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateReconfigBatchOpPass(
+    ReconfigBatchOpPassOptions options);
 
 // Create a pass to fuse the TPU Ops for TFRT.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc b/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
similarity index 68%
rename from tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc
rename to tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
index 2c2883181e942c..4e2872cd645c4f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
@@ -31,34 +31,31 @@ namespace tensorflow {
 namespace tfrt_compiler {
 namespace {
 
-class LowerBoundBatchThreadsPass
-    : public mlir::PassWrapper<LowerBoundBatchThreadsPass,
+class ReconfigBatchOpPass
+    : public mlir::PassWrapper<ReconfigBatchOpPass,
                                mlir::OperationPass<mlir::ModuleOp>> {
  public:
-  explicit LowerBoundBatchThreadsPass(uint64_t min_num_batch_threads)
-      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+  explicit ReconfigBatchOpPass(ReconfigBatchOpPassOptions options)
+      : mlir::PassWrapper<ReconfigBatchOpPass,
                           mlir::OperationPass<mlir::ModuleOp>>() {
-    min_num_batch_threads_ = min_num_batch_threads;
+    min_num_batch_threads_ = options.min_num_batch_threads;
   }
-  LowerBoundBatchThreadsPass()
-      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+  ReconfigBatchOpPass()
+      : mlir::PassWrapper<ReconfigBatchOpPass,
                           mlir::OperationPass<mlir::ModuleOp>>() {}
-  LowerBoundBatchThreadsPass(const LowerBoundBatchThreadsPass& other)
-      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+  ReconfigBatchOpPass(const ReconfigBatchOpPass& other)
+      : mlir::PassWrapper<ReconfigBatchOpPass,
                           mlir::OperationPass<mlir::ModuleOp>>(other) {}
 
-  LowerBoundBatchThreadsPass& operator=(
-      const LowerBoundBatchThreadsPass& other) = delete;
+  ReconfigBatchOpPass& operator=(const ReconfigBatchOpPass& other) = delete;
 
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerBoundBatchThreadsPass)
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReconfigBatchOpPass)
 
  private:
-  llvm::StringRef getArgument() const final {
-    return "tfrt-lower-bound-batch-threads";
-  }
+  llvm::StringRef getArgument() const final { return "tfrt-reconfig-batch-op"; }
 
   llvm::StringRef getDescription() const final {
-    return "Lower bound batch threads for batch ops.";
+    return "Reconfig batch op such as num_batch_threads.";
   }
 
   void runOnOperation() override {
@@ -82,12 +79,12 @@ class LowerBoundBatchThreadsPass
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateLowerBoundBatchThreadsPass(int64_t min_num_batch_threads) {
-  return std::make_unique<LowerBoundBatchThreadsPass>(min_num_batch_threads);
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateReconfigBatchOpPass(
+    ReconfigBatchOpPassOptions options) {
+  return std::make_unique<ReconfigBatchOpPass>(options);
 }
 
-static mlir::PassRegistration<LowerBoundBatchThreadsPass> register_pass;
+static mlir::PassRegistration<ReconfigBatchOpPass> register_pass;
 
 }  // namespace tfrt_compiler
 }  // namespace tensorflow

From 4d6076dbaae3ba78140f63e22f885a6d6db13d69 Mon Sep 17 00:00:00 2001
From: Akhil Goel <akhil.goel@intel.com>
Date: Tue, 28 May 2024 12:07:35 -0700
Subject: [PATCH 016/287] Adjust tolerance for XLA Conv test

---
 tensorflow/compiler/tests/tensor_float_32_test.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tests/tensor_float_32_test.py b/tensorflow/compiler/tests/tensor_float_32_test.py
index 7c50120f02fc48..341a5e00833275 100644
--- a/tensorflow/compiler/tests/tensor_float_32_test.py
+++ b/tensorflow/compiler/tests/tensor_float_32_test.py
@@ -48,7 +48,17 @@ def _test_fn(self, fn, inputs):
         self.assertAllClose(out, f32_out, rtol=1e-5, atol=1e-5)
       else:
         f64_out = compiled_fn(*[math_ops.cast(x, 'float64') for x in inputs])
-        self.assertAllClose(out, f64_out, rtol=1e-5, atol=1e-5)
+        # This test compares the F32 output of the model with the F64 output. 
+        # oneDNN algorithms may loose some precision due to significant accumulations 
+        # for large inputs. Therefore, we need to adjust the tolerance accordingly
+        # in these cases. 
+        rtol_val, atol_val = (
+            (3e-4, 1e-5)
+            if test_util.IsCPUTargetAvailable("x86")
+            and test_util.IsBuiltWithXLA()
+            else (1e-5, 1e-5)
+        )
+        self.assertAllClose(out, f64_out, rtol=rtol_val, atol=atol_val)
 
       # Test with TF32 enabled. Recompile fn because enabling TF32 does not
       # reset function cache.

From dbf3cd351cdac416d635a9725746f4bcd96ffd83 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 11:41:26 -0700
Subject: [PATCH 017/287] [Multi-host GPU]Integrate GPU topology into
 PjRtClient for multi-host GPU support

--Integrate GpuTopology into PjRtTopologyDescription to represent multi-host GPU topologies
--Utilize GlobalTopology to build GpuTopology during PjRt client creation
--Update relevant tests

PiperOrigin-RevId: 637971846
---
 tensorflow/core/common_runtime/eager/BUILD    |  1 +
 .../eager/context_distributed_manager.cc      | 29 ++++++---
 .../saved_model/saved_model_aot_compile.cc    |  4 +-
 third_party/xla/xla/pjrt/c/BUILD              |  1 +
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc | 10 ++-
 .../xla/xla/pjrt/distributed/topology_util.cc |  4 ++
 third_party/xla/xla/pjrt/gpu/BUILD            |  2 +
 third_party/xla/xla/pjrt/gpu/gpu_topology.cc  |  3 +-
 third_party/xla/xla/pjrt/gpu/gpu_topology.h   |  2 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 42 ++++++++-----
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     | 63 +++++++++----------
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc | 25 +++++---
 12 files changed, 117 insertions(+), 69 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 0fe70927d8d45d..f29da79f2bdab7 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -273,6 +273,7 @@ tf_cuda_library(
                 "//tensorflow/core/framework:resource_base",
                 "@local_xla//xla/pjrt/distributed:key_value_store_interface",
                 "@local_xla//xla/pjrt:local_device_state",
+                "@local_xla//xla/pjrt/gpu:gpu_topology",
                 "@local_xla//xla/pjrt:pjrt_client",
                 "@local_xla//xla/pjrt:pjrt_compiler",
                 "@local_xla//xla/service/gpu:gpu_executable_run_options",
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 35c583927433de..6a37c033773f4d 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -79,6 +79,7 @@ limitations under the License.
 #if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
 #define TF_GPU_USE_PJRT
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -328,17 +329,18 @@ absl::Status CreateClientOnce(
     // proceed.
     creation_state->SetReady();
   }
-  auto status = BuildDistributedDevices(
+  auto device_topology_pair = BuildDistributedDevices(
       platform_name, std::move(unique_local_device_states), node_id, num_nodes,
-      &pjrt_devices, gpu_run_options.get(), kv_store,
+      gpu_run_options.get(), kv_store,
       /*enable_mock_nccl=*/false);
-  if (!status.ok()) {
+  if (!device_topology_pair.ok()) {
     if (use_creation_info) {
       creation_state->SetDone();
     }
-    return status;
+    return device_topology_pair.status();
   }
 
+  pjrt_devices = std::move(device_topology_pair->first);
   VLOG(2) << "Distributed devices built with size=" << pjrt_devices.size();
   int i = 0;
   for (const auto& pjrt_device : pjrt_devices) {
@@ -350,6 +352,18 @@ absl::Status CreateClientOnce(
     }
   }
 
+  std::shared_ptr<const xla::GpuTopology> gpu_topology = nullptr;
+  if (!device_topology_pair->second.ok()) {
+    LOG(INFO)
+        << "Skipping creating GPU topology since multiple nodes on the same "
+           "host violates GPU topology assumptions. This is expected in tests "
+           "that use multiple threads to simulate multiple workers. If this "
+           "occurs in production and op execution on GPU fails, this could be "
+           "related.";
+  } else {
+    gpu_topology =
+        xla::GpuTopology::FromProto(device_topology_pair->second.value());
+  }
   if (use_creation_info) {
     std::unique_ptr<xla::PjRtClient> pjrt_client =
         std::make_unique<xla::StreamExecutorGpuClient>(
@@ -358,10 +372,11 @@ absl::Status CreateClientOnce(
             /*allocator=*/std::move(info->allocator),
             /*host_memory_allocator=*/std::move(info->host_memory_allocator),
             /*should_stage_host_to_device_transfers=*/true,
-            /*gpu_run_options=*/std::move(gpu_run_options));
+            /*gpu_run_options=*/std::move(gpu_run_options),
+            std::move(gpu_topology));
     VLOG(2) << "PJRT GPU client with remote devices created.";
-    status = SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
-                                                    std::move(pjrt_client));
+    auto status = SetPjRtClientInTFGlobalResourceManager(
+        DeviceType(DEVICE_GPU), std::move(pjrt_client));
     creation_state->SetDone();
     return status;
   } else {
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 0ed2e8618c4d71..dbaa76925a372c 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -310,8 +310,8 @@ AotCompileToGpuPjRtExecutable(
   xla::Compiler::TargetConfig gpu_config(gpu_target_config);
   xla::StreamExecutorGpuCompiler pjrt_gpu_compiler;
   // Create a trivial topology, which won't be used.
-  xla::StreamExecutorGpuTopologyDescription topology(
-      xla::CudaId(), xla::CudaName(), "fake_device", {0});
+  xla::StreamExecutorGpuTopologyDescription topology(xla::CudaId(),
+                                                     xla::CudaName(), nullptr);
   xla::CompileOptions pjrt_options =
       GetPjRtCompileOptions(options, **compilation_result);
   pjrt_options.target_config = gpu_config;
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 44c498272f863f..c447c670d80964 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -270,6 +270,7 @@ cc_library(
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/gpu:gpu_helpers",
+        "//xla/pjrt/gpu:gpu_topology",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # To register GPU AOT compiler
         "//xla/python:custom_partition_callback",
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index cb4e9c36b1d841..f2e615aa7a2033 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_stream_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -187,9 +188,16 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
     device_ids.push_back(executor->device_ordinal());
   }
   auto gpu_target_config = xla::Compiler::TargetConfig(executor);
+  // TODO(b/341334898): Create a single-host GPU topology. Will be updated for
+  // multi-host support in the future.
+  auto gpu_topology = std::make_shared<const xla::GpuTopology>(
+      device_ids, description.name(),
+      /*num_slices=*/1,
+      /*num_hosts_per_slice=*/1,
+      /*num_devices_per_host=*/device_ids.size());
   auto pjrt_topology =
       std::make_unique<xla::StreamExecutorGpuTopologyDescription>(
-          xla::CudaId(), xla::CudaName(), description.name(), device_ids,
+          xla::CudaId(), xla::CudaName(), std::move(gpu_topology),
           absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>{
               {"target_config",
                gpu_target_config.ToProto().SerializeAsString()}});
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index 40a9d63b8cadea..d8649773156c52 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -207,6 +207,10 @@ absl::StatusOr<GpuTopologyProto> BuildGpuTopology(
   for (int i = 0; i < global_topology.nodes_size(); ++i) {
     const LocalTopologyProto& local_topology = global_topology.nodes(i);
 
+    if (local_topology.devices_size() == 0) {
+      return absl::InternalError("Local topology has no devices.");
+    }
+
     slice_id_to_node_ids[local_topology.devices(0).slice_index()].push_back(
         local_topology.node_id());
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index b573c899961c0a..acfb5224483722 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -44,6 +44,7 @@ cc_library(
         ":gpu_helpers",
         ":gpu_metrics",
         ":gpu_topology",
+        ":gpu_topology_proto_cc",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status",
@@ -300,6 +301,7 @@ xla_cc_test(
         "requires-gpu-nvidia",
     ] + if_google(["config-cuda-only"]),
     deps = [
+        ":gpu_topology",
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
         "//xla:test",
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc b/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
index e9baf5f359ba66..600adf98231fcb 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 namespace xla {
@@ -33,7 +34,7 @@ std::unique_ptr<const GpuTopology> GpuTopology::FromProto(
 GpuTopologyProto GpuTopology::ToProto() const {
   GpuTopologyProto proto;
   proto.mutable_device_ids()->Add(device_ids().begin(), device_ids().end());
-  proto.set_platform_version(platform_version());
+  proto.set_platform_version(std::string(platform_version()));
   proto.set_num_slices(num_slices());
   proto.set_num_hosts_per_slice(num_hosts_per_slice());
   proto.set_num_devices_per_host(num_devices_per_host());
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.h b/third_party/xla/xla/pjrt/gpu/gpu_topology.h
index 9c1e191bc4b72b..341dba20cf1d42 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.h
+++ b/third_party/xla/xla/pjrt/gpu/gpu_topology.h
@@ -53,7 +53,7 @@ class GpuTopology {
       const GpuTopologyProto& proto);
   GpuTopologyProto ToProto() const;
 
-  std::string platform_version() const { return platform_version_; }
+  std::string_view platform_version() const { return platform_version_; }
   int32_t num_slices() const { return num_slices_; }
   int32_t num_hosts_per_slice() const { return num_hosts_per_slice_; }
   int32_t num_devices_per_host() const { return num_devices_per_host_; }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index dc0768aa1473d6..28e1671c584432 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -55,6 +55,8 @@ limitations under the License.
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -485,14 +487,15 @@ StreamExecutorGpuClient::StreamExecutorGpuClient(
     int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
     std::unique_ptr<tsl::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
-    std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
+    std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
+    std::shared_ptr<const GpuTopology> gpu_topology)
     : xla::PjRtStreamExecutorClient(
           platform_name, client, std::move(devices), process_index,
           std::move(allocator), std::move(host_memory_allocator),
           should_stage_host_to_device_transfers, std::move(gpu_run_options)),
       topology_(xla::StreamExecutorGpuTopologyDescription::Create(
           tsl::Fingerprint64(platform_name), platform_name,
-          devices_.back()->device_kind(), devices_)) {
+          std::move(gpu_topology))) {
   for (auto* device : addressable_devices()) {
     // Use the device id to construct a globally unique memory space id. We do
     // not promise that memory space ids and device ids are the same.
@@ -941,15 +944,15 @@ GetStreamExecutorGpuDeviceAllocator(
 
 }  // namespace
 
-absl::Status BuildDistributedDevices(
+absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id, int num_nodes,
-    std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     absl::Duration get_local_topology_timeout,
     absl::Duration get_global_topology_timeout) {
+  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
   std::string boot_id_str;
@@ -1009,7 +1012,7 @@ absl::Status BuildDistributedDevices(
           device_proto.name(), device_proto.vendor(),
           device_proto.compute_capability(), device_proto.core_count(),
           node.node_id(), device_proto.slice_index());
-      devices->push_back(std::move(device));
+      devices.push_back(std::move(device));
     }
   }
   for (const auto& device : local_device_states) {
@@ -1027,7 +1030,8 @@ absl::Status BuildDistributedDevices(
         });
   }
 #endif  // GOOGLE_CUDA
-  return absl::OkStatus();
+
+  return std::make_pair(std::move(devices), BuildGpuTopology(global_topology));
 }
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
@@ -1149,7 +1153,6 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
   auto host_memory_allocator =
       GetGpuHostAllocator(local_device_states.begin()->second->executor());
 
-  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
   if (options.enable_mock_nccl) {
     gpu_run_options->set_enable_mock_nccl_collectives();
@@ -1159,22 +1162,29 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     kv_store = std::make_shared<InMemoryKeyValueStore>();
   }
   TF_RET_CHECK(options.num_nodes == 1 || kv_store != nullptr);
-  TF_RETURN_IF_ERROR(BuildDistributedDevices(
-      pjrt_platform_name, std::move(local_device_states), options.node_id,
-      options.num_nodes, &devices, gpu_run_options.get(), kv_store,
-      options.enable_mock_nccl));
+  TF_ASSIGN_OR_RETURN(
+      DeviceTopologyPair device_topology_pair,
+      BuildDistributedDevices(pjrt_platform_name,
+                              std::move(local_device_states), options.node_id,
+                              options.num_nodes, gpu_run_options.get(),
+                              kv_store, options.enable_mock_nccl));
+  if (!device_topology_pair.second.ok()) {
+    return device_topology_pair.second.status();
+  }
+  auto gpu_topology = std::shared_ptr<const GpuTopology>(
+      GpuTopology::FromProto(device_topology_pair.second.value()));
 
   return std::unique_ptr<PjRtClient>(std::make_unique<StreamExecutorGpuClient>(
-      pjrt_platform_name, xla_client, std::move(devices), options.node_id,
-      std::move(allocator), std::move(host_memory_allocator),
-      options.should_stage_host_to_device_transfers,
-      std::move(gpu_run_options)));
+      pjrt_platform_name, xla_client, std::move(device_topology_pair.first),
+      options.node_id, std::move(allocator), std::move(host_memory_allocator),
+      options.should_stage_host_to_device_transfers, std::move(gpu_run_options),
+      std::move(gpu_topology)));
 }
 
 absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
     const {
   std::string result;
-  if (!tsl::SerializeToStringDeterministic(gpu_topology_.ToProto(), &result)) {
+  if (!tsl::SerializeToStringDeterministic(gpu_topology_->ToProto(), &result)) {
     return absl::InternalError("Failed to serialize gpu_topology");
   }
   return result;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 59efaf3226fd40..6977eb71093ea1 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -61,36 +62,32 @@ class MultiDeviceAdapter;
 }
 
 namespace xla {
+// TODO(b/342438435): Currently, different call sites need to handle topology
+// errors differently. It will be refactored to
+// std::pair<std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>,
+// GpuTopologyProto> when topology errors can be handled uniformly across all
+// call sites.
+using DeviceTopologyPair =
+    std::pair<std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>,
+              absl::StatusOr<GpuTopologyProto>>;
 
 class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
  public:
   static StreamExecutorGpuTopologyDescription Create(
       const PjRtPlatformId platform_id, const absl::string_view platform_name,
-      const absl::string_view platform_version,
-      const std::vector<PjRtDevice*>& devices) {
-    std::vector<int> device_ids;
-    device_ids.reserve(devices.size());
-    for (PjRtDevice* device : devices) {
-      device_ids.push_back(device->id());
-    }
+      std::shared_ptr<const GpuTopology> gpu_topology) {
     return StreamExecutorGpuTopologyDescription(platform_id, platform_name,
-                                                platform_version, device_ids);
+                                                gpu_topology);
   }
-  // `gpu_device_ids` is the list of logical device ids for the GPU devices and
-  // will be used to initialize the GPU topology.
+
   StreamExecutorGpuTopologyDescription(
       const PjRtPlatformId platform_id, const absl::string_view platform_name,
-      const absl::string_view platform_version,
-      const std::vector<int>& gpu_device_ids,
+      std::shared_ptr<const GpuTopology> gpu_topology,
       const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& attributes =
           {})
       : platform_id_(platform_id),
         platform_name_(platform_name),
-        platform_version_(platform_version),
-        // TODO(b/331224674): Add support for multi-host.
-        gpu_topology_(gpu_device_ids, platform_version, /*num_slices=*/1,
-                      /*num_hosts_per_slice=*/1,
-                      /*num_devices_per_host=*/gpu_device_ids.size()),
+        gpu_topology_(std::move(gpu_topology)),
         attributes_(attributes) {}
 
   bool operator==(const StreamExecutorGpuTopologyDescription& other) const {
@@ -105,39 +102,40 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
   absl::string_view platform_name() const override { return platform_name_; }
 
   absl::string_view platform_version() const override {
-    return platform_version_;
+    return gpu_topology_->platform_version();
   }
 
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
       const override {
     std::vector<std::unique_ptr<const PjRtDeviceDescription>> devices;
-    devices.reserve(gpu_topology_.number_of_devices());
-    for (const int device_id : gpu_topology_.device_ids()) {
+    devices.reserve(gpu_topology_->number_of_devices());
+    for (const int device_id : gpu_topology_->device_ids()) {
       devices.push_back(std::make_unique<PjRtStreamExecutorDeviceDescription>(
-          device_id, platform_version_));
+          device_id, std::string(platform_version())));
     }
     return devices;
   }
 
-  const GpuTopology& gpu_topology() const { return gpu_topology_; }
-  const GpuTopology* gpu_topology_ptr() const { return &gpu_topology_; }
+  const GpuTopology& gpu_topology() const { return *gpu_topology_; }
+  const GpuTopology* gpu_topology_ptr() const { return gpu_topology_.get(); }
 
   // No subslice is supported.
   bool is_subslice_topology() const override { return false; }
 
-  // The topology support only single host now.
-  absl::StatusOr<int> ProcessCount() const override { return 1; }
+  absl::StatusOr<int> ProcessCount() const override {
+    return gpu_topology_->number_of_hosts();
+  }
 
   absl::StatusOr<int> CoreCountOfDefaultType() const override {
-    return gpu_topology_.number_of_devices();
+    return gpu_topology_->number_of_devices();
   }
 
   absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
-    return gpu_topology_.number_of_devices();
+    return gpu_topology_->number_of_devices();
   }
 
   absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
-    return gpu_topology_.number_of_devices();
+    return gpu_topology_->number_of_devices();
   }
 
   absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
@@ -159,8 +157,7 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
  private:
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
-  const std::string platform_version_;
-  const GpuTopology gpu_topology_;
+  std::shared_ptr<const GpuTopology> gpu_topology_;
   absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
 };
 
@@ -208,7 +205,8 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tsl::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
-      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
+      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
+      std::shared_ptr<const GpuTopology> gpu_topology = nullptr);
 
   absl::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
@@ -264,11 +262,10 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc);
 
-absl::Status BuildDistributedDevices(
+absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id, int num_nodes,
-    std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     absl::Duration get_local_topology_timeout = absl::Minutes(2),
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 70a47c2400c433..4a94f14ab162a0 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "xla/client/xla_computation.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -60,10 +61,18 @@ absl::StatusOr<xla::XlaComputation> GetXlaComputation(
   return XlaComputation(hlo_module->ToProto());
 }
 
+std::shared_ptr<xla::GpuTopology> GetGpuTopology(
+    std::vector<int> device_ids, absl::string_view platform_version,
+    int num_slices, int num_hosts_per_slice, int num_devices_per_host) {
+  return std::make_shared<xla::GpuTopology>(device_ids, platform_version,
+                                            num_slices, num_hosts_per_slice,
+                                            num_devices_per_host);
+}
+
 TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
   StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
-                                                "Fake_device", {0, 1});
+  StreamExecutorGpuTopologyDescription topology(
+      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2));
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
@@ -73,8 +82,8 @@ TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
 
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
-                                                "Fake_device", {0, 1});
+  StreamExecutorGpuTopologyDescription topology(
+      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2));
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
@@ -119,8 +128,8 @@ TEST(StreamExecutorGpuCompilerTest, NoClientMlir) {
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
-  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
-                                                "Fake_device", {0, 1});
+  StreamExecutorGpuTopologyDescription topology(
+      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2));
 
   EXPECT_THAT(
       compiler.Compile(xla::CompileOptions(), mlir_module.get(), topology,
@@ -137,8 +146,8 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameMlir) {
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
-  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
-                                                "Fake_device", {0, 1});
+  StreamExecutorGpuTopologyDescription topology(
+      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2));
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));

From 4771680c4ff4d00d66def2b805511595df83f6b9 Mon Sep 17 00:00:00 2001
From: Chris Minge <chrisminge@google.com>
Date: Tue, 28 May 2024 11:44:33 -0700
Subject: [PATCH 018/287] Add post-partition optimization pass to TFRTSession.

PiperOrigin-RevId: 637972800
---
 .../core/tfrt/tfrt_session/tfrt_session.cc    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 67475b0b23ed3e..6f3aa87032efbf 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -36,10 +37,15 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/local_session_selection.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -47,6 +53,7 @@ limitations under the License.
 #include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/platform/threadpool_options.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/public/session.h"
@@ -226,6 +233,25 @@ class TfrtSession : public tensorflow::Session {
         !options_.config.experimental().enable_multi_host());
     TF_RETURN_IF_ERROR(options.runtime->CreateRuntimeResources(model_context));
 
+    // Run post-partition graph optimization passes which have been registered
+    // in `OptimizationPassRegistry::Global()`.
+    GraphOptimizationPassOptions optimization_options;
+    optimization_options.session_options = &options_;
+    FunctionLibraryDefinition flib_def = fallback_state->func_lib_def();
+    optimization_options.flib_def = &flib_def;
+    std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
+    auto initial_graph =
+        std::make_unique<tensorflow::Graph>(tensorflow::OpRegistry::Global());
+    tensorflow::GraphConstructorOptions opts;
+    opts.allow_internal_ops = true;
+    TF_RETURN_IF_ERROR(
+        tensorflow::ConvertGraphDefToGraph(opts, graph, initial_graph.get()));
+    partition_graphs["graph"] = std::move(initial_graph);
+    optimization_options.partition_graphs = &partition_graphs;
+    OptimizationPassRegistry::Global()->LogAllGroupings(1);
+    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+        OptimizationPassRegistry::POST_PARTITIONING, optimization_options));
+
     // `GraphExecutor::Create()` will preprocess the graph (e.g., apply
     // Placer to the top level graph). `kernel_registry` is required only for
     // synchronous execution right now.

From 8449a0ac7f1b9e39235ee1fe79f0234ee1b2080e Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 11:53:21 -0700
Subject: [PATCH 019/287] [xla] Annotate called computations with instruction
 type after flattening call graph

Each HloComputation has a field that tells to what kind of HloInstruction it is attached to. It is lost when we clone computations to flatten the graph. Also it can be lost in earlier rewrites.

After call graph flattening it is guaranteed that HloComputation <-> HloInstruction has a 1 to 1 mapping and each computation has exactly one callee instruction.

This information is required for later rewrites.

PiperOrigin-RevId: 637975560
---
 third_party/xla/xla/hlo/ir/hlo_computation.h  |  2 +-
 third_party/xla/xla/service/BUILD             |  6 ++
 .../xla/xla/service/flatten_call_graph.cc     | 55 +++++++++++++++++--
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index f99bbefb9097c1..3e1161fdeeefdb 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -923,7 +923,7 @@ class HloComputation {
     kFusion,
     // This computation is a custom-call computation.
     kCustomCall,
-    // This computation is a while body computation.
+    // This computation is a collective computation.
     kCollective,
     // This computation is a while body computation.
     kWhile,
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index ae1f086900d25f..c43a1b1332feea 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -971,7 +971,13 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
diff --git a/third_party/xla/xla/service/flatten_call_graph.cc b/third_party/xla/xla/service/flatten_call_graph.cc
index a448f9991f3c18..bd999d51e7e41a 100644
--- a/third_party/xla/xla/service/flatten_call_graph.cc
+++ b/third_party/xla/xla/service/flatten_call_graph.cc
@@ -16,16 +16,23 @@ limitations under the License.
 #include "xla/service/flatten_call_graph.h"
 
 #include <memory>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/call_graph.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 namespace xla {
-
 namespace {
 
 // Helper to replace the called computation at a while, call, conditional or
@@ -116,6 +123,38 @@ absl::Status FlattenNode(const CallGraphNode& node) {
   return absl::OkStatus();
 }
 
+// Annotates flatten computations with callee instruction types.
+absl::Status AnnotateNode(const CallGraphNode& node) {
+  for (auto& callsite : node.callsites()) {
+    HloInstruction* instruction = callsite.instruction();
+
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      for (HloComputation* computation : instruction->called_computations()) {
+        computation->SetFusionInstruction(instruction);
+      }
+
+    } else if (instruction->opcode() == HloOpcode::kCustomCall) {
+      for (HloComputation* computation : instruction->called_computations()) {
+        computation->SetCustomCallInstruction(instruction);
+      }
+
+    } else if (hlo_query::IsCollectiveCommunicationOp(instruction->opcode())) {
+      for (HloComputation* computation : instruction->called_computations()) {
+        computation->SetCollectiveCallInstruction(instruction);
+      }
+
+    } else if (instruction->opcode() == HloOpcode::kWhile) {
+      instruction->while_body()->SetWhileCallInstruction(instruction);
+
+    } else if (instruction->opcode() == HloOpcode::kConditional) {
+      for (HloComputation* branch : instruction->branch_computations()) {
+        branch->SetConditionalCallInstruction(instruction);
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 absl::StatusOr<bool> FlattenCallGraph::Run(
@@ -123,9 +162,17 @@ absl::StatusOr<bool> FlattenCallGraph::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
 
-  std::unique_ptr<CallGraph> call_graph =
-      CallGraph::Build(module, execution_threads);
-  TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
+  {  // Flatten original call graph.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module, execution_threads);
+    TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
+  }
+
+  {  // Annotate flattened computations with callee types.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module, execution_threads);
+    TF_RETURN_IF_ERROR(call_graph->VisitNodes(AnnotateNode));
+  }
 
   XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
   return true;

From e3cc106b142049d347741e6672f1e977b1929316 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 28 May 2024 12:04:12 -0700
Subject: [PATCH 020/287] [XLA:GPU] Remove unnecessary arg for the
 IndexingMap::Simplify.

PiperOrigin-RevId: 637979547
---
 third_party/xla/xla/service/gpu/BUILD         |  3 +-
 third_party/xla/xla/service/gpu/fusions/BUILD | 17 +----
 .../xla/service/gpu/fusions/fusion_emitter.cc |  2 +-
 .../in_place_dynamic_update_slice_mlir.cc     |  2 +-
 .../gpu/fusions/input_slices_mlir_test.cc     |  4 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |  2 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.cc  |  2 +-
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |  8 +--
 .../xla/xla/service/gpu/fusions/mlir/ir/BUILD |  1 -
 .../gpu/fusions/mlir/ir/xla_gpu_ops.cc        |  4 +-
 .../gpu/fusions/mlir/simplify_affine.cc       |  4 +-
 .../gpu/fusions/reduction_base_test.cc        |  4 +-
 .../xla/xla/service/gpu/fusions/scatter.cc    |  2 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |  4 +-
 .../xla/xla/service/gpu/fusions/transpose.cc  |  8 +--
 .../xla/service/gpu/fusions/transpose_mlir.cc |  6 +-
 third_party/xla/xla/service/gpu/model/BUILD   | 69 +++++++------------
 .../service/gpu/model/coalescing_analysis.cc  | 12 ++--
 .../service/gpu/model/indexing_analysis.cc    | 16 ++---
 .../xla/service/gpu/model/indexing_analysis.h |  5 --
 .../gpu/model/indexing_analysis_test.cc       |  2 +-
 .../xla/xla/service/gpu/model/indexing_map.cc | 27 ++++----
 .../xla/xla/service/gpu/model/indexing_map.h  |  9 +--
 .../service/gpu/model/indexing_map_test.cc    | 52 +++++++-------
 .../xla/service/gpu/model/symbolic_tile.cc    |  2 +-
 .../gpu/model/symbolic_tile_analysis.cc       |  2 +-
 26 files changed, 112 insertions(+), 157 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index c877c99ce7290e..12a2eed9b4f0dd 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -297,7 +297,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
-        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
@@ -532,7 +532,6 @@ cc_library(
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/service/gpu/model:affine_map_printer",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/service/gpu/model:symbolic_tiled_hlo_instruction",
         "//xla/service/gpu/model:tiled_hlo_computation",
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 6cdbf912fc730a..01d195d00085ef 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -22,7 +22,7 @@ cc_library(
         "//xla/service/gpu:ir_emitter",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
@@ -69,7 +69,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -223,7 +222,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:ir_array",
@@ -345,7 +343,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "@com_google_absl//absl/log",
@@ -375,7 +372,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -422,7 +418,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -476,7 +471,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -547,7 +541,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
@@ -809,7 +802,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "@com_google_absl//absl/algorithm:container",
@@ -866,7 +858,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:type_util",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -913,7 +904,7 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
-        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:loop_emitter",
@@ -959,7 +950,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -1007,7 +997,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
@@ -1063,7 +1052,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
@@ -1095,7 +1083,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index 0c3fa4e9795aa4..e61cc677e8edd6 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -197,7 +197,7 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
   } else {
     indexing_map.AddConstraint(linear_index, Interval{0, num_elements - 1});
   }
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   return indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index 38321a8df0c02b..abf0cacd1e695e 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -115,7 +115,7 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
   auto indexing = *ComputeThreadIdToInputIndexing(
       /*root_index=*/0,
       /*hero_operand_index=*/kDUSUpdateIndex, mlir_context);
-  indexing.Simplify(GetIndexingMapForInstruction);
+  indexing.Simplify();
   indexing.RemoveUnusedSymbols();
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
index c1ff66242844d7..234387a3f2e441 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
@@ -49,7 +49,7 @@ TEST_F(MlirInputSlicesFusionTest, ThreadIndexing) {
 
   auto thread_id_to_output_indexing_0 =
       emitter->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
-  thread_id_to_output_indexing_0->Simplify(GetIndexingMapForInstruction);
+  thread_id_to_output_indexing_0->Simplify();
   EXPECT_THAT(thread_id_to_output_indexing_0->ToString(thread_id_printer_),
               MatchIndexingString(R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[s0, s1] -> (
@@ -69,7 +69,7 @@ TEST_F(MlirInputSlicesFusionTest, ThreadIndexing) {
   )"));
   auto thread_id_to_output_indexing_1 =
       emitter->ComputeThreadIdToOutputIndexing(1, &mlir_context_);
-  thread_id_to_output_indexing_1->Simplify(GetIndexingMapForInstruction);
+  thread_id_to_output_indexing_1->Simplify();
   EXPECT_THAT(thread_id_to_output_indexing_1->ToString(thread_id_printer_),
               MatchIndexingString(R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[s0, s1] -> (
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e14c2698ba0cac..d7b0961c79448e 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -249,7 +249,7 @@ std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
+  thread_id_to_input_indexing_map.Simplify();
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 7bf0df17db3aec..ffc3f3f145aca6 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -87,7 +87,7 @@ std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
+  thread_id_to_input_indexing_map.Simplify();
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index afc9fed49aea42..8035e113d86b5c 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -26,7 +26,6 @@ cc_library(
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/fusions:fusion_emitter",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "@com_google_absl//absl/algorithm:container",
@@ -78,7 +77,6 @@ cc_library(
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
@@ -122,7 +120,6 @@ xla_cc_test(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:launch_dim",
         "//xla/tests:filecheck",
@@ -174,7 +171,7 @@ cc_library(
         "//xla/service/gpu:target_util",
         "//xla/service/gpu/fusions:fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
-        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -231,7 +228,7 @@ xla_cc_test(
         "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/stream_executor:device_description",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -302,7 +299,6 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index f6fb31a75ec6ec..68750b28e2826d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -58,7 +58,6 @@ cc_library(
     deps = [
         ":xla_gpu_ops_inc_gen",
         "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
index 9e16a6b290d41d..a413e3c86787d7 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
@@ -353,7 +353,7 @@ struct SimplifyIndexingMap : public mlir::OpRewritePattern<ApplyIndexingOp> {
   LogicalResult matchAndRewrite(ApplyIndexingOp indexing_op,
                                 PatternRewriter& rewriter) const override {
     IndexingMap indexing_map = indexing_op.getIndexingMap();
-    bool is_simplified = indexing_map.Simplify(GetIndexingMapForInstruction);
+    bool is_simplified = indexing_map.Simplify();
 
     // Remove unused symbols.
     auto unused_symbols_bit_vector = indexing_map.RemoveUnusedVars();
@@ -463,7 +463,7 @@ struct FoldApplyIndexingSequence
         num_syms + added_sym_args.size());
     IndexingMap new_indexing_map(new_affine_map, new_dim_vars, new_sym_vars,
                                  /*rt_vars=*/{});
-    if (!new_indexing_map.Simplify(GetIndexingMapForInstruction)) {
+    if (!new_indexing_map.Simplify()) {
       return rewriter.notifyMatchFailure(
           indexing_op, "Folded indexing map was not simplified");
     }
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index aeb0342f10417a..c89d032479ca6f 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -147,7 +147,7 @@ struct RewriteAffineApply : OpRewritePattern<mlir::affine::AffineApplyOp> {
     IndexingMap indexing_map(affine_map, std::move(dim_ranges),
                              std::move(symbol_ranges),
                              /*rt_vars=*/{});
-    indexing_map.Simplify(GetIndexingMapForInstruction);
+    indexing_map.Simplify();
     auto result_expr = indexing_map.GetAffineMap().getResult(0);
 
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
@@ -170,7 +170,7 @@ struct RewriteApplyIndexingOp : OpRewritePattern<ApplyIndexingOp> {
   LogicalResult matchAndRewrite(ApplyIndexingOp op,
                                 PatternRewriter& rewriter) const override {
     auto indexing_map = op.getIndexingMap();
-    indexing_map.Simplify(GetIndexingMapForInstruction);
+    indexing_map.Simplify();
     auto affine_map = indexing_map.GetAffineMap();
     int64_t dim_count = indexing_map.GetDimensionCount();
     auto operands = op->getOperands();
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
index 3f12d8d3784964..6c88efbea79d61 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
@@ -348,12 +348,12 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
   )";
   auto input_indexing =
       fusion.ComputeThreadIdToInputIndexing(1, 0, &mlir_context_);
-  input_indexing->Simplify(GetIndexingMapForInstruction);
+  input_indexing->Simplify();
   EXPECT_THAT(input_indexing->ToString(),
               MatchIndexingString(kExpectedIndexing));
   auto output_indexing =
       fusion.ComputeThreadIdToOutputIndexing(1, &mlir_context_);
-  output_indexing->Simplify(GetIndexingMapForInstruction);
+  output_indexing->Simplify();
   EXPECT_THAT(output_indexing->ToString(),
               MatchIndexingString(kExpectedIndexing));
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index eb7bfd3086b484..4d78e09581f601 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -284,7 +284,7 @@ std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
         RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
         /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
-    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
+    scatter_indices_map.Simplify();
     return scatter_indices_map;
   }
   return scatter_update_map;
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 2bf6c52a76d095..1fa9ee53e9dad1 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -124,7 +124,7 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
         RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
         /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
-    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
+    scatter_indices_map.Simplify();
     return scatter_indices_map;
   }
   return scatter_update_map;
@@ -195,7 +195,7 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
           /*root_index=*/0, /*hero_operand_index=*/kScatterUpdateIndex,
           mlir_context)
           .value();
-  thread_id_to_update_map.Simplify(GetIndexingMapForInstruction);
+  thread_id_to_update_map.Simplify();
   thread_id_to_update_map.RemoveUnusedSymbols();
 
   const auto& root_computation = computations.FindPartitionedComputation(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index de29e78602129f..c86084e37d7d6b 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -318,7 +318,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
         GetBitcastMap(
             tiling_.GetXlaShape(),
             analysis_.fusion_roots()[root_index].instruction().shape(), ctx));
-    map.Simplify(GetIndexingMapForInstruction);
+    map.Simplify();
     return map;
   }
 
@@ -336,7 +336,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
           permuted_tiled_shape.dimensions()),
       GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
-  map.Simplify(GetIndexingMapForInstruction);
+  map.Simplify();
   return map;
 }
 
@@ -351,14 +351,14 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
              &analysis_.fusion_root(root_index).instruction(), 0, ctx)
              .indexing_maps[hero_operand_index]
              .begin());
-    map.Simplify(GetIndexingMapForInstruction);
+    map.Simplify();
     return map;
   }
 
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
-  map.Simplify(GetIndexingMapForInstruction);
+  map.Simplify();
   return map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 4d5e46dee9f0d5..d80f6680672576 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -146,7 +146,7 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
             hero.shape(),
             analysis_.fusion_roots()[root_index].instruction().shape(),
             mlir_context));
-    map.Simplify(GetIndexingMapForInstruction);
+    map.Simplify();
     return map;
   }
   return GetIndexing(/*input=*/false, hero.shape(), mlir_context);
@@ -163,7 +163,7 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
              &analysis_.fusion_root(root_index).instruction(), 0, mlir_context)
              .indexing_maps[hero_operand_index]
              .begin());
-    map.Simplify(GetIndexingMapForInstruction);
+    map.Simplify();
     return map;
   }
   return GetIndexing(/*input=*/true, hero.operand(hero_operand_index)->shape(),
@@ -383,7 +383,7 @@ IndexingMap MlirTransposeFusion::GetIndexing(bool input,
   }
   result =
       ComposeIndexingMaps(result, GetBitcastMap(normalized_shape, shape, ctx));
-  result.Simplify(GetIndexingMapForInstruction);
+  result.Simplify();
   return result;
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index f9375ee406c6b9..11b89aa7961329 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -299,7 +299,6 @@ cc_library(
         ":gpu_performance_model_base",
         ":hlo_op_profiles",
         ":indexing_analysis",
-        ":indexing_map",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla:util",
@@ -350,7 +349,6 @@ cc_library(
         ":gpu_performance_model_base",
         ":hlo_op_profiles",
         ":indexing_analysis",
-        ":indexing_map",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -418,13 +416,32 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "indexing_map",
-    srcs = ["indexing_map.cc"],
-    hdrs = ["indexing_map.h"],
+    name = "indexing_analysis",
+    srcs = [
+        "indexing_analysis.cc",
+        "indexing_map.cc",
+    ],
+    hdrs = [
+        "indexing_analysis.h",
+        "indexing_map.h",
+    ],
     deps = [
         ":affine_map_printer",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:gather_simplifier",
+        "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/fusions:tiling_util",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:int128",  # build-cleaner: keep
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -440,7 +457,6 @@ xla_cc_test(
     deps = [
         ":affine_map_printer",
         ":indexing_analysis",
-        ":indexing_map",
         ":indexing_test_utils",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
@@ -459,7 +475,6 @@ cc_library(
     hdrs = ["indexing_test_utils.h"],
     deps = [
         ":indexing_analysis",
-        ":indexing_map",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
@@ -475,34 +490,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "indexing_analysis",
-    srcs = ["indexing_analysis.cc"],
-    hdrs = ["indexing_analysis.h"],
-    deps = [
-        ":affine_map_printer",
-        ":indexing_map",
-        "//xla:permutation_util",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:gather_simplifier",
-        "//xla/service/gpu:hlo_traversal",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/fusions:tiling_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 xla_cc_test(
     name = "indexing_analysis_test",
     srcs = ["indexing_analysis_test.cc"],
@@ -528,7 +515,6 @@ cc_library(
     deps = [
         ":affine_map_printer",
         ":indexing_analysis",
-        ":indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -564,7 +550,7 @@ cc_library(
     srcs = ["symbolic_tiled_hlo_instruction.cc"],
     hdrs = ["symbolic_tiled_hlo_instruction.h"],
     deps = [
-        ":indexing_map",
+        ":indexing_analysis",
         ":symbolic_tile",
         "//xla:status",
         "//xla/hlo/ir:hlo",
@@ -580,7 +566,6 @@ xla_cc_test(
     srcs = ["symbolic_tiled_hlo_instruction_test.cc"],
     deps = [
         ":indexing_analysis",
-        ":indexing_map",
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/ir:hlo",
@@ -598,7 +583,7 @@ cc_library(
     srcs = ["tiled_hlo_instruction.cc"],
     hdrs = ["tiled_hlo_instruction.h"],
     deps = [
-        ":indexing_map",
+        ":indexing_analysis",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/hash",
@@ -613,7 +598,7 @@ xla_cc_test(
     name = "tiled_hlo_instruction_test",
     srcs = ["tiled_hlo_instruction_test.cc"],
     deps = [
-        ":indexing_map",
+        ":indexing_analysis",
         ":indexing_test_utils",
         ":tiled_hlo_instruction",
         "//xla:shape_util",
@@ -642,7 +627,6 @@ cc_library(
     hdrs = ["symbolic_tile_analysis.h"],
     deps = [
         ":indexing_analysis",
-        ":indexing_map",
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         ":tiled_hlo_computation",
@@ -689,7 +673,6 @@ cc_library(
     hdrs = ["coalescing_analysis.h"],
     deps = [
         ":indexing_analysis",
-        ":indexing_map",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:gather_simplifier",
@@ -714,7 +697,7 @@ xla_cc_test(
     srcs = ["coalescing_analysis_test.cc"],
     deps = [
         ":coalescing_analysis",
-        ":indexing_map",
+        ":indexing_analysis",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 06f723d9cb24ec..15af9e5d029703 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -253,8 +253,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
         IndexingMap operand_logical_to_linearized_physical_shape =
             operand_logical_to_physical_map *
             operand_physical_to_linearized_shape;
-        operand_logical_to_linearized_physical_shape.Simplify(
-            GetIndexingMapForInstruction);
+        operand_logical_to_linearized_physical_shape.Simplify();
 
         for (const IndexingMap& operand_indexing_map :
              operand_indexing_maps_it->second) {
@@ -270,8 +269,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
           IndexingMap thread_id_to_linearized_physical_input_map =
               *thread_id_to_hero_operand_map *
               logical_output_to_linearized_physical_input_map;
-          thread_id_to_linearized_physical_input_map.Simplify(
-              GetIndexingMapForInstruction);
+          thread_id_to_linearized_physical_input_map.Simplify();
           result[operand].insert(thread_id_to_linearized_physical_input_map);
         }
       }
@@ -307,7 +305,7 @@ void AssignValuesToRTVars(IndexingMap* indexing_map) {
                               indexing_map->GetDimVars(),
                               indexing_map->GetRangeVars(),
                               {}};
-  indexing_map->Simplify(GetIndexingMapForInstruction);
+  indexing_map->Simplify();
   indexing_map->RemoveUnusedSymbols();
 }
 
@@ -334,7 +332,7 @@ void AssignValuesToOuterLoopIVs(IndexingMap* indexing_map) {
                               indexing_map->GetDimVars(),
                               {indexing_map->GetRangeVars().back()},
                               {}};
-  indexing_map->Simplify(GetIndexingMapForInstruction);
+  indexing_map->Simplify();
   indexing_map->RemoveUnusedSymbols();
 }
 
@@ -596,7 +594,7 @@ bool IsIndexingCoalesced(IndexingMap& thread_x_to_linearized_input,
       /*rt_vars=*/{}};
   IndexingMap thread_x_to_input_sample =
       thread_x_first_32_elements * thread_x_to_linearized_input;
-  thread_x_to_input_sample.Simplify(GetIndexingMapForInstruction);
+  thread_x_to_input_sample.Simplify();
   thread_x_to_input_sample.RescaleSymbols();
   thread_x_to_input_sample.RemoveUnusedSymbols();
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index e7a56b62d45c75..429d5be768c330 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -624,7 +624,7 @@ IndexingMap ComposeIndexingMapsForWindow(
   // Composed indexing.
   IndexingMap result =
       ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
-  result.Simplify(GetIndexingMapForInstruction);
+  result.Simplify();
   result.RemoveUnusedSymbols();
   return result;
 }
@@ -941,7 +941,7 @@ HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(input, output, mlir_context),
       output.dimensions(), {});
-  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
+  reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
@@ -952,7 +952,7 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(output, input, mlir_context),
       input.dimensions(), {});
-  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
+  reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 
@@ -1096,7 +1096,7 @@ HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->shape(),
                                    bitcast->operand(0)->shape(), mlir_context);
-  bitcast_map.Simplify(GetIndexingMapForInstruction);
+  bitcast_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -1104,7 +1104,7 @@ HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
                                    bitcast->shape(), mlir_context);
-  bitcast_map.Simplify(GetIndexingMapForInstruction);
+  bitcast_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -1262,7 +1262,7 @@ bool HloInstructionIndexing::Simplify() {
       to_remove.push_back(map);
       if (map.IsUndefined()) {
         to_add.push_back(map);
-      } else if (map.Simplify(GetIndexingMapForInstruction)) {
+      } else if (map.Simplify()) {
         map.RemoveUnusedSymbols();
       } else {
         to_remove.pop_back();
@@ -1377,7 +1377,7 @@ GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
       for (const IndexingMap& producer_map : producer_operand_indexing) {
         for (const IndexingMap& consumer_map : consumer_indexing_maps_copy) {
           auto composed_map = ComposeIndexingMaps(consumer_map, producer_map);
-          composed_map.Simplify(GetIndexingMapForInstruction);
+          composed_map.Simplify();
           composed_map.RemoveUnusedSymbols();
           grouped_indexing_maps[&producer_operand_adaptor.instruction()].insert(
               composed_map);
@@ -1536,7 +1536,7 @@ IndexingMap ComputeEpilogueInputToOutputIndexing(
     auto user_indexing = ComputeInputToOutputIndexing(
         &user, user.operand_index(&producer), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
-    root_indexing.Simplify(GetIndexingMapForInstruction);
+    root_indexing.Simplify();
     root_indexing.RemoveUnusedSymbols();
   }
   return root_indexing;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index 8d6125e71645dc..49732f628383b3 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -186,11 +186,6 @@ IndexingMap CreateIdentityMap(const Shape& shape,
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes);
 
-// Returns the output-to-input indexing map of the first output of `instr`
-IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
-                                         int64_t operand_idx,
-                                         mlir::MLIRContext* mlir_context);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 651fd41bbc3e3b..f746c6673ad3db 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -2579,7 +2579,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
                 /*tile_sizes=*/{8, 1, 4},
                 /*num_threads=*/{1, 4, 4}};
   auto indexing_map = GetIndexingMapForTiling(tiling, &mlir_context_);
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 floordiv 64) * 8 + s0,
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 8ffe3ad8f7f72d..5ce87ec476c3c3 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -750,6 +750,11 @@ SmallVector<AffineExpr, 4> MapSymbolsToComposedSymbolsList(
 
 }  // namespace
 
+// Returns the output-to-input indexing map of the first output of `instr`
+IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
+                                         int64_t operand_idx,
+                                         mlir::MLIRContext* mlir_context);
+
 int64_t FloorDiv(int64_t dividend, int64_t divisor) {
   return dividend / divisor -
          (((dividend >= 0) != (divisor >= 0) && dividend % divisor) ? 1 : 0);
@@ -1235,10 +1240,10 @@ IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
 // RangeEvaluator for every constraint. Note that we start with "expr"
 // simplification, because the ranges of constraints were already optimized once
 // when IndexingMap was constructed.
-bool IndexingMap::Simplify(IndexingMapProvider indexing_map_provider) {
+bool IndexingMap::Simplify() {
   if (IsUndefined() || IsKnownEmpty()) return false;
 
-  bool rtvars_were_eliminated = ReplaceConstantRTVars(indexing_map_provider);
+  bool rtvars_were_eliminated = ReplaceConstantRTVars();
 
   // Simplify constraints to shrink the lower/upper bounds of dims and symbols.
   bool constraints_were_simplified = false;
@@ -1777,9 +1782,8 @@ namespace {
 // `{()[sk] -> f(sk), rt_var_new }` - an affine expression that maps from the
 // old RTVar to the new RTVar, and the new RTVar itself. The new RTVar now
 // references some HLO subgraph of the old RTVar's HLO.
-RTVarOptimizationResult OptimizeRTVar(
-    RTVar rt_var, int64_t symbol_index, MLIRContext* mlir_context,
-    IndexingMap::IndexingMapProvider indexing_map_provider) {
+RTVarOptimizationResult OptimizeRTVar(RTVar rt_var, int64_t symbol_index,
+                                      MLIRContext* mlir_context) {
   const auto symbol = getAffineSymbolExpr(symbol_index, mlir_context);
   auto result_expr = symbol;
 
@@ -1814,7 +1818,7 @@ RTVarOptimizationResult OptimizeRTVar(
 
     if (is_indexing_transformation(rt_var.hlo)) {
       auto instr_indexing_map =
-          indexing_map_provider(rt_var.hlo, 0, mlir_context);
+          GetIndexingMapForInstruction(rt_var.hlo, 0, mlir_context);
 
       rt_var.hlo = rt_var.hlo->operand(0);
       rt_var.map = instr_indexing_map.GetAffineMap().compose(rt_var.map);
@@ -1849,7 +1853,7 @@ RTVarOptimizationResult OptimizeRTVar(
 
       auto lhs = OptimizeRTVar(
           RTVar{rt_var.feasible_values, rt_var.hlo->operand(0), rt_var.map},
-          symbol_index, mlir_context, indexing_map_provider);
+          symbol_index, mlir_context);
 
       if (!lhs.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
         // This means that lhs is constant-like and we can eliminate the
@@ -1864,7 +1868,7 @@ RTVarOptimizationResult OptimizeRTVar(
 
       auto rhs = OptimizeRTVar(
           RTVar{rt_var.feasible_values, rt_var.hlo->operand(1), rt_var.map},
-          symbol_index, mlir_context, indexing_map_provider);
+          symbol_index, mlir_context);
 
       if (!rhs.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
         // This means that rhs is constant-like and we can eliminate the
@@ -1884,8 +1888,7 @@ RTVarOptimizationResult OptimizeRTVar(
 }
 }  // namespace
 
-bool IndexingMap::ReplaceConstantRTVars(
-    IndexingMap::IndexingMapProvider indexing_map_provider) {
+bool IndexingMap::ReplaceConstantRTVars() {
   if (rt_vars_.empty()) return false;
 
   std::vector<size_t> to_delete;
@@ -1898,8 +1901,8 @@ bool IndexingMap::ReplaceConstantRTVars(
     auto symbol_index = range_vars_.size() + index;
     auto rt_var_symbol = getAffineSymbolExpr(symbol_index, GetMLIRContext());
 
-    RTVarOptimizationResult result = OptimizeRTVar(
-        rt_var, symbol_index, GetMLIRContext(), indexing_map_provider);
+    RTVarOptimizationResult result =
+        OptimizeRTVar(rt_var, symbol_index, GetMLIRContext());
 
     if (result.remapped_symbol != rt_var_symbol) {
       affine_map_ =
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 371591aaeaad1e..35a35f50c3cdd5 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -265,13 +265,8 @@ class IndexingMap {
 
   void Print(std::ostream& out, const AffineMapPrinter& printer) const;
 
-  // TODO(hebecker): Rearrange code structure so that we can call
-  // `ComputeInputToOutputIndexing` from `:indexing_analysis` directly.
-  using IndexingMapProvider = llvm::function_ref<IndexingMap(
-      const HloInstruction*, int64_t /*operand id*/, mlir::MLIRContext*)>;
-
   // Returns true if the map was simplified.
-  bool Simplify(IndexingMapProvider indexing_map_provider);
+  bool Simplify();
 
   // Return MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const;
@@ -387,7 +382,7 @@ class IndexingMap {
 
   // Replace RTVars that yield constants by indexing expressions.
   // Returns true if a replacement was performed, otherwise false.
-  bool ReplaceConstantRTVars(IndexingMapProvider indexing_map_provider);
+  bool ReplaceConstantRTVars();
 
   // Removes DimVars, RangeVars, RTVars that correspond to the unused dimensions
   // and symbols. If unused_dims is empty, then dims won't be removed. The same
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 1776e2ddfa8bb8..3d773e48171c0c 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -178,7 +178,7 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                           s0 mod 3 in [1, 1]
                           s2 mod 4 in [0, 0]
                         )"));
-  composed.Simplify(GetIndexingMapForInstruction);
+  composed.Simplify();
   EXPECT_THAT(composed, MatchIndexingMap(R"(
                           (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
                           domain:
@@ -528,7 +528,7 @@ TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
                              Interval{0, 0});
   indexing_map.AddConstraint(ParseAffineExpr("s1 mod 5", &mlir_context_),
                              Interval{1, 1});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
                           (d0)[s0, s1] -> (d0, s1, s0)
@@ -546,7 +546,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0) -> (d0)", &mlir_context_),
                   {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
                                                   domain:
@@ -559,7 +559,7 @@ TEST_F(IndexingMapTest,
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1) -> (d0, d1)
                                                   domain:
@@ -576,7 +576,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1, d2) -> (d0, d1, d2)
@@ -595,7 +595,7 @@ TEST_F(IndexingMapTest,
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
     (d0, d1, d2) -> (d0 * 2 + (d2 floordiv 4 + d1) floordiv 2,
                      (d1 * 4 + d2) mod 8)
@@ -612,7 +612,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  (d0, d1) -> (d0, d1)
                                                  domain:
@@ -626,7 +626,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 128) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (s0 * 128)
       domain: s0 in [0, 127]
@@ -638,7 +638,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape2) {
       "(d0, d1) -> ((d0 mod 8) * 128 + d1 + (d0 floordiv 8) * 1024)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {1024, 128}, {});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       (d0, d1) -> (d0 * 128 + d1)
       domain:
@@ -653,7 +653,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (((s0 * 64) floordiv 715) * 715 + (s0 * 128) mod 715)
       domain: s0 in [0, 127]
@@ -666,7 +666,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
       "14)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  ()[s0] -> (s0)
                                                  domain:
@@ -680,7 +680,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
       "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
       domain:
@@ -696,7 +696,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
       "20000)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
         (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000 + s1
@@ -716,7 +716,7 @@ TEST_F(IndexingMapTest,
       "* 2) floordiv 4)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {2, 128});
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1] -> (
         s0 * 4 + s1 floordiv 32
@@ -981,7 +981,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ScalarConstant) {
              hlo_module.value()->entry_computation()->root_instruction(),
              AffineMap::get(0, 0, {}, &mlir_context_)}});
 
-  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_TRUE(indexing_map.Simplify());
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               () -> (42)
@@ -1009,7 +1009,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_StaticIndexIntoTensorConstant) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("() -> (1,2)", &mlir_context_)}});
 
-  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_TRUE(indexing_map.Simplify());
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               () -> (13)
@@ -1037,7 +1037,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_NonFoldableTensor) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (1, d0)", &mlir_context_)}});
 
-  EXPECT_FALSE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_FALSE(indexing_map.Simplify());
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
@@ -1060,7 +1060,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
 
-  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_TRUE(indexing_map.Simplify());
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, d0)
@@ -1089,7 +1089,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_IotaAsConstant) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
 
-  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_TRUE(indexing_map.Simplify());
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, 7)
@@ -1120,7 +1120,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
   indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
                              Interval{0, 0});
 
-  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+  EXPECT_TRUE(indexing_map.Simplify());
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, d0)
@@ -1153,7 +1153,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 11)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, 11)
@@ -1194,7 +1194,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
           hlo_module.value()->entry_computation()->root_instruction(),
           ParseAffineMap("(d0) -> (d0, d0 floordiv 12, 3)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, (d0 floordiv 12) * -4 + 8)
@@ -1227,7 +1227,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, d0 floordiv 2)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0)[s0] -> (d0, s0)
@@ -1264,7 +1264,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Add) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7, 2 * d0)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, d0 * 2 + 42)
@@ -1303,7 +1303,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Multiply) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, d0)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0) -> (d0, (-d0 + 11) * d0)
@@ -1339,7 +1339,7 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartiallyOptimizableAdd) {
              hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7, 2 * d0)", &mlir_context_)}});
 
-  indexing_map.Simplify(GetIndexingMapForInstruction);
+  indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
               (d0)[s0] -> (d0, d0 * 2 + s0)
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index 836bc243954812..000f42f441329d 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -530,7 +530,7 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
       /*range_vars=*/reference.GetRangeVars(),
       /*rt_vars=*/reference.GetRTVars(),
       /*constraints=*/reference.GetConstraints());
-  tmp_indexing_map.Simplify(GetIndexingMapForInstruction);
+  tmp_indexing_map.Simplify();
 
   CHECK_EQ(tmp_indexing_map.GetAffineMap().getResults().size(), 1);
   return tmp_indexing_map.GetAffineMap().getResults().back();
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 71d4c285ae1103..30fb78c9e795d6 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -127,7 +127,7 @@ absl::StatusOr<IndexingMap> ComputeBlockIdToTileOffsetIndexing(
       block_id_to_tile_offset_indexing.GetRangeVars(),
       block_id_to_tile_offset_indexing.GetRTVars()};
 
-  simplified_indexing_map.Simplify(GetIndexingMapForInstruction);
+  simplified_indexing_map.Simplify();
   simplified_indexing_map.RescaleSymbols();
   simplified_indexing_map.RemoveUnusedSymbols();
 

From d386b72100d5f55b68241bb0c0604b09a610c05b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 12:16:09 -0700
Subject: [PATCH 021/287] [xla:cpu] Add initial version of HostKernelEmitter to
 build host kernel LLVM IR

PiperOrigin-RevId: 637982980
---
 third_party/xla/xla/service/cpu/BUILD         |  34 ++++
 .../xla/service/cpu/host_kernel_emitter.cc    | 172 ++++++++++++++++++
 .../xla/xla/service/cpu/host_kernel_emitter.h |  90 +++++++++
 .../service/cpu/host_kernel_emitter_test.cc   |  77 ++++++++
 .../xla/xla/service/cpu/thunk_emitter.cc      |   4 +-
 5 files changed, 375 insertions(+), 2 deletions(-)
 create mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter.cc
 create mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter.h
 create mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 245628e35d5f7c..44421e35e950f5 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -619,6 +619,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_kernel_emitter",
+    srcs = ["host_kernel_emitter.cc"],
+    hdrs = ["host_kernel_emitter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_cc_test(
+    name = "host_kernel_emitter_test",
+    srcs = ["host_kernel_emitter_test.cc"],
+    deps = [
+        ":host_kernel_emitter",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tests:filecheck",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "ir_emitter",
     srcs = [
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter.cc b/third_party/xla/xla/service/cpu/host_kernel_emitter.cc
new file mode 100644
index 00000000000000..42b07758ae91dd
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/host_kernel_emitter.cc
@@ -0,0 +1,172 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/host_kernel_emitter.h"
+
+#include <cstdint>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+static llvm::StructType* Dim3StructType(llvm::LLVMContext& ctx,
+                                        std::string_view name) {
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create(name, i64, i64, i64);
+}
+
+// The following struct types correspond to HostKernel C API.
+// See: xla/stream_executor/host/host_kernel_c_api.h
+
+static llvm::StructType* KernelThreadDimType(llvm::LLVMContext& ctx) {
+  return Dim3StructType(ctx, "SE_HOST_KernelThreadDim");
+}
+
+static llvm::StructType* KernelThreadType(llvm::LLVMContext& ctx) {
+  return Dim3StructType(ctx, "SE_HOST_KernelThread");
+}
+
+static llvm::StructType* KernelArgType(llvm::LLVMContext& ctx) {
+  auto* ptr = llvm::PointerType::getUnqual(ctx);
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create("SE_HOST_KernelArg", ptr, i64);
+}
+
+static llvm::StructType* KernelCallFrameType(llvm::LLVMContext& ctx) {
+  auto* ptr = llvm::PointerType::getUnqual(ctx);
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create("SE_HOST_KernelCallFrame", ptr, ptr, i64,
+                                  ptr);
+}
+
+static llvm::FunctionType* KernelFunctionType(llvm::LLVMContext& ctx) {
+  return llvm::FunctionType::get(llvm::PointerType::getUnqual(ctx),
+                                 llvm::PointerType::getUnqual(ctx),
+                                 /*isVarArg=*/false);
+}
+
+HostKernelEmitter::HostKernelEmitter(llvm::Module* module)
+    : module_(module),
+      call_frame_ty_(KernelCallFrameType(module_->getContext())),
+      thread_dims_ty_(KernelThreadDimType(module_->getContext())),
+      thread_ty_(KernelThreadType(module_->getContext())),
+      arg_ty_(KernelArgType(module_->getContext())) {}
+
+HostKernelEmitter::IrKernelThreadDims HostKernelEmitter::BuildKernelThreadDims(
+    llvm::Value* call_frame, llvm::IRBuilder<>& b) {
+  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 0);
+  auto* x_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
+  auto* y_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
+  auto* z_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
+          b.CreateLoad(b.getInt64Ty(), y_ptr),
+          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+}
+
+HostKernelEmitter::IrKernelThread HostKernelEmitter::BuildKernelThread(
+    llvm::Value* call_frame, llvm::IRBuilder<>& b) {
+  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 1);
+  auto* x_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
+  auto* y_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
+  auto* z_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
+          b.CreateLoad(b.getInt64Ty(), y_ptr),
+          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+}
+
+llvm_ir::IrArray HostKernelEmitter::BuildArgument(llvm::IRBuilder<>& b,
+                                                  llvm::Value* call_frame,
+                                                  int64_t idx,
+                                                  const Shape& shape) {
+  auto* args_ptr = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 3);
+  auto* arg_ptr = b.CreateConstGEP1_64(arg_ty_, args_ptr, idx);
+  auto* data_ptr = b.CreateConstGEP2_64(arg_ty_, arg_ptr, 0, 0);
+
+  llvm::Type* ptr = llvm::PointerType::get(b.getContext(), 0);
+  return llvm_ir::IrArray(b.CreateLoad(ptr, data_ptr),
+                          llvm_ir::ShapeToIrType(shape, module_), shape);
+}
+
+HostKernelEmitter::KernelPrototype HostKernelEmitter::BuildKernelPrototype(
+    std::string_view name, absl::Span<const Shape> parameters,
+    absl::Span<const Shape> results) {
+  VLOG(3) << "Build kernel prototype for: " << name << " with "
+          << parameters.size() << " parameters and " << results.size()
+          << " results:";
+  for (auto& parameter : parameters) {
+    VLOG(3) << "  parameter: " << parameter.ToString(true);
+  }
+  for (auto& result : results) {
+    VLOG(3) << "  result: " << result.ToString(true);
+  }
+
+  llvm::LLVMContext& ctx = module_->getContext();
+  llvm::IRBuilder<> b(ctx);
+
+  // Create a kernel function with HostKernel API.
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_->getOrInsertFunction(name, KernelFunctionType(ctx)).getCallee());
+  function->setCallingConv(llvm::CallingConv::C);
+  b.SetInsertPoint(llvm::BasicBlock::Create(ctx, "", function));
+
+  llvm::Value* call_frame = function->getArg(0);
+
+  // Build thread coordinates from the call frame.
+  IrKernelThreadDims kernel_thread_dims = BuildKernelThreadDims(call_frame, b);
+  IrKernelThread kernel_thread = BuildKernelThread(call_frame, b);
+
+  int64_t idx = 0;
+
+  // IrArrays for the parameters.
+  std::vector<llvm_ir::IrArray> ir_parameters;
+  for (const Shape& parameter : parameters) {
+    ir_parameters.push_back(BuildArgument(b, call_frame, idx++, parameter));
+  }
+
+  // IrArrays for the results.
+  std::vector<llvm_ir::IrArray> ir_results;
+  for (const Shape& result : results) {
+    ir_results.push_back(BuildArgument(b, call_frame, idx++, result));
+  }
+
+  // Return null pointer to signal success as we do not support error handling
+  // in the compiled host kernel.
+  b.CreateRet(
+      llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(ctx)));
+
+  return KernelPrototype{function, kernel_thread_dims, kernel_thread,
+                         std::move(ir_parameters), std::move(ir_results)};
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter.h b/third_party/xla/xla/service/cpu/host_kernel_emitter.h
new file mode 100644
index 00000000000000..b6c6122eef8780
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/host_kernel_emitter.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
+#define XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+
+namespace xla::cpu {
+
+// Collection of LLVM utilities to emit functions compatible with XLA HostKernel
+// API (ABI) for compiled HLO operations.
+class HostKernelEmitter {
+ public:
+  // Thread dimensions of the kernel invocation.
+  struct IrKernelThreadDims {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  // Thread coordinates of the kernel invocation.
+  struct IrKernelThread {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  struct KernelPrototype {
+    llvm::Function* function;
+
+    // LLVM values identifying kernel invocation thread coordinates.
+    IrKernelThreadDims thread_dims;
+    IrKernelThread thread;
+
+    // LLVM values corresponding to the kernel parameters and results arrays.
+    std::vector<llvm_ir::IrArray> parameters;
+    std::vector<llvm_ir::IrArray> results;
+  };
+
+  explicit HostKernelEmitter(llvm::Module* module);
+
+  KernelPrototype BuildKernelPrototype(std::string_view name,
+                                       absl::Span<const Shape> parameters,
+                                       absl::Span<const Shape> results);
+
+ private:
+  IrKernelThreadDims BuildKernelThreadDims(llvm::Value* call_frame,
+                                           llvm::IRBuilder<>& b);
+
+  IrKernelThread BuildKernelThread(llvm::Value* call_frame,
+                                   llvm::IRBuilder<>& b);
+
+  llvm_ir::IrArray BuildArgument(llvm::IRBuilder<>& b, llvm::Value* call_frame,
+                                 int64_t idx, const Shape& shape);
+
+  llvm::Module* module_;
+
+  llvm::StructType* call_frame_ty_;
+  llvm::StructType* thread_dims_ty_;
+  llvm::StructType* thread_ty_;
+  llvm::StructType* arg_ty_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc b/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc
new file mode 100644
index 00000000000000..9736374fdcb645
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/host_kernel_emitter.h"
+
+#include <memory>
+#include <vector>
+
+#include "llvm/IR/LLVMContext.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/filecheck.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST(HostKernelEmitterTest, BuildKernelPrototype) {
+  llvm::LLVMContext context;
+  auto module = std::make_unique<llvm::Module>("test", context);
+
+  auto shape = ShapeUtil::MakeShape(PrimitiveType::F32, {4, 2});
+  std::vector<Shape> parameters = {shape};
+  std::vector<Shape> results = {shape};
+
+  HostKernelEmitter emitter(module.get());
+  HostKernelEmitter::KernelPrototype prototype =
+      emitter.BuildKernelPrototype("test", parameters, results);
+
+  ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
+    CHECK: define ptr @test(ptr %0) {
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 0
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   load i64
+    CHECK:   load i64
+    CHECK:   load i64
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 1
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   load i64
+    CHECK:   load i64
+    CHECK:   load i64
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
+    CHECK:   getelementptr %SE_HOST_KernelArg
+    CHECK:   getelementptr %SE_HOST_KernelArg
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
+    CHECK:   getelementptr %SE_HOST_KernelArg
+    CHECK:   getelementptr %SE_HOST_KernelArg
+
+    CHECK:   ret ptr null
+    CHECK: }
+  )"));
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index f2576c7e9667ee..590edc0a5d06b3 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -89,8 +89,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
 
     default:
       return absl::UnimplementedError(
-          absl::StrCat("HLO opcode: ", HloOpcodeString(instruction->opcode()),
-                       " is not supported by XLA:CPU ThunkEmitter"));
+          absl::StrCat("HLO opcode `", HloOpcodeString(instruction->opcode()),
+                       "` is not supported by XLA:CPU ThunkEmitter"));
   }
 }
 

From 1b6e606f34e1197b6b9bc8f44e11c6f50800ff17 Mon Sep 17 00:00:00 2001
From: "Dimitar (Mitko) Asenov" <dasenov@google.com>
Date: Tue, 28 May 2024 12:16:22 -0700
Subject: [PATCH 022/287] [XLA:GPU] Handle 0-D `MakeBlockPtr` in
 `EmitParameterLoad` instead of `EmitTiledSoftmax`.

This enables cleaner code in followup changes.

PiperOrigin-RevId: 637983067
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 63b182c38db5b7..7dc2e0ab323647 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -510,6 +510,23 @@ absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
 
 Value EmitParameterLoad(ImplicitLocOpBuilder& b, Value pointer,
                         ArrayRef<int32_t> boundary_checks) {
+  // 0-D MakeTensorPtrOp
+  //
+  // Triton tries to access the -1 element of a vector and segfaults when
+  // lowering to LLVM the code to load a 0-D tensor. The workaround is to load a
+  // regular pointer + a splat.
+  if (auto make_tensor_ptr = pointer.getDefiningOp<mt::MakeTensorPtrOp>()) {
+    if (make_tensor_ptr.getOffsets().empty()) {
+      return Splat(b,
+                   b.create<mt::LoadOp>(make_tensor_ptr.getBase(),
+                                        mt::CacheModifier::NONE,
+                                        mt::EvictionPolicy::NORMAL,
+                                        /*isVolatile=*/false),
+                   {});
+    }
+  }
+
+  // Any other tensor pointer.
   if (mt::isTensorPointerType(pointer.getType())) {
     std::optional<mt::PaddingOption> padding;
     if (!boundary_checks.empty()) {
@@ -520,6 +537,11 @@ Value EmitParameterLoad(ImplicitLocOpBuilder& b, Value pointer,
                                 mt::EvictionPolicy::NORMAL,
                                 /*isVolatile=*/false);
   }
+
+  // Non-tensor pointer.
+  //
+  // TODO(b/343013366): Remove this after we delete the legacy SoftMax code.
+  // It's the only place where this code-path is used.
   return Splat(b,
                b.create<mt::LoadOp>(pointer, mt::CacheModifier::NONE,
                                     mt::EvictionPolicy::NORMAL,
@@ -2447,10 +2469,6 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
     auto fn_arg = fn.getArgument(tiled_hlo.hlo()->parameter_number());
     auto tile_ptr = AddPtr(b, fn_arg, ptr_offset);
 
-    if (tile_sizes.empty()) {
-      return EmitParameterLoad(b, tile_ptr, boundary_checks);
-    }
-
     Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
         /*base=*/tile_ptr,
         /*shape=*/tile_sizes,

From 41bb3996e135202917e564e939d7208875affb3e Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 13:07:54 -0700
Subject: [PATCH 023/287] [xla:cpu] Add rudimentary elemental ir emitter
 support for host kernels

+ extract elemental math emitter into a shared library

PiperOrigin-RevId: 637998478
---
 third_party/xla/xla/service/BUILD             |   1 +
 third_party/xla/xla/service/cpu/BUILD         |  77 +++++++++-
 .../xla/service/cpu/elemental_ir_emitter.h    |  65 ---------
 ...r_emitter.cc => elemental_math_emitter.cc} |  84 +++++------
 .../xla/service/cpu/elemental_math_emitter.h  |  43 ++++++
 third_party/xla/xla/service/cpu/ir_emitter.cc |  44 +++++-
 third_party/xla/xla/service/cpu/ir_emitter.h  |   2 +-
 .../xla/xla/service/cpu/ir_emitter2.cc        | 137 ++++++++++++++++++
 third_party/xla/xla/service/cpu/ir_emitter2.h |  66 +++++++++
 .../xla/xla/service/cpu/ir_emitter2_test.cc   |  63 ++++++++
 .../xla/xla/service/elemental_ir_emitter.h    |   3 +-
 11 files changed, 466 insertions(+), 119 deletions(-)
 delete mode 100644 third_party/xla/xla/service/cpu/elemental_ir_emitter.h
 rename third_party/xla/xla/service/cpu/{elemental_ir_emitter.cc => elemental_math_emitter.cc} (59%)
 create mode 100644 third_party/xla/xla/service/cpu/elemental_math_emitter.h
 create mode 100644 third_party/xla/xla/service/cpu/ir_emitter2.cc
 create mode 100644 third_party/xla/xla/service/cpu/ir_emitter2.h
 create mode 100644 third_party/xla/xla/service/cpu/ir_emitter2_test.cc

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index c43a1b1332feea..c2b1c934959796 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5433,6 +5433,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 44421e35e950f5..4b2a59be250f2f 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -619,6 +619,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "elemental_math_emitter",
+    srcs = ["elemental_math_emitter.cc"],
+    hdrs = ["elemental_math_emitter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:math_ops",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
 cc_library(
     name = "host_kernel_emitter",
     srcs = ["host_kernel_emitter.cc"],
@@ -654,21 +673,65 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "ir_emitter",
-    srcs = [
-        "elemental_ir_emitter.cc",
-        "ir_emitter.cc",
+    name = "ir_emitter2",
+    srcs = ["ir_emitter2.cc"],
+    hdrs = ["ir_emitter2.h"],
+    deps = [
+        ":elemental_math_emitter",
+        ":host_kernel_emitter",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:loop_emitter",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
     ],
-    hdrs = [
-        "elemental_ir_emitter.h",
-        "ir_emitter.h",
+)
+
+xla_cc_test(
+    name = "ir_emitter2_test",
+    srcs = ["ir_emitter2_test.cc"],
+    deps = [
+        ":host_kernel_emitter",
+        ":ir_emitter2",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tests:filecheck",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
+)
+
+cc_library(
+    name = "ir_emitter",
+    srcs = ["ir_emitter.cc"],
+    hdrs = ["ir_emitter.h"],
     copts = tsl_copts(),
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
         ":cpu_runtime",
         ":dot_op_emitter",
+        ":elemental_math_emitter",
         ":ir_emission_utils",
         ":ir_function",
         ":onednn_memory_util",
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h b/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
deleted file mode 100644
index 3056047e0787cb..00000000000000
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
-#define XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/cpu/ir_emitter.h"
-#include "xla/service/elemental_ir_emitter.h"
-#include "xla/statusor.h"
-
-namespace xla {
-namespace cpu {
-
-class CpuElementalIrEmitter : public ElementalIrEmitter {
- public:
-  CpuElementalIrEmitter(const HloModuleConfig& module_config,
-                        IrEmitter* ir_emitter, llvm::Module* module)
-      : ElementalIrEmitter(module, ir_emitter->b()),
-        hlo_module_config_(module_config),
-        ir_emitter_(ir_emitter) {}
-
- protected:
-  absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
-                                         llvm::Value* lhs, llvm::Value* rhs,
-                                         absl::string_view name) override;
-  absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
-                                        llvm::Value* value) override;
-  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
-                                       llvm::Value* value) override;
-
-  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
-      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
-      absl::string_view name, bool is_reducer) override {
-    return ir_emitter_->EmitThreadLocalCall(callee, parameters, name,
-                                            is_reducer);
-  }
-
-  bool fast_min_max() override {
-    return hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max();
-  }
-
-  const HloModuleConfig& hlo_module_config_;
-  IrEmitter* ir_emitter_;
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/cpu/elemental_math_emitter.cc
similarity index 59%
rename from third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
rename to third_party/xla/xla/service/cpu/elemental_math_emitter.cc
index 2c5049ade90e24..3745b092446617 100644
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/elemental_math_emitter.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,37 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/elemental_ir_emitter.h"
+#include "xla/service/cpu/elemental_math_emitter.h"
 
 #include <string>
 
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/llvm_ir/llvm_util.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "xla/service/llvm_ir/math_ops.h"
-#include "xla/types.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
 
-using xla::llvm_ir::IrArray;
+namespace xla::cpu {
 
-namespace xla {
-namespace cpu {
-
-absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
-    PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs,
-    absl::string_view /*name*/) {
+absl::StatusOr<llvm::Value*> EmitAtan2(llvm::Module* module,
+                                       llvm::IRBuilder<>& b,
+                                       PrimitiveType prim_type,
+                                       llvm::Value* lhs, llvm::Value* rhs) {
   std::string function_name;
   bool cast_result_to_fp16 = false;
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      lhs = FPCast(lhs, b()->getFloatTy());
-      rhs = FPCast(rhs, b()->getFloatTy());
+      lhs = b.CreateFPCast(lhs, b.getFloatTy());
+      rhs = b.CreateFPCast(rhs, b.getFloatTy());
       [[fallthrough]];
     case F32:
       function_name = "atan2f";
@@ -52,11 +46,11 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
       function_name = "atan2";
       break;
     default:
-      return Unimplemented("atan2");
+      return absl::UnimplementedError("atan2");
   }
   // Create a function declaration.
   llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-      module()
+      module
           ->getOrInsertFunction(function_name, lhs->getType(), lhs->getType(),
                                 rhs->getType())
           .getCallee());
@@ -64,21 +58,23 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
   // Create an instruction to call the function.
-  llvm::Value* result = Call(function, {lhs, rhs});
+  llvm::Value* result = b.CreateCall(function, {lhs, rhs});
   if (cast_result_to_fp16) {
-    result = FPCast(result, b()->getHalfTy());
+    result = b.CreateFPCast(result, b.getHalfTy());
   }
   return result;
 }
 
-absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(
-    PrimitiveType prim_type, llvm::Value* value) {
+absl::StatusOr<llvm::Value*> EmitTanh(llvm::Module* module,
+                                      llvm::IRBuilder<>& b,
+                                      PrimitiveType prim_type,
+                                      llvm::Value* value) {
   bool cast_result_to_fp16 = false;
   std::string function_name;
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      value = FPCast(value, b()->getFloatTy());
+      value = b.CreateFPCast(value, b.getFloatTy());
       [[fallthrough]];
     case F32:
       function_name = "tanhf";
@@ -87,11 +83,11 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(
       function_name = "tanh";
       break;
     default:
-      return Unimplemented("tanh");
+      return absl::UnimplementedError("tanh");
   }
   // Create a function declaration.
   llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-      module()
+      module
           ->getOrInsertFunction(function_name, value->getType(),
                                 value->getType())
           .getCallee());
@@ -99,20 +95,21 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
   // Create an instruction to call the function.
-  llvm::Value* result = Call(function, value);
+  llvm::Value* result = b.CreateCall(function, value);
   if (cast_result_to_fp16) {
-    result = FPCast(result, b()->getHalfTy());
+    result = b.CreateFPCast(result, b.getHalfTy());
   }
   return result;
 }
 
-absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitErf(
-    PrimitiveType prim_type, llvm::Value* value) {
+absl::StatusOr<llvm::Value*> EmitErf(llvm::Module* module, llvm::IRBuilder<>& b,
+                                     PrimitiveType prim_type,
+                                     llvm::Value* value) {
   if (prim_type == F64) {
     std::string function_name = "erf";
     // Create a function declaration.
     llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-        module()
+        module
             ->getOrInsertFunction(function_name, value->getType(),
                                   value->getType())
             .getCallee());
@@ -120,18 +117,17 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitErf(
     function->setDoesNotThrow();
     function->setDoesNotAccessMemory();
     // Create an instruction to call the function.
-    llvm::Value* result = Call(function, value);
+    llvm::Value* result = b.CreateCall(function, value);
     return result;
   }
   // Upcast F16 to F32 if necessary.
-  llvm::Type* type = prim_type == F16 ? b()->getFloatTy() : value->getType();
-  if (type == b()->getFloatTy()) {
-    llvm::Value* x = FPCast(value, type);
-    auto* result = llvm_ir::EmitErfF32(b(), x);
-    return FPCast(result, value->getType());
+  llvm::Type* type = prim_type == F16 ? b.getFloatTy() : value->getType();
+  if (type == b.getFloatTy()) {
+    llvm::Value* x = b.CreateFPCast(value, type);
+    auto* result = llvm_ir::EmitErfF32(&b, x);
+    return b.CreateFPCast(result, value->getType());
   }
-  return Unimplemented("erf");
+  return absl::UnimplementedError("erf");
 }
 
-}  // namespace cpu
-}  // namespace xla
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/elemental_math_emitter.h b/third_party/xla/xla/service/cpu/elemental_math_emitter.h
new file mode 100644
index 00000000000000..1c17484d1dae18
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/elemental_math_emitter.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
+#define XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<llvm::Value*> EmitAtan2(llvm::Module* module,
+                                       llvm::IRBuilder<>& b,
+                                       PrimitiveType prim_type,
+                                       llvm::Value* lhs, llvm::Value* rhs);
+
+absl::StatusOr<llvm::Value*> EmitTanh(llvm::Module* module,
+                                      llvm::IRBuilder<>& b,
+                                      PrimitiveType prim_type,
+                                      llvm::Value* value);
+
+absl::StatusOr<llvm::Value*> EmitErf(llvm::Module* module, llvm::IRBuilder<>& b,
+                                     PrimitiveType prim_type,
+                                     llvm::Value* value);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index f7b6855dfd4655..a0e73aaaf8aa77 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -64,11 +64,12 @@ limitations under the License.
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/cpu/dot_op_emitter.h"
-#include "xla/service/cpu/elemental_ir_emitter.h"
+#include "xla/service/cpu/elemental_math_emitter.h"
 #include "xla/service/cpu/ir_emission_utils.h"
 #include "xla/service/cpu/ir_function.h"
 #include "xla/service/cpu/parallel_loop_emitter.h"
 #include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -95,10 +96,51 @@ namespace xla {
 namespace {
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
+
 }  // namespace
 
 namespace cpu {
 
+class IrEmitter::CpuElementalIrEmitter : public ElementalIrEmitter {
+ public:
+  CpuElementalIrEmitter(const HloModuleConfig& module_config,
+                        IrEmitter* ir_emitter, llvm::Module* module)
+      : ElementalIrEmitter(module, ir_emitter->b()),
+        hlo_module_config_(module_config),
+        ir_emitter_(ir_emitter) {}
+
+ protected:
+  absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                         llvm::Value* lhs, llvm::Value* rhs,
+                                         absl::string_view) override {
+    return xla::cpu::EmitAtan2(module(), *b(), prim_type, lhs, rhs);
+  }
+
+  absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                        llvm::Value* value) override {
+    return xla::cpu::EmitTanh(module(), *b(), prim_type, value);
+  }
+
+  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                       llvm::Value* value) override {
+    return xla::cpu::EmitErf(module(), *b(), prim_type, value);
+  }
+
+  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer) override {
+    return ir_emitter_->EmitThreadLocalCall(callee, parameters, name,
+                                            is_reducer);
+  }
+
+  bool fast_min_max() override {
+    return hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max();
+  }
+
+  const HloModuleConfig& hlo_module_config_;
+  IrEmitter* ir_emitter_;
+};
+
 IrEmitter::IrEmitter(mlir::MLIRContext* mlir_context,
                      const HloModule& hlo_module,
                      const BufferAssignment& assignment,
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 21a371810116ac..fd1dcf22a02722 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -57,7 +57,7 @@ namespace cpu {
 // functions.
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
-  friend class CpuElementalIrEmitter;
+  class CpuElementalIrEmitter;
 
  public:
   using GeneratorForOperandIrArrays =
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
new file mode 100644
index 00000000000000..9c8f60af33c827
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -0,0 +1,137 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/ir_emitter2.h"
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/cpu/elemental_math_emitter.h"
+#include "xla/service/cpu/host_kernel_emitter.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/errors.h"
+
+namespace xla::cpu {
+namespace {
+
+static std::vector<Shape> FlattenedParameters(const HloInstruction* instr) {
+  std::vector<Shape> parameters;
+  for (auto* operand : instr->operands()) {
+    for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
+      parameters.push_back(indexed.shape);
+    }
+  }
+  return parameters;
+}
+
+static std::vector<Shape> FlattenedResults(const HloInstruction* instr) {
+  std::vector<Shape> results;
+  for (auto& indexed : ShapeUtil::GetLeafShapes(instr->shape())) {
+    results.push_back(indexed.shape);
+  }
+  return results;
+}
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// ElementalIrEmitter
+//===----------------------------------------------------------------------===//
+
+class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
+ public:
+  ElementalIrEmitter(llvm::Module* module, llvm::IRBuilder<>* b,
+                     bool fast_min_max)
+      : xla::ElementalIrEmitter(module, b), fast_min_max_(fast_min_max) {}
+
+ protected:
+  absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                         llvm::Value* lhs, llvm::Value* rhs,
+                                         absl::string_view) override {
+    return xla::cpu::EmitAtan2(module(), *b(), prim_type, lhs, rhs);
+  }
+
+  absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                        llvm::Value* value) override {
+    return xla::cpu::EmitTanh(module(), *b(), prim_type, value);
+  }
+
+  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                       llvm::Value* value) override {
+    return xla::cpu::EmitErf(module(), *b(), prim_type, value);
+  }
+
+  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer) override {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  bool fast_min_max() override { return fast_min_max_; }
+
+ private:
+  bool fast_min_max_;
+};
+
+//===----------------------------------------------------------------------===//
+// IrEmitter2
+//===----------------------------------------------------------------------===//
+
+IrEmitter2::IrEmitter2(llvm::Module* module) : module_(module) {}
+
+absl::StatusOr<IrEmitter2::HostKernelSym> IrEmitter2::EmitElementalHostKernel(
+    const HloInstruction* instr) {
+  llvm::IRBuilder<> b(module_->getContext());
+  HostKernelEmitter emitter(module_);
+
+  std::vector<Shape> parameters = FlattenedParameters(instr);
+  std::vector<Shape> results = FlattenedResults(instr);
+
+  HostKernelEmitter::KernelPrototype kernel_prototype =
+      emitter.BuildKernelPrototype(instr->name(), parameters, results);
+  b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
+
+  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
+  for (const HloInstruction* operand : instr->operands()) {
+    operand_to_generator[operand] = [&](const llvm_ir::IrArray::Index& index) {
+      return kernel_prototype.parameters[0].EmitReadArrayElement(index, &b);
+    };
+  }
+
+  // TODO(ezhulenev): Get `fast_min_max` from the HLO module config.
+  ElementalIrEmitter elemental_emitter(module_, &b, /*fast_min_max_=*/true);
+  auto element_generator =
+      elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
+
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
+          .EmitLoop(llvm_ir::IrName(instr)));
+
+  return HostKernelSym{kernel_prototype.function->getName().str()};
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
new file mode 100644
index 00000000000000..58ff1904710d51
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IR_EMITTER2_H_
+#define XLA_SERVICE_CPU_IR_EMITTER2_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla::cpu {
+
+// IrEmitter emits host kernels form HLO instructions into the LLVM module(s).
+//
+// Host kernel is simply a function that implements StreamExecutor HostKernel
+// interface (defined as C API for ABI stability), and XLA:CPU runtime is
+// responsible for launching host kernels on the host as a part of the Thunk
+// sequence execution.
+//
+// In addition to a host kernel function itself, host kernel defines how much
+// concurrency it can support by picking the right thread and block sizes.
+// Runtime might launch host kernel blocks and threads on a thread pool, with an
+// assumption that threads and blocks that are close to each other in three
+// dimensional space are likely to touch the same memory, and thus should be
+// executed on the same thread (or same NUMA node).
+//
+// At run time thunks resolve kernel functions by name in the compiled LLVM
+// module.
+//
+// WARNING: This is under construction and will eventually replace IrEmitter.
+class IrEmitter2 {
+ public:
+  class ElementalIrEmitter;
+
+  explicit IrEmitter2(llvm::Module* module);
+
+  // A symbol name in the LLVM module that defines a host kernel.
+  struct HostKernelSym {
+    std::string name;
+  };
+
+  // Emits an elemental host kernel for the given HLO instruction.
+  absl::StatusOr<HostKernelSym> EmitElementalHostKernel(
+      const HloInstruction* instr);
+
+ private:
+  llvm::Module* module_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_IR_EMITTER2_H_
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
new file mode 100644
index 00000000000000..70ccc42e40dbe6
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/ir_emitter2.h"
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/tests/filecheck.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+using IrEmitter2Test = HloTestBase;
+
+TEST_F(IrEmitter2Test, EmitElementalKernel) {
+  llvm::LLVMContext context;
+  auto module = std::make_unique<llvm::Module>("test", context);
+
+  const char* hlo_text = R"(
+    HloModule m
+    ENTRY main {
+      p0 = f32[2,2] parameter(0)
+      ROOT convert = s32[2,2] convert(p0)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, ParseAndReturnUnverifiedModule(hlo_text));
+  HloInstruction* convert = FindInstruction(hlo.get(), "convert");
+  ASSERT_NE(convert, nullptr);
+
+  IrEmitter2 ir_emitter(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(IrEmitter2::HostKernelSym sym,
+                          ir_emitter.EmitElementalHostKernel(convert));
+
+  ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
+    CHECK: define ptr @convert(ptr %0) {
+    CHECK:   fptosi float {{.*}} to i32
+    CHECK: }
+  )"));
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index b636cb82df5eac..cff5bbb1648389 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
 #define XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
 
+#include <tuple>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
@@ -30,7 +32,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/ir_builder_mixin.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
-#include "xla/statusor.h"
 
 namespace xla {
 

From f79abf66986c54f73ec4e84a10ddd5cc040cbf1e Mon Sep 17 00:00:00 2001
From: "Dimitar (Mitko) Asenov" <dasenov@google.com>
Date: Tue, 28 May 2024 13:20:39 -0700
Subject: [PATCH 024/287] [XLA:GPU] Extract tiled block creation in the Triton
 emitter into a standalone function and add tests.

This change both extracts the logic and generalizes it so that it works with more than one dimension.

PiperOrigin-RevId: 638002384
---
 third_party/xla/xla/service/gpu/BUILD         |  24 ++
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 122 ++++++----
 .../xla/xla/service/gpu/ir_emitter_triton.h   |  21 ++
 .../gpu/ir_emitter_triton_mem_utils_test.cc   | 225 ++++++++++++++++++
 4 files changed, 342 insertions(+), 50 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 12a2eed9b4f0dd..29342fb18ef03f 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -650,6 +650,30 @@ xla_test(
     ],
 )
 
+cc_test(
+    name = "ir_emitter_triton_mem_utils_test",
+    srcs = if_cuda_is_configured(["ir_emitter_triton_mem_utils_test.cc"]),
+    deps = [
+        ":ir_emitter_triton",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:tiled_hlo_instruction",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:logging",
+        "@triton//:TritonDialects",
+    ],
+)
+
 xla_test(
     name = "ir_emitter_triton_large_test",
     srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 7dc2e0ab323647..7e0550857e6baa 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -513,7 +513,7 @@ Value EmitParameterLoad(ImplicitLocOpBuilder& b, Value pointer,
   // 0-D MakeTensorPtrOp
   //
   // Triton tries to access the -1 element of a vector and segfaults when
-  // lowering to LLVM the code to load a 0-D tensor. The workaround is to load a
+  // lowering the code to load a 0-D tensor to LLVM. The workaround is to load a
   // regular pointer + a splat.
   if (auto make_tensor_ptr = pointer.getDefiningOp<mt::MakeTensorPtrOp>()) {
     if (make_tensor_ptr.getOffsets().empty()) {
@@ -2384,6 +2384,70 @@ Value ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
                                       /*symbols=*/{}, b));
 }
 
+namespace ir_emitter_triton_internal {
+
+MakeTensorPtrOpAndBoundaryChecks CreateMakeTensorPtrOp(
+    ImplicitLocOpBuilder& b, Value pid, const TiledHloInstruction& tiled_hlo,
+    Value argument_block) {
+  llvm::SmallVector<Value> sizes;
+  llvm::SmallVector<Value> strides;
+  llvm::SmallVector<Value> offsets;
+  llvm::SmallVector<int32_t> power2_sizes;
+  llvm::SmallVector<int32_t> order;
+  llvm::SmallVector<int32_t> boundary_checks;
+
+  for (auto [size, stride] :
+       llvm::zip(tiled_hlo.tile_sizes(), tiled_hlo.tile_strides())) {
+    if (size == 1) continue;
+
+    int dimension_index = sizes.size();
+
+    sizes.push_back(CreateConst(b, b.getI64Type(), size));
+    strides.push_back(CreateConst(b, b.getI64Type(), stride));
+    // TODO(b/332649307): Explore using proper offsets instead of manually
+    // computing the block pointer.
+    //
+    // In general, there are two options for computing a block:
+    //   - Output a TensorPtr whose base pointer is the base pointer of the
+    //     TiledHloInstruction and provide the necessary offsets so that Triton
+    //     can compute the pointer to the block specific to the given pid. This
+    //     option yields simpler code, but relies on Triton to correctly handle
+    //     higher-dimensional blocks and degenerate dimensions of size 1, which
+    //     Triton doesn't always do well.
+    //   - Output a TensorPtr that points directly to the tile specific to the
+    //     pid. All offset computation is done here instead of by Triton. Triton
+    //     sees 0 offsets. This is what we do now. It's a bit of extra code to
+    //     compute the right offsets, but it's possible to ensure that we
+    //     generate a block with minimal dimensions (all dimensions of size 1
+    //     are folded into the offset computation).
+    offsets.push_back(CreateConst(b, b.getI32Type(), 0));
+    // Triton requires that all block dimensions are a power of 2.
+    power2_sizes.push_back(llvm::PowerOf2Ceil(size));
+    // TODO(b/342989850): Clarify and comment what `order` exactly is. It's not
+    // entirely clear from the Triton docs.
+    order.insert(order.begin(), dimension_index);
+    if (size != power2_sizes.back()) {
+      boundary_checks.push_back(dimension_index);
+    }
+  }
+
+  // Manually compute pointer offset to avoid materialized fully parallel
+  // dimensions in the tile. Current codegen tried to avoid size-1 dims.
+  Value ptr_offset = ComputeBasePtrOffset(b, pid, tiled_hlo);
+  auto tile_ptr = AddPtr(b, argument_block, ptr_offset);
+
+  return MakeTensorPtrOpAndBoundaryChecks{b.create<mt::MakeTensorPtrOp>(
+                                              /*base=*/tile_ptr,
+                                              /*shape=*/sizes,
+                                              /*strides=*/strides,
+                                              /*offsets=*/offsets,
+                                              /*tensorShape=*/power2_sizes,
+                                              /*order=*/order),
+                                          boundary_checks};
+}
+
+}  // namespace ir_emitter_triton_internal
+
 absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
                               absl::string_view libdevice_path,
                               const se::DeviceDescription& device_info,
@@ -2440,44 +2504,11 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
 
   TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
                       analysis->ComputeTiledHloInstructions(output_tile_sizes));
-
-  // block_size must be a power of two.
-  int result_block_size = llvm::PowerOf2Ceil(row_len);
-
-  std::vector<int32_t> boundary_checks;
-  if (result_block_size != row_len) {
-    boundary_checks.push_back(0);
-  }
-
-  // Emits load instructions
   auto emit_param_load =
       [&](const TiledHloInstruction& tiled_hlo) -> absl::StatusOr<Value> {
-    std::vector<Value> tile_sizes, tile_strides, tile_offsets;
-    for (auto [size, stride] :
-         llvm::zip(tiled_hlo.tile_sizes(), tiled_hlo.tile_strides())) {
-      if (size == 1) continue;
-
-      tile_sizes.push_back(CreateConst(b, b.getI64Type(), size));
-      tile_strides.push_back(CreateConst(b, b.getI64Type(), stride));
-      tile_offsets.push_back(CreateConst(b, b.getI32Type(), 0));
-    }
-
-    // Manually compute pointer offset to avoid materialized fully parallel
-    // dimensions in the tile. Current codegen tried to avoid size-1 dims.
-    Value ptr_offset = ComputeBasePtrOffset(b, pid, tiled_hlo);
-
-    auto fn_arg = fn.getArgument(tiled_hlo.hlo()->parameter_number());
-    auto tile_ptr = AddPtr(b, fn_arg, ptr_offset);
-
-    Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
-        /*base=*/tile_ptr,
-        /*shape=*/tile_sizes,
-        /*strides=*/tile_strides,
-        /*offsets=*/tile_offsets,
-        /*tensorShape=*/std::vector<int32_t>{result_block_size},
-        /*order=*/std::vector<int32_t>{0});
-
-    return EmitParameterLoad(b, emitted_tensor, boundary_checks);
+    auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
+        b, pid, tiled_hlo, fn.getArgument(tiled_hlo.hlo()->parameter_number()));
+    return EmitParameterLoad(b, make_tensor.op, make_tensor.boundary_checks);
   };
 
   absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
@@ -2486,19 +2517,10 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
       EmitTiledScope(b, libdevice_path, device_info, tiled_hlo_computation,
                      emit_param_load, values_out));
 
-  Value ptr_offset =
-      ComputeBasePtrOffset(b, pid, *tiled_hlo_computation.GetRoot());
-
-  Value store_tensor = b.create<mt::MakeTensorPtrOp>(
-      /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
-                      ptr_offset),
-      /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
-      /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
-      /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
-      /*tensorShape=*/std::vector<int32_t>{result_block_size},
-      /*order=*/std::vector<int32_t>{0});
-
-  b.create<mt::StoreOp>(store_tensor, result, std::vector<int32_t>{0},
+  const auto& tiled_hlo = *tiled_hlo_computation.GetRoot();
+  auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
+      b, pid, tiled_hlo, fn.getArgument(computation->num_parameters()));
+  b.create<mt::StoreOp>(make_tensor.op, result, make_tensor.boundary_checks,
                         mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 935e8da234eb01..c48d5de7423aed 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -24,17 +24,21 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Module.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/status.h"
@@ -125,6 +129,23 @@ absl::Status CreateTritonPipeline(
 std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
                              const se::DeviceDescription& device_info);
 
+// Exposed for testing purposes only. Do not use.
+namespace ir_emitter_triton_internal {
+
+// Used for creating Triton Load and Store ops.
+struct MakeTensorPtrOpAndBoundaryChecks {
+  mt::MakeTensorPtrOp op;
+
+  // Indices of dimensions where the original tile size is not a power of 2 and
+  // requires a boundary check.
+  llvm::SmallVector<int32_t> boundary_checks;
+};
+
+MakeTensorPtrOpAndBoundaryChecks CreateMakeTensorPtrOp(
+    mlir::ImplicitLocOpBuilder& b, mlir::Value pid,
+    const TiledHloInstruction& tiled_hlo, mlir::Value argument_block);
+}  // namespace ir_emitter_triton_internal
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc
new file mode 100644
index 00000000000000..267b7fb555f394
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc
@@ -0,0 +1,225 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/ir_emitter_triton.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "third_party/triton/include/triton/Dialect/Triton/IR/Dialect.h"
+#include "third_party/triton/include/triton/Dialect/Triton/IR/Types.h"
+
+namespace xla::gpu::ir_emitter_triton_internal {
+namespace {
+
+using ::mlir::ImplicitLocOpBuilder;
+using ::mlir::MLIRContext;
+using ::mlir::OpBuilder;
+using ::mlir::Type;
+using ::mlir::Value;
+using ::testing::ElementsAre;
+
+class TritonMakeTensorPtrTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    mlir_context_.loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
+                              mlir::affine::AffineDialect>();
+  }
+
+ protected:
+  MLIRContext mlir_context_;
+};
+
+// This is not a proper affine map, just something that enables the
+// creation of the index.
+//
+// TODO(b/332649307): Test with a proper affine map once the code starts
+// passing proper offsets to MakeTensorPtr.
+IndexingMap CreateAffineMap(const std::vector<int64_t>& tile_sizes,
+                            MLIRContext& ctx) {
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &ctx);
+  std::vector<mlir::AffineExpr> dims(tile_sizes.size(), d0);
+  return IndexingMap::FromTensorSizes(mlir::AffineMap::get(1, 0, dims, &ctx),
+                                      /*dim_upper_bounds=*/{8},
+                                      /*symbol_upper_bounds=*/{});
+}
+
+// Returns a Parameter HLO instruction with a parameter number 0.
+std::pair<std::unique_ptr<HloInstruction>, std::unique_ptr<TiledHloInstruction>>
+CreateAndTileParameterHloInstruction(std::vector<int64_t> shape_sizes,
+                                     const std::vector<int64_t>& tile_sizes,
+                                     const std::vector<int64_t>& tile_strides,
+                                     MLIRContext& ctx) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, shape_sizes), "p0");
+
+  auto tiled_hlo = TiledHloInstruction::Create(
+      hlo.get(), tile_sizes, tile_strides, CreateAffineMap(tile_sizes, ctx));
+  EXPECT_OK(tiled_hlo);
+  return std::make_pair(std::move(hlo), std::move(tiled_hlo.value()));
+}
+
+mlir::triton::FuncOp CreateTritonFunction(
+    ImplicitLocOpBuilder& b, const std::vector<int64_t> shape_sizes) {
+  auto fn = b.create<mt::FuncOp>(
+      "func",
+      b.getFunctionType({mt::PointerType::get(b.getF32Type(),
+                                              mlir::NVVM::kGlobalMemorySpace)},
+                        std::nullopt));
+  for (int i = 0; i < fn.getNumArguments(); ++i) {
+    fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
+  }
+  b.setInsertionPointToStart(fn.addEntryBlock());
+  return fn;
+}
+
+std::pair<mlir::OwningOpRef<mlir::ModuleOp>, MakeTensorPtrOpAndBoundaryChecks>
+CreateTestTensorPtr(const std::vector<int64_t>& tile_sizes,
+                    const std::vector<int64_t>& tile_strides,
+                    MLIRContext& ctx) {
+  std::vector<int64_t> shape_sizes;
+  for (int64_t tile_size : tile_sizes) {
+    constexpr int64_t kShapeToTileRatio = 5;
+    shape_sizes.push_back(tile_size * kShapeToTileRatio);
+  }
+
+  auto [hlo, tiled_hlo] = CreateAndTileParameterHloInstruction(
+      shape_sizes, tile_sizes, tile_strides, ctx);
+
+  OpBuilder builder(&ctx);
+  auto loc = mlir::NameLoc::get(builder.getStringAttr(hlo->name()));
+  mlir::OwningOpRef<mlir::ModuleOp> triton_module =
+      llvm_ir::CreateMlirModuleOp(loc);
+  builder.setInsertionPointToEnd(triton_module->getBody());
+
+  ImplicitLocOpBuilder b(loc, builder);
+  auto fn = CreateTritonFunction(b, shape_sizes);
+  Value pid = b.create<mlir::arith::IndexCastUIOp>(
+      b.getIndexType(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
+  return std::make_pair(std::move(triton_module),
+                        ir_emitter_triton_internal::CreateMakeTensorPtrOp(
+                            b, pid, *tiled_hlo, fn.getArgument(0)));
+}
+
+std::vector<int> ConstOpValuesToInt(const mlir::ValueRange values) {
+  std::vector<int> result;
+  for (Value v : values) {
+    auto const_op = v.getDefiningOp<mlir::arith::ConstantOp>();
+    CHECK_NOTNULL(const_op);
+    auto int_attr = mlir::cast<mlir::IntegerAttr>(const_op.getValueAttr());
+    result.push_back(int_attr.getInt());
+  }
+  return result;
+}
+
+mlir::ArrayRef<int64_t> TensorShape(const mt::MakeTensorPtrOp& op) {
+  auto ptr = mlir::cast<mt::PointerType>(op->getResult(0).getType());
+  auto tensor = mlir::cast<mlir::TensorType>(ptr.getPointeeType());
+  return tensor.getShape();
+}
+
+TEST_F(TritonMakeTensorPtrTest, BlockProperties) {
+  {
+    auto [module, ptr] = CreateTestTensorPtr({3, 4}, {1, 1}, mlir_context_);
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getShape()), ElementsAre(3, 4));
+    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4));
+    EXPECT_THAT(ptr.boundary_checks, ElementsAre(0));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(1, 1));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0));
+    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(1, 0));
+  }
+  {
+    auto [module, ptr] = CreateTestTensorPtr({4, 4}, {1, 1}, mlir_context_);
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getShape()), ElementsAre(4, 4));
+    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4));
+    EXPECT_TRUE(ptr.boundary_checks.empty());
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(1, 1));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0));
+    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(1, 0));
+  }
+  {
+    auto [module, ptr] = CreateTestTensorPtr({1}, {1}, mlir_context_);
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getShape()).empty());
+    EXPECT_TRUE(TensorShape(ptr.op).empty());
+    EXPECT_TRUE(ptr.boundary_checks.empty());
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getStrides()).empty());
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getOffsets()).empty());
+    EXPECT_TRUE(ptr.op.getOrder().empty());
+  }
+  {
+    auto [module, ptr] =
+        CreateTestTensorPtr({1, 1, 1}, {1, 1, 1}, mlir_context_);
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getShape()).empty());
+    EXPECT_TRUE(TensorShape(ptr.op).empty());
+    EXPECT_TRUE(ptr.boundary_checks.empty());
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getStrides()).empty());
+    EXPECT_TRUE(ConstOpValuesToInt(ptr.op.getOffsets()).empty());
+    EXPECT_TRUE(ptr.op.getOrder().empty());
+  }
+  {
+    auto [module, ptr] =
+        CreateTestTensorPtr({1, 3, 4}, {1, 1, 1}, mlir_context_);
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getShape()), ElementsAre(3, 4));
+    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4));
+    EXPECT_THAT(ptr.boundary_checks, ElementsAre(0));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(1, 1));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0));
+    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(1, 0));
+  }
+  {
+    // TODO(b/332649307): Clarify whether the 1 at index 3 should indeed be
+    // skipped. Maybe this depends on the shape? E.g. if the shape is also 1,
+    // then it's fine to skip, otherwise not.
+    auto [module, ptr] =
+        CreateTestTensorPtr({1, 3, 4, 1, 6}, {1, 1, 1, 1, 1}, mlir_context_);
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getShape()), ElementsAre(3, 4, 6));
+    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4, 8));
+    EXPECT_THAT(ptr.boundary_checks, ElementsAre(0, 2));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(1, 1, 1));
+    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0, 0));
+    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(2, 1, 0));
+  }
+}
+
+}  // namespace
+}  // namespace xla::gpu::ir_emitter_triton_internal

From c21b2f705857b39ef572e78f0961d303fdc0a80a Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Tue, 28 May 2024 13:21:06 -0700
Subject: [PATCH 025/287] [XLA:GPU] Extract launch dimensions for SoftMax
 Triton fusions.

PiperOrigin-RevId: 638002536
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 +
 third_party/xla/xla/service/gpu/model/BUILD   |  2 +
 .../gpu/model/gpu_performance_model_base.cc   | 10 ++
 .../model/gpu_performance_model_base_test.cc  | 96 +++++++++++++++++++
 4 files changed, 109 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 01d195d00085ef..9b798ce4a328d0 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -603,6 +603,7 @@ cc_library(
     name = "triton",
     srcs = ["triton.cc"],
     hdrs = ["triton.h"],
+    visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 11b89aa7961329..8282baf2b9ed8d 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -205,6 +205,7 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions",
         "//xla/service/gpu/fusions:fusion_emitter",
+        "//xla/service/gpu/fusions:triton",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -227,6 +228,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 7f627bd36c77cb..421c0d1ca566ce 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/fusions/triton.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -157,6 +158,15 @@ LaunchDimensions GpuPerformanceModelBase::EstimateFusionLaunchDimensions(
     return kernel_emitter->launch_dimensions();
   }
 
+  // TritonFusion does not implement KernelFusionInterface, because it provides
+  // launch dimensions only for SoftMax fusions.
+  if (const auto* triton_emitter =
+          dynamic_cast<const TritonFusion*>(emitter.get())) {
+    if (auto launch_dimensions = triton_emitter->launch_dimensions()) {
+      return *launch_dimensions;
+    }
+  }
+
   // This estimate should never be reached in fusion code. Fusions that don't
   // implement KernelFusionInterface, don't generate GPU kernels, so there is
   // nothing to fuse. Keep this estimate as a simple fallback.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index d15c0d4339bfc2..9acd86ef99a0c1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -191,6 +192,101 @@ ENTRY entry_computation {
             /*2*4*128=*/1024);
 }
 
+TEST_F(GpuPerformanceModelBaseTest, EstimateFusionLaunchDimensions_LoopFusion) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f1 {
+  p0 = f32[8,16,128] parameter(0)
+  log = f32[8,16,128] log(p0)
+  ROOT add = f32[8,16,128] add(p0, log)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[8,16,128] parameter(0)
+  ROOT fusion = f32[8,16,128] fusion(param_0), kind=kLoop, calls=f1
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto fusion_analysis = AnalyzeFusion(
+      *module->entry_computation()->root_instruction(), device_info_);
+  auto launch_dimensions =
+      GpuPerformanceModelBase::EstimateFusionLaunchDimensions(fusion_analysis);
+
+  EXPECT_EQ(launch_dimensions.num_blocks(), 16);
+  EXPECT_EQ(launch_dimensions.num_threads_per_block(), 1024);
+}
+
+TEST_F(GpuPerformanceModelBaseTest,
+       EstimateFusionLaunchDimensions_TritonSoftMaxFusion) {
+  absl::string_view hlo_string = R"(
+max {
+  p1 = f32[] parameter(1)
+  p0 = f32[] parameter(0)
+  ROOT m = f32[] maximum(p0, p1)
+}
+
+triton_softmax_computation {
+  p0 = f32[16,970] parameter(0)
+  constant = f32[] constant(-inf)
+  reduce = f32[16] reduce(p0, constant), dimensions={1}, to_apply=max
+  broadcast = f32[16,970] broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[16,970] subtract(p0, broadcast)
+}
+
+ENTRY e {
+  p0 = f32[16,970]{1,0} parameter(0)
+  ROOT r = f32[16,970]{1,0} fusion(p0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={"fusion_backend_config": {kind: "__triton_softmax"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto fusion_analysis = AnalyzeFusion(
+      *module->entry_computation()->root_instruction(), device_info_);
+  auto launch_dimensions =
+      GpuPerformanceModelBase::EstimateFusionLaunchDimensions(fusion_analysis);
+
+  EXPECT_EQ(launch_dimensions.num_blocks(), 16);
+  EXPECT_EQ(launch_dimensions.num_threads_per_block(), 64);
+}
+
+TEST_F(GpuPerformanceModelBaseTest,
+       EstimateFusionLaunchDimensions_CudnnFusion) {
+  absl::string_view hlo_string = R"(
+fusion1 {
+  p0 = f32[32,96] parameter(0)
+  p1 = f32[96,256] parameter(1)
+  ROOT r = f32[32,256] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[32,96] parameter(0)
+  p1 = f32[96,256] parameter(1)
+  ROOT _ = f32[32,256] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto fusion_analysis = AnalyzeFusion(
+      *module->entry_computation()->root_instruction(), device_info_);
+  auto launch_dimensions =
+      GpuPerformanceModelBase::EstimateFusionLaunchDimensions(fusion_analysis);
+
+  // CuNnnFusion doesn't implement KernelLaunchInsterface, so
+  // EstimateFusionLaunchDimensions returns a default estimate.
+  EXPECT_EQ(launch_dimensions.num_blocks(), 64);
+  EXPECT_EQ(launch_dimensions.num_threads_per_block(), 128);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 04a2e22fd7cb604463f8c4c502c6801f9bd947b7 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 28 May 2024 13:33:32 -0700
Subject: [PATCH 026/287] [XLA:GPU] Remove structured matchers for
 `SymbolicTile` tests and use string matcher instead.

Also filter out the printing of the string "rt_vars" when the symbolic tile
does not contain any runtime variable.

This is in preparation for adding support for constraints, which would have
required yet another matcher without this change.

PiperOrigin-RevId: 638006815
---
 .../xla/service/gpu/model/symbolic_tile.cc    |  11 +-
 .../service/gpu/model/symbolic_tile_test.cc   | 314 +++++++++---------
 2 files changed, 172 insertions(+), 153 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index 000f42f441329d..b69dcaf8d8ab42 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -676,10 +676,13 @@ void SymbolicTile::Print(std::ostream& out,
   printer.Print(out, size_map());
   out << "\n\tstride_map: ";
   printer.Print(out, stride_map());
-  out << "\n\trt_vars: ";
-  PrintRTVars(tile_map_.GetRTVars(),
-              /*first_rt_var_symbol_index=*/tile_map_.GetDimensionCount(), out,
-              printer);
+  const std::vector<RTVar>& rt_vars = tile_map_.GetRTVars();
+  if (!rt_vars.empty()) {
+    out << "\n\trt_vars: ";
+    PrintRTVars(rt_vars,
+                /*first_rt_var_symbol_index=*/tile_map_.GetDimensionCount(),
+                out, printer);
+  }
   out << "\n";
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
index 97128095df6f4b..6e03cdc44026dd 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -31,7 +29,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/platform/test.h"
@@ -46,43 +43,11 @@ using ::mlir::AffineMap;
 using ::testing::ElementsAre;
 using ::testing::ExplainMatchResult;
 using ::testing::Optional;
-using ::testing::StrEq;
-
-MATCHER_P4(MatchSymbolicTileWithRtVars, offset_map_string, size_map_string,
-           stride_map_string, rt_vars_string,
-           absl::StrCat(negation
-                            ? "equals "
-                            : "doesn't equal symbolic tile with offset_map_ ",
-                        offset_map_string, " and size_map_ ", size_map_string,
-                        " and stride_map_ ", stride_map_string, "and rt_vars_ ",
-                        rt_vars_string)) {
-  AffineMapPrinter printer;
-  return ExplainMatchResult(StrEq(offset_map_string),
-                            printer.ToString(arg.offset_map()),
-                            result_listener) &&
-         ExplainMatchResult(StrEq(size_map_string),
-                            printer.ToString(arg.size_map()),
-                            result_listener) &&
-         ExplainMatchResult(StrEq(stride_map_string),
-                            printer.ToString(arg.stride_map()),
-                            result_listener) &&
-         // Strip whitespace, so we don't need to add trailing newlines.
-         ExplainMatchResult(StrEq(absl::StripAsciiWhitespace(rt_vars_string)),
-                            absl::StripAsciiWhitespace(arg.RtVarsToString()),
-                            result_listener);
-}
 
-MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
-           stride_map_string,
-           absl::StrCat(negation
-                            ? "equals "
-                            : "doesn't equal symbolic tile with offset_map_ ",
-                        offset_map_string, " and size_map_ ", size_map_string,
-                        " and stride_map_ ", stride_map_string)) {
+MATCHER_P(MatchSymbolicTileString, symbolic_tile_string, "") {
   return ExplainMatchResult(
-      MatchSymbolicTileWithRtVars(offset_map_string, size_map_string,
-                                  stride_map_string, ""),
-      arg, result_listener);
+      true, ApproximateMatch(symbolic_tile_string, arg.ToString()),
+      result_listener);
 }
 
 std::vector<int64_t> EvaluateMapAt(AffineMap affine_map,
@@ -124,15 +89,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
-                                 "()[s0, s1, s2] -> (s0, s1, 19)",
-                                 "()[s0, s1, s2] -> (1, 1, 1)")));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
-                                 "()[s0, s1, s2] -> (s0, 19, s2)",
-                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+          offset_map: ()[s0, s1, s2] -> (0, 0, 0)
+          size_map: ()[s0, s1, s2] -> (s0, s1, 19)
+          stride_map: ()[s0, s1, s2] -> (1, 1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
@@ -146,9 +108,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3] -> (0, 0, 0)",
-                                 "()[s0, s1, s2, s3] -> (s1, s2, s3)",
-                                 "()[s0, s1, s2, s3] -> (1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3] -> (0, 0, 0)
+        size_map: ()[s0, s1, s2, s3] -> (s1, s2, s3)
+        stride_map: ()[s0, s1, s2, s3] -> (1, 1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest,
@@ -163,11 +128,12 @@ TEST_F(SymbolicTileTest,
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1] -> (0, 0, 0, 0)",
-          "()[s0, s1] -> "
-          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
-          "()[s0, s1] -> (0, 1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (0, 0, 0, 0)
+        size_map: ()[s0, s1] -> (1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)
+        stride_map: ()[s0, s1] -> (0, 1, 1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest,
@@ -183,20 +149,19 @@ TEST_F(SymbolicTileTest,
   std::optional<SymbolicTile> symbolic_tile =
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin());
 
-  EXPECT_THAT(
-      symbolic_tile,
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3] -> (0, 0)",
-          "()[s0, s1, s2, s3] -> ((s0 * s1) * s2, s3)",
-          // Collapsed dimensions force us to create nested conditionals, since
-          // the stride of the output corresponds to the stride of the minormost
-          // expression along which elements are captured in the composite
-          // expression. Hence, the resulting expression is very ugly.
-          "()[s0, s1, s2, s3] -> "
-          "(((-s2 + 7) floordiv 6) * (((-s1 + 9) floordiv 8) * "
-          "((-((-s0 + 5) floordiv 4) + 1) * 48) + "
-          "(-((-s1 + 9) floordiv 8) + 1) * 6) + -((-s2 + 7) floordiv 6) + 1, "
-          "1)")));
+  // Collapsed dimensions force us to create nested conditionals, since the
+  // stride of the output corresponds to the stride of the minormost expression
+  // along which elements are captured in the composite expression. Hence, the
+  // resulting expression is very ugly.
+  EXPECT_THAT(symbolic_tile, Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3] -> (0, 0)
+        size_map: ()[s0, s1, s2, s3] -> ((s0 * s1) * s2, s3)
+        stride_map: ()[s0, s1, s2, s3] ->
+          (((-s2 + 7) floordiv 6) * (((-s1 + 9) floordiv 8) *
+          ((-((-s0 + 5) floordiv 4) + 1) * 48) +
+          (-((-s1 + 9) floordiv 8) + 1) * 6) + -((-s2 + 7) floordiv 6) + 1, 1)
+      )")));
 
   // Capturing elements along dimensions 0, 1, and 2 makes the stride equal to
   // 1.
@@ -248,8 +213,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughElementwiseOp) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0] -> (0)", "()[s0] -> (s0)",
-                                 "()[s0] -> (1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0] -> (0)
+        size_map: ()[s0] -> (s0)
+        stride_map: ()[s0] -> (1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
@@ -263,8 +232,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1] -> (0)", "()[s0, s1] -> (s1)",
-                                 "()[s0, s1] -> (1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (0)
+        size_map: ()[s0, s1] -> (s1)
+        stride_map: ()[s0, s1] -> (1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
@@ -285,8 +258,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0] -> (0, 0)", "()[s0] -> (125, s0)",
-                                 "()[s0] -> (1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0] -> (0, 0)
+        size_map: ()[s0] -> (125, s0)
+        stride_map: ()[s0] -> (1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
@@ -300,8 +277,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0] -> (-s0 + 179)", "()[s0] -> (s0)",
-                                 "()[s0] -> (1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0] -> (-s0 + 179)
+        size_map: ()[s0] -> (s0)
+        stride_map: ()[s0] -> (1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
@@ -315,9 +296,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1] -> (40, 20)",
-                                 "()[s0, s1] -> (s0, s1)",
-                                 "()[s0, s1] -> (2, 4)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (40, 20)
+        size_map: ()[s0, s1] -> (s0, s1)
+        stride_map: ()[s0, s1] -> (2, 4)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
@@ -331,9 +315,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1] -> (0, 0)",
-                                 "()[s0, s1] -> (s1, s0)",
-                                 "()[s0, s1] -> (1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (0, 0)
+        size_map: ()[s0, s1] -> (s1, s0)
+        stride_map: ()[s0, s1] -> (1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
@@ -350,19 +337,28 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
-                                 "()[s0, s1, s2] -> (s0, s1, s2)",
-                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2] -> (0, 0, 0)
+        size_map: ()[s0, s1, s2] -> (s0, s1, s2)
+        stride_map: ()[s0, s1, s2] -> (1, 1, 1)
+      )")));
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -5, 0)",
-                                 "()[s0, s1, s2] -> (s0, s1, s2)",
-                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2] -> (0, -5, 0)
+        size_map: ()[s0, s1, s2] -> (s0, s1, s2)
+        stride_map: ()[s0, s1, s2] -> (1, 1, 1)
+      )")));
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[2].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -16, 0)",
-                                 "()[s0, s1, s2] -> (s0, s1, s2)",
-                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2] -> (0, -16, 0)
+        size_map: ()[s0, s1, s2] -> (s0, s1, s2)
+        stride_map: ()[s0, s1, s2] -> (1, 1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
@@ -378,9 +374,12 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1] -> (-2, -1)",
-                                 "()[s0, s1] -> (s0, s1)",
-                                 "()[s0, s1] -> (1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (-2, -1)
+        size_map: ()[s0, s1] -> (s0, s1)
+        stride_map: ()[s0, s1] -> (1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughDynamicSlice) {
@@ -406,24 +405,28 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughDynamicSlice) {
       // Note: We don't have s0 in the size map's rhs, because the first dim
       // of the tile size can only be 1. The second offset is optimized to 0,
       // because that is the only possible value.
-      Optional(MatchSymbolicTileWithRtVars(
-          "()[s0, s1, s2, s3, s4] -> (s3, 0, s4)",
-          "()[s0, s1, s2] -> (1, s1, s2)", "()[s0, s1, s2] -> (0, 1, 1)",
-          R"(
-s3 in [0, 1]
-  hlo: %of1 = s32[] parameter(1)
-  (d0, d1, d2) -> ()
-s4 in [0, 226]
-  hlo: %of3 = s32[] parameter(3)
-  (d0, d1, d2) -> ()
-)")));
-
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3, s4] -> (s3, 0, s4)
+        size_map: ()[s0, s1, s2] -> (1, s1, s2)
+        stride_map: ()[s0, s1, s2] -> (0, 1, 1)
+        rt_vars:
+          s3 in [0, 1]
+            hlo: %of1 = s32[] parameter(1)
+            (d0, d1, d2) -> ()
+          s4 in [0, 226]
+            hlo: %of3 = s32[] parameter(3)
+            (d0, d1, d2) -> ()
+      )")));
   for (int i = 1; i <= 3; i++) {
     EXPECT_THAT(
         SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[i].begin()),
-        Optional(MatchSymbolicTile("()[s0, s1, s2] -> ()",
-                                   "()[s0, s1, s2] -> ()",
-                                   "()[s0, s1, s2] -> ()")));
+        Optional(MatchSymbolicTileString(R"(
+        Symbolic tile with
+          offset_map: ()[s0, s1, s2] -> ()
+          size_map: ()[s0, s1, s2] -> ()
+          stride_map: ()[s0, s1, s2] -> ()
+        )")));
   }
 }
 
@@ -444,31 +447,38 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughDynamicUpdateSlice) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      // s0, s1: tile sizes
-      // s2, s3: runtime parameters
-      Optional(MatchSymbolicTile("()[s0, s1] -> (0, 0)",
-                                 "()[s0, s1] -> (s0, s1)",
-                                 "()[s0, s1] -> (1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (0, 0)
+        size_map: ()[s0, s1] -> (s0, s1)
+        stride_map: ()[s0, s1] -> (1, 1)
+      )")));
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
       // s0, s1: tile sizes
       // s2, s3: runtime parameters
-      Optional(MatchSymbolicTileWithRtVars("()[s0, s1, s2, s3] -> (-s2, -s3)",
-                                           "()[s0, s1] -> (s0, s1)",
-                                           "()[s0, s1] -> (1, 1)",
-                                           R"(
-s2 in [0, 15]
-  hlo: %of1 = s32[] parameter(2)
-  (d0, d1) -> ()
-s3 in [0, 20]
-  hlo: %of2 = s32[] parameter(3)
-  (d0, d1) -> ()
-)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3] -> (-s2, -s3)
+        size_map: ()[s0, s1] -> (s0, s1)
+        stride_map: ()[s0, s1] -> (1, 1)
+        rt_vars:
+          s2 in [0, 15]
+            hlo: %of1 = s32[] parameter(2)
+            (d0, d1) -> ()
+          s3 in [0, 20]
+            hlo: %of2 = s32[] parameter(3)
+            (d0, d1) -> ()
+      )")));
   for (int i = 2; i <= 3; i++) {
     EXPECT_THAT(
         SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[i].begin()),
-        Optional(MatchSymbolicTile("()[s0, s1] -> ()", "()[s0, s1] -> ()",
-                                   "()[s0, s1] -> ()")));
+        Optional(MatchSymbolicTileString(R"(
+        Symbolic tile with
+          offset_map: ()[s0, s1] -> ()
+          size_map: ()[s0, s1] -> ()
+          stride_map: ()[s0, s1] -> ()
+        )")));
   }
 }
 
@@ -490,24 +500,28 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughGather) {
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       // s0, s1, s2, s3: tile sizes
       // s4, s5: runtime parameters
-      Optional(MatchSymbolicTileWithRtVars(
-          "()[s0, s1, s2, s3, s4, s5] -> (s4, s5, 0)",
-          "()[s0, s1, s2, s3] -> (s1, s2, s3)",
-          "()[s0, s1, s2, s3] -> (1, 1, 1)",
-          R"(
-s4 in [0, 26]
-  hlo: %indices = s32[1806,2]{1,0} parameter(1)
-  (d0, d1, d2, d3) -> (d0, 0)
-s5 in [0, 68]
-  hlo: %indices = s32[1806,2]{1,0} parameter(1)
-  (d0, d1, d2, d3) -> (d0, 1)
-)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3, s4, s5] -> (s4, s5, 0)
+        size_map: ()[s0, s1, s2, s3] -> (s1, s2, s3)
+        stride_map: ()[s0, s1, s2, s3] -> (1, 1, 1)
+        rt_vars:
+          s4 in [0, 26]
+            hlo: %indices = s32[1806,2]{1,0} parameter(1)
+            (d0, d1, d2, d3) -> (d0, 0)
+          s5 in [0, 68]
+            hlo: %indices = s32[1806,2]{1,0} parameter(1)
+            (d0, d1, d2, d3) -> (d0, 1)
+      )")));
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3] -> (0, 0)",
-                                 "()[s0, s1, s2, s3] -> (s0, 2)",
-                                 "()[s0, s1, s2, s3] -> (1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1, s2, s3] -> (0, 0)
+        size_map: ()[s0, s1, s2, s3] -> (s0, 2)
+        stride_map: ()[s0, s1, s2, s3] -> (1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
@@ -530,12 +544,14 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
   // TODO(b/331257678): the expected expressions should be simplified.
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1] -> (0, -((s0 + 5) floordiv 6) + 8, "
-          "-(s0 - ((s0 - 1) floordiv 6) * 6) + 6, 0)",
-          "()[s0, s1] -> "
-          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
-          "()[s0, s1] -> (0, 1, 1, 1)")));
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] ->
+          (0, -((s0 + 5) floordiv 6) + 8, -(s0 - ((s0 - 1) floordiv 6) * 6) + 6, 0)
+        size_map: ()[s0, s1] ->
+          (1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)
+        stride_map: ()[s0, s1] -> (0, 1, 1, 1)
+      )")));
 }
 
 TEST_F(SymbolicTileTest,

From 2e933b5bc2e9e27ee665f9ca6b07ac6f49a56819 Mon Sep 17 00:00:00 2001
From: Yazhou Zu <yzu@google.com>
Date: Tue, 28 May 2024 13:42:18 -0700
Subject: [PATCH 027/287] creat jax config api to allow custom pjrt client
 create option settings. this allows a device platform's pjrt client be aware
 of the calling (customer) ml framework

PiperOrigin-RevId: 638009713
---
 third_party/xla/xla/python/xla_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 384e7baca5fb7d..a790b2ca7c6a80 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -49,7 +49,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 266
+_version = 267
 
 # Version number for MLIR:Python components.
 mlir_api_version = 56

From b45bc6e0e3b66a0b5f0c9a81eef9bb7425e43182 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 13:45:33 -0700
Subject: [PATCH 028/287] [xla:cpu] NFC: Merge HostKernelEmitter into
 IrEmitter2

IrEmitter2 (will be renamed to IrEmitter) will be the main owner of all the state relevant for emitting host kernels IR.

PiperOrigin-RevId: 638010688
---
 third_party/xla/xla/service/cpu/BUILD         |  36 ----
 .../xla/service/cpu/host_kernel_emitter.cc    | 172 ------------------
 .../xla/xla/service/cpu/host_kernel_emitter.h |  90 ---------
 .../service/cpu/host_kernel_emitter_test.cc   |  77 --------
 .../xla/xla/service/cpu/ir_emitter2.cc        | 158 +++++++++++++++-
 third_party/xla/xla/service/cpu/ir_emitter2.h |  70 ++++++-
 .../xla/xla/service/cpu/ir_emitter2_test.cc   |  49 ++++-
 7 files changed, 264 insertions(+), 388 deletions(-)
 delete mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter.cc
 delete mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter.h
 delete mode 100644 third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 4b2a59be250f2f..e122d45e6f5155 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -638,47 +638,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "host_kernel_emitter",
-    srcs = ["host_kernel_emitter.cc"],
-    hdrs = ["host_kernel_emitter.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-xla_cc_test(
-    name = "host_kernel_emitter_test",
-    srcs = ["host_kernel_emitter_test.cc"],
-    deps = [
-        ":host_kernel_emitter",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:filecheck",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
 cc_library(
     name = "ir_emitter2",
     srcs = ["ir_emitter2.cc"],
     hdrs = ["ir_emitter2.h"],
     deps = [
         ":elemental_math_emitter",
-        ":host_kernel_emitter",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
@@ -700,7 +665,6 @@ xla_cc_test(
     name = "ir_emitter2_test",
     srcs = ["ir_emitter2_test.cc"],
     deps = [
-        ":host_kernel_emitter",
         ":ir_emitter2",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter.cc b/third_party/xla/xla/service/cpu/host_kernel_emitter.cc
deleted file mode 100644
index 42b07758ae91dd..00000000000000
--- a/third_party/xla/xla/service/cpu/host_kernel_emitter.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/host_kernel_emitter.h"
-
-#include <cstdint>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "xla/service/llvm_ir/ir_array.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/shape.h"
-#include "tsl/platform/logging.h"
-
-namespace xla::cpu {
-
-static llvm::StructType* Dim3StructType(llvm::LLVMContext& ctx,
-                                        std::string_view name) {
-  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
-  return llvm::StructType::create(name, i64, i64, i64);
-}
-
-// The following struct types correspond to HostKernel C API.
-// See: xla/stream_executor/host/host_kernel_c_api.h
-
-static llvm::StructType* KernelThreadDimType(llvm::LLVMContext& ctx) {
-  return Dim3StructType(ctx, "SE_HOST_KernelThreadDim");
-}
-
-static llvm::StructType* KernelThreadType(llvm::LLVMContext& ctx) {
-  return Dim3StructType(ctx, "SE_HOST_KernelThread");
-}
-
-static llvm::StructType* KernelArgType(llvm::LLVMContext& ctx) {
-  auto* ptr = llvm::PointerType::getUnqual(ctx);
-  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
-  return llvm::StructType::create("SE_HOST_KernelArg", ptr, i64);
-}
-
-static llvm::StructType* KernelCallFrameType(llvm::LLVMContext& ctx) {
-  auto* ptr = llvm::PointerType::getUnqual(ctx);
-  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
-  return llvm::StructType::create("SE_HOST_KernelCallFrame", ptr, ptr, i64,
-                                  ptr);
-}
-
-static llvm::FunctionType* KernelFunctionType(llvm::LLVMContext& ctx) {
-  return llvm::FunctionType::get(llvm::PointerType::getUnqual(ctx),
-                                 llvm::PointerType::getUnqual(ctx),
-                                 /*isVarArg=*/false);
-}
-
-HostKernelEmitter::HostKernelEmitter(llvm::Module* module)
-    : module_(module),
-      call_frame_ty_(KernelCallFrameType(module_->getContext())),
-      thread_dims_ty_(KernelThreadDimType(module_->getContext())),
-      thread_ty_(KernelThreadType(module_->getContext())),
-      arg_ty_(KernelArgType(module_->getContext())) {}
-
-HostKernelEmitter::IrKernelThreadDims HostKernelEmitter::BuildKernelThreadDims(
-    llvm::Value* call_frame, llvm::IRBuilder<>& b) {
-  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 0);
-  auto* x_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
-  auto* y_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
-  auto* z_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
-          b.CreateLoad(b.getInt64Ty(), y_ptr),
-          b.CreateLoad(b.getInt64Ty(), z_ptr)};
-}
-
-HostKernelEmitter::IrKernelThread HostKernelEmitter::BuildKernelThread(
-    llvm::Value* call_frame, llvm::IRBuilder<>& b) {
-  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 1);
-  auto* x_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
-  auto* y_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
-  auto* z_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
-          b.CreateLoad(b.getInt64Ty(), y_ptr),
-          b.CreateLoad(b.getInt64Ty(), z_ptr)};
-}
-
-llvm_ir::IrArray HostKernelEmitter::BuildArgument(llvm::IRBuilder<>& b,
-                                                  llvm::Value* call_frame,
-                                                  int64_t idx,
-                                                  const Shape& shape) {
-  auto* args_ptr = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 3);
-  auto* arg_ptr = b.CreateConstGEP1_64(arg_ty_, args_ptr, idx);
-  auto* data_ptr = b.CreateConstGEP2_64(arg_ty_, arg_ptr, 0, 0);
-
-  llvm::Type* ptr = llvm::PointerType::get(b.getContext(), 0);
-  return llvm_ir::IrArray(b.CreateLoad(ptr, data_ptr),
-                          llvm_ir::ShapeToIrType(shape, module_), shape);
-}
-
-HostKernelEmitter::KernelPrototype HostKernelEmitter::BuildKernelPrototype(
-    std::string_view name, absl::Span<const Shape> parameters,
-    absl::Span<const Shape> results) {
-  VLOG(3) << "Build kernel prototype for: " << name << " with "
-          << parameters.size() << " parameters and " << results.size()
-          << " results:";
-  for (auto& parameter : parameters) {
-    VLOG(3) << "  parameter: " << parameter.ToString(true);
-  }
-  for (auto& result : results) {
-    VLOG(3) << "  result: " << result.ToString(true);
-  }
-
-  llvm::LLVMContext& ctx = module_->getContext();
-  llvm::IRBuilder<> b(ctx);
-
-  // Create a kernel function with HostKernel API.
-  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
-      module_->getOrInsertFunction(name, KernelFunctionType(ctx)).getCallee());
-  function->setCallingConv(llvm::CallingConv::C);
-  b.SetInsertPoint(llvm::BasicBlock::Create(ctx, "", function));
-
-  llvm::Value* call_frame = function->getArg(0);
-
-  // Build thread coordinates from the call frame.
-  IrKernelThreadDims kernel_thread_dims = BuildKernelThreadDims(call_frame, b);
-  IrKernelThread kernel_thread = BuildKernelThread(call_frame, b);
-
-  int64_t idx = 0;
-
-  // IrArrays for the parameters.
-  std::vector<llvm_ir::IrArray> ir_parameters;
-  for (const Shape& parameter : parameters) {
-    ir_parameters.push_back(BuildArgument(b, call_frame, idx++, parameter));
-  }
-
-  // IrArrays for the results.
-  std::vector<llvm_ir::IrArray> ir_results;
-  for (const Shape& result : results) {
-    ir_results.push_back(BuildArgument(b, call_frame, idx++, result));
-  }
-
-  // Return null pointer to signal success as we do not support error handling
-  // in the compiled host kernel.
-  b.CreateRet(
-      llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(ctx)));
-
-  return KernelPrototype{function, kernel_thread_dims, kernel_thread,
-                         std::move(ir_parameters), std::move(ir_results)};
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter.h b/third_party/xla/xla/service/cpu/host_kernel_emitter.h
deleted file mode 100644
index b6c6122eef8780..00000000000000
--- a/third_party/xla/xla/service/cpu/host_kernel_emitter.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
-#define XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
-
-#include <cstdint>
-#include <string_view>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "xla/service/llvm_ir/ir_array.h"
-#include "xla/shape.h"
-
-namespace xla::cpu {
-
-// Collection of LLVM utilities to emit functions compatible with XLA HostKernel
-// API (ABI) for compiled HLO operations.
-class HostKernelEmitter {
- public:
-  // Thread dimensions of the kernel invocation.
-  struct IrKernelThreadDims {
-    llvm::Value* x;
-    llvm::Value* y;
-    llvm::Value* z;
-  };
-
-  // Thread coordinates of the kernel invocation.
-  struct IrKernelThread {
-    llvm::Value* x;
-    llvm::Value* y;
-    llvm::Value* z;
-  };
-
-  struct KernelPrototype {
-    llvm::Function* function;
-
-    // LLVM values identifying kernel invocation thread coordinates.
-    IrKernelThreadDims thread_dims;
-    IrKernelThread thread;
-
-    // LLVM values corresponding to the kernel parameters and results arrays.
-    std::vector<llvm_ir::IrArray> parameters;
-    std::vector<llvm_ir::IrArray> results;
-  };
-
-  explicit HostKernelEmitter(llvm::Module* module);
-
-  KernelPrototype BuildKernelPrototype(std::string_view name,
-                                       absl::Span<const Shape> parameters,
-                                       absl::Span<const Shape> results);
-
- private:
-  IrKernelThreadDims BuildKernelThreadDims(llvm::Value* call_frame,
-                                           llvm::IRBuilder<>& b);
-
-  IrKernelThread BuildKernelThread(llvm::Value* call_frame,
-                                   llvm::IRBuilder<>& b);
-
-  llvm_ir::IrArray BuildArgument(llvm::IRBuilder<>& b, llvm::Value* call_frame,
-                                 int64_t idx, const Shape& shape);
-
-  llvm::Module* module_;
-
-  llvm::StructType* call_frame_ty_;
-  llvm::StructType* thread_dims_ty_;
-  llvm::StructType* thread_ty_;
-  llvm::StructType* arg_ty_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_SERVICE_CPU_HOST_KERNEL_EMITTER_H_
diff --git a/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc b/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc
deleted file mode 100644
index 9736374fdcb645..00000000000000
--- a/third_party/xla/xla/service/cpu/host_kernel_emitter_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/host_kernel_emitter.h"
-
-#include <memory>
-#include <vector>
-
-#include "llvm/IR/LLVMContext.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tests/filecheck.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
-
-namespace xla::cpu {
-namespace {
-
-TEST(HostKernelEmitterTest, BuildKernelPrototype) {
-  llvm::LLVMContext context;
-  auto module = std::make_unique<llvm::Module>("test", context);
-
-  auto shape = ShapeUtil::MakeShape(PrimitiveType::F32, {4, 2});
-  std::vector<Shape> parameters = {shape};
-  std::vector<Shape> results = {shape};
-
-  HostKernelEmitter emitter(module.get());
-  HostKernelEmitter::KernelPrototype prototype =
-      emitter.BuildKernelPrototype("test", parameters, results);
-
-  ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
-    CHECK: define ptr @test(ptr %0) {
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 0
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   load i64
-    CHECK:   load i64
-    CHECK:   load i64
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 1
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   load i64
-    CHECK:   load i64
-    CHECK:   load i64
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
-    CHECK:   getelementptr %SE_HOST_KernelArg
-    CHECK:   getelementptr %SE_HOST_KernelArg
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
-    CHECK:   getelementptr %SE_HOST_KernelArg
-    CHECK:   getelementptr %SE_HOST_KernelArg
-
-    CHECK:   ret ptr null
-    CHECK: }
-  )"));
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 9c8f60af33c827..788f5e0fdeb3d4 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -15,18 +15,23 @@ limitations under the License.
 
 #include "xla/service/cpu/ir_emitter2.h"
 
+#include <cstdint>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/cpu/elemental_math_emitter.h"
-#include "xla/service/cpu/host_kernel_emitter.h"
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -34,10 +39,15 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 namespace xla::cpu {
 namespace {
 
+// We do not materialize buffers for tuples at run time, and work only with leaf
+// arrays. These are the helper functions to flatten HLO instruction parameters
+// and results into a list of leaf shapes.
+
 static std::vector<Shape> FlattenedParameters(const HloInstruction* instr) {
   std::vector<Shape> parameters;
   for (auto* operand : instr->operands()) {
@@ -56,6 +66,42 @@ static std::vector<Shape> FlattenedResults(const HloInstruction* instr) {
   return results;
 }
 
+// Following struct types correspond to HostKernel C API.
+// See: xla/stream_executor/host/host_kernel_c_api.h
+
+static llvm::StructType* Dim3StructTy(llvm::LLVMContext& ctx,
+                                      std::string_view name) {
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create(name, i64, i64, i64);
+}
+
+static llvm::StructType* KernelThreadDimTy(llvm::LLVMContext& ctx) {
+  return Dim3StructTy(ctx, "SE_HOST_KernelThreadDim");
+}
+
+static llvm::StructType* KernelThreadTy(llvm::LLVMContext& ctx) {
+  return Dim3StructTy(ctx, "SE_HOST_KernelThread");
+}
+
+static llvm::StructType* KernelArgTy(llvm::LLVMContext& ctx) {
+  auto* ptr = llvm::PointerType::getUnqual(ctx);
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create("SE_HOST_KernelArg", ptr, i64);
+}
+
+static llvm::StructType* KernelCallFrameTy(llvm::LLVMContext& ctx) {
+  auto* ptr = llvm::PointerType::getUnqual(ctx);
+  auto* i64 = llvm::IntegerType::getInt64Ty(ctx);
+  return llvm::StructType::create("SE_HOST_KernelCallFrame", ptr, ptr, i64,
+                                  ptr);
+}
+
+static llvm::FunctionType* KernelFunctionTy(llvm::LLVMContext& ctx) {
+  return llvm::FunctionType::get(llvm::PointerType::getUnqual(ctx),
+                                 llvm::PointerType::getUnqual(ctx),
+                                 /*isVarArg=*/false);
+}
+
 }  // namespace
 
 //===----------------------------------------------------------------------===//
@@ -101,24 +147,28 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
 // IrEmitter2
 //===----------------------------------------------------------------------===//
 
-IrEmitter2::IrEmitter2(llvm::Module* module) : module_(module) {}
+IrEmitter2::IrEmitter2(llvm::Module* module)
+    : module_(module),
+      call_frame_ty_(KernelCallFrameTy(module_->getContext())),
+      thread_dims_ty_(KernelThreadDimTy(module_->getContext())),
+      thread_ty_(KernelThreadTy(module_->getContext())),
+      arg_ty_(KernelArgTy(module_->getContext())) {}
 
-absl::StatusOr<IrEmitter2::HostKernelSym> IrEmitter2::EmitElementalHostKernel(
+absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     const HloInstruction* instr) {
   llvm::IRBuilder<> b(module_->getContext());
-  HostKernelEmitter emitter(module_);
 
   std::vector<Shape> parameters = FlattenedParameters(instr);
   std::vector<Shape> results = FlattenedResults(instr);
 
-  HostKernelEmitter::KernelPrototype kernel_prototype =
-      emitter.BuildKernelPrototype(instr->name(), parameters, results);
+  KernelPrototype kernel_prototype =
+      EmitKernelPrototype(instr->name(), parameters, results);
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : instr->operands()) {
     operand_to_generator[operand] = [&](const llvm_ir::IrArray::Index& index) {
-      return kernel_prototype.parameters[0].EmitReadArrayElement(index, &b);
+      return kernel_prototype.arguments[0].EmitReadArrayElement(index, &b);
     };
   }
 
@@ -131,7 +181,99 @@ absl::StatusOr<IrEmitter2::HostKernelSym> IrEmitter2::EmitElementalHostKernel(
       llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
           .EmitLoop(llvm_ir::IrName(instr)));
 
-  return HostKernelSym{kernel_prototype.function->getName().str()};
+  return KernelInfo{kernel_prototype.function->getName().str()};
+}
+
+//===----------------------------------------------------------------------===//
+// Building HostKernel prototypes.
+//===----------------------------------------------------------------------===//
+
+IrEmitter2::KernelThreadDims IrEmitter2::EmitKernelThreadDims(
+    llvm::IRBuilder<>& b, llvm::Value* call_frame) {
+  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 0);
+  auto* x_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
+  auto* y_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
+  auto* z_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
+          b.CreateLoad(b.getInt64Ty(), y_ptr),
+          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+}
+
+IrEmitter2::KernelThread IrEmitter2::EmitKernelThread(llvm::IRBuilder<>& b,
+                                                      llvm::Value* call_frame) {
+  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 1);
+  auto* x_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
+  auto* y_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
+  auto* z_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
+          b.CreateLoad(b.getInt64Ty(), y_ptr),
+          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+}
+
+llvm_ir::IrArray IrEmitter2::EmitKernelArgument(llvm::IRBuilder<>& b,
+                                                llvm::Value* call_frame,
+                                                int64_t index,
+                                                const Shape& shape) {
+  auto* args_ptr = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 3);
+  auto* arg_ptr = b.CreateConstGEP1_64(arg_ty_, args_ptr, index);
+  auto* data_ptr = b.CreateConstGEP2_64(arg_ty_, arg_ptr, 0, 0);
+
+  llvm::Type* ptr = llvm::PointerType::get(b.getContext(), 0);
+  return llvm_ir::IrArray(b.CreateLoad(ptr, data_ptr),
+                          llvm_ir::ShapeToIrType(shape, module_), shape);
+}
+
+IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
+    std::string_view name, absl::Span<const Shape> arguments,
+    absl::Span<const Shape> results) {
+  VLOG(3) << "Build kernel prototype for: " << name << " with "
+          << arguments.size() << " arguments and " << results.size()
+          << " results:";
+  for (auto& argument : arguments) {
+    VLOG(3) << "  arguments: " << argument.ToString(true);
+  }
+  for (auto& result : results) {
+    VLOG(3) << "  result: " << result.ToString(true);
+  }
+
+  llvm::LLVMContext& ctx = module_->getContext();
+  llvm::IRBuilder<> b(ctx);
+
+  // Create a kernel function with HostKernel API.
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_->getOrInsertFunction(name, KernelFunctionTy(ctx)).getCallee());
+  function->setCallingConv(llvm::CallingConv::C);
+  b.SetInsertPoint(llvm::BasicBlock::Create(ctx, "", function));
+
+  llvm::Value* call_frame = function->getArg(0);
+
+  // Build thread coordinates from the call frame.
+  KernelThreadDims kernel_thread_dims = EmitKernelThreadDims(b, call_frame);
+  KernelThread kernel_thread = EmitKernelThread(b, call_frame);
+
+  int64_t idx = 0;
+
+  // IrArrays for the parameters.
+  std::vector<llvm_ir::IrArray> ir_arguments;
+  for (const Shape& argument : arguments) {
+    ir_arguments.push_back(EmitKernelArgument(b, call_frame, idx++, argument));
+  }
+
+  // IrArrays for the results.
+  std::vector<llvm_ir::IrArray> ir_results;
+  for (const Shape& result : results) {
+    ir_results.push_back(EmitKernelArgument(b, call_frame, idx++, result));
+  }
+
+  // Return null pointer to signal success as we do not support error handling
+  // in the compiled host kernel.
+  b.CreateRet(
+      llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(ctx)));
+
+  return KernelPrototype{function, kernel_thread_dims, kernel_thread,
+                         std::move(ir_arguments), std::move(ir_results)};
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 58ff1904710d51..05da1b67083b10 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -16,11 +16,19 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_IR_EMITTER2_H_
 #define XLA_SERVICE_CPU_IR_EMITTER2_H_
 
+#include <cstdint>
 #include <string>
+#include <string_view>
+#include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
 
 namespace xla::cpu {
 
@@ -44,21 +52,75 @@ namespace xla::cpu {
 // WARNING: This is under construction and will eventually replace IrEmitter.
 class IrEmitter2 {
  public:
-  class ElementalIrEmitter;
-
   explicit IrEmitter2(llvm::Module* module);
 
+  // Thread dimensions of the kernel invocation.
+  struct KernelThreadDims {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  // Thread coordinates of the kernel invocation.
+  struct KernelThread {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  // A kernel function prototype with all the LLVM values that might be needed
+  // to emit the actual kernel body.
+  struct KernelPrototype {
+    llvm::Function* function;
+
+    // LLVM values identifying kernel invocation thread coordinates.
+    KernelThreadDims thread_dims;
+    KernelThread thread;
+
+    // LLVM values corresponding to the kernel arguments and results arrays. All
+    // tuples are flattened as we do not have any tuples at run time and only
+    // read and write data from/to leaf arrays.
+    std::vector<llvm_ir::IrArray> arguments;
+    std::vector<llvm_ir::IrArray> results;
+  };
+
   // A symbol name in the LLVM module that defines a host kernel.
-  struct HostKernelSym {
+  //
+  // TODO(ezhulenev): In addition to a symbol name we also need to know the
+  // block and thread sizes.
+  struct KernelInfo {
     std::string name;
   };
 
   // Emits an elemental host kernel for the given HLO instruction.
-  absl::StatusOr<HostKernelSym> EmitElementalHostKernel(
+  absl::StatusOr<KernelInfo> EmitElementalHostKernel(
       const HloInstruction* instr);
 
+  // Emits a host kernel prototype and prepares function for emitting kernel
+  // body into it.
+  KernelPrototype EmitKernelPrototype(std::string_view name,
+                                      absl::Span<const Shape> arguments,
+                                      absl::Span<const Shape> results);
+
  private:
+  class ElementalIrEmitter;
+
+  KernelThreadDims EmitKernelThreadDims(llvm::IRBuilder<>& b,
+                                        llvm::Value* call_frame);
+
+  KernelThread EmitKernelThread(llvm::IRBuilder<>& b, llvm::Value* call_frame);
+
+  llvm_ir::IrArray EmitKernelArgument(llvm::IRBuilder<>& b,
+                                      llvm::Value* call_frame, int64_t index,
+                                      const Shape& shape);
+
   llvm::Module* module_;
+
+  // LLVM types defining HostKernel API (see host_kernel_c_api.h).
+  llvm::StructType* call_frame_ty_;
+  llvm::StructType* thread_dims_ty_;
+  llvm::StructType* thread_ty_;
+  llvm::StructType* arg_ty_;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
index 70ccc42e40dbe6..e8d3c9984d8d21 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter2.h"
 
 #include <memory>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "llvm/IR/LLVMContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
@@ -33,6 +36,50 @@ namespace {
 
 using IrEmitter2Test = HloTestBase;
 
+TEST_F(IrEmitter2Test, BuildKernelPrototype) {
+  llvm::LLVMContext context;
+  auto module = std::make_unique<llvm::Module>("test", context);
+
+  auto shape = ShapeUtil::MakeShape(PrimitiveType::F32, {4, 2});
+  std::vector<Shape> parameters = {shape};
+  std::vector<Shape> results = {shape};
+
+  IrEmitter2 ir_emitter(module.get());
+  IrEmitter2::KernelPrototype prototype =
+      ir_emitter.EmitKernelPrototype("test", parameters, results);
+
+  ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
+    CHECK: define ptr @test(ptr %0) {
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 0
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   getelementptr %SE_HOST_KernelThreadDim
+    CHECK:   load i64
+    CHECK:   load i64
+    CHECK:   load i64
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 1
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   getelementptr %SE_HOST_KernelThread
+    CHECK:   load i64
+    CHECK:   load i64
+    CHECK:   load i64
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
+    CHECK:   getelementptr %SE_HOST_KernelArg
+    CHECK:   getelementptr %SE_HOST_KernelArg
+
+    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
+    CHECK:   getelementptr %SE_HOST_KernelArg
+    CHECK:   getelementptr %SE_HOST_KernelArg
+
+    CHECK:   ret ptr null
+    CHECK: }
+  )"));
+}
+
 TEST_F(IrEmitter2Test, EmitElementalKernel) {
   llvm::LLVMContext context;
   auto module = std::make_unique<llvm::Module>("test", context);
@@ -49,7 +96,7 @@ TEST_F(IrEmitter2Test, EmitElementalKernel) {
   ASSERT_NE(convert, nullptr);
 
   IrEmitter2 ir_emitter(module.get());
-  TF_ASSERT_OK_AND_ASSIGN(IrEmitter2::HostKernelSym sym,
+  TF_ASSERT_OK_AND_ASSIGN(IrEmitter2::KernelInfo kernel,
                           ir_emitter.EmitElementalHostKernel(convert));
 
   ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(

From d1527296226075d43908a5d4a916c00747735ddf Mon Sep 17 00:00:00 2001
From: Catherine Payne <paynecl@google.com>
Date: Tue, 28 May 2024 13:47:02 -0700
Subject: [PATCH 029/287] Correct instrumentation of MLIR bridge ph1

PiperOrigin-RevId: 638011127
---
 .../host_runtime/lower_cluster_to_runtime_ops_test.cc           | 2 +-
 tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc       | 2 +-
 tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc       | 2 +-
 tensorflow/core/framework/metrics.cc                            | 2 +-
 tensorflow/core/framework/metrics.h                             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
index 1f0cf146203de2..cd2de4edf987d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
@@ -62,7 +62,7 @@ std::string TestDataPath() {
 }
 
 static constexpr char kCompilationStreamz[] =
-    "/tensorflow/core/tf_mlir_bridge_first_phase_count";
+    "/tensorflow/core/tf_mlir_bridge_first_phase_v2_count";
 
 class LowerClusterToRuntimeOpsTest : public ::testing::Test {
  public:
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
index 2dda3809fc2b9b..916b568a698de8 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -45,7 +45,7 @@ using ::mlir::OwningOpRef;
 using ::tensorflow::monitoring::testing::CellReader;
 
 static constexpr char kCompilationStreamz[] =
-    "/tensorflow/core/tf_mlir_bridge_first_phase_count";
+    "/tensorflow/core/tf_mlir_bridge_first_phase_v2_count";
 
 std::string TestDataPath() {
   return tensorflow::GetDataDependencyFilepath(
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index 14a9c1b1a99bff..6104027f835ca4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -56,7 +56,7 @@ std::string TestDataPath() {
 }
 
 static constexpr char kCompilationStreamz[] =
-    "/tensorflow/core/tf_mlir_bridge_first_phase_count";
+    "/tensorflow/core/tf_mlir_bridge_first_phase_v2_count";
 
 class FunctionClusterTensorflowDialectTest : public ::testing::Test {
  public:
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 863f0c209513ac..dc324d06c128dd 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -444,7 +444,7 @@ auto* eager_client_error_counter = tsl::monitoring::Counter<2>::New(
     "error_type");
 
 auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<5>::New(
-    "/tensorflow/core/tf_mlir_bridge_first_phase_count",
+    "/tensorflow/core/tf_mlir_bridge_first_phase_v2_count",
     "Tracks processing state in first phase of mlir bridge", "bridge",
     "version", "device", "fallback", "result");
 
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 1a6ba8a88bf890..18b52c49ecf61b 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -340,7 +340,7 @@ int64_t GetFunctionGraphOptimizationCacheLoadCount(
     GraphOptimizationSource source);
 
 // Records the activity of the first phase of the mlir bridge using the
-// tf_metadata.tf_mlir_bridge_first_phase_count metric.
+// tf_metadata.tf_mlir_bridge_first_phase_v2_count metric.
 // bridge_type: replicated, nonreplicated, etc.
 // bridge_version: v1 compat, v2, etc.
 // device_type: tpu, cpu, gpu, etc.

From 32f16ec208f8c3fdf93f99420e61dcbde83eb97b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 13:55:45 -0700
Subject: [PATCH 030/287] Break LegalizeTfWithTf2xla out of passes.h to make
 using it easier.

PiperOrigin-RevId: 638013818
---
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |  2 +-
 .../stablehlo/transforms/tf_stablehlo_pass.cc |  1 +
 .../mlir/lite/transforms/prepare_tf.cc        |  1 +
 .../compiler/mlir/tf2xla/transforms/BUILD     |  1 +
 .../transforms/legalize_tf_with_tf2xla.cc     |  1 +
 .../legalize_tf_with_tf2xla_passes.h          | 63 +++++++++++++++++++
 .../compiler/mlir/tf2xla/transforms/passes.h  | 17 -----
 .../mlir/tf2xla/transforms/xla_legalize_tf.cc |  1 +
 8 files changed, 69 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index e4001d4c08b695..5e621b934a221a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -134,7 +134,7 @@ cc_library(
         ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
index a22c392163b09d..ae4ee26eab9b8c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index b0b6fc8ac7f2d8..d1bb76fa431ee2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 34ffdfa90f028f..d24d93af829a58 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -402,6 +402,7 @@ cc_library(
         "legalize_tf_with_tf2xla.cc",
     ],
     hdrs = [
+        "legalize_tf_with_tf2xla_passes.h",
         "passes.h",
     ],
     deps = [
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
index ce8b46708d2f52..b528215c75194f 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h
new file mode 100644
index 00000000000000..8c83fb56608544
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+
+namespace mhlo {
+
+/// Converter to be used along with the fallback Tf2Xla patterns below.
+class Tf2XlaTypeConverter : public TypeConverter {
+ public:
+  Tf2XlaTypeConverter();
+};
+
+/// Adds the TF to XLA via TF2XLA rewrite patterns to the pattern list.
+/// `prefer_tf2xla` means an op will be included iff it is not in
+/// `MlirLegalizedUnderPreferTf2XlaSet`. `!prefer_tf2xla` mean an op will be
+/// included if there is no native MLIR legalization for the op.
+void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
+                                          RewritePatternSet& patterns,
+                                          MLIRContext* ctx,
+                                          Tf2XlaTypeConverter& converter,
+                                          bool prefer_tf2xla = false);
+
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
index e4215725741a6d..0b9f5a1efaab84 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
@@ -54,22 +54,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFPass(
     std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt,
     bool prefer_tf2xla = false);
 
-/// Converter to be used along with the fallback Tf2Xla patterns below.
-class Tf2XlaTypeConverter : public TypeConverter {
- public:
-  Tf2XlaTypeConverter();
-};
-
-/// Adds the TF to XLA via TF2XLA rewrite patterns to the pattern list.
-/// `prefer_tf2xla` means an op will be included iff it is not in
-/// `MlirLegalizedUnderPreferTf2XlaSet`. `!prefer_tf2xla` mean an op will be
-/// included if there is no native MLIR legalization for the op.
-void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
-                                          RewritePatternSet& patterns,
-                                          MLIRContext* ctx,
-                                          Tf2XlaTypeConverter& converter,
-                                          bool prefer_tf2xla = false);
-
 /// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
 /// list.
 void PopulateLegalizeTfPatterns(MLIRContext* context,
@@ -129,7 +113,6 @@ CreateInfeedsOpsXlaAdjustLayoutPass();
 
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_LEGALIZETFCOMMUNICATIONPASS
-#define GEN_PASS_DECL_LEGALIZETFWITHTF2XLA
 #include "tensorflow/compiler/mlir/tf2xla/transforms/tf_xla_passes.h.inc"
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index d3c9ff7e8bd157..ee377b93b7662c 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"

From bca3f13195bcbde4b30cf75ccc8aa5e81b6f4077 Mon Sep 17 00:00:00 2001
From: "Dimitar (Mitko) Asenov" <dasenov@google.com>
Date: Tue, 28 May 2024 14:00:45 -0700
Subject: [PATCH 031/287] [XLA:GPU] Remove the `emit_param_load_fn` callback in
 `EmitTiledScope`.

PiperOrigin-RevId: 638015368
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 37 +++++++------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 7e0550857e6baa..4a6a5d9141747d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -831,14 +831,14 @@ Value EmitTiledBroadcast(
 absl::StatusOr<Value> EmitTiledHloInstruction(
     ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
-    const TiledHloInstruction& tiled_hlo,
-    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
-        emit_param_load_fn,
+    const TiledHloInstruction& tiled_hlo, mlir::triton::FuncOp fn, Value pid,
     absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
   const HloInstruction* hlo = tiled_hlo.hlo();
 
   if (hlo->opcode() == HloOpcode::kParameter) {
-    return emit_param_load_fn(tiled_hlo);
+    auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
+        b, pid, tiled_hlo, fn.getArgument(tiled_hlo.hlo()->parameter_number()));
+    return EmitParameterLoad(b, make_tensor.op, make_tensor.boundary_checks);
   }
 
   if (hlo->opcode() == HloOpcode::kConstant &&
@@ -883,16 +883,14 @@ absl::StatusOr<Value> EmitTiledHloInstruction(
 absl::StatusOr<Value> EmitTiledScope(
     ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
-    const TiledHloComputation& tiled_computation,
-    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
-        emit_param_load_fn,
-    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+    const TiledHloComputation& tiled_computation, mlir::triton::FuncOp fn,
+    Value pid) {
+  absl::flat_hash_map<const TiledHloInstruction*, Value> values;
   for (const TiledHloInstruction* tiled_hlo :
        tiled_computation.instructions()) {
-    TF_ASSIGN_OR_RETURN(
-        Value result,
-        EmitTiledHloInstruction(b, libdevice_path, device_info, *tiled_hlo,
-                                emit_param_load_fn, values));
+    TF_ASSIGN_OR_RETURN(Value result,
+                        EmitTiledHloInstruction(b, libdevice_path, device_info,
+                                                *tiled_hlo, fn, pid, values));
     TF_RET_CHECK(values.insert({tiled_hlo, result}).second)
         << tiled_hlo->hlo()->ToString();
     VLOG(8) << "Emitted "
@@ -2504,18 +2502,9 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
 
   TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
                       analysis->ComputeTiledHloInstructions(output_tile_sizes));
-  auto emit_param_load =
-      [&](const TiledHloInstruction& tiled_hlo) -> absl::StatusOr<Value> {
-    auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-        b, pid, tiled_hlo, fn.getArgument(tiled_hlo.hlo()->parameter_number()));
-    return EmitParameterLoad(b, make_tensor.op, make_tensor.boundary_checks);
-  };
-
-  absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
-  TF_ASSIGN_OR_RETURN(
-      Value result,
-      EmitTiledScope(b, libdevice_path, device_info, tiled_hlo_computation,
-                     emit_param_load, values_out));
+  TF_ASSIGN_OR_RETURN(Value result,
+                      EmitTiledScope(b, libdevice_path, device_info,
+                                     tiled_hlo_computation, fn, pid));
 
   const auto& tiled_hlo = *tiled_hlo_computation.GetRoot();
   auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(

From ef88271c6a198a18d755e677b7e7d595e09e6cc4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 14:21:13 -0700
Subject: [PATCH 032/287] Integrate LLVM at llvm/llvm-project@9b79acedd689

Updates LLVM usage to match
[9b79acedd689](https://github.com/llvm/llvm-project/commit/9b79acedd689)

PiperOrigin-RevId: 638022318
---
 third_party/llvm/generated.patch | 1762 ------------------------------
 third_party/llvm/workspace.bzl   |    4 +-
 2 files changed, 2 insertions(+), 1764 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 88dc5ae24a6833..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,1763 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
---- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
-+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
-@@ -1,744 +0,0 @@
--//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--/// \file Implements a module splitting algorithm designed to support the
--/// FullLTO --lto-partitions option for parallel codegen. This is completely
--/// different from the common SplitModule pass, as this system is designed with
--/// AMDGPU in mind.
--///
--/// The basic idea of this module splitting implementation is the same as
--/// SplitModule: load-balance the module's functions across a set of N
--/// partitions to allow parallel codegen. However, it does it very
--/// differently than the target-agnostic variant:
--///   - Kernels are used as the module's "roots".
--///     They're known entry points on AMDGPU, and everything else is often
--///     internal only.
--///   - Each kernel has a set of dependencies, and when a kernel and its
--///     dependencies is considered "big", we try to put it in a partition where
--///     most dependencies are already imported, to avoid duplicating large
--///     amounts of code.
--///   - There's special care for indirect calls in order to ensure
--///     AMDGPUResourceUsageAnalysis can work correctly.
--///
--/// This file also includes a more elaborate logging system to enable
--/// users to easily generate logs that (if desired) do not include any value
--/// names, in order to not leak information about the source file.
--/// Such logs are very helpful to understand and fix potential issues with
--/// module splitting.
--
--#include "AMDGPUSplitModule.h"
--#include "AMDGPUTargetMachine.h"
--#include "Utils/AMDGPUBaseInfo.h"
--#include "llvm/ADT/DenseMap.h"
--#include "llvm/ADT/SmallVector.h"
--#include "llvm/ADT/StringExtras.h"
--#include "llvm/ADT/StringRef.h"
--#include "llvm/Analysis/CallGraph.h"
--#include "llvm/Analysis/TargetTransformInfo.h"
--#include "llvm/IR/Function.h"
--#include "llvm/IR/Instruction.h"
--#include "llvm/IR/Module.h"
--#include "llvm/IR/User.h"
--#include "llvm/IR/Value.h"
--#include "llvm/Support/Casting.h"
--#include "llvm/Support/Debug.h"
--#include "llvm/Support/FileSystem.h"
--#include "llvm/Support/Path.h"
--#include "llvm/Support/Process.h"
--#include "llvm/Support/SHA256.h"
--#include "llvm/Support/Threading.h"
--#include "llvm/Support/raw_ostream.h"
--#include "llvm/Transforms/Utils/Cloning.h"
--#include <algorithm>
--#include <cassert>
--#include <iterator>
--#include <memory>
--#include <utility>
--#include <vector>
--
--using namespace llvm;
--
--#define DEBUG_TYPE "amdgpu-split-module"
--
--namespace {
--
--static cl::opt<float> LargeKernelFactor(
--    "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
--    cl::Hidden,
--    cl::desc(
--        "consider a kernel as large and needing special treatment when it "
--        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
--        "means if the kernel and its dependencies is 2 times bigger than "
--        "an average partition; 0 disables large kernels handling entirely"));
--
--static cl::opt<float> LargeKernelOverlapForMerge(
--    "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
--    cl::Hidden,
--    cl::desc("defines how much overlap between two large kernel's dependencies "
--             "is needed to put them in the same partition"));
--
--static cl::opt<bool> NoExternalizeGlobals(
--    "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
--    cl::desc("disables externalization of global variable with local linkage; "
--             "may cause globals to be duplicated which increases binary size"));
--
--static cl::opt<std::string>
--    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
--              cl::desc("output directory for AMDGPU module splitting logs"));
--
--static cl::opt<bool>
--    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
--               cl::desc("hash value names before printing them in the AMDGPU "
--                        "module splitting logs"));
--
--using CostType = InstructionCost::CostType;
--using PartitionID = unsigned;
--
--static bool isEntryPoint(const Function *F) {
--  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
--}
--
--static std::string getName(const Value &V) {
--  static bool HideNames;
--
--  static llvm::once_flag HideNameInitFlag;
--  llvm::call_once(HideNameInitFlag, [&]() {
--    if (LogPrivate.getNumOccurrences())
--      HideNames = LogPrivate;
--    else {
--      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
--      HideNames = (EV.value_or("0") != "0");
--    }
--  });
--
--  if (!HideNames)
--    return V.getName().str();
--  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
--               /*LowerCase=*/true);
--}
--
--/// Main logging helper.
--///
--/// Logging can be configured by the following environment variable.
--///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
--///     If set, uses <filepath> as the directory to write logfiles to
--///     each time module splitting is used.
--///   AMD_SPLIT_MODULE_LOG_PRIVATE
--///     If set to anything other than zero, all names are hidden.
--///
--/// Both environment variables have corresponding CL options which
--/// takes priority over them.
--///
--/// Any output printed to the log files is also printed to dbgs() when -debug is
--/// used and LLVM_DEBUG is defined.
--///
--/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
--/// cannot be removed from the code (by building without debug). This probably
--/// has a small performance cost because if some computation/formatting is
--/// needed for logging purpose, it may be done everytime only to be ignored
--/// by the logger.
--///
--/// As this pass only runs once and is not doing anything computationally
--/// expensive, this is likely a reasonable trade-off.
--///
--/// If some computation should really be avoided when unused, users of the class
--/// can check whether any logging will occur by using the bool operator.
--///
--/// \code
--///   if (SML) {
--///     // Executes only if logging to a file or if -debug is available and
--///     used.
--///   }
--/// \endcode
--class SplitModuleLogger {
--public:
--  SplitModuleLogger(const Module &M) {
--    std::string LogDir = LogDirOpt;
--    if (LogDir.empty())
--      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
--
--    // No log dir specified means we don't need to log to a file.
--    // We may still log to dbgs(), though.
--    if (LogDir.empty())
--      return;
--
--    // If a log directory is specified, create a new file with a unique name in
--    // that directory.
--    int Fd;
--    SmallString<0> PathTemplate;
--    SmallString<0> RealPath;
--    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
--    if (auto Err =
--            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
--      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
--                             "': " + Err.message(),
--                         /*CrashDiag=*/false);
--    }
--
--    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
--  }
--
--  bool hasLogFile() const { return FileOS != nullptr; }
--
--  raw_ostream &logfile() {
--    assert(FileOS && "no logfile!");
--    return *FileOS;
--  }
--
--  /// \returns true if this SML will log anything either to a file or dbgs().
--  /// Can be used to avoid expensive computations that are ignored when logging
--  /// is disabled.
--  operator bool() const {
--    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
--  }
--
--private:
--  std::unique_ptr<raw_fd_ostream> FileOS;
--};
--
--template <typename Ty>
--static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
--  static_assert(
--      !std::is_same_v<Ty, Value>,
--      "do not print values to logs directly, use handleName instead!");
--  LLVM_DEBUG(dbgs() << Val);
--  if (SML.hasLogFile())
--    SML.logfile() << Val;
--  return SML;
--}
--
--/// Calculate the cost of each function in \p M
--/// \param SML Log Helper
--/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
--/// \param M Module to analyze.
--/// \param CostMap[out] Resulting Function -> Cost map.
--/// \return The module's total cost.
--static CostType
--calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
--                       Module &M,
--                       DenseMap<const Function *, CostType> &CostMap) {
--  CostType ModuleCost = 0;
--  CostType KernelCost = 0;
--
--  for (auto &Fn : M) {
--    if (Fn.isDeclaration())
--      continue;
--
--    CostType FnCost = 0;
--    TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
--
--    for (const auto &BB : Fn) {
--      for (const auto &I : BB) {
--        auto Cost =
--            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
--        assert(Cost != InstructionCost::getMax());
--        // Assume expensive if we can't tell the cost of an instruction.
--        CostType CostVal =
--            Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
--        assert((FnCost + CostVal) >= FnCost && "Overflow!");
--        FnCost += CostVal;
--      }
--    }
--
--    assert(FnCost != 0);
--
--    CostMap[&Fn] = FnCost;
--    assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
--    ModuleCost += FnCost;
--
--    if (isEntryPoint(&Fn))
--      KernelCost += FnCost;
--  }
--
--  CostType FnCost = (ModuleCost - KernelCost);
--  SML << "=> Total Module Cost: " << ModuleCost << '\n'
--      << "  => KernelCost: " << KernelCost << " ("
--      << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
--      << "  => FnsCost: " << FnCost << " ("
--      << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
--
--  return ModuleCost;
--}
--
--static bool canBeIndirectlyCalled(const Function &F) {
--  if (F.isDeclaration() || isEntryPoint(&F))
--    return false;
--  return !F.hasLocalLinkage() ||
--         F.hasAddressTaken(/*PutOffender=*/nullptr,
--                           /*IgnoreCallbackUses=*/false,
--                           /*IgnoreAssumeLikeCalls=*/true,
--                           /*IgnoreLLVMUsed=*/true,
--                           /*IgnoreARCAttachedCall=*/false,
--                           /*IgnoreCastedDirectCall=*/true);
--}
--
--/// When a kernel or any of its callees performs an indirect call, this function
--/// takes over \ref addAllDependencies and adds all potentially callable
--/// functions to \p Fns so they can be counted as dependencies of the kernel.
--///
--/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
--/// presence of an indirect call, the function's resource usage is the same as
--/// the most expensive function in the module.
--/// \param M    The module.
--/// \param Fns[out] Resulting list of functions.
--static void addAllIndirectCallDependencies(const Module &M,
--                                           DenseSet<const Function *> &Fns) {
--  for (const auto &Fn : M) {
--    if (canBeIndirectlyCalled(Fn))
--      Fns.insert(&Fn);
--  }
--}
--
--/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
--/// callee until all reachable functions have been gathered.
--///
--/// \param SML Log Helper
--/// \param CG Call graph for \p Fn's module.
--/// \param Fn Current function to look at.
--/// \param Fns[out] Resulting list of functions.
--/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
--/// point, either in \p Fn or in one of the function it calls. When that
--/// happens, we fall back to adding all callable functions inside \p Fn's module
--/// to \p Fns.
--static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
--                               const Function &Fn,
--                               DenseSet<const Function *> &Fns,
--                               bool &HadIndirectCall) {
--  assert(!Fn.isDeclaration());
--
--  const Module &M = *Fn.getParent();
--  SmallVector<const Function *> WorkList({&Fn});
--  while (!WorkList.empty()) {
--    const auto &CurFn = *WorkList.pop_back_val();
--    assert(!CurFn.isDeclaration());
--
--    // Scan for an indirect call. If such a call is found, we have to
--    // conservatively assume this can call all non-entrypoint functions in the
--    // module.
--
--    for (auto &CGEntry : *CG[&CurFn]) {
--      auto *CGNode = CGEntry.second;
--      auto *Callee = CGNode->getFunction();
--      if (!Callee) {
--        // Functions have an edge towards CallsExternalNode if they're external
--        // declarations, or if they do an indirect call. As we only process
--        // definitions here, we know this means the function has an indirect
--        // call. We then have to conservatively assume this can call all
--        // non-entrypoint functions in the module.
--        if (CGNode != CG.getCallsExternalNode())
--          continue; // this is another function-less node we don't care about.
--
--        SML << "Indirect call detected in " << getName(CurFn)
--            << " - treating all non-entrypoint functions as "
--               "potential dependencies\n";
--
--        // TODO: Print an ORE as well ?
--        addAllIndirectCallDependencies(M, Fns);
--        HadIndirectCall = true;
--        return;
--      }
--
--      if (Callee->isDeclaration())
--        continue;
--
--      auto [It, Inserted] = Fns.insert(Callee);
--      if (Inserted)
--        WorkList.push_back(Callee);
--    }
--  }
--}
--
--/// Contains information about a kernel and its dependencies.
--struct KernelWithDependencies {
--  KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
--                         const DenseMap<const Function *, CostType> &FnCosts,
--                         const Function *Fn)
--      : Fn(Fn) {
--    addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
--    TotalCost = FnCosts.at(Fn);
--    for (const auto *Dep : Dependencies) {
--      TotalCost += FnCosts.at(Dep);
--
--      // We cannot duplicate functions with external linkage, or functions that
--      // may be overriden at runtime.
--      HasNonDuplicatableDependecy |=
--          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
--    }
--  }
--
--  const Function *Fn = nullptr;
--  DenseSet<const Function *> Dependencies;
--  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
--  bool HasIndirectCall = false;
--  /// Whether any of \p Fn's dependencies cannot be duplicated.
--  bool HasNonDuplicatableDependecy = false;
--
--  CostType TotalCost = 0;
--
--  /// \returns true if this kernel and its dependencies can be considered large
--  /// according to \p Threshold.
--  bool isLarge(CostType Threshold) const {
--    return TotalCost > Threshold && !Dependencies.empty();
--  }
--};
--
--/// Calculates how much overlap there is between \p A and \p B.
--/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
--/// and B have no shared elements. Kernels do not count in overlap calculation.
--static float calculateOverlap(const DenseSet<const Function *> &A,
--                              const DenseSet<const Function *> &B) {
--  DenseSet<const Function *> Total;
--  for (const auto *F : A) {
--    if (!isEntryPoint(F))
--      Total.insert(F);
--  }
--
--  if (Total.empty())
--    return 0.0f;
--
--  unsigned NumCommon = 0;
--  for (const auto *F : B) {
--    if (isEntryPoint(F))
--      continue;
--
--    auto [It, Inserted] = Total.insert(F);
--    if (!Inserted)
--      ++NumCommon;
--  }
--
--  return static_cast<float>(NumCommon) / Total.size();
--}
--
--/// Performs all of the partitioning work on \p M.
--/// \param SML Log Helper
--/// \param M Module to partition.
--/// \param NumParts Number of partitions to create.
--/// \param ModuleCost Total cost of all functions in \p M.
--/// \param FnCosts Map of Function -> Cost
--/// \param WorkList Kernels and their dependencies to process in order.
--/// \returns The created partitions (a vector of size \p NumParts )
--static std::vector<DenseSet<const Function *>>
--doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
--               CostType ModuleCost,
--               const DenseMap<const Function *, CostType> &FnCosts,
--               const SmallVector<KernelWithDependencies> &WorkList) {
--
--  SML << "\n--Partitioning Starts--\n";
--
--  // Calculate a "large kernel threshold". When more than one kernel's total
--  // import cost exceeds this value, we will try to merge it with other,
--  // similarly large kernels.
--  //
--  // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
--  // assign X to a partition as usual, but when we get to Y, we check if it's
--  // worth also putting it in Y's partition.
--  const CostType LargeKernelThreshold =
--      LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor)
--                        : std::numeric_limits<CostType>::max();
--
--  std::vector<DenseSet<const Function *>> Partitions;
--  Partitions.resize(NumParts);
--
--  // Assign a partition to each kernel, and try to keep the partitions more or
--  // less balanced. We do that through a priority queue sorted in reverse, so we
--  // can always look at the partition with the least content.
--  //
--  // There are some cases where we will be deliberately unbalanced though.
--  //  - Large kernels: we try to merge with existing partitions to reduce code
--  //  duplication.
--  //  - Kernels with indirect or external calls always go in the first partition
--  //  (P0).
--  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
--                              const std::pair<PartitionID, CostType> &b) {
--    // When two partitions have the same cost, assign to the one with the
--    // biggest ID first. This allows us to put things in P0 last, because P0 may
--    // have other stuff added later.
--    if (a.second == b.second)
--      return a.first < b.first;
--    return a.second > b.second;
--  };
--
--  // We can't use priority_queue here because we need to be able to access any
--  // element. This makes this a bit inefficient as we need to sort it again
--  // everytime we change it, but it's a very small array anyway (likely under 64
--  // partitions) so it's a cheap operation.
--  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
--  for (unsigned I = 0; I < NumParts; ++I)
--    BalancingQueue.push_back(std::make_pair(I, 0));
--
--  // Helper function to handle assigning a kernel to a partition. This takes
--  // care of updating the balancing queue.
--  const auto AssignToPartition = [&](PartitionID PID,
--                                     const KernelWithDependencies &KWD) {
--    auto &FnsInPart = Partitions[PID];
--    FnsInPart.insert(KWD.Fn);
--    FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
--
--    SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n  ->  ";
--    if (!KWD.Dependencies.empty()) {
--      SML << KWD.Dependencies.size() << " dependencies added\n";
--    };
--
--    // Update the balancing queue. we scan backwards because in the common case
--    // the partition is at the end.
--    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
--      if (QueuePID == PID) {
--        CostType NewCost = 0;
--        for (auto *Fn : Partitions[PID])
--          NewCost += FnCosts.at(Fn);
--
--        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
--        if (Cost) {
--          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
--              << "% increase)";
--        }
--        SML << '\n';
--
--        Cost = NewCost;
--      }
--    }
--
--    sort(BalancingQueue, ComparePartitions);
--  };
--
--  for (auto &CurKernel : WorkList) {
--    // When a kernel has indirect calls, it must stay in the first partition
--    // alongside every reachable non-entry function. This is a nightmare case
--    // for splitting as it severely limits what we can do.
--    if (CurKernel.HasIndirectCall) {
--      SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
--          << " defaulting to P0\n";
--      AssignToPartition(0, CurKernel);
--      continue;
--    }
--
--    // When a kernel has non duplicatable dependencies, we have to keep it in
--    // the first partition as well. This is a conservative approach, a
--    // finer-grained approach could keep track of which dependencies are
--    // non-duplicatable exactly and just make sure they're grouped together.
--    if (CurKernel.HasNonDuplicatableDependecy) {
--      SML << "Kernel with externally visible dependency "
--          << getName(*CurKernel.Fn) << " defaulting to P0\n";
--      AssignToPartition(0, CurKernel);
--      continue;
--    }
--
--    // Be smart with large kernels to avoid duplicating their dependencies.
--    if (CurKernel.isLarge(LargeKernelThreshold)) {
--      assert(LargeKernelOverlapForMerge >= 0.0f &&
--             LargeKernelOverlapForMerge <= 1.0f);
--      SML << "Large Kernel: " << getName(*CurKernel.Fn)
--          << " - looking for partition with at least "
--          << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
--
--      bool Assigned = false;
--      for (const auto &[PID, Fns] : enumerate(Partitions)) {
--        float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
--        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
--            << PID << '\n';
--        if (Overlap > LargeKernelOverlapForMerge) {
--          SML << "  selecting P" << PID << '\n';
--          AssignToPartition(PID, CurKernel);
--          Assigned = true;
--        }
--      }
--
--      if (Assigned)
--        continue;
--    }
--
--    // Normal "load-balancing", assign to partition with least pressure.
--    auto [PID, CurCost] = BalancingQueue.back();
--    AssignToPartition(PID, CurKernel);
--  }
--
--  // Work is mostly done now, verify the partioning and add all functions we may
--  // have missed (= unreachable, or we don't understand how they're reached) to
--  // P0.
--  DenseSet<const Function *> AllFunctions;
--  for (const auto &[Idx, Part] : enumerate(Partitions)) {
--    CostType Cost = 0;
--    for (auto *Fn : Part) {
--      // external linkage functions should exclusively be in the first partition
--      // at this stage. In theory, we should only ever see external linkage
--      // functions here if they're kernels, or if they've been added due to a
--      // kernel using indirect calls somewhere in its CallGraph.
--      assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
--      Cost += FnCosts.at(Fn);
--    }
--    SML << "P" << Idx << " has a total cost of " << Cost << " ("
--        << format("%0.2f", (float(Cost) / ModuleCost) * 100)
--        << "% of source module)\n";
--    AllFunctions.insert(Part.begin(), Part.end());
--  }
--
--  // Add missed functions to P0. This will take care of adding things like
--  // external functions with no callers in the module to P0. This should be
--  // fairly rare as AMDGPU internalizes everything in most cases, so unused
--  // internal functions would get removed.
--  for (auto &Fn : M) {
--    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
--      SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
--      Partitions[0].insert(&Fn);
--    }
--  }
--
--  SML << "--Partitioning Done--\n\n";
--
--  return Partitions;
--}
--
--static void externalize(GlobalValue &GV) {
--  if (GV.hasLocalLinkage()) {
--    GV.setLinkage(GlobalValue::ExternalLinkage);
--    GV.setVisibility(GlobalValue::HiddenVisibility);
--  }
--
--  // Unnamed entities must be named consistently between modules. setName will
--  // give a distinct name to each such entity.
--  if (!GV.hasName())
--    GV.setName("__llvmsplit_unnamed");
--}
--} // end anonymous namespace
--
--void llvm::splitAMDGPUModule(
--    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
--    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
--
--  SplitModuleLogger SML(M);
--
--  CallGraph CG(M);
--
--  // Externalize functions whose address are taken.
--  //
--  // This is needed because partitioning is purely based on calls, but sometimes
--  // a kernel/function may just look at the address of another local function
--  // and not do anything (no calls). After partitioning, that local function may
--  // end up in a different module (so it's just a declaration in the module
--  // where its address is taken), which emits a "undefined hidden symbol" linker
--  // error.
--  //
--  // Additionally, it guides partitioning to not duplicate this function if it's
--  // called directly at some point.
--  for (auto &Fn : M) {
--    if (Fn.hasAddressTaken()) {
--      if (Fn.hasLocalLinkage()) {
--        SML << "[externalize] " << Fn.getName()
--            << " because its address is taken\n";
--      }
--      externalize(Fn);
--    }
--  }
--
--  // Externalize local GVs, which avoids duplicating their initializers, which
--  // in turns helps keep code size in check.
--  if (!NoExternalizeGlobals) {
--    for (auto &GV : M.globals()) {
--      if (GV.hasLocalLinkage())
--        SML << "[externalize] GV " << GV.getName() << '\n';
--      externalize(GV);
--    }
--  }
--
--  // Start by calculating the cost of every function in the module, as well as
--  // the module's overall cost.
--  DenseMap<const Function *, CostType> FnCosts;
--  const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
--
--  // Gather every kernel into a WorkList, then sort it by descending total cost
--  // of the kernel so the biggest kernels are seen first.
--  SmallVector<KernelWithDependencies> WorkList;
--  for (auto &Fn : M) {
--    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
--      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
--  }
--  sort(WorkList, [&](auto &A, auto &B) {
--    // Sort by total cost, and if the total cost is identical, sort
--    // alphabetically.
--    if (A.TotalCost == B.TotalCost)
--      return A.Fn->getName() < B.Fn->getName();
--    return A.TotalCost > B.TotalCost;
--  });
--
--  if (SML) {
--    SML << "Worklist\n";
--    for (const auto &KWD : WorkList) {
--      SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
--          << " indirect:" << KWD.HasIndirectCall
--          << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
--          << ")\n";
--      for (const auto *Dep : KWD.Dependencies)
--        SML << "  [Dep] " << getName(*Dep) << '\n';
--    }
--  }
--
--  // This performs all of the partitioning work.
--  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
--  assert(Partitions.size() == N);
--
--  // If we didn't externalize GVs, then local GVs need to be conservatively
--  // imported into every module (including their initializers), and then cleaned
--  // up afterwards.
--  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
--    // We conservatively import private/internal GVs into every module and clean
--    // them up afterwards.
--    const auto *Var = dyn_cast<GlobalVariable>(GV);
--    return Var && Var->hasLocalLinkage();
--  };
--
--  SML << "Creating " << N << " modules...\n";
--  unsigned TotalFnImpls = 0;
--  for (unsigned I = 0; I < N; ++I) {
--    const auto &FnsInPart = Partitions[I];
--
--    ValueToValueMapTy VMap;
--    std::unique_ptr<Module> MPart(
--        CloneModule(M, VMap, [&](const GlobalValue *GV) {
--          // Functions go in their assigned partition.
--          if (const auto *Fn = dyn_cast<Function>(GV)) {
--// Check we don't import an external linkage function in any
--// partition other than P0.
--#ifndef NDEBUG
--            if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
--              assert((I == 0) == FnsInPart.contains(Fn));
--            }
--#endif
--            return FnsInPart.contains(Fn);
--          }
--
--          if (NeedsConservativeImport(GV))
--            return true;
--
--          // Everything else goes in the first partition.
--          return I == 0;
--        }));
--
--    // Clean-up conservatively imported GVs without any users.
--    for (auto &GV : make_early_inc_range(MPart->globals())) {
--      if (NeedsConservativeImport(&GV) && GV.use_empty())
--        GV.eraseFromParent();
--    }
--
--    unsigned NumAllFns = 0, NumKernels = 0;
--    for (auto &Cur : *MPart) {
--      if (!Cur.isDeclaration()) {
--        ++NumAllFns;
--        if (isEntryPoint(&Cur))
--          ++NumKernels;
--      }
--    }
--    TotalFnImpls += NumAllFns;
--    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
--        << NumKernels << " kernels)\n";
--    ModuleCallback(std::move(MPart));
--  }
--
--  SML << TotalFnImpls << " function definitions across all modules ("
--      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
--      << "% of original module)\n";
--}
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
---- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
-+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
-@@ -1,30 +0,0 @@
--//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--//===----------------------------------------------------------------------===//
--
--#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
--#define LLVM_TARGET_AMDGPUSPLITMODULE_H
--
--#include "llvm/ADT/STLFunctionalExtras.h"
--#include <memory>
--
--namespace llvm {
--
--class Module;
--class AMDGPUTargetMachine;
--
--/// Splits the module M into N linkable partitions. The function ModuleCallback
--/// is called N times passing each individual partition as the MPart argument.
--void splitAMDGPUModule(
--    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
--    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
--
--} // end namespace llvm
--
--#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
---- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
-+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
-@@ -21,7 +21,6 @@
- #include "AMDGPUIGroupLP.h"
- #include "AMDGPUMacroFusion.h"
- #include "AMDGPURegBankSelect.h"
--#include "AMDGPUSplitModule.h"
- #include "AMDGPUTargetObjectFile.h"
- #include "AMDGPUTargetTransformInfo.h"
- #include "AMDGPUUnifyDivergentExitNodes.h"
-@@ -816,13 +815,6 @@
-   return AMDGPUAS::FLAT_ADDRESS;
- }
- 
--bool AMDGPUTargetMachine::splitModule(
--    Module &M, unsigned NumParts,
--    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
--  splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
--  return true;
--}
--
- //===----------------------------------------------------------------------===//
- // GCN Target Machine (SI+)
- //===----------------------------------------------------------------------===//
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
---- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
-+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
-@@ -73,10 +73,6 @@
-   getPredicatedAddrSpace(const Value *V) const override;
- 
-   unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
--
--  bool splitModule(Module &M, unsigned NumParts,
--                   function_ref<void(std::unique_ptr<Module> MPart)>
--                       ModuleCallback) const override;
- };
- 
- //===----------------------------------------------------------------------===//
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
---- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
-+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
-@@ -98,7 +98,6 @@
-   AMDGPURewriteOutArguments.cpp
-   AMDGPURewriteUndefForPHI.cpp
-   AMDGPUSetWavePriority.cpp
--  AMDGPUSplitModule.cpp
-   AMDGPUSubtarget.cpp
-   AMDGPUTargetMachine.cpp
-   AMDGPUTargetObjectFile.cpp
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
---- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
-+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
-@@ -2109,10 +2109,12 @@
-         if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
-           if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
-             // MemSetInst must have a write location.
--            MemoryLocation UpperLoc = *getLocForWrite(UpperInst);
-+            auto UpperLoc = getLocForWrite(UpperInst);
-+            if (!UpperLoc)
-+              return false;
-             int64_t InstWriteOffset = 0;
-             int64_t DepWriteOffset = 0;
--            auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc,
-+            auto OR = isOverwrite(UpperInst, DefInst, *UpperLoc, *MaybeDefLoc,
-                                   InstWriteOffset, DepWriteOffset);
-             Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL);
-             return StoredByte && StoredByte == MemSetI->getOperand(1) &&
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
---- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
-+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
-@@ -1389,6 +1389,12 @@
-         if (!Arg)
-           continue;
- 
-+        if (AL.hasParamAttr(I, Attribute::ByVal))
-+          // It's unsound to propagate memory attributes to byval arguments.
-+          // Even if CalledFunction doesn't e.g. write to the argument,
-+          // the call to NewInnerCB may write to its by-value copy.
-+          continue;
-+
-         unsigned ArgNo = Arg->getArgNo();
-         // If so, propagate its access attributes.
-         AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/DeadStoreElimination/simple.ll b/llvm/test/Transforms/DeadStoreElimination/simple.ll
---- a/llvm/test/Transforms/DeadStoreElimination/simple.ll
-+++ b/llvm/test/Transforms/DeadStoreElimination/simple.ll
-@@ -790,3 +790,16 @@
-   %l = load i32, ptr %R
-   ret i32 %l
- }
-+
-+define void @test49() {
-+; CHECK-LABEL: @test49(
-+; CHECK-NEXT:  bb:
-+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr readonly null, i8 0, i64 0, i1 false)
-+; CHECK-NEXT:    store ptr null, ptr null, align 8
-+; CHECK-NEXT:    ret void
-+;
-+bb:
-+  call void @llvm.memset.p0.i64(ptr readonly null, i8 0, i64 0, i1 false)
-+  store ptr null, ptr null, align 8
-+  ret void
-+}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
---- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
-+++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
-@@ -6,6 +6,7 @@
- declare void @bar1(ptr %p)
- declare void @bar2(ptr %p, ptr %p2)
- declare void @bar3(ptr writable %p)
-+declare void @bar4(ptr byval([4 x i32]) %p)
- define dso_local void @foo1_rdonly(ptr readonly %p) {
- ; CHECK-LABEL: define {{[^@]+}}@foo1_rdonly
- ; CHECK-SAME: (ptr readonly [[P:%.*]]) {
-@@ -186,6 +187,15 @@
-   ret void
- }
- 
-+define dso_local void @foo_byval_readonly(ptr readonly %p) {
-+; CHECK-LABEL: define {{[^@]+}}@foo_byval_readonly
-+; CHECK-SAME: (ptr readonly [[P:%.*]])
-+; CHECK-NEXT:   call void @bar4(ptr byval([4 x i32]) [[P]])
-+; CHECK-NEXT:   ret void
-+  call void @bar4(ptr byval([4 x i32]) %p)
-+  ret void
-+}
-+
- define void @prop_param_func_decl(ptr %p) {
- ; CHECK-LABEL: define {{[^@]+}}@prop_param_func_decl
- ; CHECK-SAME: (ptr [[P:%.*]]) {
-@@ -539,3 +549,11 @@
-   ret void
- }
- 
-+define void @prop_byval_readonly(ptr %p) {
-+; CHECK-LABEL: define {{[^@]+}}@prop_byval_readonly
-+; CHECK-SAME: (ptr [[P:%.*]]) {
-+; CHECK-NEXT:   call void @bar4(ptr byval([4 x i32]) [[P]])
-+; CHECK-NEXT:   ret void
-+  call void @foo_byval_readonly(ptr %p)
-+  ret void
-+}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
-@@ -1,46 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels:
--;   - A does a direct call to HelperA
--;   - B is storing @HelperA
--;   - C does a direct call to HelperA
--;
--; The helper functions will get externalized, which will force A and C into P0 as
--; external functions cannot be duplicated.
--
--; CHECK0: define hidden void @HelperA()
--; CHECK0: define amdgpu_kernel void @A()
--; CHECK0: declare amdgpu_kernel void @B(ptr)
--; CHECK0: define amdgpu_kernel void @C()
--
--; CHECK1: declare hidden void @HelperA()
--; CHECK1: declare amdgpu_kernel void @A()
--; CHECK1: declare amdgpu_kernel void @B(ptr)
--; CHECK1: declare amdgpu_kernel void @C()
--
--; CHECK2: declare hidden void @HelperA()
--; CHECK2: declare amdgpu_kernel void @A()
--; CHECK2: define amdgpu_kernel void @B(ptr %dst)
--; CHECK2: declare amdgpu_kernel void @C()
--
--define internal void @HelperA() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  call void @HelperA()
--  ret void
--}
--
--define amdgpu_kernel void @B(ptr %dst) {
--  store ptr @HelperA, ptr %dst
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @HelperA()
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
-@@ -1,37 +0,0 @@
--; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--
--; 2 kernels:
--;   - A is isolated
--;   - B is storing @HelperA/B's address
--;
--; The helper functions should get externalized (become hidden w/ external linkage)
--
--; CHECK0: define hidden void @HelperA()
--; CHECK0: define hidden void @HelperB()
--; CHECK0: define amdgpu_kernel void @A()
--; CHECK0: declare amdgpu_kernel void @B(i1, ptr)
--
--; CHECK1: declare hidden void @HelperA()
--; CHECK1: declare hidden void @HelperB()
--; CHECK1: declare amdgpu_kernel void @A()
--; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)
--
--define internal void @HelperA() {
--  ret void
--}
--
--define internal void @HelperB() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  ret void
--}
--
--define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
--  %addr = select i1 %cond, ptr @HelperA, ptr @HelperB
--  store ptr %addr, ptr %dst
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
-@@ -1,20 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
--; REQUIRES: asserts
--
--; SHA256 of the kernel names.
--
--; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
--; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
--; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
--
--define amdgpu_kernel void @MyCustomKernel0() {
--  ret void
--}
--
--define amdgpu_kernel void @MyCustomKernel1() {
--  ret void
--}
--
--define amdgpu_kernel void @MyCustomKernel2() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
-@@ -1,45 +0,0 @@
--; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--
--; 3 kernels:
--;   - A calls nothing
--;   - B calls @PerryThePlatypus
--;   - C calls @Perry, an alias of @PerryThePlatypus
--;
--; We should see through the alias and put B/C in the same
--; partition.
--;
--; Additionally, @PerryThePlatypus gets externalized as
--; the alias counts as taking its address.
--
--; CHECK0-NOT: define
--; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
--; CHECK0: define hidden void @PerryThePlatypus()
--; CHECK0: define amdgpu_kernel void @B
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @A
--; CHECK1-NOT: define
--
--@Perry = internal alias ptr(), ptr @PerryThePlatypus
--
--define internal void @PerryThePlatypus() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @PerryThePlatypus()
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @Perry()
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
-@@ -1,54 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels with each their own dependencies should go into 3
--; distinct partitions. The most expensive kernel should be
--; seen first and go into the last partition.
--
--; CHECK0-NOT: define
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0: define internal void @HelperC
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @A
--; CHECK1: define internal void @HelperA
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define amdgpu_kernel void @B
--; CHECK2: define internal void @HelperB
--; CHECK2-NOT: define
--
--
--define amdgpu_kernel void @A() {
--  call void @HelperA()
--  ret void
--}
--
--define internal void @HelperA() {
--  ret void
--}
--
--define amdgpu_kernel void @B(ptr %x) {
--  store i64 42, ptr %x
--  store i64 43, ptr %x
--  store i64 44, ptr %x
--  call void @HelperB()
--  ret void
--}
--
--define internal void @HelperB() {
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @HelperC()
--  ret void
--}
--
--define internal void @HelperC() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
-@@ -1,50 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels with each their own dependencies should go into 3
--; distinct partitions.
--
--; CHECK0-NOT: define
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0: define internal void @HelperC
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @B
--; CHECK1: define internal void @HelperB
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define amdgpu_kernel void @A
--; CHECK2: define internal void @HelperA
--; CHECK2-NOT: define
--
--
--define amdgpu_kernel void @A() {
--  call void @HelperA()
--  ret void
--}
--
--define internal void @HelperA() {
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @HelperB()
--  ret void
--}
--
--define internal void @HelperB() {
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @HelperC()
--  ret void
--}
--
--define internal void @HelperC() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
-@@ -1,41 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels share a common helper, that helper should be
--; cloned in all partitions.
--
--; CHECK0-NOT: define
--; CHECK0: define internal void @Helper
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define internal void @Helper
--; CHECK1: define amdgpu_kernel void @B
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define internal void @Helper
--; CHECK2: define amdgpu_kernel void @A
--; CHECK2-NOT: define
--
--define internal void @Helper() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  call void @Helper()
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @Helper()
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @Helper()
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
-@@ -1,64 +0,0 @@
--; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
--
--; Both overridable helper should go in P0.
--
--; CHECK0-NOT: define
--; CHECK0: define available_externally void @OverridableHelper0()
--; CHECK0: define internal void @OverridableHelper1()
--; CHECK0: define amdgpu_kernel void @A
--; CHECK0: define amdgpu_kernel void @B
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define internal void @PrivateHelper1()
--; CHECK2: define amdgpu_kernel void @D
--; CHECK2-NOT: define
--
--; CHECK3-NOT: define
--; CHECK3: define internal void @PrivateHelper0()
--; CHECK3: define amdgpu_kernel void @C
--; CHECK3-NOT: define
--
--define available_externally void @OverridableHelper0() {
--  ret void
--}
--
--define internal void @OverridableHelper1() #0 {
--  ret void
--}
--
--define internal void @PrivateHelper0() {
--  ret void
--}
--
--define internal void @PrivateHelper1() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  call void @OverridableHelper0()
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @OverridableHelper1()
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @PrivateHelper0()
--  ret void
--}
--
--define amdgpu_kernel void @D() {
--  call void @PrivateHelper1()
--  ret void
--}
--
--attributes #0 = { nobuiltin }
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
-@@ -1,76 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; We have 4 kernels:
--;   - Each kernel has an internal helper
--;   - @A and @B's helpers does an indirect call.
--;
--; We default to putting A/B in P0, alongside a copy
--; of all helpers who have their address taken.
--; The other kernels can still go into separate partitions.
--
--; CHECK0-NOT: define
--; CHECK0: define hidden void @HelperA
--; CHECK0: define hidden void @HelperB
--; CHECK0: define hidden void @CallCandidate
--; CHECK0-NOT: define {{.*}} @HelperC
--; CHECK0-NOT: define {{.*}} @HelperD
--; CHECK0: define amdgpu_kernel void @A
--; CHECK0: define amdgpu_kernel void @B
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define internal void @HelperD
--; CHECK1: define amdgpu_kernel void @D
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define internal void @HelperC
--; CHECK2: define amdgpu_kernel void @C
--; CHECK2-NOT: define
--
--@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
--
--define internal void @HelperA(ptr %call) {
--  call void %call()
--  ret void
--}
--
--define internal void @HelperB(ptr %call) {
--  call void %call()
--  ret void
--}
--
--define internal void @CallCandidate() {
--  ret void
--}
--
--define internal void @HelperC() {
--  ret void
--}
--
--define internal void @HelperD() {
--  ret void
--}
--
--define amdgpu_kernel void @A(ptr %call) {
--  call void @HelperA(ptr %call)
--  ret void
--}
--
--define amdgpu_kernel void @B(ptr %call) {
--  call void @HelperB(ptr %call)
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @HelperC()
--  ret void
--}
--
--define amdgpu_kernel void @D() {
--  call void @HelperD()
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
-@@ -1,40 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; CHECK0-NOT: define
--; CHECK0: define void @ExternalHelper
--; CHECK0: define amdgpu_kernel void @A
--; CHECK0: define amdgpu_kernel void @B
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @D
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define amdgpu_kernel void @C
--; CHECK2-NOT: define
--
--define void @ExternalHelper() {
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  call void @ExternalHelper()
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @ExternalHelper()
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  ret void
--}
--
--define amdgpu_kernel void @D() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
-@@ -1,42 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels use private/internal global variables.
--; The GVs should be copied in each partition as needed.
--
--; CHECK0-NOT: define
--; CHECK0: @bar = internal constant ptr
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: @foo = private constant ptr
--; CHECK1: define amdgpu_kernel void @A
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: @foo = private constant ptr
--; CHECK2: @bar = internal constant ptr
--; CHECK2: define amdgpu_kernel void @B
--; CHECK2-NOT: define
--
--@foo = private constant ptr poison
--@bar = internal constant ptr poison
--
--define amdgpu_kernel void @A() {
--  store i32 42, ptr @foo
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  store i32 42, ptr @foo
--  store i32 42, ptr @bar
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  store i32 42, ptr @bar
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
-@@ -1,44 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; 3 kernels use private/internal global variables.
--; The GVs should be copied in each partition as needed.
--
--; CHECK0-NOT: define
--; CHECK0: @foo = hidden constant ptr poison
--; CHECK0: @bar = hidden constant ptr poison
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: @foo = external hidden constant ptr{{$}}
--; CHECK1: @bar = external hidden constant ptr{{$}}
--; CHECK1: define amdgpu_kernel void @A
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: @foo = external hidden constant ptr{{$}}
--; CHECK2: @bar = external hidden constant ptr{{$}}
--; CHECK2: define amdgpu_kernel void @B
--; CHECK2-NOT: define
--
--@foo = private constant ptr poison
--@bar = internal constant ptr poison
--
--define amdgpu_kernel void @A() {
--  store i32 42, ptr @foo
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  store i32 42, ptr @foo
--  store i32 42, ptr @bar
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  store i32 42, ptr @bar
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
-@@ -1,75 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; Test load balancing logic with 6 kernels.
--;
--; Kernels go from most expensive (A == 6) to least expensive (F == 1)
--;
--; Load balancing should work like this (current partition cost is in parens)
--;
--; Initial    -> [P0(0), P1(0), P2(0)]
--;
--; A(6) goes in 2 -> [P2(6), P0(0), P1(0)]
--; B(5) goes in 1 -> [P2(6), P1(5), P0(4)]
--; C(4) goes in 0 -> [P2(6), P1(5), P0(4)]
--
--; D(3) goes in 0 -> [P0(7), P2(6), P1(5)]
--; E(2) goes in 1 -> [P0(7), P1(7), P2(6)]
--; F(1) goes in 2 -> [P0(7), P1(7), P2(7)]
--
--; CHECK0-NOT: define
--; CHECK0: define amdgpu_kernel void @C
--; CHECK0: define amdgpu_kernel void @D
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @B
--; CHECK1: define amdgpu_kernel void @E
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define amdgpu_kernel void @A
--; CHECK2: define amdgpu_kernel void @F
--; CHECK2-NOT: define
--
--
--define amdgpu_kernel void @A(ptr %x) {
--  store i64 42, ptr %x
--  store i64 43, ptr %x
--  store i64 44, ptr %x
--  store i64 45, ptr %x
--  store i64 46, ptr %x
--  ret void
--}
--
--define amdgpu_kernel void @B(ptr %x) {
--  store i64 42, ptr %x
--  store i64 43, ptr %x
--  store i64 44, ptr %x
--  store i64 45, ptr %x
--  ret void
--}
--
--define amdgpu_kernel void @C(ptr %x) {
--  store i64 42, ptr %x
--  store i64 43, ptr %x
--  store i64 44, ptr %x
--  ret void
--}
--
--define amdgpu_kernel void @D(ptr %x) {
--  store i64 42, ptr %x
--  store i64 43, ptr %x
--  ret void
--}
--
--define amdgpu_kernel void @E(ptr %x) {
--  store i64 42, ptr %x
--  ret void
--}
--
--define amdgpu_kernel void @F() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
-@@ -1,39 +0,0 @@
--; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
--
--; Check that 4 independent kernels get put into 4 different partitions.
--
--; CHECK0-NOT: define
--; CHECK0: define amdgpu_kernel void @D
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define amdgpu_kernel void @C
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define amdgpu_kernel void @B
--; CHECK2-NOT: define
--
--; CHECK3-NOT: define
--; CHECK3: define amdgpu_kernel void @A
--; CHECK3-NOT: define
--
--define amdgpu_kernel void @A() {
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  ret void
--}
--
--define amdgpu_kernel void @D() {
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
---- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
-+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
-@@ -1,98 +0,0 @@
--; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5
--; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
--; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
--; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
--
--; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
--; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
--; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
--; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
--
--; 2 kernels (A/B) are large and share all their dependencies.
--; They should go in the same partition, the remaining kernel should
--; go somewhere else, and one partition should be empty.
--;
--; Also check w/o large kernels processing to verify they are indeed handled
--; differently.
--
--; CHECK0-NOT: define
--
--; CHECK1-NOT: define
--; CHECK1: define internal void @HelperC()
--; CHECK1: define amdgpu_kernel void @C
--; CHECK1-NOT: define
--
--; CHECK2-NOT: define
--; CHECK2: define internal void @large2()
--; CHECK2: define internal void @large1()
--; CHECK2: define internal void @large0()
--; CHECK2: define internal void @HelperA()
--; CHECK2: define internal void @HelperB()
--; CHECK2: define amdgpu_kernel void @A
--; CHECK2: define amdgpu_kernel void @B
--; CHECK2-NOT: define
--
--; NOLARGEKERNELS-CHECK0-NOT: define
--; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
--; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
--; NOLARGEKERNELS-CHECK0-NOT: define
--
--; NOLARGEKERNELS-CHECK1: define internal void @large2()
--; NOLARGEKERNELS-CHECK1: define internal void @large1()
--; NOLARGEKERNELS-CHECK1: define internal void @large0()
--; NOLARGEKERNELS-CHECK1: define internal void @HelperB()
--; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B
--
--; NOLARGEKERNELS-CHECK2: define internal void @large2()
--; NOLARGEKERNELS-CHECK2: define internal void @large1()
--; NOLARGEKERNELS-CHECK2: define internal void @large0()
--; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
--; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
--
--define internal void @large2() {
--  store volatile i32 42, ptr null
--  call void @large2()
--  ret void
--}
--
--define internal void @large1() {
--  call void @large1()
--  call void @large2()
--  ret void
--}
--
--define internal void @large0() {
--  call void @large0()
--  call void @large1()
--  call void @large2()
--  ret void
--}
--
--define internal void @HelperA() {
--  call void @large0()
--  ret void
--}
--
--define internal void @HelperB() {
--  call void @large0()
--  ret void
--}
--
--define amdgpu_kernel void @A() {
--  call void @HelperA()
--  ret void
--}
--
--define amdgpu_kernel void @B() {
--  call void @HelperB()
--  ret void
--}
--
--define internal void @HelperC() {
--  ret void
--}
--
--define amdgpu_kernel void @C() {
--  call void @HelperC()
--  ret void
--}
-diff -ruN --strip-trailing-cr a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
---- a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
-+++ b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
-@@ -1,2 +0,0 @@
--if not "AMDGPU" in config.root.targets:
--  config.unsupported = True
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 13f84dc3837360..cfc24fd2f858ca 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "fddf350f96405d2b0f4b17025e7c7bb2d159798e"
-    LLVM_SHA256 = "fe7c00020525dd5343ed27c4b5b94b027b8d080b6cda80e52d56ee68591a961b"
+    LLVM_COMMIT = "9b79acedd689c34d3505ac351c732aa44e22fc86"
+    LLVM_SHA256 = "5e3150bf1d6e8725f4b4bcffa6107fb2b5160087899fb2d78bc74424245d773d"
 
     tf_http_archive(
         name = name,

From 12faf41d053dc1349cc1b53420c60812678ae825 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 28 May 2024 14:30:15 -0700
Subject: [PATCH 033/287] Update robin_map dependency to v1.3.0.

This will be needed for a future nanobind v2.0 update.

PiperOrigin-RevId: 638025070
---
 third_party/robin_map/workspace.bzl                 | 6 +++---
 third_party/xla/third_party/robin_map/workspace.bzl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/robin_map/workspace.bzl b/third_party/robin_map/workspace.bzl
index 397becb29c86b6..d0546dd746f23f 100644
--- a/third_party/robin_map/workspace.bzl
+++ b/third_party/robin_map/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "robin_map",
-        strip_prefix = "robin-map-1.2.1",
-        sha256 = "2b54d2c1de2f73bea5c51d5dcbd64813a08caf1bfddcfdeee40ab74e9599e8e3",
-        urls = tf_mirror_urls("https://github.com/Tessil/robin-map/archive/refs/tags/v1.2.1.tar.gz"),
+        strip_prefix = "robin-map-1.3.0",
+        sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+        urls = tf_mirror_urls("https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz"),
         build_file = "//third_party/robin_map:robin_map.BUILD",
     )
diff --git a/third_party/xla/third_party/robin_map/workspace.bzl b/third_party/xla/third_party/robin_map/workspace.bzl
index 397becb29c86b6..d0546dd746f23f 100644
--- a/third_party/xla/third_party/robin_map/workspace.bzl
+++ b/third_party/xla/third_party/robin_map/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "robin_map",
-        strip_prefix = "robin-map-1.2.1",
-        sha256 = "2b54d2c1de2f73bea5c51d5dcbd64813a08caf1bfddcfdeee40ab74e9599e8e3",
-        urls = tf_mirror_urls("https://github.com/Tessil/robin-map/archive/refs/tags/v1.2.1.tar.gz"),
+        strip_prefix = "robin-map-1.3.0",
+        sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+        urls = tf_mirror_urls("https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz"),
         build_file = "//third_party/robin_map:robin_map.BUILD",
     )

From 377f47694fa790e98db6665b9adecde00b5e0d68 Mon Sep 17 00:00:00 2001
From: Sizhi Tan <sizhi@google.com>
Date: Tue, 28 May 2024 14:31:34 -0700
Subject: [PATCH 034/287] Re-enable eager test with TFRT:TPU

PiperOrigin-RevId: 638025499
---
 tensorflow/compiler/tests/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index cc5d75ab70081c..4a8928ce9e2826 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -756,10 +756,6 @@ tf_xla_py_strict_test(
     name = "eager_test",
     size = "medium",
     srcs = ["eager_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/287111047): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     enable_mlir_bridge = False,
     python_version = "PY3",
     tags = [

From 1ae494f058176aa0c93a7bb26526dafc53de80be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 14:45:53 -0700
Subject: [PATCH 035/287] Do not assign `ExecutionStreamAssignments` to
 `HloComputations` that are only reachable through embedded calls.

`ExecutionStreamAssignment` relies on the the input `HloModule` being flat. In other words, each computation must be called by a single instruction. This is generally achieved by processing the `HloModule` with `FlattenCallGraph`. However, this pass only flattens sequential calls.

The good news is that we don't need to assign `ExecutionStreamIds` to instructions invoked via embedded calls because they end up in the same kernel as the parent instruction. So this change ignores all instructions reachable through embedded calls (e.g. fusions, sorts, etc.).

PiperOrigin-RevId: 638029921
---
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 .../gpu/execution_stream_assignment.cc        |  38 ++++--
 .../service/gpu/execution_stream_assignment.h |  11 +-
 .../gpu/execution_stream_assignment_test.cc   | 129 +++++++++++-------
 4 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 29342fb18ef03f..3d2cb0750e87b5 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -6154,7 +6154,6 @@ xla_cc_test(
     srcs = ["execution_stream_assignment_test.cc"],
     deps = [
         ":execution_stream_assignment",
-        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu/runtime:thunk",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment.cc
index a41a680563ae6c..6ee0f2bfbc1f7b 100644
--- a/third_party/xla/xla/service/gpu/execution_stream_assignment.cc
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment.cc
@@ -55,6 +55,20 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
   std::deque<Pending> queue;
   queue.emplace_back(module->entry_computation(), ExecutionStreamId(0));
 
+  // Enqueues called computations of a given `callsite` unless the callees are
+  // only invoked in an embedded context, in which case children nodes will all
+  // be executed in a single kernel.
+  auto enqueue_called_computations = [&](const CallSite& callsite,
+                                         ExecutionStreamId stream) {
+    if (GetInstructionCallContext(callsite.instruction()->opcode()) ==
+        CallContext::kEmbedded) {
+      return;
+    }
+    for (HloComputation* computation : callsite.called_computations()) {
+      queue.emplace_back(computation, stream);
+    }
+  };
+
   while (!queue.empty()) {
     Pending pending = queue.front();
     queue.pop_front();
@@ -72,14 +86,10 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
          call_graph->GetNode(pending.node).callsites()) {
       if (callsite.instruction()->IsAsynchronous()) {
         // Asynchronous calls will result in a new `ExecutionStreamId` being
-        // dispensed for the target computation.
+        // dispensed for the called computations.
         CHECK_EQ(callsite.instruction()->opcode(), HloOpcode::kAsyncStart);
         const ExecutionStreamId async_stream_id = next_stream_id++;
-
-        // Because the `HloModule` is assumed to be flat, all computations must
-        // be invoked from a single call-like instruction.
-        CHECK_EQ(callsite.called_computations().size(), 1);
-        queue.emplace_back(callsite.called_computations()[0], async_stream_id);
+        enqueue_called_computations(callsite, async_stream_id);
 
         AsyncExecutionStreamIds streams;
         streams.source_stream_id = pending.stream_id;
@@ -87,17 +97,16 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
         CHECK(async_instructions_.try_emplace(callsite.instruction(), streams)
                   .second);
       } else {
-        // Synchronous calls will result in the target computation being invoked
-        // using the same `ExecutionStreamId`.
-        queue.emplace_back(callsite.called_computations()[0],
-                           pending.stream_id);
+        // Synchronous calls will result in the called computations being
+        // invoked using the same `ExecutionStreamId`.
+        enqueue_called_computations(callsite, pending.stream_id);
       }
     }
 
     // And finally, we need to assign `ExecutionStreamIds` to all asynchronous
     // instructions that are were not handled by the iteration over callsites
     // above. These are the `async-updates` and `async-dones`. Both of these
-    // should share the `ExecutionStreamId` as the originating `async-starts`.
+    // should share the `ExecutionStreamId` with the originating `async-starts`.
     for (HloInstruction* instruction : pending.node->instructions()) {
       if (!instruction->IsAsynchronous()) continue;
       if (instruction->opcode() == HloOpcode::kAsyncStart) {
@@ -117,9 +126,10 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
 
 namespace {
 absl::Status StreamNotFoundError(const HloInstruction* instruction) {
-  return absl::NotFoundError(
-      absl::StrCat("No stream found for ", instruction->ToString(),
-                   "; it may not be reachable from the module's entrypoint."));
+  return absl::NotFoundError(absl::StrCat(
+      "No ExecutionStreamId found for ", instruction->ToString(),
+      "; this may happen if the Computation is not reachable from the module's "
+      "entrypoint, or if it's only reachable through a embedded calls."));
 }
 }  // namespace
 
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment.h b/third_party/xla/xla/service/gpu/execution_stream_assignment.h
index 8200b7730d9eb8..adbd7f04ace5ec 100644
--- a/third_party/xla/xla/service/gpu/execution_stream_assignment.h
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment.h
@@ -40,14 +40,15 @@ class ExecutionStreamAssignment {
   explicit ExecutionStreamAssignment(const HloModule* module);
 
   // Returns the `ExecutionStreamId` for the given instruction, which *must* be
-  // synchronous. Returns an error if the instruction is not reachable
-  // from the module's entrypoint.
+  // synchronous. Returns an error if the instruction is either not reachable
+  // from the module's entrypoint, or is only reachable through embedded calls.
   absl::StatusOr<ExecutionStreamId> GetSyncExecutionStreamId(
       const HloInstruction* instruction) const;
 
   // Returns the source and destination `ExecutionStreamIds` for the given
   // instruction, which *must* be asynchronous. Returns an error if the
-  // instruction is not reachable from the module's entrypoint.
+  // instruction is either not reachable from the module's entrypoint, or is
+  // only reachable through embedded calls.
   struct AsyncExecutionStreamIds {
     // The `ExecutionStreamId` for the calling instruction (e.g. the computation
     // that invokes `async-start`).
@@ -61,8 +62,8 @@ class ExecutionStreamAssignment {
 
  private:
   // Maps from `HloInstructions` to `ExecutionStreamIds` for synchronous and
-  // asynchronous instructions, respectively. Instructions that are not
-  // reachable from the module's entrypoint will not be present.
+  // asynchronous instructions, respectively. All instructions reachable through
+  // non-embedded calls must be present.
   absl::flat_hash_map<HloInstruction*, ExecutionStreamId> sync_instructions_;
   absl::flat_hash_map<HloInstruction*, AsyncExecutionStreamIds>
       async_instructions_;
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
index 507d6e9fcef6b4..624b9b859117d7 100644
--- a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/execution_stream_assignment.h"
 
 #include <memory>
+#include <string_view>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -27,8 +28,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -55,55 +54,34 @@ class ExecutionStreamAssignmentTest : public HloTestBase {
                   IsOkAndHolds(stream));
     }
   }
-
-  // Adds expectations for the `ExecutionStreamId` for all asynchronous
-  // `HloInstructions` in the given `HloComputation`.
-  void ExpectExecutionStreamForAsyncInstructions(
-      const ExecutionStreamAssignment& assignment, HloComputation* computation,
-      ExecutionStreamId source_stream,
-      ExecutionStreamId destination_stream) const {
-    for (const HloInstruction* instruction : computation->instructions()) {
-      if (!instruction->IsAsynchronous()) continue;
-      AsyncExecutionStreamIds expected_stream_ids;
-      expected_stream_ids.source_stream_id = source_stream;
-      expected_stream_ids.destination_stream_id = destination_stream;
-      EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(
-                      Cast<HloAsyncInstruction>(instruction)),
-                  IsOkAndHolds(expected_stream_ids));
-    }
-  }
-
-  const Shape kTensorShape = ShapeUtil::MakeShape(F32, {2, 2});
 };
 
 TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
-  // We'll create an `HloModule` with two nested `async-fusions`.
-  // ENTRY -> ASYNC-FUSION -> ASYNC-FUSION -> BINARY_OP
   const char* kModuleStr = R"(
     HloModule m
 
-    // Leaf computation.
-    leaf {
+    // Leaf computations.
+    leaf1 {
       p0 = f32[2,2] parameter(0)
       ROOT add = f32[2,2] add(p0, p0)
     }
-
-    // Innermost `async-fusion`.
-    fusion {
+    leaf2 {
       p0 = f32[2,2] parameter(0)
-      start = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
-          kind=kLoop, calls=leaf
-      update = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start)
-      ROOT done = f32[2,2] fusion-done(update)
+      ROOT add = f32[2,2] add(p0, p0)
     }
 
-    // Outermost `async-fusion` and entrypoint for the module.
+    // Entry computation that calls each of the leaves asynchronously.
     ENTRY entry {
       p0 = f32[2,2] parameter(0)
-      start = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
-          kind=kLoop, calls=fusion
-      update = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start)
-      ROOT done = f32[2,2] fusion-done(update)
+      start1 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+          kind=kLoop, calls=leaf1
+      start2 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+          kind=kLoop, calls=leaf2
+      update1 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start1)
+      update2 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start2)
+      done1 = f32[2,2] fusion-done(update1)
+      done2 = f32[2,2] fusion-done(update2)
+      ROOT done = f32[2,2] add(done1, done2)
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -111,26 +89,79 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
 
   ExecutionStreamAssignment assignment(module.get());
 
-  // The outermost computation should run on `ExecutionStreamId(0)` and launch
-  // asynchronous work on `ExecutionStreamId(1)`.
+  // The outermost computation should run on `ExecutionStreamId(0)`. The two
+  // asynchronous branches should be launched on `ExecutionStreamId(1)` and
+  // `ExecutionStreamId(2)`, respectively.
   ExpectExecutionStreamForSyncInstructions(
       assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
-  ExpectExecutionStreamForAsyncInstructions(
-      assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0),
-      ExecutionStreamId(1));
+  for (std::string_view instruction : {"start1", "update1", "done1"}) {
+    EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(Cast<HloAsyncInstruction>(
+                    FindInstruction(module.get(), instruction))),
+                IsOkAndHolds(AsyncExecutionStreamIds(
+                    /*source_stream_id=*/ExecutionStreamId(0),
+                    /*destination_stream_id=*/ExecutionStreamId(1))));
+  }
+  for (std::string_view instruction : {"start2", "update2", "done2"}) {
+    EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(Cast<HloAsyncInstruction>(
+                    FindInstruction(module.get(), instruction))),
+                IsOkAndHolds(AsyncExecutionStreamIds(
+                    /*source_stream_id=*/ExecutionStreamId(0),
+                    /*destination_stream_id=*/ExecutionStreamId(2))));
+  }
 
-  // The nested computation should run on `ExecutionStreamId(1)` and launch
-  // asynchronous work on `ExecutionStreamId(2)`.
+  // Leaf computations should run on the respective asynchronous
+  // `ExecutionStreamIds`.
   ExpectExecutionStreamForSyncInstructions(
-      assignment, FindComputation(module.get(), "fusion"),
+      assignment,
+      Cast<HloAsyncInstruction>(FindInstruction(module.get(), "start1"))
+          ->async_wrapped_computation(),
       ExecutionStreamId(1));
-  ExpectExecutionStreamForAsyncInstructions(
-      assignment, FindComputation(module.get(), "fusion"), ExecutionStreamId(1),
+  ExpectExecutionStreamForSyncInstructions(
+      assignment,
+      Cast<HloAsyncInstruction>(FindInstruction(module.get(), "start2"))
+          ->async_wrapped_computation(),
       ExecutionStreamId(2));
+}
+
+TEST_F(ExecutionStreamAssignmentTest, FusionComputations) {
+  const char* kModuleStr = R"(
+    HloModule m
 
-  // The innermost computation should run on `ExecutionStreamId(2)`
+    reduce {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[] reduce(p0, c0), dimensions={0}, to_apply=reduce
+    }
+
+    // Entry computation that calls each of the leaves asynchronously.
+    ENTRY entry {
+      p0 = f32[4] parameter(0)
+      ROOT done = f32[] fusion(p0), kind=kLoop, calls=fusion
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ExecutionStreamAssignment assignment(module.get());
+
+  // The outermost computation should run on `ExecutionStreamId(0)`.
   ExpectExecutionStreamForSyncInstructions(
-      assignment, FindComputation(module.get(), "leaf"), ExecutionStreamId(2));
+      assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
+
+  // Computations only reachable through fusion nodes should have no assigned
+  // `ExecutionStreamId`.
+  for (std::string_view computation : {"reduce", "fusion"}) {
+    for (const HloInstruction* instruction :
+         FindComputation(module.get(), computation)->instructions()) {
+      EXPECT_THAT(assignment.GetSyncExecutionStreamId(instruction),
+                  StatusIs(absl::StatusCode::kNotFound));
+    }
+  }
 }
 
 TEST_F(ExecutionStreamAssignmentTest, UnreachableComputation) {

From 57a37cf522d482a36c092f5c2b8b8dc74dc0c1bd Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Tue, 28 May 2024 15:03:16 -0700
Subject: [PATCH 036/287] Remove StreamInterface::implementation method, and
 just cast based on the stream pointer.

PiperOrigin-RevId: 638035755
---
 .../stream_executor/stream_executor.cc        | 41 +++++++------------
 .../stream_executor_internal.h                |  1 +
 tensorflow/c/kernels.cc                       |  2 +-
 tensorflow/compiler/jit/xla_tpu_device.cc     | 11 +++--
 .../xla/xla/backends/interpreter/executor.cc  |  2 +-
 .../xla/xla/service/cpu/cpu_executable.cc     |  4 +-
 .../xla/xla/stream_executor/gpu/gpu_stream.h  |  5 ++-
 .../xla/stream_executor/host/host_executor.cc |  2 +-
 .../xla/stream_executor/host/host_stream.cc   |  1 +
 third_party/xla/xla/stream_executor/stream.cc |  2 +-
 third_party/xla/xla/stream_executor/stream.h  |  4 --
 .../xla/stream_executor/tpu/tpu_executable.cc |  5 +--
 .../xla/stream_executor/tpu/tpu_executor.cc   |  3 +-
 .../stream_executor/tpu/tpu_op_executable.cc  |  3 +-
 .../xla/xla/stream_executor/tpu/tpu_stream.h  |  1 +
 .../tpu/tpu_transfer_manager.cc               | 15 ++-----
 16 files changed, 40 insertions(+), 62 deletions(-)

diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 7822647d0487d3..cde30a61b9cf54 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -316,8 +316,7 @@ class CStreamExecutor : public StreamExecutor {
   absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                        uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem = DeviceMemoryBaseToC(location);
     stream_executor_->mem_zero(&device_, stream_handle, &device_mem, size,
                                c_status.get());
@@ -326,8 +325,7 @@ class CStreamExecutor : public StreamExecutor {
   absl::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
                       uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem = DeviceMemoryBaseToC(location);
     stream_executor_->memset(&device_, stream_handle, &device_mem, pattern,
                              size, c_status.get());
@@ -336,8 +334,7 @@ class CStreamExecutor : public StreamExecutor {
   absl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
                         uint32 pattern, uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem = DeviceMemoryBaseToC(location);
     stream_executor_->memset32(&device_, stream_handle, &device_mem, pattern,
                                size, c_status.get());
@@ -346,8 +343,7 @@ class CStreamExecutor : public StreamExecutor {
   absl::Status Memcpy(Stream* stream, void* host_dst,
                       const DeviceMemoryBase& gpu_src, uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
     stream_executor_->memcpy_dtoh(&device_, stream_handle, host_dst,
                                   &device_mem_src, size, c_status.get());
@@ -359,8 +355,7 @@ class CStreamExecutor : public StreamExecutor {
   absl::Status Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
                       const void* host_src, uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
     stream_executor_->memcpy_htod(&device_, stream_handle, &device_mem_dst,
                                   host_src, size, c_status.get());
@@ -373,8 +368,7 @@ class CStreamExecutor : public StreamExecutor {
                             const DeviceMemoryBase& gpu_src,
                             uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
     SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
     stream_executor_->memcpy_dtod(&device_, stream_handle, &device_mem_dst,
@@ -387,20 +381,17 @@ class CStreamExecutor : public StreamExecutor {
   }
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override {
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
     return stream_executor_->host_callback(&device_, stream_handle,
                                            &HostCallbackTrampoline, ctx);
   }
   absl::Status RecordEvent(Stream* stream, Event* event) override {
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     return static_cast<CEvent*>(event)->Record(stream_handle);
   }
   absl::Status WaitForEvent(Stream* stream, Event* event) override {
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     SP_Event event_handle = static_cast<CEvent*>(event)->Handle();
     OwnedTFStatus c_status(TF_NewStatus());
     stream_executor_->wait_for_event(&device_, stream_handle, event_handle,
@@ -409,14 +400,12 @@ class CStreamExecutor : public StreamExecutor {
     return s;
   }
   void DeallocateStream(Stream* stream) override {
-    static_cast<CStream*>(stream->implementation())->Destroy();
+    static_cast<CStream*>(stream)->Destroy();
   }
   bool CreateStreamDependency(Stream* dependent, Stream* other) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream dependent_handle =
-        static_cast<CStream*>(dependent->implementation())->Handle();
-    SP_Stream other_handle =
-        static_cast<CStream*>(other->implementation())->Handle();
+    SP_Stream dependent_handle = static_cast<CStream*>(dependent)->Handle();
+    SP_Stream other_handle = static_cast<CStream*>(other)->Handle();
     stream_executor_->create_stream_dependency(&device_, dependent_handle,
                                                other_handle, c_status.get());
     if (TF_GetCode(c_status.get()) != TF_OK) {
@@ -435,8 +424,7 @@ class CStreamExecutor : public StreamExecutor {
 
   absl::Status BlockHostUntilDone(Stream* stream) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
 
     // If `block_host_until_done` is set, use it.
     if (stream_executor_->block_host_until_done != nullptr) {
@@ -463,8 +451,7 @@ class CStreamExecutor : public StreamExecutor {
 
   absl::Status GetStatus(Stream* stream) override {
     OwnedTFStatus c_status(TF_NewStatus());
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Stream stream_handle = static_cast<CStream*>(stream)->Handle();
     stream_executor_->get_stream_status(&device_, stream_handle,
                                         c_status.get());
     return StatusFromTF_Status(c_status.get());
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index fa3b4321e27634..525c76c51b4b9e 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -107,6 +107,7 @@ class CStream : public Stream {
         stream_handle_(nullptr) {}
   ~CStream() override {
     parent()->BlockHostUntilDone(this).IgnoreError();
+    parent()->DeallocateStream(this);
     Destroy();
   }
 
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index d424706cad929d..4a309b25c3a51e 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -360,7 +360,7 @@ SP_Stream TF_GetStream(TF_OpKernelContext* ctx, TF_Status* status) {
   } else {  // Is a PluggableDevice
     TF_SetStatus(status, TF_OK, "");
     auto c_stream = static_cast<stream_executor::CStream*>(
-        cc_ctx->op_device_context()->stream()->implementation());
+        cc_ctx->op_device_context()->stream());
     return c_stream->Handle();
   }
 #endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
diff --git a/tensorflow/compiler/jit/xla_tpu_device.cc b/tensorflow/compiler/jit/xla_tpu_device.cc
index 403e6b17e6fc00..f9524163a6223b 100644
--- a/tensorflow/compiler/jit/xla_tpu_device.cc
+++ b/tensorflow/compiler/jit/xla_tpu_device.cc
@@ -157,12 +157,12 @@ void TpuDeviceToDeviceCopy(DeviceContext* src_dev_context,
         << DataTypeString(output->dtype());
     TF_RET_CHECK(input->shape() == output->shape());
     TF_RET_CHECK(DMAHelper::CanUseDMA(input));
-    auto* const src_compute_stream_impl = static_cast<tpu::TpuStreamInterface*>(
-        src_compute_stream->implementation());
+    auto* const src_compute_stream_impl =
+        static_cast<tpu::TpuStreamInterface*>(src_compute_stream);
 
     se::Stream* dst_compute_stream = dst_xla_context->stream();
-    auto* const dst_compute_stream_impl = static_cast<tpu::TpuStreamInterface*>(
-        dst_compute_stream->implementation());
+    auto* const dst_compute_stream_impl =
+        static_cast<tpu::TpuStreamInterface*>(dst_compute_stream);
 
     if (src_compute_stream_impl->IsSameSharedMemoryLocation(
             dst_compute_stream_impl)) {
@@ -191,8 +191,7 @@ void TpuDeviceToDeviceCopy(DeviceContext* src_dev_context,
         });
 
     auto* const dst_device_to_device_stream_impl =
-        static_cast<tpu::TpuStreamInterface*>(
-            dst_device_to_device_stream->implementation());
+        static_cast<tpu::TpuStreamInterface*>(dst_device_to_device_stream);
 
     const int dst_device_ordinal =
         dst_xla_context->stream()->parent()->device_ordinal();
diff --git a/third_party/xla/xla/backends/interpreter/executor.cc b/third_party/xla/xla/backends/interpreter/executor.cc
index 26c548e0090292..b99e801565a791 100644
--- a/third_party/xla/xla/backends/interpreter/executor.cc
+++ b/third_party/xla/xla/backends/interpreter/executor.cc
@@ -27,7 +27,7 @@ namespace interpreter {
 
 host::HostStream *AsExecutorStream(Stream *stream) {
   DCHECK(stream != nullptr);
-  return dynamic_cast<host::HostStream *>(stream->implementation());
+  return dynamic_cast<host::HostStream *>(stream);
 }
 
 DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64_t size,
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 68a9d7e84d6d67..1bb32b957d98a7 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -470,8 +470,8 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     }
   }
 
-  auto* host_stream = dynamic_cast<se::host::HostStream*>(
-      run_options->stream()->implementation());
+  auto* host_stream =
+      dynamic_cast<se::host::HostStream*>(run_options->stream());
   se::Stream* stream = run_options->stream();
   se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index 262d37344cd8f1..7b7bffc2d79a1d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -45,7 +45,10 @@ class GpuStream : public Stream {
         completed_event_(nullptr) {}
 
   // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~GpuStream() override { BlockHostUntilDone().IgnoreError(); }
+  ~GpuStream() override {
+    BlockHostUntilDone().IgnoreError();
+    parent()->DeallocateStream(this);
+  }
 
   void* platform_specific_stream() const override { return gpu_stream_; }
 
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index 9c00dad836470e..86a79c2fe7a188 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -51,7 +51,7 @@ namespace host {
 
 HostStream* AsHostStream(Stream* stream) {
   DCHECK(stream != nullptr);
-  return dynamic_cast<HostStream*>(stream->implementation());
+  return dynamic_cast<HostStream*>(stream);
 }
 
 static std::vector<HostExecutor::KernelFunctionLoader>&
diff --git a/third_party/xla/xla/stream_executor/host/host_stream.cc b/third_party/xla/xla/stream_executor/host/host_stream.cc
index 654cf4856e084c..4bd249d0139931 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream.cc
+++ b/third_party/xla/xla/stream_executor/host/host_stream.cc
@@ -48,6 +48,7 @@ HostStream::~HostStream() {
   }
   // thread_'s destructor blocks until the thread finishes running.
   thread_.reset();
+  parent()->DeallocateStream(this);
 }
 
 bool HostStream::EnqueueTask(absl::AnyInvocable<void() &&> task) {
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index e470ceb985c1c4..2ab77d64226d23 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -47,7 +47,7 @@ Stream::Stream(StreamExecutor *parent)
   CHECK_NE(parent, nullptr);
 }
 
-Stream::~Stream() { parent_->DeallocateStream(this); }
+Stream::~Stream() = default;
 
 Stream::PlatformSpecificHandle Stream::platform_specific_handle() const {
   PlatformSpecificHandle handle;
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index fcb302f914c95d..24b9596eb70478 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -235,10 +235,6 @@ class Stream : public StreamInterface {
   // Otherwise returns an error describing why the blocking failed.
   absl::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the (opaque) platform-specific backing object. Ownership is not
-  // transferred to the caller.
-  StreamInterface *implementation() { return this; }
-
   // Entrains onto the stream a callback to the host (from the device).
   // Behaves as DoHostCallbackWithStatus below, but the callback should
   // never fail or its failure is inconsequential.
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
index 5c0e5b3b880fce..1c07770e4833c8 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
@@ -51,7 +51,7 @@ static SE_ExecutableRunOptions ToC(
   if (options.run_options().host_to_device_stream() != nullptr) {
     se_options.host_to_device_stream =
         static_cast<tensorflow::tpu::TpuStream*>(
-            options.run_options().host_to_device_stream()->implementation())
+            options.run_options().host_to_device_stream())
             ->se_stream();
   } else {
     se_options.host_to_device_stream = nullptr;
@@ -77,8 +77,7 @@ static SE_ExecutableRunOptions ToC(
   CHECK_EQ(options.run_options().then_execute_function(), nullptr)
       << "ThenExecuteFunction not supported by this platform.";
 
-  auto impl =
-      const_cast<stream_executor::Stream*>(options.stream())->implementation();
+  auto impl = const_cast<stream_executor::Stream*>(options.stream());
   se_options.stream =
       static_cast<tensorflow::tpu::TpuStream*>(impl)->se_stream();
   return se_options;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
index e24dca6f5f9462..ee3df5651d50e1 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
@@ -88,8 +88,7 @@ void TpuExecutor::DeallocateStream(Stream* stream) {
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
   return ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
-      executor_, get_stream(dependent->implementation()),
-      get_stream(other->implementation()));
+      executor_, get_stream(dependent), get_stream(other));
 }
 
 absl::Status TpuExecutor::RecordEvent(Stream* stream,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
index 6d1a243b58d1a1..c062ddf8bd8cfe 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
@@ -88,8 +88,7 @@ absl::Status TpuOpExecutable::LoadProgramAndEnqueueToStream(
 
   auto platform = down_cast<tpu::TpuPlatform*>(
       tpu::TpuPlatformInterface::GetRegisteredPlatform());
-  auto stream = platform->LookupStream(
-      run_options.run_options().stream()->implementation());
+  auto stream = platform->LookupStream(run_options.run_options().stream());
   StatusHelper status;
 
   TpuExecutable_LoadProgramAndEnqueueToStream_Params params;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_stream.h b/third_party/xla/xla/stream_executor/tpu/tpu_stream.h
index 7eaf2f99b46173..e4d9284022426c 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_stream.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_stream.h
@@ -38,6 +38,7 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
       : TpuStreamInterface(executor), stream_(stream) {}
   ~TpuStream() override {
     BlockHostUntilDone().IgnoreError();
+    parent()->DeallocateStream(this);
     stream_executor::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
   }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
index 1273551804c408..45fe7aa8eab607 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
@@ -90,9 +90,7 @@ absl::Status TpuTransferManager::TransferLiteralToDeviceAsync(
 
   stream_executor::tpu::ExecutorApiFn()
       ->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
-          manager_,
-          TpuPlatform::GetRegisteredPlatform()->LookupStream(
-              stream->implementation()),
+          manager_, TpuPlatform::GetRegisteredPlatform()->LookupStream(stream),
           &c_literal, &c_device_buffer, status.c_status);
   ApiConverter::Destroy(&c_device_buffer);
   ApiConverter::Destroy(&c_literal);
@@ -221,9 +219,7 @@ void TpuTransferManager::TransferLiteralFromDevice(
 
   stream_executor::tpu::ExecutorApiFn()
       ->TpuTransferManager_TransferLiteralFromDeviceFn(
-          manager_,
-          TpuPlatform::GetRegisteredPlatform()->LookupStream(
-              stream->implementation()),
+          manager_, TpuPlatform::GetRegisteredPlatform()->LookupStream(stream),
           &c_device_buffer, &c_literal, TransferLiteralFromDeviceTrampoline,
           state);
   ApiConverter::Destroy(&c_device_buffer);
@@ -309,9 +305,7 @@ absl::Status TpuTransferManager::WriteSingleTupleIndexTable(
 
   stream_executor::tpu::ExecutorApiFn()
       ->TpuTransferManager_WriteSingleTupleIndexTableFn(
-          manager_,
-          TpuPlatform::GetRegisteredPlatform()->LookupStream(
-              stream->implementation()),
+          manager_, TpuPlatform::GetRegisteredPlatform()->LookupStream(stream),
           elements_bases, elements.size(), &c_shape, &region_base,
           status.c_status);
 
@@ -363,8 +357,7 @@ absl::Status TpuTransferManager::ReadDynamicShapes(
   XLA_Shape c_updated_shape;
   StatusHelper status;
   stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_ReadDynamicShapesFn(
-      TpuPlatform::GetRegisteredPlatform()->LookupStream(
-          stream->implementation()),
+      TpuPlatform::GetRegisteredPlatform()->LookupStream(stream),
       &c_device_buffer, c_device_shape, &c_updated_shape, status.c_status);
   ApiConverter::Destroy(&c_device_buffer);
   ApiConverter::Destroy(&c_device_shape);

From 6c190efc2bbe624fb7d051e88819f9de9b466df3 Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Tue, 28 May 2024 15:32:05 -0700
Subject: [PATCH 037/287] Check for int quant types before folding qconst into
 tpose conv.

PiperOrigin-RevId: 638044288
---
 tensorflow/compiler/mlir/lite/tests/quantize.mlir | 15 ++++++++++++++-
 .../mlir/lite/transforms/quantize_patterns.td     |  5 ++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 7bf707bcb21719..9bed1c6ed63e5e 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-prepare-quantize -tfl-quantize | FileCheck %s
+// RUN: tf-opt %s -tfl-prepare-quantize -tfl-quantize  | FileCheck %s
 // RUN: tf-opt %s -tfl-quantize="legacy-quantize=true" | FileCheck --check-prefix=LEGACY %s
 // RUN: tf-opt %s -tfl-prepare-quantize -tfl-quantize="ops-blocklist=tfl.fully_connected,tfl.softmax locs-blocklist=Block,NullBlock" | FileCheck --check-prefix=BLOCK %s
 
@@ -486,3 +486,16 @@ func.func @foldQuantWeightsIntoTposeConv(%arg0: tensor<2x2x3x2048xf32>) -> tenso
   // CHECK: "tfl.transpose_conv"(%cst, %1, %arg0, %0) <{fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<4xi32>, tensor<4x2x2x2048x!quant.uniform<u8<1:255>:f32
 }
 
+// CHECK-LABEL: foldQuantWeightsIntoTposeConvf16NotFolded
+func.func @foldQuantWeightsIntoTposeConvf16NotFolded(%arg0: tensor<2x2x3x2048xf32>) -> tensor<2x3x2x2048xf32> {
+  %output_shape = arith.constant dense<[2, 3, 2, 2048]> : tensor<4xi32>
+  %f16_weights = "tfl.pseudo_const"() {value = dense<1.0> : tensor<4x2x2x2048xf16>} : () -> tensor<4x2x2x2048xf16>
+  %dq_weights = "tfl.dequantize"(%f16_weights) : (tensor<4x2x2x2048xf16>) -> tensor<4x2x2x2048xf32>
+  %bias = "tfl.no_value"() {value} : () -> none
+  %out = "tfl.transpose_conv"(%output_shape, %dq_weights, %arg0, %bias) {fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<4xi32>, tensor<4x2x2x2048xf32>, tensor<2x2x3x2048xf32>, none) -> tensor<2x3x2x2048xf32>
+  func.return %out : tensor<2x3x2x2048xf32>
+
+  // CHECK: "tfl.dequantize"
+}
+
+
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index 11488356172977..838f62728430ea 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -19,6 +19,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/IR/CommonTypeConstraints.td"
+include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Quantize attribute $0 by using quantization parameter from %1.
@@ -45,4 +47,5 @@ def FoldQuantWeightsIntoTposeConv : Pat<
     $quant_input,
     $bias, $padding, $stride_h, $stride_w, $faf),
   (TFL_TransposeConvOp $output_shape, $quant_weights,
-    $quant_input, $bias, $padding, $stride_h, $stride_w, $faf)>;
+    $quant_input, $bias, $padding, $stride_h, $stride_w, $faf), 
+  [(TensorOf<[QI8, QUI8, QI16]> $quant_weights)]>;

From 3642c2ec6577fb80cd6296364a825565832ad66b Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Tue, 28 May 2024 15:36:52 -0700
Subject: [PATCH 038/287] [pt composite lowerings]

Add lowering from composite for average pool when ceil_mode=true and count_pad=true

Additionally, the following refactors:
* Simplify padding string determination
* Use intermediate struct to parse attrs into c types
* Rename some functions to be more cleared where they are used
* Constrain enum usage to within one function, so remove enum

PiperOrigin-RevId: 638046183
---
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |   3 +-
 .../stablehlo/tests/composite-lowering.mlir   |  98 +++++-
 .../transforms/composite_avg_pool.cc          | 298 ++++++++++++------
 .../stablehlo/transforms/composite_avg_pool.h |  11 +-
 .../transforms/composite_avg_pool_patterns.td |  63 +++-
 .../transforms/composite_lowering_pass.cc     |   4 +-
 .../stablehlo/transforms/composite_utils.cc   |  12 +
 .../stablehlo/transforms/composite_utils.h    |   4 +
 8 files changed, 370 insertions(+), 123 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 5e621b934a221a..be3c7e8696876d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -716,6 +716,7 @@ cc_library(
     hdrs = ["transforms/composite_utils.h"],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -741,8 +742,6 @@ cc_library(
         ":composite_utils",
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
index 4121caa60a8e0d..ccc9114a7fe1e6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -33,7 +33,6 @@ func.func private @XlaCallModule_aten.hardswish.default.impl_0(%arg0: tensor<2xf
 // CHECK:           return %[[VAL_3]] : tensor<*xf32>
 // CHECK:         }
 
-
 func.func @avg_pool2d_1(%arg0: tensor<1x3x6x6xf32>) -> (tensor<*xf32>) {
   %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<3> : tensor<2xi64>, padding = dense<0> : tensor<2xi64>, stride = dense<1> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_0} : (tensor<1x3x6x6xf32>) -> tensor<1x3x4x4xf32>
   %1 = "tf.Identity"(%0) {device = ""} : (tensor<1x3x4x4xf32>) -> tensor<*xf32>
@@ -71,7 +70,6 @@ func.func private @XlaCallModule_aten.avg_pool2d.default.impl_0(%arg0: tensor<1x
 // CHECK:           %[[VAL_8:.*]] = "tf.Identity"(%[[VAL_7]]) {device = ""} : (tensor<1x3x4x4xf32>) -> tensor<*xf32>
 // CHECK:           %[[VAL_9:.*]] = "tf.Identity"(%[[VAL_8]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK:           return %[[VAL_9]] : tensor<*xf32>
-// CHECK:         }
 
 func.func @avg_pool2d_2(%arg0: tensor<1x3x6x6xf32>) -> (tensor<*xf32>) {
   %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = false, divisor_override = "py_None", kernel_size = dense<3> : tensor<2xi64>, padding = dense<1> : tensor<2xi64>, stride = dense<1> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_1} : (tensor<1x3x6x6xf32>) -> tensor<1x3x6x6xf32>
@@ -108,7 +106,89 @@ func.func private @XlaCallModule_aten.avg_pool2d.default.impl_1(%arg0: tensor<1x
 // CHECK:           %[[VAL_6:.*]] = "tf.Identity"(%[[VAL_5]]) {device = ""} : (tensor<1x3x6x6xf32>) -> tensor<*xf32>
 // CHECK:           %[[VAL_7:.*]] = "tf.Identity"(%[[VAL_6]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK:           return %[[VAL_7]] : tensor<*xf32>
-// CHECK:         }
+
+func.func @avg_pool2d_3(%arg0: tensor<1x1x1x8xf32>) -> (tensor<1x1x1x4xf32>) {
+  %2 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<[1, 3]> : tensor<2xi64>, padding = dense<[0, 1]> : tensor<2xi64>, stride = dense<[1, 2]> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_2} : (tensor<1x1x1x8xf32>) -> tensor<1x1x1x4xf32>
+  return %2 : tensor<1x1x1x4xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_2(%arg0: tensor<1x1x1x8xf32>) -> tensor<1x1x1x4xf32>
+
+// CHECK-LABEL: avg_pool2d_3
+// CHECK: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x1x8xf32>, tensor<4xi32>) -> tensor<1x1x8x1xf32>
+// CHECK{LITERAL}: %cst_0 = arith.constant dense<[[0, 0], [0, 0], [1, 1], [0, 0]]> : tensor<4x2xi32>
+// CHECK: %1 = "tfl.pad"(%0, %cst_0) : (tensor<1x1x8x1xf32>, tensor<4x2xi32>) -> tensor<1x1x10x1xf32>
+// CHECK: %2 = "tfl.average_pool_2d"(%1) <{filter_height = 1 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32}> : (tensor<1x1x10x1xf32>) -> tensor<1x1x4x1xf32>
+// CHECK: %cst_1 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK: %3 = "tfl.transpose"(%2, %cst_1) : (tensor<1x1x4x1xf32>, tensor<4xi32>) -> tensor<1x1x1x4xf32>
+// CHECK: return %3 : tensor<1x1x1x4xf32>
+
+func.func @avg_pool2d_4(%arg0: tensor<1x1x1x9xf32>) -> (tensor<1x1x1x4xf32>) {
+  %2 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = false, divisor_override = "py_None", kernel_size = dense<[1, 3]> : tensor<2xi64>, padding = dense<[0, 0]> : tensor<2xi64>, stride = dense<[1, 2]> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_3} : (tensor<1x1x1x9xf32>) -> tensor<1x1x1x4xf32>
+  return %2 : tensor<1x1x1x4xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_3(%arg0: tensor<1x1x1x9xf32>) -> tensor<1x1x1x4xf32>
+
+// CHECK-LABEL: avg_pool2d_4
+// CHECK: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x1x9xf32>, tensor<4xi32>) -> tensor<1x1x9x1xf32>
+// CHECK: %1 = "tfl.average_pool_2d"(%0) <{filter_height = 1 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32}> : (tensor<1x1x9x1xf32>) -> tensor<1x1x4x1xf32>
+// CHECK: %cst_0 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK: %2 = "tfl.transpose"(%1, %cst_0) : (tensor<1x1x4x1xf32>, tensor<4xi32>) -> tensor<1x1x1x4xf32>
+// CHECK: return %2 : tensor<1x1x1x4xf32>
+
+
+func.func @avg_pool2d_5(%arg0: tensor<1x1x3x3xf32>) -> (tensor<1x1x2x2xf32>) {
+  %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = true, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<[2, 2]> : tensor<2xi64>, padding = dense<[0, 0]> : tensor<2xi64>, stride = dense<[2, 2]> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_4} : (tensor<1x1x3x3xf32>) -> tensor<1x1x2x2xf32>
+  return %0 : tensor<1x1x2x2xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_4(%arg0: tensor<1x1x3x3xf32>) -> tensor<1x1x2x2xf32>
+
+// CHECK-LABEL: avg_pool2d_5
+// CHECK: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x3x3xf32>, tensor<4xi32>) -> tensor<1x3x3x1xf32>
+// CHECK{LITERAL}: %cst_0 = arith.constant dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi32>
+// CHECK: %1 = "tfl.pad"(%0, %cst_0) : (tensor<1x3x3x1xf32>, tensor<4x2xi32>) -> tensor<1x4x4x1xf32>
+// CHECK: %2 = "tfl.average_pool_2d"(%1) <{filter_height = 2 : i32, filter_width = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x4x4x1xf32>) -> tensor<1x2x2x1xf32>
+// CHECK{LITERAL}: %cst_1 = arith.constant dense<[[[[1.000000e+00], [2.000000e+00]], [[2.000000e+00], [4.000000e+00]]]]> : tensor<1x2x2x1xf32>
+// CHECK: %3 = tfl.mul %2, %cst_1 {fused_activation_function = "NONE"} : tensor<1x2x2x1xf32>
+// CHECK: %cst_2 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK: %4 = "tfl.transpose"(%3, %cst_2) : (tensor<1x2x2x1xf32>, tensor<4xi32>) -> tensor<1x1x2x2xf32>
+// CHECK: return %4 : tensor<1x1x2x2xf32>
+
+func.func @avg_pool2d_6(%arg0: tensor<1x1x1x7xf32>) -> (tensor<1x1x1x2xf32>) {
+  %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = true, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<[1, 5]> : tensor<2xi64>, padding = dense<[0, 0]> : tensor<2xi64>, stride = dense<[1, 3]> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_5} : (tensor<1x1x1x7xf32>) -> tensor<1x1x1x2xf32>
+  return %0 : tensor<1x1x1x2xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_5(%arg0: tensor<1x1x1x7xf32>) -> tensor<1x1x1x2xf32>
+
+// CHECK-LABEL: avg_pool2d_6
+// CHECK: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x1x7xf32>, tensor<4xi32>) -> tensor<1x1x7x1xf32>
+// CHECK{LITERAL}: %cst_0 = arith.constant dense<[[0, 0], [0, 0], [0, 1], [0, 0]]> : tensor<4x2xi32>
+// CHECK: %1 = "tfl.pad"(%0, %cst_0) : (tensor<1x1x7x1xf32>, tensor<4x2xi32>) -> tensor<1x1x8x1xf32>
+// CHECK: %2 = "tfl.average_pool_2d"(%1) <{filter_height = 1 : i32, filter_width = 5 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 3 : i32}> : (tensor<1x1x8x1xf32>) -> tensor<1x1x2x1xf32>
+// CHECK{LITERAL}: %cst_1 = arith.constant dense<[[[[1.000000e+00], [1.250000e+00]]]]> : tensor<1x1x2x1xf32>
+// CHECK: %3 = tfl.mul %2, %cst_1 {fused_activation_function = "NONE"} : tensor<1x1x2x1xf32>
+// CHECK: %cst_2 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK: %4 = "tfl.transpose"(%3, %cst_2) : (tensor<1x1x2x1xf32>, tensor<4xi32>) -> tensor<1x1x1x2xf32>
+
+func.func @avg_pool2d_7(%arg0: tensor<1x1x1x8xf32>) -> (tensor<1x1x1x5xf32>) {
+  %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = true, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<[1, 3]> : tensor<2xi64>, padding = dense<[0, 1]> : tensor<2xi64>, stride = dense<[1, 2]> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_6} : (tensor<1x1x1x8xf32>) -> tensor<1x1x1x5xf32>
+  return %0 : tensor<1x1x1x5xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_6(%arg0: tensor<1x1x1x8xf32>) -> tensor<1x1x1x5xf32>
+
+// CHECK-LABEL: avg_pool2d_7
+// CHECK: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK{LITERAL}: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x1x8xf32>, tensor<4xi32>) -> tensor<1x1x8x1xf32>
+// CHECK{LITERAL}: %cst_0 = arith.constant dense<[[0, 0], [0, 0], [1, 2], [0, 0]]> : tensor<4x2xi32>
+// CHECK: %1 = "tfl.pad"(%0, %cst_0) : (tensor<1x1x8x1xf32>, tensor<4x2xi32>) -> tensor<1x1x11x1xf32>
+// CHECK: %2 = "tfl.average_pool_2d"(%1) <{filter_height = 1 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32}> : (tensor<1x1x11x1xf32>) -> tensor<1x1x5x1xf32>
+// CHECK{LITERAL}: %cst_1 = arith.constant dense<[[[[1.000000e+00], [1.000000e+00], [1.000000e+00], [1.000000e+00], [1.500000e+00]]]]> : tensor<1x1x5x1xf32>
+// CHECK: %3 = tfl.mul %2, %cst_1 {fused_activation_function = "NONE"} : tensor<1x1x5x1xf32>
+// CHECK: %cst_2 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK: %4 = "tfl.transpose"(%3, %cst_2) : (tensor<1x1x5x1xf32>, tensor<4xi32>) -> tensor<1x1x1x5xf32>
 
 func.func @upsample_bilinear2d(%arg0: tensor<1x64x16x16xf32>) -> (tensor<1x64x32x32xf32>) {
   %0 = mhlo.composite "odml.upsample_bilinear2d" %arg0 {composite_attributes = {align_corners = false, output = dense<32> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.upsample_bilinear2d.impl_21_0} : (tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32>
@@ -169,12 +249,12 @@ func.func private @XlaCallModule_odml.upsample_bilinear2d.impl_21_0(%arg0: tenso
 
 // CHECK-LABEL:   func.func @upsample_bilinear2d(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
-// CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x64x16x16xf32>, tensor<4xi32>) -> tensor<1x16x16x64xf32>
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<32> : tensor<2xi32>
-// CHECK:           %[[VAL_4:.*]] = "tfl.resize_bilinear"(%[[VAL_2]], %[[VAL_3]]) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x16x16x64xf32>, tensor<2xi32>) -> tensor<1x32x32x64xf32>
-// CHECK:           %[[VAL_5:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
-// CHECK:           %[[VAL_6:.*]] = "tfl.transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x32x32x64xf32>, tensor<4xi32>) -> tensor<1x64x32x32xf32>
+// CHECK-DAG:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK-DAG:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x64x16x16xf32>, tensor<4xi32>) -> tensor<1x16x16x64xf32>
+// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant dense<32> : tensor<2xi32>
+// CHECK-DAG:           %[[VAL_4:.*]] = "tfl.resize_bilinear"(%[[VAL_2]], %[[VAL_3]]) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x16x16x64xf32>, tensor<2xi32>) -> tensor<1x32x32x64xf32>
+// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK-DAG:           %[[VAL_6:.*]] = "tfl.transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x32x32x64xf32>, tensor<4xi32>) -> tensor<1x64x32x32xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1x64x32x32xf32>
 // CHECK:         }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
index 8c28f2e5e5df4b..9020e256cc07a4 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
@@ -15,15 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h"
 
-#include <algorithm>
 #include <array>
+#include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <string>
 #include <vector>
 
-#include "absl/status/status.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -33,121 +32,230 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
-#include "tensorflow/core/framework/kernel_shape_util.h"
-#include "tensorflow/core/util/padding.h"
 
 namespace mlir {
 namespace odml {
 
-DenseIntElementsAttr GetPaddingArrayAttr(Builder& builder, Operation* old_op) {
-  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
-  auto composite_attrs = composite_op.getCompositeAttributes();
-  std::vector<int32_t> padding_vec;
-  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "padding",
-                                        &padding_vec);
+using mhlo::CompositeOp;
 
-  std::vector<int32_t> result_padding_conf(8, 0);  // NHWC
-  result_padding_conf[2] = result_padding_conf[3] = padding_vec[0];
-  result_padding_conf[4] = result_padding_conf[5] = padding_vec[1];
+// Struct for holding composite attrs for torch average pool as CC types.
+struct TorchAvgPoolData {
+  int n;  // Batch.
+  int c;  // Channels.
 
-  return DenseIntElementsAttr::get(
-      RankedTensorType::get({4, 2}, builder.getI32Type()), result_padding_conf);
+  int h_in;  // Input height.
+  int w_in;  // Input width.
+
+  int h_out;  // Output height.
+  int w_out;  // Output width.
+
+  int kh;  // Kernel height.
+  int kw;  // Kernel width.
+
+  int ph;  // Padding on "height" dimension (both sides).
+  int pw;  // Padding on "width" dimension (both sides).
+
+  int sh;  // Stride on "height" dimension.
+  int sw;  // Stride on "width" dimension.
+
+  bool ceil_mode;  // Rounding strategy (ceil or floor).
+};
+
+// Rounds the dimension based on the ceil mode.
+int RoundDim(float dim, bool ceil_mode) {
+  if (ceil_mode) {
+    return std::ceil(dim);
+  }
+  return std::floor(dim);
 }
 
-ShapedType GetPaddedType(Operation* old_op) {
-  auto input_type = mlir::cast<ShapedType>(old_op->getOperand(0).getType());
-  auto input_shape = input_type.getShape();  // NCHW
-  int64_t batch_size = input_shape[0];
-  int64_t channel_size = input_shape[1];
-  int64_t height = input_shape[2];
-  int64_t width = input_shape[3];
-
-  DenseIntElementsAttr padding_attr;
-  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
-  auto composite_attributes = composite_op.getCompositeAttributes();
-  EnsureAttribute<DenseIntElementsAttr>(composite_attributes, "padding",
-                                        &padding_attr);
-  std::vector<int64_t> padding_values(padding_attr.getValues<int64_t>().begin(),
-                                      padding_attr.getValues<int64_t>().end());
-  int64_t padding_height = padding_values[0];
-  int64_t padding_width = padding_values[1];
-
-  std::array<int64_t, 4> output_shape = {
-      batch_size, height + 2 * padding_height, width + 2 * padding_width,
-      channel_size};  // NHWC
-  return RankedTensorType::get(output_shape, input_type.getElementType());
+// For H or W, calculate the output dimension for average pool.
+int CalculateSpatialOutDim(int in, int k, int p, int s, bool ceil_mode) {
+  const float effective_size = in - k + (2 * p);
+  int out = RoundDim(effective_size / (float)s, ceil_mode) + 1;
+  // Only possible if rounder is ceil.
+  if ((out - 1) * s >= in + p) {
+    out -= 1;
+  }
+  return out;
 }
 
-// Checks if the provided configuration can be supported by the tensorflow
-// "SAME" padding configuration.
-static bool IsSamePadding(const std::vector<int32_t>& spatial_dim_sizes,
-                          const std::vector<int32_t>& kernel_size,
-                          const std::vector<int32_t>& strides,
-                          const std::vector<int32_t>& padding_array) {
-  for (int dim : llvm::seq<int>(0, spatial_dim_sizes.size())) {
-    int64_t discard;
-    int64_t pad_low_ignore;
-    int64_t pad_high_ignore;
-    absl::Status status = tensorflow::GetWindowedOutputSizeVerbose(
-        spatial_dim_sizes[dim], kernel_size[dim], 1, strides[dim],
-        tensorflow::Padding::SAME, &discard, &pad_low_ignore, &pad_high_ignore);
-    if (!status.ok()) {
-      return false;
+// Builds a `TorchAvgPoolData` from composite op.
+TorchAvgPoolData GetTorchAvgPoolData(CompositeOp op) {
+  auto composite_attrs = op.getCompositeAttributes();
+  TorchAvgPoolData data;
+
+  auto op_type = mlir::cast<RankedTensorType>(op.getOperand(0).getType());
+
+  data.n = op_type.getShape()[0];
+  data.c = op_type.getShape()[1];
+  data.h_in = op_type.getShape()[2];
+  data.w_in = op_type.getShape()[3];
+
+  std::vector<int32_t> kernel_size;
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "kernel_size",
+                                        &kernel_size);
+  data.kh = kernel_size[0];
+  data.kw = kernel_size[1];
+
+  std::vector<int32_t> padding;
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "padding", &padding);
+  data.ph = padding[0];
+  data.pw = padding[1];
+
+  std::vector<int32_t> strides;
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "stride", &strides);
+  data.sh = strides[0];
+  data.sw = strides[1];
+
+  data.ceil_mode =
+      GetBoolFromCompositeAttr(composite_attrs, "ceil_mode").value();
+
+  data.h_out = CalculateSpatialOutDim(data.h_in, data.kh, data.ph, data.sh,
+                                      data.ceil_mode);
+  data.w_out = CalculateSpatialOutDim(data.w_in, data.kw, data.pw, data.sw,
+                                      data.ceil_mode);
+
+  return data;
+}
+
+// Determines the true number of present elements in the given window
+// in input tensor with pytorch rounding behavior.
+int ActualNumElementsInKernel(int k_row_start, int k_col_start,
+                              const TorchAvgPoolData& pool) {
+  int res = 0;
+  for (int k_col = 0; k_col < pool.kw; ++k_col) {
+    for (int k_row = 0; k_row < pool.kh; ++k_row) {
+      const int target_col = k_col + k_col_start;
+      const int target_row = k_row + k_row_start;
+
+      const bool row_in_bound = target_row < pool.h_in + (2 * pool.ph);
+      const bool col_in_bound = target_col < pool.w_in + (2 * pool.pw);
+      const bool is_counted_in_original_input = row_in_bound && col_in_bound;
+
+      res += is_counted_in_original_input;
     }
-    if (padding_array[dim] != pad_low_ignore ||
-        padding_array[dim] != pad_high_ignore) {
-      return false;
+  }
+  return res;
+}
+
+// Gets a matrix which corrects the overcounting of divisors when casting a
+// average pool with ceil mode true as one with ceil mode false on a padded
+// tensor.
+DenseFPElementsAttr GetCorrectionMatrix(Builder& builder, CompositeOp op) {
+  const TorchAvgPoolData pool = GetTorchAvgPoolData(op);
+
+  llvm::SmallVector<int64_t, 4> nhwc_out_shape(4);
+  nhwc_out_shape[0] = 1;  // Broadcast batch.
+  nhwc_out_shape[1] = pool.h_out;
+  nhwc_out_shape[2] = pool.w_out;
+  nhwc_out_shape[3] = 1;  // Broadcast channels.
+
+  auto out_shaped_type =
+      RankedTensorType::get(nhwc_out_shape, builder.getF32Type());
+  auto get_flat_ind = [&](int row, int col) -> size_t {
+    return row * pool.w_out + col;
+  };
+
+  std::vector<float> correction_data(out_shaped_type.getNumElements(), 1.0);
+
+  const float kern_size = pool.kh * pool.kw;
+
+  // LEMMA 1: Changing the rounding mode from floor to ceil will increase an
+  // output dimension by at most 1 (see `ComputeSpatialOutDim`). This is because
+  // for any `x`, `ceil(x) - floor(x) <= 1`.
+
+  // Consider that we pad the input of a average pool with floor rounding to the
+  // appropriate size and switch the rounding mode to ceil. When computing the
+  // average of a given window, the elements which exist in the newly padded
+  // zones will be counted as present elements. Therefore in some windows we
+  // will overcount the divisors.
+  //    Following from (LEMMA 1) the only windows which contain overcounted
+  // divisors are the ones on the outside right and bottom edge. We can iterate
+  // over these windows and multiply the corresponding out element by
+  // `kernel_size / X` where `X` is the number of elements in the padded input
+  // tensor not in the newly padded zone. This corrects the overcounting of
+  // divisors resulting in an equivalant computation.
+  {
+    const int right_col = pool.w_out - 1;
+    const int k_col_start = right_col * pool.sw;
+    for (int row = 0; row < pool.h_out; ++row) {
+      const int k_row_start = row * pool.sh;
+      const int correct_divisor =
+          ActualNumElementsInKernel(k_row_start, k_col_start, pool);
+      const size_t flat_ind = get_flat_ind(row, right_col);
+      correction_data[flat_ind] = kern_size / correct_divisor;
     }
   }
 
-  return true;
+  {
+    const int bottom_row = pool.h_out - 1;
+    const int k_row_start = bottom_row * pool.sh;
+    for (int col = 0; col < pool.w_out; ++col) {
+      const int k_col_start = col * pool.sw;
+      const int correct_divisor =
+          ActualNumElementsInKernel(k_row_start, k_col_start, pool);
+      const size_t flat_ind = get_flat_ind(bottom_row, col);
+      correction_data[flat_ind] = kern_size / correct_divisor;
+    }
+  }
+
+  return DenseFPElementsAttr::get(out_shaped_type, correction_data);
 }
 
-enum class PaddingType { kValid, kSame, kCustom };
+std::array<int, 4> GetPadOpPaddingValues(const TorchAvgPoolData& pool) {
+  int pad_bottom = pool.ph;
+  int pad_right = pool.pw;
+
+  if (pool.ceil_mode) {
+    const int remaining_bottom = pool.h_in - ((pool.h_out - 1) * pool.sh);
+    const int ceil_pad_bottom = pool.kh - remaining_bottom;
+    pad_bottom = ceil_pad_bottom - pool.ph;
 
-static PaddingType GetPaddingType(const std::vector<int32_t>& spatial_dim_sizes,
-                                  const std::vector<int32_t>& kernel_size,
-                                  const std::vector<int32_t>& strides,
-                                  const std::vector<int32_t>& padding_array) {
-  if (std::all_of(padding_array.begin(), padding_array.end(),
-                  [](int32_t padding_value) { return padding_value == 0; })) {
-    return PaddingType::kValid;
+    const int remaining_right = pool.w_in - ((pool.w_out - 1) * pool.sw);
+    const int ceil_pad_right = pool.kw - remaining_right;
+    pad_right = ceil_pad_right - pool.pw;
   }
-  if (IsSamePadding(spatial_dim_sizes, kernel_size, strides, padding_array)) {
-    return PaddingType::kSame;
+
+  return {pool.ph, pad_bottom, pool.pw, pad_right};
+}
+
+DenseIntElementsAttr GetPadOpAttr(Builder& builder, CompositeOp op) {
+  const TorchAvgPoolData pool = GetTorchAvgPoolData(op);
+
+  const auto values = GetPadOpPaddingValues(pool);
+
+  llvm::SmallVector<int32_t> padding_vec(8, 0);  // NHWC
+  for (auto [ind, val] : llvm::enumerate(values)) {
+    padding_vec[ind + 2] = val;
   }
-  return PaddingType::kCustom;
+
+  auto padding_shape = RankedTensorType::get({4, 2}, builder.getI32Type());
+  return DenseIntElementsAttr::get(padding_shape, padding_vec);
 }
 
-StringAttr GetPaddingStringAttr(Builder& builder, Operation* old_op) {
-  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
-  auto composite_attrs = composite_op.getCompositeAttributes();
+ShapedType GetPadOpType(CompositeOp op) {
+  const TorchAvgPoolData pool = GetTorchAvgPoolData(op);
 
-  auto operand_shape =
-      mlir::cast<ShapedType>(composite_op.getOperand(0).getType()).getShape();
-  // NC(H)(W)
-  std::vector<int32_t> spatial_dim_sizes = {
-      static_cast<int32_t>(operand_shape[2]),
-      static_cast<int32_t>(operand_shape[3])};
+  const auto padding_values = GetPadOpPaddingValues(pool);
+  const int64_t h = pool.h_in + padding_values[0] + padding_values[1];
+  const int64_t w = pool.w_in + padding_values[2] + padding_values[3];
+  llvm::SmallVector<int64_t> shape = {pool.n, h, w, pool.c};
 
-  std::vector<int32_t> padding_vec, kernel_size_vec, strides_vec;
-  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "kernel_size",
-                                        &kernel_size_vec);
-  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "stride",
-                                        &strides_vec);
-  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "padding",
-                                        &padding_vec);
-  PaddingType padding_type = GetPaddingType(spatial_dim_sizes, kernel_size_vec,
-                                            strides_vec, padding_vec);
-
-  switch (padding_type) {
-    case PaddingType::kValid:
-      return builder.getStringAttr("VALID");
-    case PaddingType::kSame:
-      return builder.getStringAttr("SAME");
-    case PaddingType::kCustom:
-      return builder.getStringAttr("CUSTOM");
+  auto op_type = mlir::cast<RankedTensorType>(op->getResult(0).getType());
+  return RankedTensorType::get(shape, op_type.getElementType());
+}
+
+StringAttr GetAvgPoolOpPadAttr(Builder& builder, CompositeOp op) {
+  const TorchAvgPoolData pool = GetTorchAvgPoolData(op);
+
+  if (pool.ph == 0 && pool.pw == 0) {
+    return builder.getStringAttr("VALID");
+  }
+  if (pool.h_out == pool.h_in && pool.w_out == pool.w_in) {
+    return builder.getStringAttr("SAME");
   }
+  return builder.getStringAttr("CUSTOM");
 }
 
 }  // namespace odml
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
index 4224f2d6c8ae10..2afa2066b078bc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
@@ -30,19 +30,24 @@ namespace odml {
 // Given a Composite op that wraps a core.aten.avg_pool2d, returns the padding
 // configuration required for the `tfl.pad` if the padding part of the op is
 // to be done before average pooling.
-DenseIntElementsAttr GetPaddingArrayAttr(Builder& builder, Operation* old_op);
+DenseIntElementsAttr GetPadOpAttr(Builder& builder, mhlo::CompositeOp op);
 
 // Given a Composite op that wraps a core.aten.avg_pool2d, and assuming that
 // the padding part is extracted into a tfl.pad op prior to a
 // tfl.average_pool_2d, this function finds the return type of the needed
 // tfl.pad .
-ShapedType GetPaddedType(Operation* old_op);
+ShapedType GetPadOpType(mhlo::CompositeOp op);
 
 // Given a Composite op that wraps a core.aten.avg_pool2d, finds the padding
 // attribute to be passed to the a tfl.average_pool_2d that can fully replace
 // this composite (here, padding is done directly by the tfl.average_pool_2d as
 // opposed to being extracted into a separate tfl.pad).
-StringAttr GetPaddingStringAttr(Builder& builder, Operation* old_op);
+StringAttr GetAvgPoolOpPadAttr(Builder& builder, mhlo::CompositeOp op);
+
+// Get dense attr for a matrix that corrects the over counting of divisors when
+// casting an average pool with ceil mode on in terms of average pool with it
+// off.
+DenseFPElementsAttr GetCorrectionMatrix(Builder& builder, mhlo::CompositeOp op);
 
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td
index 607b8f520ba6f9..8563fe7a9c038c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td
@@ -15,29 +15,39 @@ limitations under the License.
 
 include "mlir/IR/PatternBase.td"
 include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td"
+include "mhlo/IR/hlo_ops.td"
 
 
 // See the function doc in the header file.
-def GetPaddedType : NativeCodeCall<
-  "GetPaddedType((*$0.begin()).getDefiningOp())">;
+def GetPadOpType : NativeCodeCall<
+  "GetPadOpType((*$0.begin()).getDefiningOp<mhlo::CompositeOp>())">;
 
 // See the function doc in the header file.
-def GetPadding: 
-  NativeCodeCall<"GetPaddingStringAttr($_builder, (*$0.begin()).getDefiningOp())">;
+def GetAvgPoolOpPadAttr: 
+  NativeCodeCall<"GetAvgPoolOpPadAttr($_builder, (*$0.begin()).getDefiningOp<mhlo::CompositeOp>())">;
 
 // Returns true if the provided padding in the composite op can *not* be 
 // satisfied by SAME or VALID tensorflow padding.
 def HasCustomPadding:
-  Constraint<CPred<"GetPaddingStringAttr($_builder, (*$0.begin()).getDefiningOp()) == $_builder.getStringAttr(\"CUSTOM\")">>;
+  Constraint<CPred<"GetAvgPoolOpPadAttr($_builder, (*$0.begin()).getDefiningOp<mhlo::CompositeOp>()) == $_builder.getStringAttr(\"CUSTOM\")">>;
 
 // Returns true if the provided padding in the composite op can be satisfied 
 // by SAME or VALID tensorflow padding.
 def HasSameOrValidPadding: Constraint<Neg<HasCustomPadding.predicate>>;
 
 // See the function doc in the header file.
-def GetPaddingArrayAttr: NativeCodeCall<"GetPaddingArrayAttr($_builder, (*$0.begin()).getDefiningOp())">;
+def GetPadOpAttr: NativeCodeCall<"GetPadOpAttr($_builder, (*$0.begin()).getDefiningOp<mhlo::CompositeOp>())">;
 
-// Replaces an ate.avg_pool2d with a (T -> tfl.average_pool_2d  -> T).
+// See the function doc in the header file.
+def GetPaddingArrayCeilMode: NativeCodeCall<"GetPaddingArrayCeilMode($_builder, (*$0.begin()).getDefiningOp<mhlo::CompositeOp>())">;
+
+class CreateStringAttr<string values> : NativeCodeCall<
+  "$_builder.getStringAttr("# values #")">;
+
+// Get dense attr for a matrix that corrects the over counting of dividends when casting an average pool with ceil mode on in terms of average pool with it off.
+def GetCorrectionMatrix: NativeCodeCall<"GetCorrectionMatrix($_builder, (*$0.begin()).getDefiningOp<mhlo::CompositeOp>())">;
+
+// Replaces an aten.avg_pool2d with a (T -> tfl.average_pool_2d  -> T).
 // Constraints are added on the attributes of the aten.avg_pool2d to ensure only
 // ops that match the behaviour of tfl.average_pool_2d are directly lowered.
 def LegalizeAvgPool2dComposite: Pat<
@@ -51,7 +61,7 @@ def LegalizeAvgPool2dComposite: Pat<
                                     ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
                             /*filter_height*/(GetI32At<0> (GetAsVectorAttr<"kernel_size"> $attrs)),
                             /*filter_width*/(GetI32At<1> (GetAsVectorAttr<"kernel_size"> $attrs)),
-                            /*padding*/(GetPadding $old_val),
+                            /*padding*/(GetAvgPoolOpPadAttr $old_val),
                             /*stride_h*/(GetI32At<0> (GetAsVectorAttr<"stride"> $attrs)),
                             /*stride_w*/(GetI32At<1> (GetAsVectorAttr<"stride"> $attrs)),
                             /*fused_activation_function*/TFL_AF_None,
@@ -63,7 +73,7 @@ def LegalizeAvgPool2dComposite: Pat<
                     (IsStrCompositeAttribute<"divisor_override", "py_None"> $attrs),
                     (HasSameOrValidPadding $old_val)]>;
 
-// Replaces an ate.avg_pool2d with (T -> tfl.pad -> tfl.average_pool_2d  -> T).
+// Replaces an aten.avg_pool2d with (T -> tfl.pad -> tfl.average_pool_2d  -> T).
 def LegalizeAvgPool2dWithPadComposite: Pat<
                     (MHLO_CompositeOp:$old_val
                     (variadic $a_input), 
@@ -75,8 +85,8 @@ def LegalizeAvgPool2dWithPadComposite: Pat<
                                     (Arith_ConstantOp
                                         ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
                                 (Arith_ConstantOp
-                                    (GetPaddingArrayAttr $old_val)),
-                                (returnType (GetPaddedType $old_val))),
+                                    (GetPadOpAttr $old_val)),
+                                (returnType (GetPadOpType $old_val))),
                             /*filter_height*/(GetI32At<0> (GetAsVectorAttr<"kernel_size"> $attrs)),
                             /*filter_width*/(GetI32At<1> (GetAsVectorAttr<"kernel_size"> $attrs)),
                             /*padding*/TFL_PAD_Valid,
@@ -89,3 +99,34 @@ def LegalizeAvgPool2dWithPadComposite: Pat<
                     [(IsBoolCompositeAttribute<"ceil_mode", "false"> $attrs),
                     (IsStrCompositeAttribute<"divisor_override", "py_None"> $attrs),
                     (IsBoolCompositeAttribute<"count_include_pad", "true"> $attrs)]>;
+
+// Replaces aten.avg_pool2d with ceil mode with (T -> tfl.pad -> tfl.average_pool_2d -> mul -> T). Multiplies by a constant
+// which corrects the overcounting of divisors that would occur if doing this computation on a padded tensor with ceil mode off.
+def LegalizeAvgPoolCeilModeTrue: Pat<
+                    (MHLO_CompositeOp:$old_val
+                    (variadic $a_input), 
+                    ConstantStrAttr<StrAttr, "aten.avg_pool2d.default">, $attrs, $_, $_),
+                    (TFL_TransposeOp 
+                        (TFL_MulOp
+                          (TFL_AveragePool2DOp:$padded_value 
+                              (TFL_PadOp 
+                                  (TFL_TransposeOp $a_input,
+                                      (Arith_ConstantOp
+                                          ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+                                  (Arith_ConstantOp
+                                      (GetPadOpAttr $old_val)),
+                                  (returnType (GetPadOpType $old_val))),
+                              (GetI32At<0> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                              (GetI32At<1> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                              TFL_PAD_Valid,
+                              (GetI32At<0> (GetAsVectorAttr<"stride"> $attrs)),
+                              (GetI32At<1> (GetAsVectorAttr<"stride"> $attrs)),
+                              TFL_AF_None,
+                              (returnType (GetNhwcReturnTypeFromNchw $old_val))),
+                            (Arith_ConstantOp (GetCorrectionMatrix $old_val)),
+                            TFL_AF_None),
+                          (Arith_ConstantOp
+                              ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)), 
+                    [(IsBoolCompositeAttribute<"ceil_mode", "true"> $attrs),
+                    (IsStrCompositeAttribute<"divisor_override", "py_None"> $attrs),
+                    (IsBoolCompositeAttribute<"count_include_pad", "true"> $attrs)]>;
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
index 11e2272a145f0b..8ced899f6de108 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -53,12 +52,11 @@ class CompositeLoweringPass
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/generated_composite_lowering.inc"
 
 void CompositeLoweringPass::runOnOperation() {
-  MLIRContext& context = getContext();
   RewritePatternSet patterns(&getContext());
 
   populateWithGenerated(patterns);
 
-  ConversionTarget target(context);
+  ConversionTarget target(getContext());
   target.addLegalDialect<TFL::TensorFlowLiteDialect>();
   target.addLegalDialect<arith::ArithDialect>();
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
index 2809c81458918c..eebce19066a7ae 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
@@ -64,6 +66,16 @@ bool GetI32VectorFromDenseI64CompositeAttr(
   return DenseI64AttrToI32Vector(attr, out_vec);
 }
 
+std::optional<bool> GetBoolFromCompositeAttr(
+    const DictionaryAttr& composite_attrs, llvm::StringRef attr_name) {
+  auto attr = composite_attrs.get(attr_name);
+  if (!attr) return std::nullopt;
+  if (auto bool_attr = mlir::dyn_cast_or_null<BoolAttr>(attr)) {
+    return bool_attr.getValue();
+  }
+  return std::nullopt;
+}
+
 bool IsSupportedNchwUpsampleBlinear(
     Value input, Value output, const DenseIntElementsAttr& output_size_attr) {
   auto input_shape = mlir::cast<ShapedType>(input.getType()).getShape();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
index 79d0910bce18a4..7d57cbe21a5a4f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
@@ -53,6 +53,10 @@ bool EnsureAttribute(const DictionaryAttr& composite_attributes,
 bool DenseI64AttrToI32Vector(const DenseIntElementsAttr& dense_attr,
                              std::vector<int32_t>* out_vec);
 
+// Gets boolean from composite attrs if it exists.
+std::optional<bool> GetBoolFromCompositeAttr(
+    const DictionaryAttr& composite_attrs, llvm::StringRef attr_name);
+
 // Given a DictionaryAttr, checks if it has a DenseIntElementsAttr attribute
 // with the name attr_name. If so, extracts its values and stores as a vector
 // of int32_t elements.

From b4db8f79cccca1640fb0ad598b9851b10df3ef19 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Tue, 28 May 2024 15:43:45 -0700
Subject: [PATCH 039/287] Use opaque layout PJRT_Layouts_MemoryLayout in
 PjRtCApiBuffer::layout() to keep all the layout information.

PjRtCApiBuffer::layout() was using PJRT_Buffer_GetMemoryLayout, which will be deprecated. PJRT_Buffer_GetMemoryLayout uses explicit PJRT_Buffer_MemoryLayout which does not contain all the layout information.
PiperOrigin-RevId: 638048293
---
 third_party/xla/xla/pjrt/c/BUILD              |  1 +
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  2 +
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       |  4 +-
 .../xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc | 15 +++--
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc | 62 +++++++++++++++----
 5 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index c447c670d80964..691f6866c746ad 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -202,6 +202,7 @@ cc_library(
     deps = [
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
+        ":pjrt_c_api_layouts_extension_hdrs",
         ":pjrt_c_api_wrapper_impl",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/cpu:cpu_client",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 2dec23805e20df..477045dfb93adb 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,7 @@
 # PJRT C API changelog
 
+## 0.54
+* Deprecated PJRT_Buffer_GetMemoryLayout.
 
 ## 0.53
 * Added ``PJRT_FFI_Extension` extension to support passing user data to FFI
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 9f8e1fc04a10cb..786795fdb7be1f 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -79,7 +79,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 53
+#define PJRT_API_MINOR 54
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1685,6 +1685,8 @@ struct PJRT_Buffer_GetMemoryLayout_Args {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_GetMemoryLayout_Args, layout);
 
+// DEPRECATED. Please use layout extension instead.
+// https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_layouts_extension.h
 // Returns the memory layout of the data in this buffer.
 typedef PJRT_Error* PJRT_Buffer_GetMemoryLayout(
     PJRT_Buffer_GetMemoryLayout_Args* args);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
index 17479e023e74b3..2fa0891685a95e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -55,11 +56,15 @@ PJRT_Error* PJRT_CpuDeviceTopology_Create(
 }
 
 const PJRT_Api* GetCpuPjrtApi() {
-  static const PJRT_Api pjrt_api =
-      pjrt::CreatePjrtApi(pjrt::cpu_plugin::PJRT_Client_Create,
-                          pjrt::cpu_plugin::PJRT_ExecuteContext_Create,
-                          pjrt::cpu_plugin::PJRT_CpuDeviceTopology_Create,
-                          pjrt::PJRT_Plugin_Initialize_NoOp);
+  static PJRT_Layouts_Extension layouts_extension =
+      pjrt::CreateLayoutsExtension(nullptr);
+
+  static const PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
+      pjrt::cpu_plugin::PJRT_Client_Create,
+      pjrt::cpu_plugin::PJRT_ExecuteContext_Create,
+      pjrt::cpu_plugin::PJRT_CpuDeviceTopology_Create,
+      pjrt::PJRT_Plugin_Initialize_NoOp,
+      reinterpret_cast<PJRT_Extension_Base*>(&layouts_extension));
 
   return &pjrt_api;
 }
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index e8f5d3926c0465..cece2b67bf83fe 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -1809,18 +1809,56 @@ std::unique_ptr<PjRtLayout> PjRtCApiBuffer::layout() const {
   {
     absl::MutexLock lock(&mu_);
     if (!layout_.has_value()) {
-      PJRT_Buffer_GetMemoryLayout_Args args;
-      args.struct_size = PJRT_Buffer_GetMemoryLayout_Args_STRUCT_SIZE;
-      args.extension_start = nullptr;
-      args.buffer = buffer_.get();
-      pjrt::LogFatalIfPjrtError(
-          pjrt_c_api()->PJRT_Buffer_GetMemoryLayout(&args), pjrt_c_api());
-      CHECK_EQ(args.layout.type, PJRT_Buffer_MemoryLayout_Type_Tiled)
-          << "PjRtCApiBuffer only supports tiled device layouts";
-      absl::StatusOr<Layout> cpp_layout =
-          pjrt::ConvertToLayout(args.layout.tiled);
-      TF_CHECK_OK(cpp_layout.status());
-      layout_.emplace(*cpp_layout);
+      const PJRT_Api* c_api = pjrt_c_api();
+      PJRT_Layouts_Extension* extension =
+          pjrt::FindExtension<PJRT_Layouts_Extension>(
+              c_api, PJRT_Extension_Type::PJRT_Extension_Type_Layouts);
+      if (extension == nullptr) {
+        // TODO(jieying): Change this branch to return nullptr after the
+        // compatibility window (around Aug 24, 2024).
+        // TODO(b/343274728): implement some generic layouts behavior for
+        // plugins that don't support it.
+        PJRT_Buffer_GetMemoryLayout_Args args;
+        args.struct_size = PJRT_Buffer_GetMemoryLayout_Args_STRUCT_SIZE;
+        args.extension_start = nullptr;
+        args.buffer = buffer_.get();
+        pjrt::LogFatalIfPjrtError(
+            pjrt_c_api()->PJRT_Buffer_GetMemoryLayout(&args), pjrt_c_api());
+        CHECK_EQ(args.layout.type, PJRT_Buffer_MemoryLayout_Type_Tiled)
+            << "PjRtCApiBuffer only supports tiled device layouts";
+        absl::StatusOr<Layout> cpp_layout =
+            pjrt::ConvertToLayout(args.layout.tiled);
+        TF_CHECK_OK(cpp_layout.status());
+        layout_.emplace(*cpp_layout);
+      } else {
+        std::unique_ptr<PJRT_Layouts_MemoryLayout,
+                        pjrt::PJRT_Layouts_MemoryLayoutDeleter>
+            layout = pjrt::GetMemoryLayout(c_api, buffer_.get());
+
+        // TODO(b/343274093): returns a PjRtLayout that wraps a C API layout
+        // directly instead of de/serializing into an xla::Layout.
+        PJRT_Layouts_MemoryLayout_Serialize_Args serialize_args;
+        serialize_args.struct_size =
+            PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE;
+        serialize_args.extension_start = nullptr;
+        serialize_args.layout = layout.get();
+        pjrt::LogFatalIfPjrtError(
+            extension->PJRT_Layouts_MemoryLayout_Serialize(&serialize_args),
+            c_api);
+
+        // Clean up `PJRT_Layouts_SerializedLayout`.
+        absl::Cleanup cleanup = [&serialize_args] {
+          serialize_args.serialized_layout_deleter(
+              serialize_args.serialized_layout);
+        };
+
+        std::string serialized_layout(serialize_args.serialized_bytes,
+                                      serialize_args.serialized_bytes_size);
+        absl::StatusOr<PjRtXlaLayout> pjrt_xla_layout =
+            PjRtXlaLayout::Deserialize(serialized_layout);
+        TF_CHECK_OK(pjrt_xla_layout.status());
+        layout_.emplace(*pjrt_xla_layout);
+      }
     }
   }
   return std::make_unique<PjRtXlaLayout>(*layout_);

From 7982b5fa47ecd06a7e1122fb5e0f2f83a5ec0ac1 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Tue, 28 May 2024 16:09:37 -0700
Subject: [PATCH 040/287] Collapse Stream & StreamInterface classes into a
 single base class.

PiperOrigin-RevId: 638057108
---
 .../c/experimental/stream_executor/BUILD      |  1 -
 .../stream_executor_internal.h                |  1 -
 third_party/xla/xla/stream_executor/BUILD     | 11 ----
 .../xla/xla/stream_executor/gpu/gpu_stream.h  |  8 +--
 .../stream_executor/mock_stream_executor.h    |  1 -
 .../xla/xla/stream_executor/rocm/BUILD        |  1 -
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  |  1 -
 third_party/xla/xla/stream_executor/stream.cc |  2 -
 third_party/xla/xla/stream_executor/stream.h  | 19 ++++--
 .../stream_executor_interface.h               |  1 -
 .../xla/stream_executor/stream_interface.h    | 58 -------------------
 third_party/xla/xla/stream_executor/tpu/BUILD |  4 --
 .../xla/stream_executor/tpu/tpu_executor.cc   |  1 -
 .../xla/stream_executor/tpu/tpu_executor.h    |  7 +--
 .../xla/stream_executor/tpu/tpu_platform.h    |  7 +--
 15 files changed, 22 insertions(+), 101 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/stream_interface.h

diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 067e68dafafbeb..66a5e52be7f65f 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -70,7 +70,6 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:stream_executor_interface",
-        "@local_xla//xla/stream_executor:stream_interface",
     ],
 )
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index 525c76c51b4b9e..1ed5ed1baafc8d 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
-#include "xla/stream_executor/stream_interface.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 6df97ba9d49c16..c43aa809982ad6 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -227,7 +227,6 @@ cc_library(
         ":module_spec",
         ":stream_executor_headers",
         ":stream_executor_interface",
-        ":stream_interface",
         "//xla:test",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -452,7 +451,6 @@ cc_library(
     ],
     deps = [
         ":stream_executor_headers",
-        ":stream_interface",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -480,7 +478,6 @@ cc_library(
     deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
         "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
     ]) + [
-        ":stream_interface",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -506,14 +503,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "stream_interface",
-    hdrs = ["stream_interface.h"],
-    deps = [
-        ":platform",
-    ],
-)
-
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor private implementation (has private visibility)
 #===--------------------------------------------------------------------------------------------===#
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index 7b7bffc2d79a1d..0ef380738db577 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -55,11 +55,9 @@ class GpuStream : public Stream {
   // Explicitly initialize the CUDA resources associated with this stream.
   bool Init();
 
-  void SetPriority(StreamPriority priority) override {
-    stream_priority_ = priority;
-  }
-
-  void SetPriority(int priority) override { stream_priority_ = priority; }
+  // Sets the priority of this stream.
+  void SetPriority(StreamPriority priority) { stream_priority_ = priority; }
+  void SetPriority(int priority) { stream_priority_ = priority; }
 
   std::variant<StreamPriority, int> priority() const override {
     return stream_priority_;
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index c8f8cd75e6bcfc..6e2c9c7c81af22 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_interface.h"
-#include "xla/stream_executor/stream_interface.h"
 #include "xla/test.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 4193551418e270..6de13f276b2419 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -394,7 +394,6 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_interface",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index f4f5910771c03b..c5cd6e29cb02c5 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_interface.h"
 #include "xla/tsl/util/determinism.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index 2ab77d64226d23..7213b25eff5f0e 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -47,8 +47,6 @@ Stream::Stream(StreamExecutor *parent)
   CHECK_NE(parent, nullptr);
 }
 
-Stream::~Stream() = default;
-
 Stream::PlatformSpecificHandle Stream::platform_specific_handle() const {
   PlatformSpecificHandle handle;
   handle.stream = platform_specific_stream();
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 24b9596eb70478..e3ef38ccb245d3 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -47,10 +47,6 @@ limitations under the License.
 
 namespace stream_executor {
 
-namespace internal {
-class StreamInterface;
-}  // namespace internal
-
 class DeviceMemoryBase;
 template <typename ElemT>
 class DeviceMemory;
@@ -68,7 +64,7 @@ class StreamExecutor;
 // !ok(), it will never be ok().
 //
 // Thread-safe post-initialization.
-class Stream : public StreamInterface {
+class Stream {
  public:
   // Platform specific handle to the underlying resources behind a stream
   // implementation (e.g. it gives access to CUstream for CUDA platform).
@@ -84,7 +80,7 @@ class Stream : public StreamInterface {
   // Deallocates any stream resources that the parent StreamExecutor has
   // bestowed
   // upon this object.
-  ~Stream();
+  virtual ~Stream() = default;
 
   // TODO(ezhulenev): Consider removing this platform-specific accessor and
   // forward all users to platform-specific headers, however it requires careful
@@ -269,6 +265,17 @@ class Stream : public StreamInterface {
     return parent()->GetDeviceDescription().rocm_compute_capability();
   }
 
+  // Gets priority for a stream.
+  virtual std::variant<StreamPriority, int> priority() const {
+    return StreamPriority::Default;
+  }
+
+  // Returns a pointer to a platform specific stream associated with this object
+  // if it exists, or nullptr otherwise. This is available via Stream public API
+  // as Stream::PlatformSpecificHandle, and should not be accessed directly
+  // outside of a StreamExecutor package.
+  virtual void *platform_specific_stream() const { return nullptr; }
+
  private:
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
diff --git a/third_party/xla/xla/stream_executor/stream_executor_interface.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
index daa206710badbc..f040fda36e44d7 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_interface.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/stream_interface.h b/third_party/xla/xla/stream_executor/stream_interface.h
deleted file mode 100644
index dcfd56201bfc50..00000000000000
--- a/third_party/xla/xla/stream_executor/stream_interface.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
-#define XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
-
-#include <variant>
-
-#include "xla/stream_executor/platform.h"
-
-namespace stream_executor {
-// Pointer-to-implementation object type (i.e. the Stream class delegates to
-// this interface) with virtual destruction. This class exists for the
-// platform-dependent code to hang any kernel data/resource info/functionality
-// off of.
-class StreamInterface {
- public:
-  // Default constructor for the abstract interface.
-  StreamInterface() = default;
-
-  // Default destructor for the abstract interface.
-  virtual ~StreamInterface() = default;
-
-  // Sets priority for a stream.
-  virtual void SetPriority(StreamPriority priority) {}
-  virtual void SetPriority(int priority) {}
-
-  // Gets priority for a stream.
-  virtual std::variant<StreamPriority, int> priority() const {
-    return StreamPriority::Default;
-  }
-
-  // Returns a pointer to a platform specific stream associated with this object
-  // if it exists, or nullptr otherwise. This is available via Stream public API
-  // as Stream::PlatformSpecificHandle, and should not be accessed directly
-  // outside of a StreamExecutor package.
-  virtual void* platform_specific_stream() const { return nullptr; }
-
- private:
-  StreamInterface(const StreamInterface&) = delete;
-  void operator=(const StreamInterface&) = delete;
-};
-
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index 1cefc0a5771ed9..b06c94f1841da0 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -203,7 +203,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_interface",
-        "//xla/stream_executor:stream_interface",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -235,7 +234,6 @@ cc_library(
         ":tpu_topology_external",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
-        "//xla/stream_executor:stream_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -265,7 +263,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_interface",
-        "//xla/stream_executor:stream_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -314,7 +311,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_interface",
-        "//xla/stream_executor:stream_interface",
         "//xla/tsl/c:tsl_status_internal",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
index ee3df5651d50e1..b7566025d35d00 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor_interface.h"
-#include "xla/stream_executor/stream_interface.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 021d18ae4fd014..3e2e719cca89f2 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -1,6 +1,3 @@
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream_interface.h"
 /* Copyright 2020 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,8 +28,10 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
@@ -187,7 +186,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     return *(tpu_platform().stream_map());
   }
 
-  SE_Stream* get_stream(StreamInterface* ptr) {
+  SE_Stream* get_stream(Stream* ptr) {
     absl::MutexLock m(&tpu_platform().mutex());
     return stream_map()[ptr];
   }
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
index 2a3f87946102d5..f257c29374615a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_interface.h"
-#include "xla/stream_executor/stream_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"  // IWYU pragma: keep
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
@@ -42,8 +42,7 @@ namespace tpu {
 
 class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
  public:
-  using StreamMap =
-      absl::flat_hash_map<stream_executor::StreamInterface*, SE_Stream*>;
+  using StreamMap = absl::flat_hash_map<stream_executor::Stream*, SE_Stream*>;
   using EventMap = absl::flat_hash_map<stream_executor::Event*, SE_Event*>;
 
   static const ::stream_executor::Platform::Id kId;
@@ -101,7 +100,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   void InsertEvent(stream_executor::Event* key, SE_Event* val);
   SE_Event* LookupEvent(stream_executor::Event* key);
-  SE_Stream* LookupStream(stream_executor::StreamInterface* key) {
+  SE_Stream* LookupStream(stream_executor::Stream* key) {
     mutex().Lock();
     auto stream = stream_map_.at(key);
     mutex().Unlock();

From 9076ac1496dfbf228220bf728385db6c96447fdf Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 16:21:49 -0700
Subject: [PATCH 041/287] [xla:gpu] Detect dynamic slice offsets derived from
 loop iteration number and skip D2H transfer for them

PiperOrigin-RevId: 638061115
---
 third_party/xla/xla/service/gpu/fusions/BUILD |   1 +
 .../xla/xla/service/gpu/fusions/custom.cc     | 123 +++++++++++++-----
 third_party/xla/xla/service/gpu/runtime/BUILD |   1 +
 .../gpu/runtime/address_computation_thunk.cc  |  48 +++++--
 .../gpu/runtime/address_computation_thunk.h   |  18 +--
 .../runtime/address_computation_thunk_test.cc |  47 ++++---
 6 files changed, 156 insertions(+), 82 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 9b798ce4a328d0..27c292fedd333b 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:hlo_proto_cc",
+        "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:hlo_fusion_analysis",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 9974c15d877a14..262e0a9f25c4ef 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -74,6 +76,8 @@ namespace {
 constexpr unsigned kGEMMOutputBufferIndex = 0;
 constexpr unsigned kGEMMWorkspaceBufferIndex = 1;
 
+namespace m = ::xla::match;
+
 absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     CustomKernel custom_kernel) {
@@ -168,54 +172,103 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
   return absl::InternalError("WTF");
 }
 
+// Returns true if `offset` is a loop iteration number. This pattern matching
+// detects HLOs that generated by `jax.lax.scan` and will miss slightly
+// different patterns that still compute slice offset as loop iteration number.
+static bool IsLoopIterationOffset(const HloInstruction* offset) {
+  const HloComputation* parent = offset->parent();
+  if (!parent->IsWhileBodyComputation()) return false;
+
+  // Scan loops trip count must be known at compile time as it iterates over the
+  // leading dimension of the statically shaped input.
+  const HloInstruction* while_instr = parent->WhileCallInstruction();
+  auto config = while_instr->backend_config<xla::WhileLoopBackendConfig>();
+  if (!config.ok() || !config->has_known_trip_count()) return false;
+  int32_t trip_count = config->known_trip_count().n();
+
+  // Check that offset is defined by a loop fusion that computes offset
+  // from the loop iteration number.
+  if (!offset->IsLoopFusion() ||
+      !Match(
+          offset->fused_expression_root(),
+          m::Select(m::Compare(m::Parameter(0), m::ConstantScalar<int32_t>(0)),
+                    m::Add(m::Parameter(0), m::ConstantScalar(trip_count)),
+                    m::Parameter(0)))) {
+    return false;
+  }
+
+  // Check that we get loop iteration directly from loop parameters bundle.
+  HloInstruction* get_loop_iteration;
+  if (!Match(const_cast<HloInstruction*>(offset->operand(0)),
+             m::GetTupleElement(&get_loop_iteration, m::Parameter(0)))) {
+    return false;
+  }
+  int32_t loop_iter_idx = get_loop_iteration->tuple_index();
+
+  // Check that loop iteration counter updated with a +1 fusion.
+  const HloInstruction* loop_inc =
+      parent->root_instruction()->operand(loop_iter_idx);
+  if (!loop_inc->IsLoopFusion() ||
+      !Match(loop_inc->fused_expression_root(),
+             m::Add(m::Parameter(0), m::ConstantScalar<int32_t>(1)))) {
+    return false;
+  }
+
+  return true;
+}
+
 absl::Status CollectSliceInfo(
     const BufferAssignment& buffer_assignment,
     const HloInstruction& fusion_instr,
     absl::Span<HloInstruction*> slice_instrs,
-    std::vector<std::optional<
-        std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>&
-        offset_buffer_indices,
+    std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>&
+        offsets,
     std::vector<std::optional<Shape>>& orig_shapes,
     std::vector<std::optional<Shape>>& sliced_shapes,
     std::vector<std::optional<uint64_t>>& offset_byte_sizes, unsigned arg_idx) {
-  auto* slice_instr =
+  auto* arg_slice_instr =
       DynCastOrNull<HloDynamicIndexInstruction>(slice_instrs[arg_idx]);
-  if (slice_instr == nullptr) {
+  if (arg_slice_instr == nullptr) {
     return absl::OkStatus();
   }
 
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> offset_slices;
-  for (auto idx_op : slice_instr->index_operands()) {
+  std::vector<AddressComputationThunk::Offset> arg_offsets;
+  for (auto idx_op : arg_slice_instr->index_operands()) {
     const auto* param = Cast<HloParameterInstruction>(idx_op);
-    const auto* offset_param = fusion_instr.operand(param->parameter_number());
+    const auto* offset_value = fusion_instr.operand(param->parameter_number());
 
-    if (auto* cst_offset = DynCast<HloConstantInstruction>(offset_param)) {
+    if (auto* cst = DynCast<HloConstantInstruction>(offset_value)) {
+      // Loop offset is defined by a constant value.
       auto s32_scalar = ShapeUtil::MakeShape(PrimitiveType::S32, {});
       auto s64_scalar = ShapeUtil::MakeShape(PrimitiveType::S64, {});
 
-      if (cst_offset->shape() == s32_scalar) {
-        offset_slices.emplace_back() = cst_offset->literal().data<int32_t>()[0];
-      } else if (cst_offset->shape() == s64_scalar) {
-        offset_slices.emplace_back() = cst_offset->literal().data<int64_t>()[0];
+      if (cst->shape() == s32_scalar) {
+        arg_offsets.emplace_back() = cst->literal().data<int32_t>()[0];
+      } else if (cst->shape() == s64_scalar) {
+        arg_offsets.emplace_back() = cst->literal().data<int64_t>()[0];
       } else {
-        return absl::InternalError(
-            absl::StrCat("Unsupported constant offset shape: ",
-                         cst_offset->shape().ToString()));
+        return absl::InternalError(absl::StrCat(
+            "Unsupported constant offset shape: ", cst->shape().ToString()));
       }
 
+    } else if (IsLoopIterationOffset(offset_value)) {
+      // Loop offset defined by a loop iteration number.
+      arg_offsets.emplace_back() = AddressComputationThunk::LoopIter();
+
     } else {
-      TF_ASSIGN_OR_RETURN(offset_slices.emplace_back(),
-                          GetAllocationSlice(buffer_assignment, offset_param,
+      // Loop offset computed on device and has to be transferred to host.
+      TF_ASSIGN_OR_RETURN(arg_offsets.emplace_back(),
+                          GetAllocationSlice(buffer_assignment, offset_value,
                                              /*index=*/{}));
     }
   }
-  offset_buffer_indices[arg_idx] = std::move(offset_slices);
-  orig_shapes[arg_idx] = slice_instr->operand(0)->shape();
-  sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(slice_instr)
-                               ? slice_instr->shape()
-                               : slice_instr->operand(1)->shape();
+  offsets[arg_idx] = std::move(arg_offsets);
+  orig_shapes[arg_idx] = arg_slice_instr->operand(0)->shape();
+  sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(arg_slice_instr)
+                               ? arg_slice_instr->shape()
+                               : arg_slice_instr->operand(1)->shape();
   offset_byte_sizes[arg_idx] = ShapeUtil::ByteSizeOfPrimitiveType(
-      slice_instr->index_operands().front()->shape().element_type());
+      arg_slice_instr->index_operands().front()->shape().element_type());
 
   return absl::OkStatus();
 }
@@ -272,8 +325,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  std::vector<std::optional<
-      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
+  std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
@@ -452,9 +504,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
     num_args += ShapeUtil::GetLeafCount(operand->shape());
   });
 
-  std::vector<std::optional<
-      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
-      offset_buffer_indices(num_args, std::nullopt);
+  std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>
+      offsets(num_args, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
   std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
@@ -483,8 +534,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                               slice_instrs, /*shape_idx=*/index, arg_idx));
           TF_RETURN_IF_ERROR(CollectSliceInfo(
               buffer_assignment, fusion,
-              absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
-              orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+              absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
+              sliced_shapes, offset_byte_sizes, arg_idx++));
 
           operands.push_back(CustomCallThunk::Slice{slice, subshape});
           arguments.push_back(slice);
@@ -509,8 +560,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                            slice_instrs, /*shape_idx=*/index, arg_idx));
         TF_RETURN_IF_ERROR(CollectSliceInfo(
             buffer_assignment, fusion,
-            absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
-            orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+            absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
+            sliced_shapes, offset_byte_sizes, arg_idx++));
 
         results.push_back(CustomCallThunk::Slice{slice, subshape});
         arguments.push_back(slice);
@@ -681,9 +732,9 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
 
     thunk = std::make_unique<AddressComputationThunk>(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
-        std::move(arguments), std::move(fake_allocations),
-        std::move(offset_buffer_indices), std::move(orig_shapes),
-        std::move(sliced_shapes), std::move(offset_byte_sizes));
+        std::move(arguments), std::move(fake_allocations), std::move(offsets),
+        std::move(orig_shapes), std::move(sliced_shapes),
+        std::move(offset_byte_sizes));
   } else {
     thunk = found_ffi_handler
                 ? ffi_thunk(std::move(operands), std::move(results))
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index feeaf7ef046c9a..d013fc5b9d3397 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -320,6 +320,7 @@ cc_library(
     hdrs = ["address_computation_thunk.h"],
     deps = [
         ":sequential_thunk",
+        ":while_thunk",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index c31f8176fffca4..745195ebf9e3ff 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -48,9 +49,7 @@ AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
     std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
-    std::vector<std::optional<
-        std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
-        offset_buffer_indices,
+    std::vector<std::optional<std::vector<Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
     std::vector<std::optional<uint64_t>> offset_byte_sizes)
@@ -59,7 +58,7 @@ AddressComputationThunk::AddressComputationThunk(
           ThunkInfo(), std::move(*embedded_thunk))),
       embedded_thunk_arguments_(std::move(arguments)),
       fake_allocations_(std::move(fake_allocations)),
-      offset_buffer_indices_(std::move(offset_buffer_indices)),
+      offsets_(std::move(offsets)),
       orig_shapes_(std::move(orig_shapes)),
       sliced_shapes_(std::move(sliced_shapes)),
       offset_byte_sizes_(std::move(offset_byte_sizes)) {}
@@ -67,14 +66,14 @@ AddressComputationThunk::AddressComputationThunk(
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
   auto num_arguments = embedded_thunk_arguments_.size();
-  TF_RET_CHECK(num_arguments == offset_buffer_indices_.size());
+  TF_RET_CHECK(num_arguments == offsets_.size());
   TF_RET_CHECK(num_arguments == orig_shapes_.size());
   TF_RET_CHECK(num_arguments == sliced_shapes_.size());
   TF_RET_CHECK(num_arguments == offset_byte_sizes_.size());
   for (auto [argument, offset_slice, orig_shape, sliced_shape,
              offset_byte_size] :
-       llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
-                 orig_shapes_, sliced_shapes_, offset_byte_sizes_)) {
+       llvm::zip(embedded_thunk_arguments_, offsets_, orig_shapes_,
+                 sliced_shapes_, offset_byte_sizes_)) {
     if (offset_slice.has_value()) {
       TF_RET_CHECK(argument.has_value());
       TF_RET_CHECK(orig_shape.has_value());
@@ -103,11 +102,12 @@ absl::Status AddressComputationThunk::Initialize(
   }
 
   absl::MutexLock lock(&mutex_);
-  if (auto it = offsets_.find(params.executor); it == offsets_.end()) {
+  if (auto it = offsets_allocs_.find(params.executor);
+      it == offsets_allocs_.end()) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::MemoryAllocation> allocation,
         params.executor->HostMemoryAllocate(offset_count * sizeof(int64_t)));
-    offsets_.emplace(params.executor, std::move(allocation));
+    offsets_allocs_.emplace(params.executor, std::move(allocation));
   }
 
   return absl::OkStatus();
@@ -123,12 +123,14 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
   // Get memory allocation for copying offsets from device.
   int64_t* offsets_base = [&] {
     absl::MutexLock lock(&mutex_);
-    return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
+    return reinterpret_cast<int64_t*>(
+        offsets_allocs_.at(stream.parent())->opaque());
   }();
 
+  VLOG(2) << "Execute address computation thunk:";
   for (auto [argument_idx, values] : llvm::enumerate(
-           llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
-                     orig_shapes_, sliced_shapes_, offset_byte_sizes_))) {
+           llvm::zip(embedded_thunk_arguments_, offsets_, orig_shapes_,
+                     sliced_shapes_, offset_byte_sizes_))) {
     auto [argument_slice, offset_slice, orig_shape, sliced_shape,
           offset_byte_size] = values;
 
@@ -164,11 +166,25 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
       if (int64_t* const_offset = std::get_if<int64_t>(&slice)) {
         // Forward slice offsets that are known constant values
+        VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
+                << "]: constant offset = " << *const_offset;
         offsets_base[argument_idx + offset_idx] = *const_offset;
+
+      } else if (std::holds_alternative<LoopIter>(slice)) {
+        // Get slice offset from the current loop iteration.
+        TF_ASSIGN_OR_RETURN(int64_t iter, WhileThunk::CurrentLoopIteration());
+        VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
+                << "]: loop iteration offset = " << iter;
+        offsets_base[argument_idx + offset_idx] = iter;
+
       } else {
         // Transfer slice offset value from device to host.
-        se::DeviceMemoryBase offset_src = orig_allocations.GetDeviceAddress(
-            std::get<BufferAllocation::Slice>(slice));
+        auto alloc_slice = std::get<BufferAllocation::Slice>(slice);
+        VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
+                << "]: transfer offset from device " << alloc_slice.ToString();
+
+        se::DeviceMemoryBase offset_src =
+            orig_allocations.GetDeviceAddress(alloc_slice);
         int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
 
         // Copy the `offset_idx`-th component of the offset for the
@@ -207,6 +223,10 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       new_offset += start * stride;
     }
 
+    VLOG(2) << "Create sliced argument " << argument_idx << " of shape "
+            << sliced_shape->ToString() << " by slicing argument of shape "
+            << orig_shape->ToString() << " at offset " << new_offset << " with "
+            << new_size;
     new_buffers[argument_idx] =
         orig_argument.GetByteSlice(new_offset, new_size);
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index 8374362f11da50..3db9c5b0da14d7 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -45,13 +44,18 @@ namespace gpu {
 // AddressComputationThunk assumes that the slices are contiguous.
 class AddressComputationThunk : public Thunk {
  public:
+  struct LoopIter {};
+
+  // Dynamic slice offset can be either: (1) a statically known constant value,
+  // (2) a loop iteration number, or (3) a truly dynamic offset that is
+  // computed on device and have to be transferred to host.
+  using Offset = std::variant<int64_t, LoopIter, BufferAllocation::Slice>;
+
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
       std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
-      std::vector<std::optional<
-          std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
-          offset_buffer_indices,
+      std::vector<std::optional<std::vector<Offset>>> offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
       std::vector<std::optional<uint64_t>> offset_byte_sizes);
@@ -70,9 +74,7 @@ class AddressComputationThunk : public Thunk {
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<BufferAllocation::Slice>> embedded_thunk_arguments_;
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
-  std::vector<std::optional<
-      std::vector<std::variant<int64_t, BufferAllocation::Slice>>>>
-      offset_buffer_indices_;
+  std::vector<std::optional<std::vector<Offset>>> offsets_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
   std::vector<std::optional<uint64_t>> offset_byte_sizes_;
@@ -81,7 +83,7 @@ class AddressComputationThunk : public Thunk {
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*,
                       std::unique_ptr<se::MemoryAllocation>>
-      offsets_ ABSL_GUARDED_BY(mutex_);
+      offsets_allocs_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 5673a1efa75b35..c472ac9f6569d6 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -137,8 +136,8 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -290,10 +289,10 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> rhs_offsets{
-      slice_rhs_offset_0, slice_rhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> rhs_offsets{slice_rhs_offset_0,
+                                                           slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -454,10 +453,10 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> rhs_offsets{
-      slice_rhs_offset_0, slice_rhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> rhs_offsets{slice_rhs_offset_0,
+                                                           slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -632,7 +631,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_offsets{
+  std::vector<AddressComputationThunk::Offset> slice_offsets{
       slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
@@ -790,10 +789,10 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_src_offsets{
+  std::vector<AddressComputationThunk::Offset> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_dst_offsets{
+  std::vector<AddressComputationThunk::Offset> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
@@ -970,8 +969,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1118,8 +1117,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1259,8 +1258,8 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
@@ -1429,10 +1428,10 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_src_offsets{
+  std::vector<AddressComputationThunk::Offset> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> slice_dst_offsets{
+  std::vector<AddressComputationThunk::Offset> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
@@ -1610,8 +1609,8 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<std::variant<int64_t, BufferAllocation::Slice>> lhs_offsets{
-      slice_lhs_offset_0, slice_lhs_offset_1};
+  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                           slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},

From 52ee4bd9183b49a42d2984b766a0c1f3745361a5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 16:55:26 -0700
Subject: [PATCH 042/287] Remove fallback call to MLIR bridge.

PiperOrigin-RevId: 638072221
---
 .../mlir/tf2xla/api/v2/legalize_tf.cc         | 40 +------------------
 .../mlir/tf2xla/api/v2/legalize_tf_test.cc    |  2 -
 2 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
index d84e4d8692a19d..a09021514f3ee2 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
@@ -182,45 +182,7 @@ absl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     return *compilation_result;
   }
 
-  VLOG(1) << "Failed to compile MLIR computation to XLA HLO using Combined "
-             "MLIR and XlaBuilder Bridge. Failed to lower to hlo."
-          << combined_bridge_status.status();
-  tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_COMBINED_BRIDGE",
-                          combined_bridge_status.status().ToString())
-      .IgnoreError();
-
-  auto mlir_bridge_status = internal::LegalizeWithMlirBridge(
-      std::get<0>(computation), metadata, use_tuple_args, device_type,
-      shape_determination_fns, arg_shapes, arg_core_mapping,
-      per_core_arg_shapes, custom_legalization_passes,
-      compilation_result.get());
-
-  if (mlir_bridge_status.ok()) {
-    VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using MLIR "
-               "tf2xla Bridge";
-    IncrementTfMlirBridgeSecondPhaseCounter(
-        metrics::MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeSuccess);
-
-    DumpHloCompilationResult("legalize_tf_mlir_bridge.hlo",
-                             compilation_result.get())
-        .IgnoreError();
-    return *compilation_result;
-  } else if (mlir_bridge_status.status() ==
-             CompileToHloGraphAnalysisFailedError()) {
-    VLOG(1) << "Filtered out MLIR computation to XLA HLO using MLIR tf2xla "
-               "Bridge. Could not generate HLO.";
-  } else {
-    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using MLIR "
-               "tf2xla Bridge. Could not generate HLO. "
-            << mlir_bridge_status.status();
-    tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_PHASE2_MLIR_BRIDGE",
-                            mlir_bridge_status.status().ToString())
-        .IgnoreError();
-    IncrementTfMlirBridgeSecondPhaseCounter(
-        metrics::MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeFailure);
-  }
-
-  return mlir_bridge_status.status();
+  return combined_bridge_status.status();
 }
 
 };  // namespace v2
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
index 0f5680bda420d2..7da9fbe6f8abec 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
@@ -238,8 +238,6 @@ TEST(LegalizeTFTest, RecordsStreamzForFailedLegalizeWithMlirBridge) {
       ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED);
 
   EXPECT_FALSE(result.ok());
-  EXPECT_EQ(compilation_status.Delta(kMlirWithFallbackModeSuccess), 0);
-  EXPECT_EQ(compilation_status.Delta(kMlirWithFallbackModeFailure), 1);
   EXPECT_EQ(compilation_status.Delta(kMlirCombinedMlirFailure), 1);
 }
 

From fdf16caa1b49d0e18c16d6ca0af345d1203bd122 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 17:01:21 -0700
Subject: [PATCH 043/287] Add support for fusion-start and fusion-done
 operations in HLO.

PiperOrigin-RevId: 638074101
---
 third_party/xla/xla/service/gpu/BUILD         |  7 +-
 .../service/gpu/compile_module_to_llvm_ir.cc  | 12 ++-
 .../service/gpu/compile_module_to_llvm_ir.h   |  2 +
 .../xla/xla/service/gpu/gpu_compiler.cc       | 11 ++-
 .../xla/xla/service/gpu/ir_emitter_context.h  |  7 ++
 .../xla/service/gpu/ir_emitter_unnested.cc    | 59 ++++++++++++--
 .../xla/xla/service/gpu/ir_emitter_unnested.h | 12 +--
 .../xla/xla/service/gpu/runtime/thunk.h       |  3 +
 third_party/xla/xla/service/gpu/tests/BUILD   | 18 +++++
 .../gpu/tests/async_kernel_launch_test.cc     | 80 +++++++++++++++++++
 10 files changed, 181 insertions(+), 30 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 3d2cb0750e87b5..6300703cd560c8 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -290,6 +290,7 @@ cc_library(
     srcs = ["ir_emitter_context.cc"],
     hdrs = ["ir_emitter_context.h"],
     deps = [
+        ":execution_stream_assignment",
         ":gpu_constants",
         ":gpu_executable",
         ":ir_emission_utils",
@@ -297,13 +298,11 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
-        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
@@ -323,6 +322,7 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        ":execution_stream_assignment",
         ":gpu_asm_opts_util",
         ":gpu_conv_runner",
         ":gpu_fused_mha_runner",
@@ -3087,6 +3087,7 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":executable_proto_cc",
+        ":execution_stream_assignment",
         ":gpu_constants",
         ":gpu_executable",
         ":gpu_memory_space_assignment",
@@ -3127,7 +3128,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -3559,6 +3559,7 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
     ]) + xla_internal(["service:export_hlo"]) + [
         ":command_buffer_scheduling",
+        ":execution_stream_assignment",
         ":fusion_pipeline",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index e3197f495d471d..68d83954d30bfa 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/dump.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
@@ -137,9 +138,12 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
                 : BufferAssigner::DefaultColorer(),
             /*must_not_live_out=*/{}, can_share_buffer_function));
   }
-
   VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
           << results.buffer_assignment->GetStats().ToString();
+
+  results.execution_stream_assignment =
+      std::make_unique<ExecutionStreamAssignment>(hlo_module);
+
   struct GetCcStr {
     std::string operator()(const se::CudaComputeCapability& cc) const {
       return absl::StrCat("sm_", cc.ToString());
@@ -173,9 +177,9 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
                            hlo_module->name(), hlo_module->unique_id());
   });
   IrEmitterContext ir_emitter_context(
-      hlo_module, results.buffer_assignment.get(), platform_name,
-      gpu_device_info, mlir_context.get(), results.llvm_module.get(),
-      /*emit_kernels=*/true);
+      hlo_module, results.buffer_assignment.get(),
+      results.execution_stream_assignment.get(), platform_name, gpu_device_info,
+      mlir_context.get(), results.llvm_module.get(), /*emit_kernels=*/true);
 
   std::vector<BufferAllocation*> allocations;
   results.output_shape = hlo_module->result_shape();
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index 4dee30414094a9..eedec53d37b4b0 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo.pb.h"
@@ -45,6 +46,7 @@ namespace gpu {
 struct CompileModuleResults {
   std::unique_ptr<llvm::Module> llvm_module;
   std::unique_ptr<BufferAssignment> buffer_assignment;
+  std::unique_ptr<ExecutionStreamAssignment> execution_stream_assignment;
   std::vector<BufferAllocation> allocations;
   GpuExecutable::OwnedThunkSequence executable;
   std::vector<GpuExecutable::ConstantInfo> constants;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index f39dfb616302ec..e9e0f1b67bbd46 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -121,6 +121,7 @@ limitations under the License.
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
 #include "xla/service/gpu/double_buffer_loop_unrolling.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -377,6 +378,8 @@ GpuThunkAotCompilationResult::LoadExecutable(
                                   compiler->BufferSizeBytesFunction(),
                                   /*can_share_buffer=*/nullptr));
 
+  ExecutionStreamAssignment execution_stream_assignment(hlo_module.get());
+
   std::vector<uint8_t> binary(proto_.binary().begin(), proto_.binary().end());
 
   // Build the executable, which should be a thunk sequence.
@@ -396,10 +399,10 @@ GpuThunkAotCompilationResult::LoadExecutable(
   }
   llvm_module->setTargetTriple(gpu_compiler->target_triple());
   llvm_module->setDataLayout(gpu_compiler->data_layout());
-  IrEmitterContext ir_emitter_context(hlo_module.get(), buffer_assignment.get(),
-                                      platform_name, gpu_device_info,
-                                      mlir_context.get(), llvm_module.get(),
-                                      /*emit_kernels=*/false);
+  IrEmitterContext ir_emitter_context(
+      hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
+      platform_name, gpu_device_info, mlir_context.get(), llvm_module.get(),
+      /*emit_kernels=*/false);
   auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
   TF_RETURN_IF_ERROR(
       ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index afbf212bad0369..52dbc17a91ac0f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
@@ -60,12 +61,14 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
+                   const ExecutionStreamAssignment* execution_stream_assignment,
                    std::string platform_name,
                    const se::DeviceDescription& gpu_device_info,
                    mlir::MLIRContext* mlir_context, llvm::Module* llvm_module,
                    bool emit_kernels)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
+        execution_stream_assignment_(execution_stream_assignment),
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         mlir_context_(mlir_context),
@@ -80,6 +83,9 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
+  const ExecutionStreamAssignment& execution_stream_assignment() const {
+    return *execution_stream_assignment_;
+  }
   absl::string_view platform_name() const { return platform_name_; }
   const se::DeviceDescription& gpu_device_info() const {
     return gpu_device_info_;
@@ -123,6 +129,7 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
+  const ExecutionStreamAssignment* execution_stream_assignment_;
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 85a0437410c6fd..b0a007819f1adf 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -92,6 +92,7 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/fusions/thunk_util.h"
@@ -1678,13 +1679,27 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
 #endif  // GOOGLE_CUDA
 }
 
-absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr,
-                                           HloFusionAnalysis& fusion_analysis) {
+absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr) {
+  const se::DeviceDescription& device_info =
+      ir_emitter_context_->gpu_device_info();
+  const HloFusionAnalysis fusion_analysis =
+      HloFusionAnalysis::Create(instr, &device_info);
+
   std::unique_ptr<FusionInterface> emitter =
       GetFusionEmitter(HloFusionInfo(fusion_analysis, instr,
                                      &ir_emitter_context_->buffer_assignment()),
                        /*is_emission_phase=*/true);
-  return AddThunksToThunkSequence(emitter->Emit(*ir_emitter_context_, *instr));
+  TF_ASSIGN_OR_RETURN(auto result, emitter->Emit(*ir_emitter_context_, *instr));
+
+  const ExecutionStreamAssignment& stream_assignment =
+      ir_emitter_context_->execution_stream_assignment();
+  for (std::unique_ptr<Thunk>& thunk : result.thunks) {
+    TF_ASSIGN_OR_RETURN(ExecutionStreamId execution_stream_id,
+                        stream_assignment.GetSyncExecutionStreamId(instr));
+    thunk->set_execution_stream_id(execution_stream_id);
+    AddThunkToThunkSequence(std::move(thunk));
+  }
+  return absl::OkStatus();
 }
 
 absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
@@ -2780,6 +2795,20 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
           return EmitNcclAsyncDone(Thunk::kNcclAllToAllDone, instr);
         case HloOpcode::kCollectiveBroadcast:
           return EmitNcclAsyncDone(Thunk::kNcclCollectiveBroadcastDone, instr);
+        case HloOpcode::kFusion: {
+          // Wait until the concurrent stream has finished.
+          auto* async_done = Cast<HloAsyncInstruction>(instr);
+          const ExecutionStreamAssignment& stream_assignment =
+              ir_emitter_context_->execution_stream_assignment();
+          TF_ASSIGN_OR_RETURN(
+              ExecutionStreamAssignment::AsyncExecutionStreamIds streams,
+              stream_assignment.GetAsyncExecutionStreamIds(async_done));
+          AddThunkToThunkSequence(std::make_unique<WaitForStreamsThunk>(
+              Thunk::ThunkInfo::WithProfileAnnotation(instr),
+              streams.source_stream_id,
+              std::vector<ExecutionStreamId>{streams.destination_stream_id}));
+          return absl::OkStatus();
+        }
         default: {
           if (wrapped->has_backend_config()) {
             TF_ASSIGN_OR_RETURN(
@@ -2822,6 +2851,24 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
               Thunk::kNcclCollectiveBroadcast, instr, collective_broadcast,
               std::nullopt);
         }
+        case HloOpcode::kFusion: {
+          // We'll launch the fusion computation on a concurrent stream. The
+          // concurrent stream needs to first wait until the main stream has
+          // finished calculating any values that may be used as inputs to the
+          // fusion computation. We enforce this by inlining a `WaitForStreams`
+          // thunk.
+          auto* async_start = Cast<HloAsyncInstruction>(instr);
+          const ExecutionStreamAssignment& stream_assignment =
+              ir_emitter_context_->execution_stream_assignment();
+          TF_ASSIGN_OR_RETURN(
+              ExecutionStreamAssignment::AsyncExecutionStreamIds streams,
+              stream_assignment.GetAsyncExecutionStreamIds(async_start));
+          AddThunkToThunkSequence(std::make_unique<WaitForStreamsThunk>(
+              Thunk::ThunkInfo::WithProfileAnnotation(instr),
+              streams.destination_stream_id,
+              std::vector<ExecutionStreamId>{streams.source_stream_id}));
+          return EmitFusion(Cast<HloFusionInstruction>(wrapped));
+        }
         default: {
           if (wrapped->has_backend_config()) {
             TF_ASSIGN_OR_RETURN(
@@ -2915,11 +2962,7 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       return EmitCustomCallThunk(custom_call);
     }
     case HloOpcode::kFusion: {
-      auto* fusion = Cast<HloFusionInstruction>(instr);
-      const se::DeviceDescription& device_info =
-          ir_emitter_context_->gpu_device_info();
-      auto fusion_analysis = HloFusionAnalysis::Create(fusion, &device_info);
-      return EmitFusion(fusion, fusion_analysis);
+      return EmitFusion(Cast<HloFusionInstruction>(instr));
     }
     case HloOpcode::kInfeed:
       return EmitInfeed(Cast<HloInfeedInstruction>(instr));
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index 99342ce37d74ad..a9b3e943bbddd7 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -155,8 +155,7 @@ class IrEmitterUnnested : public IrEmitter {
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   absl::Status EmitCustomCallThunk(const HloCustomCallInstruction* instr);
   absl::Status EmitFftThunk(const HloFftInstruction* instr);
-  absl::Status EmitFusion(const HloFusionInstruction* instr,
-                          HloFusionAnalysis& fusion_analysis);
+  absl::Status EmitFusion(const HloFusionInstruction* instr);
   absl::Status EmitSelectAndScatter(
       const HloSelectAndScatterInstruction* instr);
   absl::Status EmitWhile(const HloInstruction* instr);
@@ -213,15 +212,6 @@ class IrEmitterUnnested : public IrEmitter {
     thunk_sequence_.emplace_back(std::move(thunk));
   }
 
-  absl::Status AddThunksToThunkSequence(
-      absl::StatusOr<FusionEmissionResult> result) {
-    TF_RETURN_IF_ERROR(result.status());
-    for (auto& thunk : result->thunks) {
-      AddThunkToThunkSequence(std::move(thunk));
-    }
-    return absl::OkStatus();
-  }
-
   // Load data from potentially unaligned address. If address is offset by
   // `alignment_bytes`, data is read in the unit of `alignment_bytes` to avoid
   // memory read misalignment in CUDA; otherwise, the entire data are loaded
diff --git a/third_party/xla/xla/service/gpu/runtime/thunk.h b/third_party/xla/xla/service/gpu/runtime/thunk.h
index a0c3601f2db0f4..6a271ce8f623ba 100644
--- a/third_party/xla/xla/service/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -444,6 +444,9 @@ class Thunk {
   static absl::string_view KindToString(Thunk::Kind kind);
 
   ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
+  void set_execution_stream_id(ExecutionStreamId execution_stream_id) {
+    execution_stream_id_ = execution_stream_id;
+  }
 
   static absl::StatusOr<se::Stream*> GetStreamForExecution(
       ExecutionStreamId stream_id, const ExecuteParams& params);
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index fb382298ab58f9..6b7af9993c12ab 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -89,6 +89,24 @@ xla_test(
     ],
 )
 
+xla_cc_test(
+    name = "async_kernel_launch_test",
+    srcs = ["async_kernel_launch_test.cc"],
+    # "requires-net:external" tag allows uploading `xprof` results.
+    tags = ["requires-net:external"] + tf_cuda_tests_tags(),
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/service:gpu_plugin",
+        "//xla/service:hlo_module_config",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:literal_test_util",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_test(
     name = "float_conversions_test",
     srcs = ["float_conversions_test.cc"],
diff --git a/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc b/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc
new file mode 100644
index 00000000000000..e5ff2b15701672
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "xla/debug_options_flags.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/literal_test_util.h"
+
+namespace xla::gpu {
+namespace {
+
+class AsyncKernelLaunchTest : public HloTestBase {};
+
+HloModuleConfig GetModuleConfig() {
+  // Allow even small graphs to be launched on the GPU.
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  return config;
+}
+
+// Run with CUDA to export `xprof` graphs showing the concurrent fusions.
+// xla/service/gpu/tests/async_kernel_launch_test \
+//   --test_arg=--xprof_end_2_end_upload
+TEST_F(AsyncKernelLaunchTest, BasicFusion) {
+  const char* hlo_text = R"(
+  HloModule m, is_scheduled=true
+
+  add_fusion1 {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] add(p0, p0)
+  }
+
+  add_fusion2 {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] add(p0, p0)
+  }
+
+  ENTRY main {
+    p0 = f32[2,2] parameter(0)
+    start1 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+        kind=kLoop, calls=add_fusion1
+    start2 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+        kind=kLoop, calls=add_fusion2
+    done1 = f32[2,2] fusion-done(start1)
+    done2 = f32[2,2] fusion-done(start2)
+    ROOT done = f32[2,2] add(done1, done2)
+  })";
+
+  auto module =
+      ParseAndReturnVerifiedModule(hlo_text, GetModuleConfig()).value();
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  Literal expected = LiteralUtil::CreateR2<float>({{4.0, 8.0}, {12.0, 16.0}});
+
+  Literal result = ExecuteNoHloPasses(std::move(module), {&argument});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+}  // namespace
+}  // namespace xla::gpu

From a732379147fbe143202be86c227c932d0a85f889 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 18:07:18 -0700
Subject: [PATCH 044/287] [xla:gpu] Fix static thread local leak in WhileThunk

PiperOrigin-RevId: 638092195
---
 third_party/xla/xla/service/gpu/runtime/BUILD    |  1 +
 .../xla/xla/service/gpu/runtime/while_thunk.cc   | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index d013fc5b9d3397..6478eccbd45b40 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1229,6 +1229,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index 1a0dc460249c73..b5473b230bfb7e 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -38,16 +38,20 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-static thread_local auto* loop_counters = new std::list<int64_t>();
+static std::list<int64_t>& LoopCounters() {
+  // TODO(b/343294327): Do not rely on thread-local storage.
+  static thread_local std::list<int64_t> loop_counters;
+  return loop_counters;
+}
 
 absl::StatusOr<int64_t> WhileThunk::CurrentLoopIteration(int64_t depth) {
-  if (depth >= loop_counters->size()) {
+  if (depth >= LoopCounters().size()) {
     return absl::InvalidArgumentError(absl::StrFormat(
         "Loop depth %d is greater than the number of tracked loops %d", depth,
-        loop_counters->size()));
+        LoopCounters().size()));
   }
 
-  auto counter = loop_counters->begin();
+  auto counter = LoopCounters().begin();
   std::advance(counter, depth);
   return *counter;
 }
@@ -91,8 +95,8 @@ absl::Status WhileThunk::Initialize(const InitializeParams& params) {
 absl::Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
 
-  int64_t& iter = loop_counters->emplace_front();
-  absl::Cleanup cleanup = [&] { loop_counters->pop_front(); };
+  int64_t& iter = LoopCounters().emplace_front();
+  absl::Cleanup cleanup = [&] { LoopCounters().pop_front(); };
 
   se::DeviceMemoryBase condition_result_data =
       params.buffer_allocations->GetDeviceAddress(

From bfbc181f12c699642aeee2888e3ddcda98c6cabc Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 18:18:52 -0700
Subject: [PATCH 045/287] [xla:cpu] Add support for KernelThunk and
 jit-compiling elemental host kernels

PiperOrigin-RevId: 638094579
---
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  6 +-
 third_party/xla/xla/service/cpu/BUILD         |  7 ++
 .../xla/xla/service/cpu/cpu_compiler.cc       | 52 ++++++++----
 .../xla/xla/service/cpu/cpu_executable.cc     | 22 ++++-
 .../xla/xla/service/cpu/cpu_executable.h      | 18 ++++
 .../xla/xla/service/cpu/ir_emitter2.cc        | 59 +++++++------
 third_party/xla/xla/service/cpu/ir_emitter2.h |  8 ++
 .../xla/xla/service/cpu/ir_emitter2_test.cc   | 54 ++++++------
 third_party/xla/xla/service/cpu/runtime/BUILD | 41 +++++++++
 .../service/cpu/runtime/copy_thunk_test.cc    |  2 +-
 .../xla/service/cpu/runtime/kernel_thunk.cc   | 72 ++++++++++++++++
 .../xla/service/cpu/runtime/kernel_thunk.h    | 46 ++++++++++
 .../service/cpu/runtime/kernel_thunk_test.cc  | 84 +++++++++++++++++++
 .../xla/xla/service/cpu/runtime/thunk.cc      |  2 +
 .../xla/xla/service/cpu/runtime/thunk.h       | 18 ++++
 .../xla/xla/service/cpu/thunk_emitter.cc      | 41 ++++++++-
 .../xla/xla/service/cpu/thunk_emitter.h       | 12 ++-
 17 files changed, 471 insertions(+), 73 deletions(-)
 create mode 100644 third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
 create mode 100644 third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
 create mode 100644 third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc

diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 53f04e7b430992..37a80188aaf00a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -1589,7 +1589,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
       }
 
       cpu::BufferAllocations allocations(buffer_device_mem);
-      cpu::Thunk::ExecuteParams execute_params = {&allocations};
+      cpu::Thunk::ExecuteParams execute_params = {
+          &cpu_executable->host_kernels(), &allocations};
       TF_RETURN_IF_ERROR(cpu_executable->thunks().Execute(execute_params));
 
     } else {
@@ -1696,7 +1697,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
             }
 
             cpu::BufferAllocations allocations(buffer_device_mem);
-            cpu::Thunk::ExecuteParams execute_params = {&allocations};
+            cpu::Thunk::ExecuteParams execute_params = {
+                &cpu_executable->host_kernels(), &allocations};
             status = cpu_executable->thunks().Execute(execute_params);
 
           } else {
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index e122d45e6f5155..d3c5a3f1495a3f 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -246,6 +246,7 @@ cc_library(
         ":executable_proto_cc",
         ":ir_emission_utils",
         ":ir_emitter",
+        ":ir_emitter2",
         ":onednn_matmul_rewriter",
         ":onednn_ops_rewriter",
         ":parallel_task_assignment",
@@ -598,6 +599,7 @@ cc_library(
         "//xla/service/cpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor/host:host_kernel_c_api",
         "//xla/stream_executor/host:host_stream",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:dynamic_annotations",
@@ -809,14 +811,19 @@ cc_library(
     srcs = ["thunk_emitter.cc"],
     hdrs = ["thunk_emitter.h"],
     deps = [
+        ":ir_emitter2",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu/runtime:copy_thunk",
+        "//xla/service/cpu/runtime:kernel_thunk",
         "//xla/service/cpu/runtime:thunk",
+        "//xla/stream_executor:launch_dim",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index f64f76a16e60cb..7fc5fc358914ba 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -42,8 +42,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
@@ -54,7 +52,6 @@ limitations under the License.
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
@@ -63,12 +60,6 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
-#include "xla/service/cpu/runtime/thunk.h"
-#include "xla/service/cpu/thunk_emitter.h"
-#include "xla/service/reduce_window_rewriter.h"
-#ifdef TF_LLVM_X86_AVAILABLE
-#include "llvm/TargetParser/X86TargetParser.h"
-#endif
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -78,7 +69,6 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "xla/cpu_function_runtime.h"
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -88,7 +78,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/layout_util.h"
 #include "xla/map_util.h"
 #include "xla/mlir_hlo/transforms/passes.h"
 #include "xla/service/algebraic_simplifier.h"
@@ -119,9 +108,12 @@ limitations under the License.
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/ir_emitter.h"
+#include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/parallel_task_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/cpu/target_machine_features.h"
+#include "xla/service/cpu/thunk_emitter.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dot_decomposer.h"
 #include "xla/service/dump.h"
@@ -161,6 +153,7 @@ limitations under the License.
 #include "xla/service/optimize_input_output_buffer_alias.h"
 #include "xla/service/qr_expander.h"
 #include "xla/service/reduce_decomposer.h"
+#include "xla/service/reduce_window_rewriter.h"
 #include "xla/service/reshape_decomposer.h"
 #include "xla/service/reshape_mover.h"
 #include "xla/service/result_caster.h"
@@ -170,7 +163,6 @@ limitations under the License.
 #include "xla/service/select_and_scatter_expander.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/service/sharding_remover.h"
-#include "xla/service/simplify_fp_conversions.h"
 #include "xla/service/slow_operation_alarm.h"
 #include "xla/service/sort_simplifier.h"
 #include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
@@ -189,7 +181,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -204,10 +195,15 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
+#ifdef TF_LLVM_X86_AVAILABLE
+#include "llvm/TargetParser/X86TargetParser.h"
+#endif
+
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 #include "xla/service/cpu/cpu_float_support.h"
 #include "xla/service/cpu/onednn_matmul_rewriter.h"
 #include "xla/service/cpu/onednn_ops_rewriter.h"
+#include "xla/service/simplify_fp_conversions.h"
 #endif
 
 namespace xla {
@@ -1158,18 +1154,42 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   // entry computation we emit a sequence of thunks that implement the
   // computation as a sequence of interpreted commands.
   if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
-    ThunkEmitter thunk_emitter(assignment.get());
+    // IR emitter is responsible for building LLVM module with host kernels for
+    // corresponding HLO instructions (fusions, elemental instructions, etc.).
+    IrEmitter2 ir_emitter(llvm_module.get());
+
+    // Thunk emitter is responsible for building a Thunk sequence that will
+    // resolved kernels in the compiled LLVM module and execute them together
+    // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
+    ThunkEmitter thunk_emitter(&ir_emitter, assignment.get());
     TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
                         thunk_emitter.EmitEntryComputation(*module));
 
+    // JIT compile the LLVM IR module to in-memory machine code.
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+    cantFail((*jit)->AddModule(llvm::orc::ThreadSafeModule(
+        std::move(llvm_module), std::move(llvm_context))));
+
+    // TODO(ezhulenev): We should be able to make it lazy on-demand, but today
+    // we capture obj_files by reference and it leads to asan errors. Figure out
+    // lifetime issues and move compilation to Thunk initialization stage.
+    for (const auto& kernel : ir_emitter.kernels()) {
+      if (auto sym = (*jit)->FindCompiledSymbol(kernel.name); !sym) {
+        return Internal("Failed to find compiled symbol for kernel %s",
+                        kernel.name);
+      }
+    }
+
+    // Create constant allocations from the buffer assignment.
     TF_ASSIGN_OR_RETURN(
         std::vector<CpuExecutable::ConstantAllocation> constants,
         CreateConstantAllocations(*assignment));
 
     TF_ASSIGN_OR_RETURN(
         auto cpu_executable,
-        CpuExecutable::Create(std::move(assignment), std::move(module),
-                              std::move(thunks), std::move(constants),
+        CpuExecutable::Create(std::move(*jit), std::move(assignment),
+                              std::move(module), std::move(thunks),
+                              std::move(constants),
                               std::move(hlo_profile_printer_data),
                               std::move(hlo_profile_index_map)));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 1bb32b957d98a7..70ff9ebb76bb23 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
 #include "xla/stream_executor/host/host_stream.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -68,6 +69,22 @@ namespace xla {
 namespace cpu {
 
 using ConstantAllocation = CpuExecutable::ConstantAllocation;
+using HostKernels = CpuExecutable::HostKernels;
+
+HostKernels::HostKernels(SimpleOrcJIT* jit) : jit_(jit) {}
+
+absl::StatusOr<SE_HOST_Kernel*> HostKernels::Find(std::string_view name) {
+  VLOG(2) << "Find host kernel with a name " << name;
+
+  llvm::Expected<llvm::orc::ExecutorSymbolDef> sym =
+      jit_->FindCompiledSymbol(std::string(name));
+  if (!sym) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can't resolve host kernel with a name ", name,
+                     " in the jit compiled module."));
+  }
+  return reinterpret_cast<SE_HOST_Kernel*>(sym->getAddress().getValue());
+}
 
 se::DeviceMemoryBase ConstantAllocation::AsDeviceMemoryBase() const {
   if (auto* empty = std::get_if<std::monostate>(&data)) {
@@ -123,6 +140,7 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
 }
 
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
+    std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::vector<ConstantAllocation> constants,
@@ -135,7 +153,9 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment)));
 
+  executable->jit_ = std::move(jit);
   executable->thunks_ = std::move(thunks);
+  executable->host_kernels_ = HostKernels(executable->jit_.get());
 
   // Re-index constants by their allocation index to allow efficient lookup.
   for (auto& constant : constants) {
@@ -328,7 +348,7 @@ absl::Status CpuExecutable::ExecuteThunks(
                              profile_counters_size);
   VLOG(3) << absl::StrFormat("  Profile counters: %p", profile_counters);
 
-  Thunk::ExecuteParams execute_params = {&allocations};
+  Thunk::ExecuteParams execute_params = {&*host_kernels_, &allocations};
   absl::Status executed = thunks_->Execute(execute_params);
 
   if (run_options->execution_profile()) {
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 005517c31c557b..3e7436daf8486c 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
 
 namespace xla {
 namespace cpu {
@@ -74,6 +76,7 @@ class CpuExecutable : public Executable {
 
   // Creates a CpuExecutable from a thunk sequence.
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
+      std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<const BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::vector<ConstantAllocation> constants,
@@ -137,6 +140,19 @@ class CpuExecutable : public Executable {
     return assignment_->Allocations();
   }
 
+  // A Thunk::HostKernels implementation that jit-compiles host kernels on
+  // demand using the SimpleOrcJIT instance owned by the CpuExecutable.
+  class HostKernels : public Thunk::HostKernels {
+   public:
+    explicit HostKernels(SimpleOrcJIT* jit);
+    absl::StatusOr<SE_HOST_Kernel*> Find(std::string_view name) final;
+
+   private:
+    SimpleOrcJIT* jit_;
+  };
+
+  Thunk::HostKernels& host_kernels() { return *host_kernels_; }
+
  private:
   // Creates an array suitable for passing as the "buffer_table" argument to the
   // JIT compiled function pointer.
@@ -209,6 +225,8 @@ class CpuExecutable : public Executable {
   std::optional<ThunkSequence> thunks_;
   // Vector indexed by BufferAllocation::Index for efficient access.
   std::vector<ConstantAllocation> constants_;
+  // On-demand JIT host kernels compiler.
+  std::optional<HostKernels> host_kernels_;
 
   // Entry function name for the computation.
   const std::string entry_function_name_;
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 788f5e0fdeb3d4..e336af306f1048 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -165,6 +165,14 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
       EmitKernelPrototype(instr->name(), parameters, results);
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
+  // TODO(ezhulenev): Figure out how to set up `operand_to_generator` for
+  // multi-argument/result kernels.
+  if (kernel_prototype.arguments.size() != 1 ||
+      kernel_prototype.results.size() != 1) {
+    return absl::UnimplementedError(
+        "Kernel with multiple arguments/results is not implemented");
+  }
+
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : instr->operands()) {
     operand_to_generator[operand] = [&](const llvm_ir::IrArray::Index& index) {
@@ -181,7 +189,7 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
       llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
           .EmitLoop(llvm_ir::IrName(instr)));
 
-  return KernelInfo{kernel_prototype.function->getName().str()};
+  return kernels_.emplace_back(kernel_prototype.function->getName().str());
 }
 
 //===----------------------------------------------------------------------===//
@@ -190,39 +198,40 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
 
 IrEmitter2::KernelThreadDims IrEmitter2::EmitKernelThreadDims(
     llvm::IRBuilder<>& b, llvm::Value* call_frame) {
-  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 0);
-  auto* x_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
-  auto* y_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
-  auto* z_ptr = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
-          b.CreateLoad(b.getInt64Ty(), y_ptr),
-          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+  auto* thread_dims = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 0);
+  auto* x_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
+  auto* y_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
+  auto* z_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_gep),
+          b.CreateLoad(b.getInt64Ty(), y_gep),
+          b.CreateLoad(b.getInt64Ty(), z_gep)};
 }
 
 IrEmitter2::KernelThread IrEmitter2::EmitKernelThread(llvm::IRBuilder<>& b,
                                                       llvm::Value* call_frame) {
-  auto* thread_dims = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 1);
-  auto* x_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
-  auto* y_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
-  auto* z_ptr = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_ptr),
-          b.CreateLoad(b.getInt64Ty(), y_ptr),
-          b.CreateLoad(b.getInt64Ty(), z_ptr)};
+  auto* thread_dims = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 1);
+  auto* x_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
+  auto* y_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
+  auto* z_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
+
+  return {b.CreateLoad(b.getInt64Ty(), x_gep),
+          b.CreateLoad(b.getInt64Ty(), y_gep),
+          b.CreateLoad(b.getInt64Ty(), z_gep)};
 }
 
 llvm_ir::IrArray IrEmitter2::EmitKernelArgument(llvm::IRBuilder<>& b,
                                                 llvm::Value* call_frame,
                                                 int64_t index,
                                                 const Shape& shape) {
-  auto* args_ptr = b.CreateConstGEP2_64(call_frame_ty_, call_frame, 0, 3);
-  auto* arg_ptr = b.CreateConstGEP1_64(arg_ty_, args_ptr, index);
-  auto* data_ptr = b.CreateConstGEP2_64(arg_ty_, arg_ptr, 0, 0);
-
   llvm::Type* ptr = llvm::PointerType::get(b.getContext(), 0);
-  return llvm_ir::IrArray(b.CreateLoad(ptr, data_ptr),
-                          llvm_ir::ShapeToIrType(shape, module_), shape);
+
+  auto* args_gep = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 3);
+  auto* args = b.CreateLoad(ptr, args_gep);
+  auto* data_gep = b.CreateConstGEP2_32(arg_ty_, args, index, 0);
+  auto* data = b.CreateLoad(ptr, data_gep);
+
+  return llvm_ir::IrArray(data, llvm_ir::ShapeToIrType(shape, module_), shape);
 }
 
 IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
@@ -245,10 +254,12 @@ IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
   llvm::Function* function = llvm::dyn_cast<llvm::Function>(
       module_->getOrInsertFunction(name, KernelFunctionTy(ctx)).getCallee());
   function->setCallingConv(llvm::CallingConv::C);
+  function->setDoesNotThrow();
+
+  // Create an entry basic block and set insert point to the end of it.
   b.SetInsertPoint(llvm::BasicBlock::Create(ctx, "", function));
 
   llvm::Value* call_frame = function->getArg(0);
-
   // Build thread coordinates from the call frame.
   KernelThreadDims kernel_thread_dims = EmitKernelThreadDims(b, call_frame);
   KernelThread kernel_thread = EmitKernelThread(b, call_frame);
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 05da1b67083b10..edee743a697eac 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 #include <string_view>
+#include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -89,9 +90,13 @@ class IrEmitter2 {
   // TODO(ezhulenev): In addition to a symbol name we also need to know the
   // block and thread sizes.
   struct KernelInfo {
+    explicit KernelInfo(std::string name) : name(std::move(name)) {}
     std::string name;
   };
 
+  // Returns all the kernels emitted so far via this emitter.
+  absl::Span<const KernelInfo> kernels() const { return kernels_; }
+
   // Emits an elemental host kernel for the given HLO instruction.
   absl::StatusOr<KernelInfo> EmitElementalHostKernel(
       const HloInstruction* instr);
@@ -121,6 +126,9 @@ class IrEmitter2 {
   llvm::StructType* thread_dims_ty_;
   llvm::StructType* thread_ty_;
   llvm::StructType* arg_ty_;
+
+  // Keeps track of all the kernels emitted so far.
+  std::vector<KernelInfo> kernels_;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
index e8d3c9984d8d21..b63209e87cf7fe 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -49,31 +49,33 @@ TEST_F(IrEmitter2Test, BuildKernelPrototype) {
       ir_emitter.EmitKernelPrototype("test", parameters, results);
 
   ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
-    CHECK: define ptr @test(ptr %0) {
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 0
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   getelementptr %SE_HOST_KernelThreadDim
-    CHECK:   load i64
-    CHECK:   load i64
-    CHECK:   load i64
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 1
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   getelementptr %SE_HOST_KernelThread
-    CHECK:   load i64
-    CHECK:   load i64
-    CHECK:   load i64
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
-    CHECK:   getelementptr %SE_HOST_KernelArg
-    CHECK:   getelementptr %SE_HOST_KernelArg
-
-    CHECK:   getelementptr %SE_HOST_KernelCallFrame, {{.*}} i64 3
-    CHECK:   getelementptr %SE_HOST_KernelArg
-    CHECK:   getelementptr %SE_HOST_KernelArg
+    CHECK: define ptr @test(ptr %0) #0 {
+
+    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 0
+    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 0
+    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 1
+    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 2
+    CHECK:      load i64
+    CHECK:      load i64
+    CHECK:      load i64
+
+    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 1
+    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 0
+    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 1
+    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 2
+    CHECK:      load i64
+    CHECK:      load i64
+    CHECK:      load i64
+
+    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %SE_HOST_KernelArg, {{.*}} i32 0, i32 0
+    CHECK:      load ptr
+
+    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %SE_HOST_KernelArg, {{.*}} i32 1, i32 0
+    CHECK:      load ptr
 
     CHECK:   ret ptr null
     CHECK: }
@@ -100,7 +102,7 @@ TEST_F(IrEmitter2Test, EmitElementalKernel) {
                           ir_emitter.EmitElementalHostKernel(convert));
 
   ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
-    CHECK: define ptr @convert(ptr %0) {
+    CHECK: define ptr @convert(ptr %0) #0 {
     CHECK:   fptosi float {{.*}} to i32
     CHECK: }
   )"));
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index b1dd38764e48b6..6c9402f88a6b8d 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -36,7 +36,9 @@ cc_library(
     hdrs = ["thunk.h"],
     deps = [
         ":buffer_allocations",
+        "//xla/stream_executor/host:host_kernel_c_api",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -73,3 +75,42 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ],
 )
+
+cc_library(
+    name = "kernel_thunk",
+    srcs = ["kernel_thunk.cc"],
+    hdrs = ["kernel_thunk.h"],
+    deps = [
+        ":thunk",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor",
+        "//xla/stream_executor/host:host_kernel",
+        "//xla/stream_executor/host:host_kernel_c_api",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "kernel_thunk_test",
+    srcs = ["kernel_thunk_test.cc"],
+    deps = [
+        ":buffer_allocations",
+        ":kernel_thunk",
+        ":thunk",
+        "//xla/service:buffer_assignment",
+        "//xla/service:maybe_owning_device_memory",
+        "//xla/stream_executor",
+        "//xla/stream_executor/host:host_kernel_c_api",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
index 98687fc42f2ea6..a8803ddf531700 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
@@ -48,7 +48,7 @@ TEST(CopyThunkTest, Copy) {
 
   CopyThunk thunk(src_slice, dst_slice, size_in_bytes);
 
-  Thunk::ExecuteParams params = {&allocations};
+  Thunk::ExecuteParams params = {nullptr, &allocations};
   TF_ASSERT_OK(thunk.Execute(params));
 
   EXPECT_EQ(src, dst);
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
new file mode 100644
index 00000000000000..10a7143377db61
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/runtime/kernel_thunk.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+KernelThunk::KernelThunk(absl::Span<const BufferAllocation::Slice> buffers,
+                         std::string kernel_name, se::ThreadDim thread_dim)
+    : Thunk(Kind::kKernel),
+      buffers_(buffers.begin(), buffers.end()),
+      kernel_name_(std::move(kernel_name)),
+      thread_dim_(thread_dim) {}
+
+absl::Status KernelThunk::Execute(const ExecuteParams& params) {
+  VLOG(3) << absl::StrFormat(
+      "Launch host kernel %s with %d buffer arguments: %s", kernel_name_,
+      buffers_.size(), thread_dim_.ToString());
+
+  absl::InlinedVector<se::DeviceMemoryBase, 8> buffers_data;
+  buffers_data.reserve(buffers_.size());
+
+  for (BufferAllocation::Slice& buffer : buffers_) {
+    TF_ASSIGN_OR_RETURN(buffers_data.emplace_back(),
+                        params.buffer_allocations->GetDeviceAddress(buffer));
+    VLOG(3) << absl::StrFormat(" - add argument %s (%p)", buffer.ToString(),
+                               buffers_data.back().opaque());
+  }
+
+  // TODO(ezhulenev): Kernel ptr should be loaded as a part of Thunk
+  // initialization stage.
+  TF_ASSIGN_OR_RETURN(SE_HOST_Kernel * kernel_ptr,
+                      params.host_kernels->Find(kernel_name_));
+
+  // TODO(ezhulenev): Instead of using HostKernel directly we should be going
+  // through the stream executor APIs.
+  se::host::HostKernel kernel(buffers_.size(), kernel_ptr, nullptr);
+  TF_RETURN_IF_ERROR(kernel.Launch(thread_dim_, buffers_data));
+
+  return absl::OkStatus();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
new file mode 100644
index 00000000000000..a65162c1a088d0
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_KERNEL_THUNK_H_
+#define XLA_SERVICE_CPU_RUNTIME_KERNEL_THUNK_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+// Launches compiled host kernel on the caller thread.
+class KernelThunk final : public Thunk {
+ public:
+  KernelThunk(absl::Span<const BufferAllocation::Slice> buffers,
+              std::string kernel_name, se::ThreadDim thread_dim);
+
+  absl::Status Execute(const ExecuteParams& params) final;
+
+ private:
+  std::vector<BufferAllocation::Slice> buffers_;
+  std::string kernel_name_;
+  se::ThreadDim thread_dim_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_KERNEL_THUNK_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc
new file mode 100644
index 00000000000000..b2ac099c5bd27f
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/runtime/kernel_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/buffer_allocations.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+class AddF32HostKernels : public Thunk::HostKernels {
+ public:
+  absl::StatusOr<SE_HOST_Kernel*> Find(std::string_view name) override {
+    return +[](const SE_HOST_KernelCallFrame* call_frame) {
+      SE_HOST_KernelArg& in = call_frame->args[0];
+      SE_HOST_KernelArg& out = call_frame->args[1];
+
+      float* in_ptr = reinterpret_cast<float*>(in.data);
+      float* out_ptr = reinterpret_cast<float*>(out.data);
+
+      uint64_t i = call_frame->thread->x;
+      *(out_ptr + i) = *(in_ptr + i) + *(in_ptr + i);
+
+      return static_cast<SE_HOST_KernelError*>(nullptr);
+    };
+  }
+};
+
+TEST(KernelThunkTest, AddF32) {
+  std::vector<MaybeOwningDeviceMemory> buffers;
+  std::vector<float> in = {1.0, 2.0, 3.0, 4.0};
+  std::vector<float> out(4, 0.0);
+
+  size_t size_in_bytes = in.size() * sizeof(float);
+  buffers.emplace_back(se::DeviceMemoryBase(in.data(), size_in_bytes));
+  buffers.emplace_back(se::DeviceMemoryBase(out.data(), size_in_bytes));
+
+  BufferAllocations allocations(buffers);
+
+  BufferAllocation in_alloc(0, size_in_bytes, 0);
+  BufferAllocation out_alloc(1, size_in_bytes, 0);
+
+  BufferAllocation::Slice in_slice(&in_alloc, 0, size_in_bytes);
+  BufferAllocation::Slice out_slice(&out_alloc, 0, size_in_bytes);
+  std::vector<BufferAllocation::Slice> slices = {in_slice, out_slice};
+
+  KernelThunk thunk(slices, "add_f32", se::ThreadDim(4));
+
+  AddF32HostKernels host_kernels;
+  Thunk::ExecuteParams params = {&host_kernels, &allocations};
+  TF_ASSERT_OK(thunk.Execute(params));
+
+  std::vector<float> expected = {2.0, 4.0, 6.0, 8.0};
+  EXPECT_EQ(out, expected);
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.cc b/third_party/xla/xla/service/cpu/runtime/thunk.cc
index b4868d58f3432c..f6461a95b6d29e 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.cc
@@ -30,6 +30,8 @@ std::string_view Thunk::KindToString(Kind kind) {
   switch (kind) {
     case Kind::kCopy:
       return "copy";
+    case Kind::kKernel:
+      return "kernel";
   }
 }
 
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.h b/third_party/xla/xla/service/cpu/runtime/thunk.h
index 83732b2b31f5fe..1892eb972ea553 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.h
@@ -24,7 +24,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/service/cpu/runtime/buffer_allocations.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
 
 namespace xla::cpu {
 
@@ -46,6 +48,7 @@ class Thunk {
  public:
   enum class Kind {
     kCopy,
+    kKernel,
   };
 
   virtual ~Thunk() = default;
@@ -59,6 +62,20 @@ class Thunk {
 
   static std::string_view KindToString(Kind kind);
 
+  //===--------------------------------------------------------------------===//
+  // HostKernels
+  //===--------------------------------------------------------------------===//
+
+  // Interface for finding host kernels (function pointers with host kernel API)
+  // by name. At run time this is typically backed by an LLVM jit compiler that
+  // compiles LLVM IR to executables on demand.
+  class HostKernels {
+   public:
+    virtual ~HostKernels() = default;
+
+    virtual absl::StatusOr<SE_HOST_Kernel*> Find(std::string_view name) = 0;
+  };
+
   //===--------------------------------------------------------------------===//
   // ExecuteParams
   //===--------------------------------------------------------------------===//
@@ -66,6 +83,7 @@ class Thunk {
   // Parameters passed to Execute. Execute is responsible for launching "work"
   // on device, i.e., it launches host kernels, calls into libraries, etc.
   struct ExecuteParams {
+    HostKernels* host_kernels = nullptr;
     const BufferAllocations* buffer_allocations = nullptr;
   };
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 590edc0a5d06b3..88d325836eba9c 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/cpu/thunk_emitter.h"
 
 #include <utility>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -25,16 +26,21 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/runtime/copy_thunk.h"
+#include "xla/service/cpu/runtime/kernel_thunk.h"
 #include "xla/service/cpu/runtime/thunk.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::cpu {
 
-ThunkEmitter::ThunkEmitter(const BufferAssignment* buffer_assignment)
-    : buffer_assignment_(buffer_assignment) {}
+ThunkEmitter::ThunkEmitter(IrEmitter2* ir_emitter,
+                           const BufferAssignment* buffer_assignment)
+    : ir_emitter_(ir_emitter), buffer_assignment_(buffer_assignment) {}
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
     const HloModule& module) {
@@ -84,6 +90,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kConstant:
       return ThunkSequence::Empty();
 
+    // Simple HLO instructions lowered to elemental host kernels (plain loops
+    // behind the HostKernel API).
+    case HloOpcode::kAdd:
+    case HloOpcode::kConvert:
+      return EmitElementalKernelThunk(instruction);
+
     case HloOpcode::kCopy:
       return EmitCopyThunk(instruction);
 
@@ -102,4 +114,29 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
                                       ShapeUtil::ByteSizeOf(copy->shape()));
 }
 
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
+    const HloInstruction* instruction) {
+  TF_ASSIGN_OR_RETURN(auto kernel,
+                      ir_emitter_->EmitElementalHostKernel(instruction));
+
+  // Collect flattened buffer slices for all operands and result(s).
+  std::vector<BufferAllocation::Slice> buffers;
+  auto add_buffers = [&](const HloInstruction* instr) -> absl::Status {
+    for (const auto& indexed : ShapeUtil::GetLeafShapes(instr->shape())) {
+      TF_ASSIGN_OR_RETURN(buffers.emplace_back(),
+                          GetAllocationSlice(instr, indexed.index));
+    }
+    return absl::OkStatus();
+  };
+
+  for (HloInstruction* operand : instruction->operands()) {
+    TF_RETURN_IF_ERROR(add_buffers(operand));
+  }
+  TF_RETURN_IF_ERROR(add_buffers(instruction));
+
+  // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
+  // invocation, for now we assume that we always emit a full loop.
+  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 77a9eda1e65930..eb99193eee015a 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/runtime/thunk.h"
 #include "xla/shape_util.h"
 
@@ -29,9 +30,14 @@ namespace xla::cpu {
 // ThunkEmitter is responsible for converting optimized HLO module into a
 // sequence of thunks that will launch "work" on the CPU: launch host kernels,
 // call into the libraries (oneDNN, Eigen, etc.).
+//
+// During the thunk emission it emits IR (LLVM IR) for the host kernels via the
+// IrEmitter that later will be compiled into the executable binary (one or
+// multiple LLVM modules compiled to object files).
 class ThunkEmitter {
  public:
-  explicit ThunkEmitter(const BufferAssignment* buffer_assignment);
+  ThunkEmitter(IrEmitter2* ir_emitter,
+               const BufferAssignment* buffer_assignment);
 
   // Emits HLO module entry computation as a sequence of thunks.
   absl::StatusOr<ThunkSequence> EmitEntryComputation(const HloModule& module);
@@ -50,6 +56,10 @@ class ThunkEmitter {
 
   absl::StatusOr<ThunkSequence> EmitCopyThunk(const HloInstruction* copy);
 
+  absl::StatusOr<ThunkSequence> EmitElementalKernelThunk(
+      const HloInstruction* instruction);
+
+  IrEmitter2* ir_emitter_;
   const BufferAssignment* buffer_assignment_;
 };
 

From 6f276c59fbfbb03c6889d10401d6845fd091b7ad Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 18:29:37 -0700
Subject: [PATCH 046/287] [xla:cpu] NFC: Give human readable names to LLVM IR
 instructions in host kernel prototype

Example:

```
  %tdim_x_gep = getelementptr inbounds %SE_HOST_KernelThreadDim, ptr %2, i32 0
  %tdim_y_gep = getelementptr inbounds %SE_HOST_KernelThreadDim, ptr %2, i32 1
  %tdim_z_gep = getelementptr inbounds %SE_HOST_KernelThreadDim, ptr %2, i32 2
  %tdim_x = load i64, ptr %tdim_x_gep, align 4
  %tdim_y = load i64, ptr %tdim_y_gep, align 4
  %tdim_z = load i64, ptr %tdim_z_gep, align 4
  %3 = getelementptr inbounds %SE_HOST_KernelCallFrame, ptr %0, i32 0, i32 1
  %tid_x_gep = getelementptr inbounds %SE_HOST_KernelThread, ptr %3, i32 0, i32 0
  %tid_y_gep = getelementptr inbounds %SE_HOST_KernelThread, ptr %3, i32 0, i32 1
  %tid_z_gep = getelementptr inbounds %SE_HOST_KernelThread, ptr %3, i32 0, i32 2
  %tid_x = load i64, ptr %tid_x_gep, align 4
  %tid_y = load i64, ptr %tid_y_gep, align 4
  %tid_z = load i64, ptr %tid_z_gep, align 4
  %args_gep = getelementptr inbounds %SE_HOST_KernelCallFrame, ptr %0
  %args = load ptr, ptr %args_gep, align 8
  %arg0_gep = getelementptr %SE_HOST_KernelArg, ptr %args, i32 0, i32 0
  %arg0 = load ptr, ptr %arg0_gep, align 8
  %args_gep1 = getelementptr inbounds %SE_HOST_KernelCallFrame, ptr %0
  %args2 = load ptr, ptr %args_gep1, align 8
  %arg1_gep = getelementptr %SE_HOST_KernelArg, ptr %args2, i32 1, i32 0
  %arg1 = load ptr, ptr %arg1_gep, align 8
```

PiperOrigin-RevId: 638096582
---
 third_party/xla/xla/service/cpu/BUILD         |  1 +
 .../xla/xla/service/cpu/ir_emitter2.cc        | 44 ++++++++++---------
 .../xla/xla/service/cpu/ir_emitter2_test.cc   | 20 ++++-----
 3 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index d3c5a3f1495a3f..266cc98ab1293b 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -654,6 +654,7 @@ cc_library(
         "//xla/service/llvm_ir:loop_emitter",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index e336af306f1048..296615d8768a8a 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter2.h"
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -198,26 +201,26 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
 
 IrEmitter2::KernelThreadDims IrEmitter2::EmitKernelThreadDims(
     llvm::IRBuilder<>& b, llvm::Value* call_frame) {
-  auto* thread_dims = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 0);
-  auto* x_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 0);
-  auto* y_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 1);
-  auto* z_gep = b.CreateConstGEP2_32(thread_dims_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_gep),
-          b.CreateLoad(b.getInt64Ty(), y_gep),
-          b.CreateLoad(b.getInt64Ty(), z_gep)};
+  auto* tdims = b.CreateStructGEP(call_frame_ty_, call_frame, 0, "tdims_gep");
+  auto* x_gep = b.CreateStructGEP(thread_dims_ty_, tdims, 0, "tdim_x_gep");
+  auto* y_gep = b.CreateStructGEP(thread_dims_ty_, tdims, 1, "tdim_y_gep");
+  auto* z_gep = b.CreateStructGEP(thread_dims_ty_, tdims, 2, "tdim_z_gep");
+
+  return {b.CreateLoad(b.getInt64Ty(), x_gep, "tdim_x"),
+          b.CreateLoad(b.getInt64Ty(), y_gep, "tdim_y"),
+          b.CreateLoad(b.getInt64Ty(), z_gep, "tdim_z")};
 }
 
 IrEmitter2::KernelThread IrEmitter2::EmitKernelThread(llvm::IRBuilder<>& b,
                                                       llvm::Value* call_frame) {
-  auto* thread_dims = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 1);
-  auto* x_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 0);
-  auto* y_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 1);
-  auto* z_gep = b.CreateConstGEP2_32(thread_ty_, thread_dims, 0, 2);
-
-  return {b.CreateLoad(b.getInt64Ty(), x_gep),
-          b.CreateLoad(b.getInt64Ty(), y_gep),
-          b.CreateLoad(b.getInt64Ty(), z_gep)};
+  auto* tids = b.CreateStructGEP(call_frame_ty_, call_frame, 1, "tid_gep");
+  auto* x_gep = b.CreateStructGEP(thread_ty_, tids, 0, "tid_x_gep");
+  auto* y_gep = b.CreateStructGEP(thread_ty_, tids, 1, "tid_y_gep");
+  auto* z_gep = b.CreateStructGEP(thread_ty_, tids, 2, "tid_z_gep");
+
+  return {b.CreateLoad(b.getInt64Ty(), x_gep, "tid_x"),
+          b.CreateLoad(b.getInt64Ty(), y_gep, "tid_y"),
+          b.CreateLoad(b.getInt64Ty(), z_gep, "tid_z")};
 }
 
 llvm_ir::IrArray IrEmitter2::EmitKernelArgument(llvm::IRBuilder<>& b,
@@ -225,11 +228,12 @@ llvm_ir::IrArray IrEmitter2::EmitKernelArgument(llvm::IRBuilder<>& b,
                                                 int64_t index,
                                                 const Shape& shape) {
   llvm::Type* ptr = llvm::PointerType::get(b.getContext(), 0);
+  std::string name = absl::StrCat("arg", index);
 
-  auto* args_gep = b.CreateConstGEP2_32(call_frame_ty_, call_frame, 0, 3);
-  auto* args = b.CreateLoad(ptr, args_gep);
-  auto* data_gep = b.CreateConstGEP2_32(arg_ty_, args, index, 0);
-  auto* data = b.CreateLoad(ptr, data_gep);
+  auto* args_gep = b.CreateStructGEP(call_frame_ty_, call_frame, 3, "args_gep");
+  auto* args = b.CreateLoad(ptr, args_gep, "args");
+  auto* data_gep = b.CreateConstGEP2_32(arg_ty_, args, index, 0, name + "_gep");
+  auto* data = b.CreateLoad(ptr, data_gep, name);
 
   return llvm_ir::IrArray(data, llvm_ir::ShapeToIrType(shape, module_), shape);
 }
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
index b63209e87cf7fe..4b3686148e48ad 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -51,28 +51,28 @@ TEST_F(IrEmitter2Test, BuildKernelPrototype) {
   ASSERT_TRUE(*RunFileCheck(llvm_ir::DumpToString(module.get()), R"(
     CHECK: define ptr @test(ptr %0) #0 {
 
-    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 0
-    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 0
-    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 1
-    CHECK:      getelementptr %SE_HOST_KernelThreadDim, {{.*}} i32 0, i32 2
+    CHECK-NEXT: getelementptr inbounds %SE_HOST_KernelCallFrame, {{.*}} i32 0
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThreadDim, {{.*}} i32 0
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThreadDim, {{.*}} i32 1
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThreadDim, {{.*}} i32 2
     CHECK:      load i64
     CHECK:      load i64
     CHECK:      load i64
 
-    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 1
-    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 0
-    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 1
-    CHECK:      getelementptr %SE_HOST_KernelThread, {{.*}} i32 0, i32 2
+    CHECK-NEXT: getelementptr inbounds %SE_HOST_KernelCallFrame, {{.*}} i32 1
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThread, {{.*}} i32 0
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThread, {{.*}} i32 1
+    CHECK:      getelementptr inbounds %SE_HOST_KernelThread, {{.*}} i32 2
     CHECK:      load i64
     CHECK:      load i64
     CHECK:      load i64
 
-    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 3
+    CHECK-NEXT: getelementptr inbounds %SE_HOST_KernelCallFrame, {{.*}} i32 3
     CHECK:      load ptr
     CHECK:      getelementptr %SE_HOST_KernelArg, {{.*}} i32 0, i32 0
     CHECK:      load ptr
 
-    CHECK-NEXT: getelementptr %SE_HOST_KernelCallFrame, {{.*}} i32 0, i32 3
+    CHECK-NEXT: getelementptr inbounds %SE_HOST_KernelCallFrame, {{.*}} i32 3
     CHECK:      load ptr
     CHECK:      getelementptr %SE_HOST_KernelArg, {{.*}} i32 1, i32 0
     CHECK:      load ptr

From 0740b37557fbebd3d6cc905f0ca7b0a35534876d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 18:37:57 -0700
Subject: [PATCH 047/287] [xla:gpu] NFC: Simplify AddressComputationThunk by
 replacing llvm::zip with struct

Remove accidental copies of various temporaries by packing all arguments into struct and iterating over a single vector.

Also fix a bug with accessing offsets buffers stored on host, original [arg_idx + offset] offset computation is simply incorrect.

PiperOrigin-RevId: 638098333
---
 third_party/xla/xla/service/gpu/runtime/BUILD |   2 +
 .../gpu/runtime/address_computation_thunk.cc  | 166 ++++++++++--------
 .../gpu/runtime/address_computation_thunk.h   |  23 ++-
 3 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 6478eccbd45b40..abee6757ad3502 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -332,6 +332,7 @@ cc_library(
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -340,6 +341,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 745195ebf9e3ff..0b056e3c593f2b 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "llvm/ADT/STLExtras.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -56,35 +58,44 @@ AddressComputationThunk::AddressComputationThunk(
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(), std::move(*embedded_thunk))),
-      embedded_thunk_arguments_(std::move(arguments)),
-      fake_allocations_(std::move(fake_allocations)),
-      offsets_(std::move(offsets)),
-      orig_shapes_(std::move(orig_shapes)),
-      sliced_shapes_(std::move(sliced_shapes)),
-      offset_byte_sizes_(std::move(offset_byte_sizes)) {}
+      fake_allocations_(std::move(fake_allocations)) {
+  // Zip all arguments together to create a list of SliceDef.
+  for (auto [arg, offsets, orig_shape, sliced_shape, offset_byte_size] :
+       llvm::zip_equal(arguments, offsets, orig_shapes, sliced_shapes,
+                       offset_byte_sizes)) {
+    slices_.push_back(SliceDef{
+        std::move(arg),
+        std::move(offsets),
+        std::move(orig_shape),
+        std::move(sliced_shape),
+        std::move(offset_byte_size),
+    });
+  }
+
+  // Find how many offsets we might have to transfer from device to host and
+  // pre-compute host allocation requirements.
+  for (SliceDef& slice : slices_) {
+    offsets_allocs_base_.push_back(offsets_allocs_size_);
+    if (slice.sliced_shape.has_value()) {
+      offsets_allocs_size_ += slice.sliced_shape->rank() * sizeof(int64_t);
+    }
+  }
+}
 
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
-  auto num_arguments = embedded_thunk_arguments_.size();
-  TF_RET_CHECK(num_arguments == offsets_.size());
-  TF_RET_CHECK(num_arguments == orig_shapes_.size());
-  TF_RET_CHECK(num_arguments == sliced_shapes_.size());
-  TF_RET_CHECK(num_arguments == offset_byte_sizes_.size());
-  for (auto [argument, offset_slice, orig_shape, sliced_shape,
-             offset_byte_size] :
-       llvm::zip(embedded_thunk_arguments_, offsets_, orig_shapes_,
-                 sliced_shapes_, offset_byte_sizes_)) {
-    if (offset_slice.has_value()) {
-      TF_RET_CHECK(argument.has_value());
-      TF_RET_CHECK(orig_shape.has_value());
-      TF_RET_CHECK(sliced_shape.has_value());
-      TF_RET_CHECK(offset_byte_size.has_value());
-
-      TF_RET_CHECK(orig_shape->IsArray());
-      TF_RET_CHECK(sliced_shape->IsArray());
-
-      TF_RET_CHECK(offset_slice->size() == orig_shape->rank());
-      TF_RET_CHECK(sliced_shape->rank() == orig_shape->rank());
+  for (SliceDef& slice : slices_) {
+    if (slice.offsets.has_value()) {
+      TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
+      TF_RET_CHECK(slice.orig_shape.has_value());
+      TF_RET_CHECK(slice.sliced_shape.has_value());
+      TF_RET_CHECK(slice.offset_byte_size.has_value());
+
+      TF_RET_CHECK(slice.orig_shape->IsArray());
+      TF_RET_CHECK(slice.sliced_shape->IsArray());
+
+      TF_RET_CHECK(slice.offsets->size() == slice.orig_shape->rank());
+      TF_RET_CHECK(slice.sliced_shape->rank() == slice.orig_shape->rank());
     }
   }
 
@@ -96,101 +107,100 @@ absl::Status AddressComputationThunk::Initialize(
     const InitializeParams& params) {
   TF_RETURN_IF_ERROR(embedded_thunk_->Initialize(params));
 
-  unsigned offset_count = 0;
-  for (auto maybe_shape : sliced_shapes_) {
-    offset_count += (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
-  }
-
   absl::MutexLock lock(&mutex_);
-  if (auto it = offsets_allocs_.find(params.executor);
-      it == offsets_allocs_.end()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::MemoryAllocation> allocation,
-        params.executor->HostMemoryAllocate(offset_count * sizeof(int64_t)));
-    offsets_allocs_.emplace(params.executor, std::move(allocation));
-  }
+  if (offsets_allocs_.contains(params.executor)) return absl::OkStatus();
+
+  VLOG(2) << "Allocate " << offsets_allocs_size_
+          << " bytes for transferring offsets on executor: " << params.executor;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::MemoryAllocation> allocation,
+      params.executor->HostMemoryAllocate(offsets_allocs_size_));
+  offsets_allocs_.emplace(params.executor, std::move(allocation));
 
   return absl::OkStatus();
 }
 
 absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
-  auto& stream = *params.stream;
+  se::Stream& stream = *params.stream;
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
-  std::vector<se::DeviceMemoryBase> new_buffers(
-      embedded_thunk_arguments_.size(), se::DeviceMemoryBase());
+
+  absl::InlinedVector<se::DeviceMemoryBase, 8> slice_buffers(
+      slices_.size(), se::DeviceMemoryBase());
 
   // Get memory allocation for copying offsets from device.
-  int64_t* offsets_base = [&] {
+  int64_t* offsets_alloc = [&] {
     absl::MutexLock lock(&mutex_);
     return reinterpret_cast<int64_t*>(
         offsets_allocs_.at(stream.parent())->opaque());
   }();
 
-  VLOG(2) << "Execute address computation thunk:";
-  for (auto [argument_idx, values] : llvm::enumerate(
-           llvm::zip(embedded_thunk_arguments_, offsets_, orig_shapes_,
-                     sliced_shapes_, offset_byte_sizes_))) {
-    auto [argument_slice, offset_slice, orig_shape, sliced_shape,
-          offset_byte_size] = values;
+  auto offset_value = [&](int64_t arg_idx, int64_t offset_idx) -> int64_t& {
+    return offsets_alloc[offsets_allocs_base_.at(arg_idx) + offset_idx];
+  };
 
-    if (argument_slice == std::nullopt) {
+  VLOG(2) << "Execute address computation thunk: slices=" << slices_.size();
+  for (auto [argument_idx, slice] : llvm::enumerate(slices_)) {
+    // Skip arguments that do not have buffer slices (tokens).
+    if (!slice.embedded_thunk_argument.has_value()) {
       continue;
     }
 
-    // `orig_argument` will contain the original offset for slice
+    // `argument_buffer` will contain the original offset for slice
     // `argument_slice` within `orig_allocations`
-    se::DeviceMemoryBase orig_argument =
-        orig_allocations.GetDeviceAddress(*argument_slice);
+    se::DeviceMemoryBase argument_buffer =
+        orig_allocations.GetDeviceAddress(*slice.embedded_thunk_argument);
 
-    if (offset_slice == std::nullopt) {
-      new_buffers[argument_idx] = orig_argument;
+    // If argument is not sliced, just use the original buffer.
+    if (!slice.offsets.has_value()) {
+      slice_buffers[argument_idx] = argument_buffer;
       continue;
     }
 
-    const Shape& src_shape = *orig_shape;
-    const Shape& dst_shape = *sliced_shape;
-    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
+    const Shape& src_shape = *slice.orig_shape;
+    const Shape& dst_shape = *slice.sliced_shape;
+    TF_RET_CHECK(IsContiguousSlice(*slice.orig_shape, *slice.sliced_shape));
 
-    std::vector<int64_t> slice_starts;
+    absl::InlinedVector<int64_t, 4> slice_starts;
     slice_starts.reserve(dst_shape.rank());
 
-    // Number of issues d2h transfers to copy offset values from device to host.
+    // Number of issues d2h transfers to copy offset values from device to
+    // host.
     int64_t num_transfers = 0;
 
     // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
     // components.
     for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
-             *offset_slice, src_shape.dimensions(), dst_shape.dimensions()))) {
-      auto [slice, src_dim, dst_dim] = values;
+             *slice.offsets, src_shape.dimensions(), dst_shape.dimensions()))) {
+      auto [offset, src_dim, dst_dim] = values;
 
-      if (int64_t* const_offset = std::get_if<int64_t>(&slice)) {
+      if (int64_t* const_offset = std::get_if<int64_t>(&offset)) {
         // Forward slice offsets that are known constant values
         VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
                 << "]: constant offset = " << *const_offset;
-        offsets_base[argument_idx + offset_idx] = *const_offset;
+        offset_value(argument_idx, offset_idx) = *const_offset;
 
-      } else if (std::holds_alternative<LoopIter>(slice)) {
+      } else if (std::holds_alternative<LoopIter>(offset)) {
         // Get slice offset from the current loop iteration.
         TF_ASSIGN_OR_RETURN(int64_t iter, WhileThunk::CurrentLoopIteration());
         VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
                 << "]: loop iteration offset = " << iter;
-        offsets_base[argument_idx + offset_idx] = iter;
+        offset_value(argument_idx, offset_idx) = iter;
 
       } else {
         // Transfer slice offset value from device to host.
-        auto alloc_slice = std::get<BufferAllocation::Slice>(slice);
+        auto alloc_slice = std::get<BufferAllocation::Slice>(offset);
         VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
                 << "]: transfer offset from device " << alloc_slice.ToString();
 
         se::DeviceMemoryBase offset_src =
             orig_allocations.GetDeviceAddress(alloc_slice);
-        int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
+        int64_t* offset_dst = &offset_value(argument_idx, offset_idx);
 
         // Copy the `offset_idx`-th component of the offset for the
         // `argument_idx`-th argument from device to host.
         TF_RETURN_IF_ERROR(
-            stream.Memcpy(offset_dst, offset_src, offset_byte_size.value()));
+            stream.Memcpy(offset_dst, offset_src, *slice.offset_byte_size));
         ++num_transfers;
       }
     }
@@ -208,7 +218,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
              llvm::zip(src_shape.dimensions(), dst_shape.dimensions()))) {
       auto [src_dim, dst_dim] = values;
       int64_t start_index =
-          std::min(std::max(offsets_base[argument_idx + offset_idx], 0L),
+          std::min(std::max(offset_value(argument_idx, offset_idx), 0L),
                    src_dim - dst_dim);
       slice_starts.push_back(start_index);
     }
@@ -224,21 +234,21 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     }
 
     VLOG(2) << "Create sliced argument " << argument_idx << " of shape "
-            << sliced_shape->ToString() << " by slicing argument of shape "
-            << orig_shape->ToString() << " at offset " << new_offset << " with "
-            << new_size;
-    new_buffers[argument_idx] =
-        orig_argument.GetByteSlice(new_offset, new_size);
+            << slice.sliced_shape->ToString()
+            << " by slicing argument of shape " << slice.orig_shape->ToString()
+            << " at offset " << new_offset << " with " << new_size;
+    slice_buffers[argument_idx] =
+        argument_buffer.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices
   // of bigger ones allocated elsewhere.
-  BufferAllocations new_allocations(new_buffers,
-                                    orig_allocations.device_ordinal(),
-                                    orig_allocations.memory_allocator());
+  BufferAllocations slice_allocations(slice_buffers,
+                                      orig_allocations.device_ordinal(),
+                                      orig_allocations.memory_allocator());
 
   Thunk::ExecuteParams new_params =
-      Thunk::ExecuteParams::CloneWithNewAllocations(params, new_allocations);
+      Thunk::ExecuteParams::CloneWithNewAllocations(params, slice_allocations);
 
   // Execute the underlying custom call thunk with the new buffers.
   TF_RETURN_IF_ERROR(embedded_thunk_->ExecuteOnStream(new_params));
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index 3db9c5b0da14d7..07889b1c77959c 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -72,18 +72,31 @@ class AddressComputationThunk : public Thunk {
 
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
-  std::vector<std::optional<BufferAllocation::Slice>> embedded_thunk_arguments_;
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
-  std::vector<std::optional<std::vector<Offset>>> offsets_;
-  std::vector<std::optional<Shape>> orig_shapes_;
-  std::vector<std::optional<Shape>> sliced_shapes_;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
+
+  // Definition of a dynamic slice that extract a slice from the original buffer
+  // defined by `embedded_thunk_argument` at given `offsets`.
+  struct SliceDef {
+    std::optional<BufferAllocation::Slice> embedded_thunk_argument;
+    std::optional<std::vector<Offset>> offsets;
+    std::optional<Shape> orig_shape;
+    std::optional<Shape> sliced_shape;
+    std::optional<uint64_t> offset_byte_size;
+  };
+
+  std::vector<SliceDef> slices_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*,
                       std::unique_ptr<se::MemoryAllocation>>
       offsets_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  // Pre-computed size requirement for `offsets_allocs_`.
+  int64_t offsets_allocs_size_ = 0;
+
+  // A mapping from argument index to the base offset in the `offsets_allocs_`.
+  std::vector<int64_t> offsets_allocs_base_;
 };
 
 }  // namespace gpu

From 9d2929b69fdddc1e8d5de70a706556b15d8ed37b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 18:51:42 -0700
Subject: [PATCH 048/287] [xla:cpu] Add support for host kernels with multiple
 operands

PiperOrigin-RevId: 638100809
---
 .../xla/xla/service/cpu/ir_emitter2.cc        | 19 ++++++++-----------
 .../xla/xla/service/cpu/thunk_emitter.cc      |  6 ++++++
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 296615d8768a8a..3792bf71c4b93f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -168,21 +168,18 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
       EmitKernelPrototype(instr->name(), parameters, results);
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
-  // TODO(ezhulenev): Figure out how to set up `operand_to_generator` for
-  // multi-argument/result kernels.
-  if (kernel_prototype.arguments.size() != 1 ||
-      kernel_prototype.results.size() != 1) {
-    return absl::UnimplementedError(
-        "Kernel with multiple arguments/results is not implemented");
-  }
-
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
-  for (const HloInstruction* operand : instr->operands()) {
-    operand_to_generator[operand] = [&](const llvm_ir::IrArray::Index& index) {
-      return kernel_prototype.arguments[0].EmitReadArrayElement(index, &b);
+  for (int64_t i = 0; i < instr->operand_count(); ++i) {
+    const HloInstruction* operand = instr->operand(i);
+    operand_to_generator[operand] = [&, i](const llvm_ir::IrArray::Index& idx) {
+      return kernel_prototype.arguments[i].EmitReadArrayElement(idx, &b);
     };
   }
 
+  if (kernel_prototype.results.size() > 1) {
+    return absl::InternalError("Multi-output host kernels are not supported");
+  }
+
   // TODO(ezhulenev): Get `fast_min_max` from the HLO module config.
   ElementalIrEmitter elemental_emitter(module_, &b, /*fast_min_max_=*/true);
   auto element_generator =
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 88d325836eba9c..029d4eb5774174 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -81,7 +81,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     // Instructions that do not have a thunk implementation and instead fully
     // defined by the corresponding buffer assignment.
     case HloOpcode::kBitcast:
+    case HloOpcode::kGetTupleElement:
     case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
       return ThunkSequence::Empty();
 
     // Allocations for constants owned by the executable, and resolved at run
@@ -94,6 +96,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     // behind the HostKernel API).
     case HloOpcode::kAdd:
     case HloOpcode::kConvert:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kSqrt:
       return EmitElementalKernelThunk(instruction);
 
     case HloOpcode::kCopy:

From 063700955b24bd5abacc0176bc2ad2daaefa0461 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 28 May 2024 19:05:47 -0700
Subject: [PATCH 049/287] [xla:cpu] IrEmitter2: Get fast_min_max flag from hlo
 module config

PiperOrigin-RevId: 638103879
---
 third_party/xla/xla/service/cpu/BUILD               |  1 +
 third_party/xla/xla/service/cpu/cpu_compiler.cc     |  2 +-
 third_party/xla/xla/service/cpu/ir_emitter2.cc      | 13 ++++++++-----
 third_party/xla/xla/service/cpu/ir_emitter2.h       |  3 ++-
 third_party/xla/xla/service/cpu/ir_emitter2_test.cc |  7 +++++--
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 266cc98ab1293b..e7f9c1241af0f4 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -672,6 +672,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_parser",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tests:filecheck",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 7fc5fc358914ba..649cc9485625c7 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1156,7 +1156,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
     // IR emitter is responsible for building LLVM module with host kernels for
     // corresponding HLO instructions (fusions, elemental instructions, etc.).
-    IrEmitter2 ir_emitter(llvm_module.get());
+    IrEmitter2 ir_emitter(*module, llvm_module.get());
 
     // Thunk emitter is responsible for building a Thunk sequence that will
     // resolved kernels in the compiled LLVM module and execute them together
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 3792bf71c4b93f..5d1b100d81aead 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/cpu/elemental_math_emitter.h"
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -150,8 +151,9 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
 // IrEmitter2
 //===----------------------------------------------------------------------===//
 
-IrEmitter2::IrEmitter2(llvm::Module* module)
-    : module_(module),
+IrEmitter2::IrEmitter2(const HloModule& hlo_module, llvm::Module* module)
+    : hlo_module_(hlo_module),
+      module_(module),
       call_frame_ty_(KernelCallFrameTy(module_->getContext())),
       thread_dims_ty_(KernelThreadDimTy(module_->getContext())),
       thread_ty_(KernelThreadTy(module_->getContext())),
@@ -180,9 +182,10 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     return absl::InternalError("Multi-output host kernels are not supported");
   }
 
-  // TODO(ezhulenev): Get `fast_min_max` from the HLO module config.
-  ElementalIrEmitter elemental_emitter(module_, &b, /*fast_min_max_=*/true);
-  auto element_generator =
+  ElementalIrEmitter elemental_emitter(
+      module_, &b,
+      hlo_module_.config().debug_options().xla_cpu_enable_fast_min_max());
+  llvm_ir::ElementGenerator element_generator =
       elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
 
   TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index edee743a697eac..9210c5e3b12138 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -53,7 +53,7 @@ namespace xla::cpu {
 // WARNING: This is under construction and will eventually replace IrEmitter.
 class IrEmitter2 {
  public:
-  explicit IrEmitter2(llvm::Module* module);
+  IrEmitter2(const HloModule& hlo_module, llvm::Module* module);
 
   // Thread dimensions of the kernel invocation.
   struct KernelThreadDims {
@@ -119,6 +119,7 @@ class IrEmitter2 {
                                       llvm::Value* call_frame, int64_t index,
                                       const Shape& shape);
 
+  const HloModule& hlo_module_;
   llvm::Module* module_;
 
   // LLVM types defining HostKernel API (see host_kernel_c_api.h).
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
index 4b3686148e48ad..3b75c3bed1546f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "llvm/IR/LLVMContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
@@ -37,6 +38,8 @@ namespace {
 using IrEmitter2Test = HloTestBase;
 
 TEST_F(IrEmitter2Test, BuildKernelPrototype) {
+  auto hlo = std::make_unique<HloModule>("test", HloModuleConfig());
+
   llvm::LLVMContext context;
   auto module = std::make_unique<llvm::Module>("test", context);
 
@@ -44,7 +47,7 @@ TEST_F(IrEmitter2Test, BuildKernelPrototype) {
   std::vector<Shape> parameters = {shape};
   std::vector<Shape> results = {shape};
 
-  IrEmitter2 ir_emitter(module.get());
+  IrEmitter2 ir_emitter(*hlo, module.get());
   IrEmitter2::KernelPrototype prototype =
       ir_emitter.EmitKernelPrototype("test", parameters, results);
 
@@ -97,7 +100,7 @@ TEST_F(IrEmitter2Test, EmitElementalKernel) {
   HloInstruction* convert = FindInstruction(hlo.get(), "convert");
   ASSERT_NE(convert, nullptr);
 
-  IrEmitter2 ir_emitter(module.get());
+  IrEmitter2 ir_emitter(*hlo, module.get());
   TF_ASSERT_OK_AND_ASSIGN(IrEmitter2::KernelInfo kernel,
                           ir_emitter.EmitElementalHostKernel(convert));
 

From b297295711589c2d71880a046333e6ec095c6d7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 28 May 2024 22:16:13 -0700
Subject: [PATCH 050/287] Automated Code Change

PiperOrigin-RevId: 638146225
---
 .../examples/custom_ops_doc/multiplex_1/multiplex_1_kernel.cc  | 3 +++
 .../examples/custom_ops_doc/multiplex_1/multiplex_1_op.cc      | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_kernel.cc b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_kernel.cc
index dc633fbde3ea9d..e8737027e1ea2d 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_kernel.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_kernel.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 
 // Please use the appropriate namespace for your project
 namespace tensorflow {
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_op.cc b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_op.cc
index 5ef2dc1b0dbeee..47c8bb33bb4fd5 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_op.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
 
 // Use a namespace when registering by prepending the
 // package's name to the op’s name and separate with a '>'.

From 50d45f3bfe6163abac623e33165c8bb7e0c84139 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Tue, 28 May 2024 23:27:19 -0700
Subject: [PATCH 051/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638161446
---
 third_party/xla/xla/python/pjrt_ifrt/BUILD                 | 2 --
 third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc | 1 -
 third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc         | 2 +-
 third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc    | 7 +++----
 third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h     | 1 -
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index fa5a0a0b465350..1d07b45864cd2a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -213,7 +213,6 @@ cc_library(
         ":xla_ifrt",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -272,7 +271,6 @@ cc_library(
     hdrs = ["basic_string_array.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
index 54486105df21e4..bd77289d264d3f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 09b5329b69e25f..c73212464db7be 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -572,7 +572,7 @@ Future<> PjRtArray::Delete() {
     buffer->Delete();
   }
   // TODO(hyeontaek): Return a correct future.
-  return Future<>(OkStatus());
+  return Future<>(absl::OkStatus());
 }
 
 bool PjRtArray::IsDeleted() const {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index c9498071c24ade..41fabc9dbbba37 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
@@ -585,7 +584,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     if (returned_future_supported) {
       result.status = *std::move(returned_pjrt_future);
     } else {
-      result.status = Future<>(OkStatus());
+      result.status = Future<>(absl::OkStatus());
     }
   } else {
     std::optional<std::vector<PjRtFuture<>>> returned_pjrt_futures;
@@ -600,7 +599,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     if (returned_future_supported) {
       result.status = JoinFutures(absl::MakeSpan(*returned_pjrt_futures));
     } else {
-      result.status = Future<>(OkStatus());
+      result.status = Future<>(absl::OkStatus());
     }
   }
 
@@ -705,7 +704,7 @@ Future<> PjRtLoadedExecutable::Delete() {
   DCHECK(this);
   pjrt_loaded_executable_->Delete();
   // TODO(hyeontaek): Return a correct future.
-  return Future<>(OkStatus());
+  return Future<>(absl::OkStatus());
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index fd59a236b15e20..e4400ea16b689f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"

From 18e2ebf27d76752288e01db57d1575cafc72c99d Mon Sep 17 00:00:00 2001
From: Shawn Wang <shawnw@nvidia.com>
Date: Tue, 28 May 2024 23:54:05 -0700
Subject: [PATCH 052/287] PR #13047: [XLA:GPU] Lowering cublasLt thunk to
 command buffer

Imported from GitHub PR https://github.com/openxla/xla/pull/13047

Copybara import of the project:

--
ee872b54d6301f2dfb1168b57dcd5b6e8a0c4561 by Shawn Wang <shawnw@nvidia.com>:

Lowering cublasLt thunk to command buffer

Merging this change closes #13047

PiperOrigin-RevId: 638167448
---
 .../xla/xla/service/buffer_assignment.cc      |   4 +-
 .../service/gpu/command_buffer_scheduling.cc  |   5 +
 third_party/xla/xla/service/gpu/runtime/BUILD |   1 +
 .../service/gpu/runtime/command_buffer_cmd.cc | 160 ++++++++++++++++++
 .../service/gpu/runtime/command_buffer_cmd.h  |  67 ++++++++
 .../gpu/runtime/command_buffer_cmd_emitter.cc |  17 ++
 .../gpu/runtime/command_buffer_thunk_test.cc  | 141 ++++++++++++++-
 .../gpu/runtime/gpublas_lt_matmul_thunk.h     |  19 +++
 third_party/xla/xla/xla.proto                 |   3 +-
 9 files changed, 413 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 19d840bfab42fe..0ac5dd39e9692d 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -228,8 +228,8 @@ absl::Status GatherComputationsByAllocationType(
 }
 
 std::string BufferAllocation::Slice::ToString() const {
-  return absl::StrCat("{index:", index(), ", offset:", offset_,
-                      ", size:", size_, "}");
+  return absl::StrCat("{index:", allocation_ == nullptr ? -1 : index(),
+                      ", offset:", offset_, ", size:", size_, "}");
 }
 
 BufferAllocation::Slice BufferAllocation::GetSlice(
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index a9ed89cbf91ea8..0bdde28b5961e6 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -127,6 +127,11 @@ static bool IsCommand(const HloCustomCallInstruction* hlo,
     return true;
   }
 
+  if (config.enabled_commands.contains(DebugOptions::CUBLASLT) &&
+      (IsCublasLtMatmul(*hlo) || IsCublasLtMatmulF8(*hlo))) {
+    return true;
+  }
+
   if (!config.enabled_commands.contains(DebugOptions::CUSTOM_CALL)) {
     return false;
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index abee6757ad3502..133a68f9b2ba49 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -120,6 +120,7 @@ cc_library(
         ":cudnn_thunk",
         ":custom_call_thunk",
         ":gemm_thunk",
+        ":gpublas_lt_matmul_thunk",
         ":kernel_thunk",
         ":memset_thunk",
         ":nccl_all_gather_thunk",
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index a078f5bbd45211..f2b40e949a20b3 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -1122,6 +1122,166 @@ CommandBufferCmd::BufferUsageVector GemmCmd::buffers() {
           {workspace_, MemoryAccess::kWrite}};
 }
 
+//===----------------------------------------------------------------------===//
+// CublasLtCmd
+//===----------------------------------------------------------------------===//
+
+CublasLtCmd::CublasLtCmd(
+    ExecutionStreamId execution_stream_id, GemmConfig gemm_config,
+    se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+    BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
+    BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
+    BufferAllocation::Slice bias_buffer /* may be null */,
+    BufferAllocation::Slice aux_buffer /* may be null */,
+    BufferAllocation::Slice a_scale_buffer /* may be null */,
+    BufferAllocation::Slice b_scale_buffer /* may be null */,
+    BufferAllocation::Slice c_scale_buffer /* may be null */,
+    BufferAllocation::Slice d_scale_buffer /* may be null */,
+    BufferAllocation::Slice d_amax_buffer /* may be null */,
+    BufferAllocation::Slice workspace_buffer)
+    : TracedCommandBufferCmd(execution_stream_id),
+      gemm_config_(std::move(gemm_config)),
+      epilogue_(epilogue),
+      algorithm_idx_(algorithm_idx),
+      a_buffer_(a_buffer),
+      b_buffer_(b_buffer),
+      c_buffer_(c_buffer),
+      d_buffer_(d_buffer),
+      bias_buffer_(bias_buffer),
+      aux_buffer_(aux_buffer),
+      a_scale_buffer_(a_scale_buffer),
+      b_scale_buffer_(b_scale_buffer),
+      c_scale_buffer_(c_scale_buffer),
+      d_scale_buffer_(d_scale_buffer),
+      d_amax_buffer_(d_amax_buffer),
+      workspace_buffer_(workspace_buffer) {}
+
+absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtCmd::GetMatmulPlan(
+    const stream_executor::Stream* stream) {
+  auto it = matmul_plans_cache_.find(stream);
+  if (it != matmul_plans_cache_.end()) return it->second.get();
+  TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
+                                     stream, gemm_config_, epilogue_));
+  auto [it_insert, _] = matmul_plans_cache_.emplace(stream, std::move(plan));
+  return it_insert->second.get();
+}
+
+absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
+CublasLtCmd::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
+                                int64_t max_workspace) {
+  auto it = matmul_algorithm_cache_.find(plan);
+  if (it != matmul_algorithm_cache_.end()) return it->second;
+  TF_ASSIGN_OR_RETURN(
+      auto algorithms,
+      plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+                          /*max_workspace_size*/ max_workspace));
+  TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
+  auto [it_insert, _] =
+      matmul_algorithm_cache_.emplace(plan, algorithms[algorithm_idx_]);
+  return it_insert->second;
+}
+
+absl::Status CublasLtCmd::Initialize(const Thunk::InitializeParams& params,
+                                     StateManager& state) {
+  if (!params.stream->parent()->AsBlas()) {
+    return absl::InternalError("Failed to initialize BLAS support for GemmCmd");
+  }
+  TF_ASSIGN_OR_RETURN(plan_, GetMatmulPlan(params.stream));
+  TF_ASSIGN_OR_RETURN(algorithm_,
+                      GetMatmulAlgorithm(plan_, workspace_buffer_.size()));
+  return absl::OkStatus();
+}
+
+absl::Status CublasLtCmd::Record(const Thunk::ExecuteParams& execute_params,
+                                 const RecordParams& record_params,
+                                 se::CommandBuffer* command_buffer) {
+  const BufferAllocations& allocs = *execute_params.buffer_allocations;
+
+  se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, aux, d_amax;
+  if (bias_buffer_.allocation() != nullptr) {
+    bias = allocs.GetDeviceAddress(bias_buffer_);
+  }
+  if (a_scale_buffer_.allocation() != nullptr) {
+    a_scale = allocs.GetDeviceAddress(a_scale_buffer_);
+  }
+  if (b_scale_buffer_.allocation() != nullptr) {
+    b_scale = allocs.GetDeviceAddress(b_scale_buffer_);
+  }
+  if (c_scale_buffer_.allocation() != nullptr) {
+    c_scale = allocs.GetDeviceAddress(c_scale_buffer_);
+  }
+  if (d_scale_buffer_.allocation() != nullptr) {
+    d_scale = allocs.GetDeviceAddress(d_scale_buffer_);
+  }
+  if (d_amax_buffer_.allocation() != nullptr) {
+    d_amax = allocs.GetDeviceAddress(d_amax_buffer_);
+  }
+  if (aux_buffer_.allocation() != nullptr) {
+    aux = allocs.GetDeviceAddress(aux_buffer_);
+  }
+
+  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
+
+  VLOG(5) << "CublasLtCmd with execution_scope_id: "
+          << execution_scope_id.value();
+  VLOG(5) << "  a_buffer: " << a_buffer_.ToString();
+  VLOG(5) << "  b_buffer: " << b_buffer_.ToString();
+  VLOG(5) << "  c_buffer: " << c_buffer_.ToString();
+  VLOG(5) << "  d_buffer: " << d_buffer_.ToString();
+  VLOG(5) << "  bias_buffer: " << bias_buffer_.ToString();
+  VLOG(5) << "  aux_buffer: " << aux_buffer_.ToString();
+  VLOG(5) << "  a_scale_buffer: " << a_scale_buffer_.ToString();
+  VLOG(5) << "  b_scale_buffer: " << b_scale_buffer_.ToString();
+  VLOG(5) << "  c_scale_buffer: " << c_scale_buffer_.ToString();
+  VLOG(5) << "  d_scale_buffer: " << d_scale_buffer_.ToString();
+  VLOG(5) << "  d_amax_buffer: " << d_amax_buffer_.ToString();
+  VLOG(5) << "  workspace_buffer: " << workspace_buffer_.ToString();
+
+  return AddTracedCommandBuffer(
+      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
+        return plan_->ExecuteOnStream(
+            stream, allocs.GetDeviceAddress(a_buffer_),
+            allocs.GetDeviceAddress(b_buffer_),
+            allocs.GetDeviceAddress(c_buffer_),
+            allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale,
+            c_scale, d_scale, d_amax, algorithm_,
+            allocs.GetDeviceAddress(workspace_buffer_));
+      });
+}
+
+CommandBufferCmd::BufferUsageVector CublasLtCmd::buffers() {
+  BufferUsageVector buffer_usage;
+  buffer_usage.reserve(13);
+  buffer_usage.push_back({a_buffer_, MemoryAccess::kRead});
+  buffer_usage.push_back({b_buffer_, MemoryAccess::kRead});
+  buffer_usage.push_back({c_buffer_, MemoryAccess::kRead});
+  buffer_usage.push_back({d_buffer_, MemoryAccess::kWrite});
+  buffer_usage.push_back({workspace_buffer_, MemoryAccess::kWrite});
+
+  if (bias_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({bias_buffer_, MemoryAccess::kRead});
+  }
+  if (a_scale_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({a_scale_buffer_, MemoryAccess::kRead});
+  }
+  if (b_scale_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({b_scale_buffer_, MemoryAccess::kRead});
+  }
+  if (c_scale_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({c_scale_buffer_, MemoryAccess::kRead});
+  }
+  if (d_scale_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({d_scale_buffer_, MemoryAccess::kRead});
+  }
+  if (aux_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({aux_buffer_, MemoryAccess::kWrite});
+  }
+  if (d_amax_buffer_.allocation() != nullptr) {
+    buffer_usage.push_back({d_amax_buffer_, MemoryAccess::kRead});
+  }
+  return buffer_usage;
+}
+
 //===----------------------------------------------------------------------===//
 // CuDnnCmd
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index b0b0e027aee9c8..d0339ada647333 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -735,6 +735,73 @@ class GemmCmd : public TracedCommandBufferCmd {
   const bool deterministic_;
 };
 
+//===----------------------------------------------------------------------===//
+// CublasLtCmd
+//===----------------------------------------------------------------------===//
+
+class CublasLtCmd : public TracedCommandBufferCmd {
+ public:
+  CublasLtCmd(ExecutionStreamId execution_stream_id, GemmConfig gemm_config,
+              se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+              BufferAllocation::Slice a_buffer,
+              BufferAllocation::Slice b_buffer,
+              BufferAllocation::Slice c_buffer,
+              BufferAllocation::Slice d_buffer,
+              BufferAllocation::Slice bias_buffer /* may be null */,
+              BufferAllocation::Slice aux_buffer /* may be null */,
+              BufferAllocation::Slice a_scale_buffer /* may be null */,
+              BufferAllocation::Slice b_scale_buffer /* may be null */,
+              BufferAllocation::Slice c_scale_buffer /* may be null */,
+              BufferAllocation::Slice d_scale_buffer /* may be null */,
+              BufferAllocation::Slice d_amax_buffer /* may be null */,
+              BufferAllocation::Slice workspace_buffer);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
+      const stream_executor::Stream* stream);
+
+  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
+      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
+
+  absl::flat_hash_map<const stream_executor::Stream*,
+                      se::gpu::BlasLt::MatmulPlanPtr>
+      matmul_plans_cache_;
+
+  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
+                      se::gpu::BlasLt::MatmulAlgorithm>
+      matmul_algorithm_cache_;
+
+  se::gpu::BlasLt::MatmulPlan* plan_;
+  se::gpu::BlasLt::MatmulAlgorithm algorithm_;
+
+  const GemmConfig gemm_config_;
+  const se::gpu::BlasLt::Epilogue epilogue_;
+  const int64_t algorithm_idx_;
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+  const BufferAllocation::Slice c_buffer_;
+  const BufferAllocation::Slice d_buffer_;
+  const BufferAllocation::Slice bias_buffer_;
+  const BufferAllocation::Slice aux_buffer_;
+  const BufferAllocation::Slice a_scale_buffer_;
+  const BufferAllocation::Slice b_scale_buffer_;
+  const BufferAllocation::Slice c_scale_buffer_;
+  const BufferAllocation::Slice d_scale_buffer_;
+  const BufferAllocation::Slice d_amax_buffer_;
+  const BufferAllocation::Slice workspace_buffer_;
+};
+
 //===----------------------------------------------------------------------===//
 // CuDnnCmd
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index f4fe94496cbe51..a127fbe64cf48a 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/runtime/memset_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
@@ -125,6 +126,20 @@ static absl::StatusOr<Command> Convert(const GemmThunk& thunk) {
       thunk.deterministic());
 }
 
+static absl::StatusOr<Command> Convert(const CublasLtMatmulThunk& thunk) {
+  if (!thunk.workspace().has_value()) {
+    return absl::InternalError(
+        "Gemm thunk does not contain a workspace buffer");
+  }
+  return std::make_unique<CublasLtCmd>(
+      thunk.execution_stream_id(), thunk.config(), thunk.epilogue(),
+      thunk.algorithm_idx(), thunk.a_buffer(), thunk.b_buffer(),
+      thunk.c_buffer(), thunk.d_buffer(), thunk.bias_buffer(),
+      thunk.aux_buffer(), thunk.a_scale_buffer(), thunk.b_scale_buffer(),
+      thunk.c_scale_buffer(), thunk.d_scale_buffer(), thunk.d_amax_buffer(),
+      thunk.workspace().value());
+}
+
 static absl::StatusOr<Command> Convert(
     const ConditionalThunk& thunk,
     CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
@@ -236,6 +251,8 @@ static absl::Status AppendCommands(
       return append(Convert<KernelThunk>(thunk));
     case Thunk::Kind::kGemm:
       return append(Convert<GemmThunk>(thunk));
+    case Thunk::Kind::kCublasLtMatmul:
+      return append(Convert<CublasLtMatmulThunk>(thunk));
     case Thunk::Kind::kMemset32BitValue:
       return append(Convert<Memset32BitValueThunk>(thunk));
     case Thunk::Kind::kMemzero:
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index b8394af8e3b5bf..4355cee257dd06 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -473,7 +473,7 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
 
 TEST(CommandBufferThunkTest, GemmCmd) {
   if (!IsAtLeastCuda12300()) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+    GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
   se::StreamExecutor* executor = GpuExecutor();
@@ -587,6 +587,145 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
 }
 
+TEST(CommandBufferThunkTest, CublasLtCmd) {
+  if (!IsAtLeastCuda12300()) {
+    GTEST_SKIP() << "CUDA graph tracing is not supported";
+  }
+
+  se::StreamExecutor* executor = GpuExecutor();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+
+  // CublasLt formula: D = alpha*(A*B) + beta*(C),
+
+  int64_t a_length = sizeof(float) * 2 * 4;
+  int64_t b_length = sizeof(float) * 4 * 3;
+  int64_t c_length = sizeof(float) * 2 * 3;
+  int64_t d_length = sizeof(float) * 2 * 3;
+
+  // Prepare arguments:
+  // a = [1.0, 2.0, 3.0, 4.0
+  //      5.0, 6.0, 7.0, 8.0]
+  // b = [1.0, 1.0, 1.0
+  //      1.0, 1.0, 1.0
+  //      1.0, 1.0, 1.0
+  //      1.0, 1.0, 1.0]
+  // c = [1.0, 1.0, 1.0
+  //       1.0, 1.0, 1.0]
+
+  se::DeviceMemory<float> a = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> a_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream->Memcpy(&a, a_arr.data(), a_length));
+
+  se::DeviceMemory<float> b = executor->AllocateArray<float>(4 * 3);
+  std::vector<float> b_arr(12, 1);
+  TF_ASSERT_OK(stream->Memcpy(&b, b_arr.data(), b_length));
+
+  se::DeviceMemory<float> c = executor->AllocateArray<float>(2 * 3);
+  std::vector<float> c_arr(6, 1);
+  TF_ASSERT_OK(stream->Memcpy(&c, c_arr.data(), c_length));
+
+  se::DeviceMemory<float> d = executor->AllocateArray<float>(2 * 3);
+  TF_ASSERT_OK(stream->MemZero(&d, d_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_a(/*index=*/0, a_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/1, b_length, /*color=*/0);
+  BufferAllocation alloc_c(/*index=*/2, c_length, /*color=*/0);
+  BufferAllocation alloc_d(/*index=*/3, d_length, /*color=*/0);
+  BufferAllocation alloc_workspace(/*index=*/4, 1024 * 1024, /*color=*/0);
+
+  BufferAllocation::Slice slice_a(&alloc_a, 0, a_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, b_length);
+  BufferAllocation::Slice slice_c(&alloc_c, 0, c_length);
+  BufferAllocation::Slice slice_d(&alloc_d, 0, d_length);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  auto config = GemmConfig::For(
+      /*lhs_shape*/ ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
+      /*lhs_batch_dims*/ {}, /*lhs_contracting_dims*/ {1},
+      /*rhs_shape*/ ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}),
+      /*rhs_batch_dims*/ {}, /*rhs_contracting_dims*/ {0},
+      /*c_shape*/ ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}),
+      /*bias_shape_ptr*/ nullptr,
+      /*output_shape*/ ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}),
+      /*alpha_real*/ 1.0, /*alpha_imag*/ 0,
+      /*beta*/ 1.0,
+      /*precision_algorithm*/ PrecisionConfig::ALG_UNSET,
+      /*algorithm*/ std::nullopt,
+      /*compute_precision*/ se::blas::kDefaultComputePrecision,
+      /*grad_x*/ false, /*grad_y*/ false);
+  ASSERT_TRUE(config.ok());
+
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<CublasLtCmd>(
+      s0, config.value(), se::gpu::BlasLt::Epilogue::kDefault, 0, slice_a,
+      slice_b, slice_c, slice_d, BufferAllocation::Slice(),
+      BufferAllocation::Slice(), BufferAllocation::Slice(),
+      BufferAllocation::Slice(), BufferAllocation::Slice(),
+      BufferAllocation::Slice(), BufferAllocation::Slice(), slice_workspace);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+
+  ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b, c, d, workspace}, 0, &allocator);
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
+
+  // Execute command buffer thunk and verify that it executed a GEMM.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  // Copy `out` data back to host.
+  std::vector<float> dst(6, 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), d, d_length));
+
+  ASSERT_EQ(dst, std::vector<float>({11, 11, 11, 27, 27, 27}));
+
+  // Prepare buffer allocation for updating command buffer.
+  se::DeviceMemory<float> updated_d = executor->AllocateArray<float>(2 * 3);
+  TF_ASSERT_OK(stream->MemZero(&updated_d, d_length));
+
+  // Update buffer allocation to updated `d` buffer.
+  allocations =
+      BufferAllocations({a, b, c, updated_d, workspace}, 0, &allocator);
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  // Copy `updated_out` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), updated_d, d_length));
+
+  ASSERT_EQ(dst, std::vector<float>({11, 11, 11, 27, 27, 27}));
+
+  // Try to update the command buffer with the same buffers.
+  TF_ASSERT_OK(stream->MemZero(&updated_d, d_length));
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  // Copy `updated_out` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), updated_d, d_length));
+
+  ASSERT_EQ(dst, std::vector<float>({11, 11, 11, 27, 27, 27}));
+}
+
 TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 7b6e7d75c858db..93f5de164a3cb5 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -51,6 +51,25 @@ class CublasLtMatmulThunk : public Thunk {
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
 
+  GemmConfig config() const { return gemm_config_; }
+  se::gpu::BlasLt::Epilogue epilogue() const { return epilogue_; }
+  int64_t algorithm_idx() const { return algorithm_idx_; }
+
+  BufferAllocation::Slice a_buffer() const { return a_buffer_; }
+  BufferAllocation::Slice b_buffer() const { return b_buffer_; }
+  BufferAllocation::Slice c_buffer() const { return c_buffer_; }
+  BufferAllocation::Slice d_buffer() const { return d_buffer_; }
+  BufferAllocation::Slice bias_buffer() const { return bias_buffer_; }
+  BufferAllocation::Slice aux_buffer() const { return aux_buffer_; }
+  BufferAllocation::Slice a_scale_buffer() const { return a_scale_buffer_; }
+  BufferAllocation::Slice b_scale_buffer() const { return b_scale_buffer_; }
+  BufferAllocation::Slice c_scale_buffer() const { return c_scale_buffer_; }
+  BufferAllocation::Slice d_scale_buffer() const { return d_scale_buffer_; }
+  BufferAllocation::Slice d_amax_buffer() const { return d_amax_buffer_; }
+  std::optional<const BufferAllocation::Slice> workspace() const {
+    return workspace_buffer_;
+  }
+
  private:
   absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
       const stream_executor::Stream* stream);
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 939b993848b40a..c64a751e4f6d5f 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -484,7 +484,7 @@ message DebugOptions {
 
   // Commands are categorized into 5 types:
   // FUSION represents regular fusion kernels.
-  // CUBLAS, CUDNN, and COLLECTIVES represent library calls.
+  // CUBLAS/CUBLASLT, CUDNN, and COLLECTIVES represent library calls.
   // CONDITIONALS represents control flow.
   enum CommandBufferCmdType {
     INVALID = 0;
@@ -494,6 +494,7 @@ message DebugOptions {
     COLLECTIVES = 4;
     CONDITIONALS = 5;
     CUSTOM_CALL = 6;
+    CUBLASLT = 7;
   }
 
   // Determine the types of commands that are recorded into command buffers.

From c57737b3c4f0e3e8710478ea9e1fc0ddd773b486 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 00:18:24 -0700
Subject: [PATCH 053/287] bad_indices_policy for GatherNd

PiperOrigin-RevId: 638173737
---
 .../mlir/lite/stablehlo/tests/legalize_hlo.mlir    | 14 +++++++-------
 .../compiler/mlir/lite/tests/prepare-tf.mlir       |  2 +-
 .../mlir/lite/transforms/legalize_patterns.td      |  2 +-
 .../mlir/tensorflow/ir/tf_generated_ops.td         |  4 +++-
 tensorflow/core/ops/array_ops.cc                   |  1 +
 .../tools/api/golden/v1/tensorflow.raw_ops.pbtxt   |  2 +-
 .../tools/api/golden/v2/tensorflow.raw_ops.pbtxt   |  2 +-
 7 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 2329f68b36fc33..d078ab22e0e796 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -3645,7 +3645,7 @@ func.func @convert_floor_div_broadcast_cst(%arg0: tensor<10x8xf32>) -> tensor<10
 // CHECK-LABEL:   func @convert_gather(
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<147456xf16>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<192x256x1xi32>)
-// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) : {{.*}} -> tensor<192x256xf16>
+// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) <{bad_indices_policy = ""}> : {{.*}} -> tensor<192x256xf16>
 // CHECK:            return %[[VAL_0]]
 // CHECK:         }
 func.func @convert_gather(%arg0: tensor<147456xf16>, %arg1: tensor<192x256x1xi32>) -> tensor<192x256xf16> {
@@ -3665,7 +3665,7 @@ func.func @convert_gather(%arg0: tensor<147456xf16>, %arg1: tensor<192x256x1xi32
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<147456xf16>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<192x256x1xui32>)
 // CHECK:            %[[INDICES:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<192x256x1xui32>) -> tensor<192x256x1xi64>
-// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[INDICES]]) : {{.*}} -> tensor<192x256xf16>
+// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[INDICES]]) <{bad_indices_policy = ""}> : {{.*}} -> tensor<192x256xf16>
 // CHECK:            return %[[VAL_0]]
 // CHECK:         }
 func.func @convert_gather_with_ui32indices(%arg0: tensor<147456xf16>, %arg1: tensor<192x256x1xui32>) -> tensor<192x256xf16> {
@@ -3686,7 +3686,7 @@ func.func @convert_gather_with_ui32indices(%arg0: tensor<147456xf16>, %arg1: ten
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<4x64xi32>)
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[4, 64, 1]> : tensor<3xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_2]]) : {{.*}} -> tensor<4x64x1xi32>
-// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_0]], %[[VAL_3]]) : {{.*}} -> tensor<4x64x128xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_0]], %[[VAL_3]]) <{bad_indices_policy = ""}> : {{.*}} -> tensor<4x64x128xf32>
 // CHECK:           return %[[VAL_4]]
 // CHECK:         }
 func.func @convert_gather_nd(%arg0: tensor<98x128xf32>, %arg1: tensor<4x64xi32>) -> tensor<4x64x128xf32> {
@@ -3708,7 +3708,7 @@ func.func @convert_gather_nd(%arg0: tensor<98x128xf32>, %arg1: tensor<4x64xi32>)
 // CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<4x1xi32>) -> tensor<4x128xf32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Const"{{.*}}value = dense<[1, 0]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : {{.*}} -> tensor<256x128xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_3]], %[[VAL_1]]) : {{.*}} -> tensor<4x128xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_3]], %[[VAL_1]]) <{bad_indices_policy = ""}> : {{.*}} -> tensor<4x128xf32>
 // CHECK:           return %[[VAL_4]]
 // CHECK:         }
 // Test the case when start_index_map isn't an iota what requires a transpose to
@@ -3732,7 +3732,7 @@ func.func @convert_gather_transpose(%arg0: tensor<128x256xf32>, %arg1: tensor<4x
 // CHECK-SAME:                                      %[[VAL_1:.*]]: tensor<1x1xi32>) -> tensor<1x1xi32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1x20xi32>, tensor<2xi64>) -> tensor<20x1xi32>
-// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_3]], %[[VAL_1]]) : (tensor<20x1xi32>, tensor<1x1xi32>) -> tensor<1x1xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_3]], %[[VAL_1]]) <{bad_indices_policy = ""}> : (tensor<20x1xi32>, tensor<1x1xi32>) -> tensor<1x1xi32>
 // CHECK:           %[[VAL_5:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xi32>, tensor<2xi64>) -> tensor<1x1xi32>
 // CHECK:           return %[[VAL_6]] : tensor<1x1xi32>
@@ -3780,7 +3780,7 @@ func.func @convert_gather_to_slice_batch_size_1(%arg0: tensor<1x2944xi32>, %arg1
 // CHECK-LABEL:   func @convert_gather_slice_dynamic_indices(
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<256000x1024xi8>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<?x?x1xi32>) -> tensor<?x?x1024xi8> {
-// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) : (tensor<256000x1024xi8>, tensor<?x?x1xi32>) -> tensor<?x?x1024xi8>
+// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) <{bad_indices_policy = ""}> : (tensor<256000x1024xi8>, tensor<?x?x1xi32>) -> tensor<?x?x1024xi8>
 // CHECK:            return %[[VAL_0]] : tensor<?x?x1024xi8>
 // CHECK:         }
 func.func @convert_gather_slice_dynamic_indices(%arg0: tensor<256000x1024xi8>, %arg1: tensor<?x?x1xi32>) -> tensor<?x?x1024xi8> {
@@ -3799,7 +3799,7 @@ func.func @convert_gather_slice_dynamic_indices(%arg0: tensor<256000x1024xi8>, %
 // CHECK-LABEL:   func @convert_gather_scalar_dynamic_indices(
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<256000xf32>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<?x?x1xi32>) -> tensor<?x?xf32> {
-// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) : (tensor<256000xf32>, tensor<?x?x1xi32>) -> tensor<?x?xf32>
+// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[ARG_1]]) <{bad_indices_policy = ""}> : (tensor<256000xf32>, tensor<?x?x1xi32>) -> tensor<?x?xf32>
 // CHECK:            return %[[VAL_0]] : tensor<?x?xf32>
 // CHECK:         }
 func.func @convert_gather_scalar_dynamic_indices(%arg0: tensor<256000xf32>, %arg1: tensor<?x?x1xi32>) -> tensor<?x?xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 785cfa2fa2d26f..9860b954265760 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -624,7 +624,7 @@ func.func @scatter_nd_add(%arg0: tensor<7xi64>, %arg1: tensor<1x1xi32>, %arg2: t
   func.return %0 : tensor<7xi64>
 
   // CHECK-LABEL: scatter_nd_add
-  // CHECK:  %[[GATHER:.*]] = "tf.GatherNd"(%arg0, %arg1) : (tensor<7xi64>, tensor<1x1xi32>) -> tensor<1xi64>
+  // CHECK:  %[[GATHER:.*]] = "tf.GatherNd"(%arg0, %arg1) <{bad_indices_policy = ""}> : (tensor<7xi64>, tensor<1x1xi32>) -> tensor<1xi64>
   // CHECK:  %[[ADD:.*]] = "tf.Add"(%arg2, %[[GATHER]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
   // CHECK:  %[[SCATTER:.*]] = "tf.TensorScatterUpdate"(%arg0, %arg1, %[[ADD]]) : (tensor<7xi64>, tensor<1x1xi32>, tensor<1xi64>) -> tensor<7xi64>
   // CHECK:  return %[[SCATTER]] : tensor<7xi64>
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 240773a82a9657..f9f10db3941406 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -347,7 +347,7 @@ def LegalizeGather: Pat<
   (TFL_GatherOp $params, $indices, ConstantAttr<I32Attr, "0">,
      ConstantAttr<I32Attr, "0">)>;
 
-def LegalizeGatherNd : Pat<(TF_GatherNdOp $params, $indices),
+def LegalizeGatherNd : Pat<(TF_GatherNdOp $params, $indices, $bad_indices_policy),
                            (TFL_GatherNdOp $params, $indices)>;
 
 def LegalizeGatherV2 : Pat<
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index b3d9200aa5d00d..2960be27271fa7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6230,7 +6230,9 @@ See also `tf.gather` and `tf.batch_gather`.
 
   let arguments = (ins
     Arg<TF_Tensor, [{The tensor from which to gather values.}]>:$params,
-    Arg<TensorOf<[TF_Int16, TF_Int32, TF_Int64]>, [{Index tensor.}]>:$indices
+    Arg<TensorOf<[TF_Int16, TF_Int32, TF_Int64]>, [{Index tensor.}]>:$indices,
+
+    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$bad_indices_policy
   );
 
   let results = (outs
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index a54fcb7f5954d6..4eff4cc67bdc10 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1306,6 +1306,7 @@ REGISTER_OP("GatherNd")
     .Output("output: Tparams")
     .Attr("Tparams: type")
     .Attr("Tindices: {int16,int32,int64}")
+    .Attr("bad_indices_policy: string = ''")
     .SetShapeFn(shape_inference::GatherNdShape);
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 62e0f3bd578bae..c0e1de8f947fca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1902,7 +1902,7 @@ tf_module {
   }
   member_method {
     name: "GatherNd"
-    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'bad_indices_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "GatherV2"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 62e0f3bd578bae..c0e1de8f947fca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1902,7 +1902,7 @@ tf_module {
   }
   member_method {
     name: "GatherNd"
-    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'bad_indices_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "GatherV2"

From 3c23b76a0665079dc35dda23347eeafdb9c21eee Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Wed, 29 May 2024 00:40:49 -0700
Subject: [PATCH 054/287] [xla:cpu] Make HostKernel::Launch process all work in
 the main thread if thread_pool_ is nullptr.

PiperOrigin-RevId: 638179007
---
 third_party/xla/xla/stream_executor/host/host_kernel.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.cc b/third_party/xla/xla/stream_executor/host/host_kernel.cc
index cae7dcd47dcdee..6517877ff2710f 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel.cc
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.cc
@@ -95,8 +95,8 @@ absl::Status HostKernel::Launch(
   const uint64_t workload =
       kernel_thread_dims.z * kernel_thread_dims.y * kernel_thread_dims.x;
 
-  const uint64_t num_partitions = workload / block_size;
-  const uint64_t remainder = workload % block_size;
+  const uint64_t num_partitions = thread_pool_ ? (workload / block_size) : 0;
+  const uint64_t remainder = thread_pool_ ? workload % block_size : workload;
 
   SE_HOST_Kernel* kernel = function_->kernel();
 

From a115bcde1bf642c57d1d3a2c7474d3119c7a08a5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 00:42:49 -0700
Subject: [PATCH 055/287] Automated Code Change

PiperOrigin-RevId: 638179522
---
 tensorflow/core/distributed_runtime/coordination/BUILD         | 3 ++-
 .../coordination/coordination_service_barrier_proxy.cc         | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index 73bff8ebebddf4..a964cb8d1a0f43 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -52,13 +52,14 @@ cc_library(
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
-        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
         "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index b18d7136c05c76..d24ef5f03a2896 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -29,7 +29,8 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {

From 86b6559e00ea9c0ce64aeb5681003602b67d783f Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 29 May 2024 00:58:38 -0700
Subject: [PATCH 056/287] [XLA:GPU] Fix compute time calculation in the
 performance model.

The compute time should be calculated based on the number of active cores, which is the minimum of the number of blocks and the number of cores on the device.

Originally suggested in https://github.com/openxla/xla/pull/12208.

Co-authored-by: lingzhi98 <lingzhi.zhou@intel.com>
PiperOrigin-RevId: 638183155
---
 .../model/gpu_collective_performance_model.cc |  6 ++--
 .../model/gpu_indexing_performance_model.cc   |  5 +--
 .../gpu/model/gpu_performance_model.cc        | 15 ++++----
 .../gpu/model/gpu_performance_model_base.cc   | 15 ++++----
 .../gpu/model/gpu_performance_model_base.h    |  2 +-
 .../gpu/model/gpu_performance_model_test.cc   | 36 +++++++++++++++++++
 6 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
index c6e701050a0343..f04771f789691a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -242,9 +242,9 @@ GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
 
   // Since channels are pipelined together, compute time will only occur as in a
   // single channel.
-  absl::Duration compute_time_per_channel =
-      ComputeTime(gpu_device_info,
-                  cost_analysis->flop_count(instr) / num_channels, num_threads);
+  absl::Duration compute_time_per_channel = ComputeTime(
+      gpu_device_info, cost_analysis->flop_count(instr) / num_channels,
+      /*num_blocks=*/num_channels, /*num_threads_per_block=*/num_threads);
   total_time += compute_time_per_channel;
 
   uint32_t supported_p2p = CheckIfNvlinkSupportsP2P();
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 9ecb7dcce24912..c8e79232d7040a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -115,7 +115,6 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
   LaunchDimensions launch_dimensions =
       EstimateFusionLaunchDimensions(fusion_analysis);
 
-  int64_t num_threads = launch_dimensions.launch_bound();
   int64_t num_blocks = launch_dimensions.num_blocks();
 
   // Compute indexing from root to each instruction in the fusion and fusion
@@ -167,7 +166,9 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
 
   int64_t bytes_written = GetShapeSizeRecursive(root_shape);
 
-  absl::Duration compute_time = ComputeTime(*device_info_, flops, num_threads);
+  absl::Duration compute_time =
+      ComputeTime(*device_info_, flops, num_blocks,
+                  launch_dimensions.num_threads_per_block());
   absl::Duration write_time = WriteTime(*device_info_, bytes_written);
   absl::Duration memory_access_time = read_time + write_time;
   absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index aab17f6b6a37de..acab46e2752c51 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -63,10 +63,11 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
                                     : local_analysis.value();
   LaunchDimensions launch_dimensions =
       EstimateFusionLaunchDimensions(fusion_analysis);
-  int64_t num_threads = launch_dimensions.launch_bound();
   int64_t num_blocks = launch_dimensions.num_blocks();
 
-  absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
+  absl::Duration compute_time =
+      ComputeTime(*device_info, flops, num_blocks,
+                  launch_dimensions.num_threads_per_block());
 
   CoalescingAnalysis coalescing_analysis(instr, instr->operands(),
                                          fusion_analysis);
@@ -205,8 +206,9 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
   int64_t flops = producer_runtime.flops * utilization_by_this_consumer +
                   consumer_runtime.flops;
 
-  int64_t num_threads = launch_dimensions.launch_bound();
-  absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
+  absl::Duration compute_time =
+      ComputeTime(*device_info, flops, launch_dimensions.num_blocks(),
+                  launch_dimensions.num_threads_per_block());
 
   auto fusion_operands = fusion_analysis.fusion().GetParameters();
   CoalescingAnalysis coalescing_analysis(producer, consumer, fusion_operands,
@@ -241,9 +243,9 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
           << EstimateRunTimeData{flops,
                                  bytes_read,
                                  consumer_runtime.bytes_written,
-                                 compute_time,
                                  read_time,
                                  consumer_runtime.write_time,
+                                 compute_time,
                                  exec_time}
                  .ToString();
 
@@ -307,7 +309,8 @@ absl::Duration GpuPerformanceModel::EstimateFusedExecTime(
 
     absl::Duration compute_time_by_this_consumer = ComputeTime(
         *device_info, producer_runtime.flops * utilization_by_this_consumer,
-        launch_dimensions_fused.launch_bound());
+        launch_dimensions_fused.num_blocks(),
+        launch_dimensions_fused.num_threads_per_block());
 
     // Here, we assume that the read is distributed over all the threads in the
     // launch grid. Usually this is the case, but not always: for example, a
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 421c0d1ca566ce..c19f7e52223cbc 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -403,13 +403,16 @@ absl::Duration GpuPerformanceModelBase::WriteTime(
 
 /*static*/
 absl::Duration GpuPerformanceModelBase::ComputeTime(
-    const se::DeviceDescription& gpu_device_info, int64_t flops,
-    int64_t num_threads) {
-  int64_t fpu_count =
-      gpu_device_info.core_count() * gpu_device_info.fpus_per_core();
-  int64_t n_threads_active = std::min(num_threads, fpu_count);
+    const se::DeviceDescription& gpu_device_info, int64_t flops, int num_blocks,
+    int num_threads_per_block) {
+  int64_t n_active_fpus_per_core =
+      std::min(num_threads_per_block, gpu_device_info.fpus_per_core());
+
+  int64_t n_active_core = std::min(num_blocks, gpu_device_info.core_count());
+  int64_t fpu_count = n_active_core * n_active_fpus_per_core;
+
   int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
-  int64_t flop_per_ns_effective = flop_per_ns_per_fpu * n_threads_active;
+  int64_t flop_per_ns_effective = flop_per_ns_per_fpu * fpu_count;
   return absl::Nanoseconds(1.0f * flops / flop_per_ns_effective);
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 17f0b98bbe6499..bd33ce14b4a186 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -215,7 +215,7 @@ class GpuPerformanceModelBase {
 
   static absl::Duration ComputeTime(
       const se::DeviceDescription& gpu_device_info, int64_t flops,
-      int64_t num_threads);
+      int num_blocks, int num_threads_per_block);
 
   static absl::Duration CombineComputeAndMemoryAccessTime(
       absl::Duration compute_time, absl::Duration memory_access_time,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index e9b83d4a548b0e..c8c42019d2efdb 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -660,6 +660,42 @@ ENTRY fusion {
   EXPECT_LT(exp_producer_priority, exp_consumer_priority);
 }
 
+TEST_F(GpuPerformanceModelTest, DontFuseExpensiveElementwiseIntoSmallReduce) {
+  constexpr absl::string_view kHlo = R"(
+HloModule testmodule
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fused_computation.0 {
+  p0 = f32[4,28672,32] parameter(0)
+  tanh = f32[4,28672,32] tanh(p0)
+  c1 = f32[] constant(72)
+  broadcast = f32[4,28672,32] broadcast(c1), dimensions={}
+  ROOT mul = f32[4,28672,32] multiply(tanh, broadcast)
+}
+
+ENTRY fusion {
+  p0 = f32[4,28672,32] parameter(0)
+  fusion = f32[4,28672,32] fusion(p0), kind=kLoop, calls=fused_computation.0
+  c0 = f32[] constant(0)
+  ROOT reduce = f32[4,32] reduce(fusion, c0), to_apply=add, dimensions={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  auto* fusion = module->entry_computation()->GetInstructionWithName("fusion");
+  auto* reduce = module->entry_computation()->GetInstructionWithName("reduce");
+
+  auto t = EstimateRunTimesForPriorityFusion(fusion, {reduce});
+
+  EXPECT_LT(t.time_unfused, t.time_fused);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 6a76dd24aa82f2829adbf96cf4feddb8167a8c4e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 01:21:41 -0700
Subject: [PATCH 057/287] Update ops-related pbtxt files.

PiperOrigin-RevId: 638189714
---
 .../ops/compat/ops_history_v2/GatherNd.pbtxt  | 37 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  7 ++++
 2 files changed, 44 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
index 5ec2fd92ae606e..e022d7808e37f7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
@@ -57,3 +57,40 @@ op {
     }
   }
 }
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "bad_indices_policy"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 042577243a4e90..858316c96b01c7 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -21847,6 +21847,13 @@ op {
       }
     }
   }
+  attr {
+    name: "bad_indices_policy"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
 }
 op {
   name: "GatherV2"

From 10e1e4916819a0b8e4540e58ed7e7c9034740483 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Wed, 29 May 2024 01:39:19 -0700
Subject: [PATCH 058/287] [XLA:GPU] Fix infinite loop in `ApproximateMatch`
 indexing test util.

Previously, if one string was strictly a prefix of the other one (ignoring
spaces), we would run into an infinite loop when reaching the end of the
shorter string.

PiperOrigin-RevId: 638194222
---
 third_party/xla/xla/service/gpu/model/indexing_test_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index b2499bccedf087..16c01a7cff7d19 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -152,7 +152,7 @@ bool ApproximateMatch(std::string_view lhs, std::string_view rhs) {
       ++r;
     }
     if (l == lhs_length || r == rhs_length) {
-      continue;
+      break;
     }
     if (lhs[l++] != rhs[r++]) {
       return false;

From c1ace89bb6e4b9e843ae8e28a8ce65c13034be53 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Wed, 29 May 2024 01:42:01 -0700
Subject: [PATCH 059/287] [XLA:GPU] Pre-factoring: Add GetDnnVersionInfo helper
 that returns StatusOr.

Replace repetitive code with calls to the helper.

PiperOrigin-RevId: 638194978
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/service/gpu/conv_algorithm_picker.cc  | 27 +++++++------------
 .../service/gpu/cudnn_fused_mha_rewriter.cc   |  2 +-
 .../xla/xla/service/gpu/fusions/cudnn_test.cc |  4 +--
 .../xla/service/gpu/gemm_fusion_autotuner.cc  |  2 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |  9 ++-----
 .../service/gpu/gpu_layout_assignment_test.cc |  4 +--
 .../xla/service/gpu/stream_executor_util.cc   | 19 ++++++++-----
 .../xla/service/gpu/stream_executor_util.h    |  6 ++++-
 .../service/gpu/tests/gpu_fused_mha_test.cc   | 16 +++++------
 10 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6300703cd560c8..c657d84533cd99 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3565,6 +3565,7 @@ cc_library(
         ":ir_emitter_unnested",
         ":prepare_hlo_for_ir_emitting_pipeline",
         ":rename_fusions",
+        ":stream_executor_util",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:platform_manager",
         "@llvm-project//mlir:FuncDialect",
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index cda0d5a700d238..80a731bee65761 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -288,16 +288,12 @@ std::string NumBytesToString(int64_t bytes) {
 }
 
 CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  se::dnn::VersionInfo version = GetDnnVersionInfoOrDefault(stream_executor);
   CudnnVersion cudnn_version;
-  if (auto* dnn = stream_executor->AsDnn()) {
-    absl::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
-    if (version_or.ok()) {
-      const auto& version = version_or.value();
-      cudnn_version.set_major(version.major_version());
-      cudnn_version.set_minor(version.minor_version());
-      cudnn_version.set_patch(version.patch());
-    }
-  }
+  cudnn_version.set_major(version.major_version());
+  cudnn_version.set_minor(version.minor_version());
+  cudnn_version.set_patch(version.patch());
+
   return cudnn_version;
 }
 
@@ -318,14 +314,11 @@ void PrintPlatformInfo(const se::Stream* stream) {
   LOG(ERROR) << "Driver: " << desc.driver_version();
   LOG(ERROR) << "Runtime: " << desc.runtime_version();
 
-  auto* dnn = se->AsDnn();
-  if (dnn) {
-    auto dnn_version = dnn->GetVersion();
-    if (dnn_version.ok()) {
-      auto v = dnn_version.value();
-      LOG(ERROR) << "cudnn version: " << v.major_version() << "."
-                 << v.minor_version() << "." << v.patch();
-    }
+  auto dnn_version = GetDnnVersionInfo(se);
+  if (dnn_version.ok()) {
+    auto v = dnn_version.value();
+    LOG(ERROR) << "cudnn version: " << v.major_version() << "."
+               << v.minor_version() << "." << v.patch();
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index 8012d3ff8552df..99af4c92ce668e 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -1626,7 +1626,7 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
     const DebugOptions& debug_options =
         comp->parent()->config().debug_options();
     const se::dnn::VersionInfo cudnn_version =
-        GetDnnVersionInfo(stream_executor_, cudnn_version_);
+        GetDnnVersionInfoOrDefault(stream_executor_, cudnn_version_);
 #if !defined(GOOGLE_CUDA) || CUDA_VERSION < 12000
     // CUDA needs to be >= 12.0 for cuDNN to work with all supported hardware.
     // Some cuDNN versions work with CUDA 11, but it is impractical for us to
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 4234c935954003..0b51d2bd5b60a0 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -65,11 +65,11 @@ class CuDnnFusionTest : public GpuCodegenTest {
     return executor->GetDeviceDescription()
                .cuda_compute_capability()
                .IsAtLeastHopper() &&
-           GetDnnVersionInfo(executor).major_version() >= 9;
+           GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
   }
   bool IsAtLeastCuDnn91() {
     se::StreamExecutor* executor = backend().default_stream_executor();
-    const se::dnn::VersionInfo version = GetDnnVersionInfo(executor);
+    const se::dnn::VersionInfo version = GetDnnVersionInfoOrDefault(executor);
     return (version.major_version() == 9 && version.minor_version() >= 1) ||
            version.major_version() > 9;
   }
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index 3ee89002ce2da8..377cdef92332b9 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -574,7 +574,7 @@ absl::StatusOr<std::vector<Config>> GemmFusionAutotunerImpl::GenerateConfigs(
       !config_.IsDeviceless() && GetComputeCapability().IsAtLeastHopper();
   bool is_cudnn_enabled =
       debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0 && is_hopper &&
-      GetDnnVersionInfo(config_.GetExecutor()).major_version() >= 9;
+      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9;
   if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
       (IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
        algorithm_util::IsSupportedByCudnn(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index e9e0f1b67bbd46..6ba8c63764f45c 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -163,6 +163,7 @@ limitations under the License.
 #include "xla/service/gpu/softmax_rewriter_triton.h"
 #include "xla/service/gpu/stream_attribute_annotator.h"
 #include "xla/service/gpu/stream_attribute_async_wrapper.h"
+#include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/topk_specializer.h"
 #include "xla/service/gpu/topk_splitter.h"
 #include "xla/service/gpu/tree_reduction_rewriter.h"
@@ -1188,13 +1189,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
   se::dnn::VersionInfo dnn_version = gpu_target_config.dnn_version_info;
   if (stream_exec != nullptr) {
     gpu_version = GetGpuVersion(stream_exec);
-    se::dnn::DnnSupport* dnn = stream_exec->AsDnn();
-    if (dnn == nullptr) {
-      return tsl::errors::FailedPrecondition(
-          "DNN library initialization failed."
-          " Look at the errors above for more details.");
-    }
-    TF_ASSIGN_OR_RETURN(dnn_version, dnn->GetVersion());
+    TF_ASSIGN_OR_RETURN(dnn_version, GetDnnVersionInfo(stream_exec));
   }
 
   TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
index 5d99b49179eb9a..81f9e00548d9da 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -68,8 +68,8 @@ class LayoutAssignmentTest : public HloTestBase {
   se::dnn::VersionInfo GetDnnVersion() {
     // GpuLayoutAssignment has a special case heuristic for cudnn <= 7.3, but
     // none of the tests trigger this heuristic.
-    return GetDnnVersionInfo(backend().default_stream_executor(),
-                             se::dnn::VersionInfo{8, 3, 0});
+    return GetDnnVersionInfoOrDefault(backend().default_stream_executor(),
+                                      se::dnn::VersionInfo{8, 3, 0});
   }
 };
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 45088828143c38..c278fbe8b4ff37 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -68,17 +68,24 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-se::dnn::VersionInfo GetDnnVersionInfo(
-    stream_executor::StreamExecutor* stream_exec,
-    se::dnn::VersionInfo fallback_version) {
+absl::StatusOr<se::dnn::VersionInfo> GetDnnVersionInfo(
+    stream_executor::StreamExecutor* stream_exec) {
   if (!stream_exec) {
-    return fallback_version;
+    return absl::InvalidArgumentError("StreamExecutor is null");
   }
   stream_executor::dnn::DnnSupport* dnn = stream_exec->AsDnn();
   if (!dnn) {
-    return fallback_version;
+    return absl::FailedPreconditionError(
+        "DNN library initialization failed. Look at the errors above for more "
+        "details.");
   }
-  return dnn->GetVersion().value_or(fallback_version);
+  return dnn->GetVersion();
+}
+
+se::dnn::VersionInfo GetDnnVersionInfoOrDefault(
+    stream_executor::StreamExecutor* stream_exec,
+    se::dnn::VersionInfo fallback_version) {
+  return GetDnnVersionInfo(stream_exec).value_or(fallback_version);
 }
 
 namespace {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index 0f780f7c012815..aa27afd3200ae4 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -44,9 +44,13 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Returns DNN version info from provided stream executor.
+absl::StatusOr<se::dnn::VersionInfo> GetDnnVersionInfo(
+    stream_executor::StreamExecutor* stream_exec);
+
 // Returns DNN version info from provided stream executor when possible,
 // fallback version otherwise.
-se::dnn::VersionInfo GetDnnVersionInfo(
+se::dnn::VersionInfo GetDnnVersionInfoOrDefault(
     stream_executor::StreamExecutor* stream_exec,
     se::dnn::VersionInfo fallback_version = se::dnn::VersionInfo{0, 0, 0});
 
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index a0cb225ff911ff..219370414228b7 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -352,7 +352,7 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
   template <typename T>
   void TestImpl_Flash_Attention_BMM1_CausalMask_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
@@ -373,7 +373,7 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
   template <typename T>
   void TestImpl_Flash_Attention_Training_BMM1_CausalMask_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
@@ -668,7 +668,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   template <typename T>
   void TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
@@ -690,7 +690,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   template <typename T>
   void TestImpl_Flash_Attention_Training_BMM1_Bias_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
@@ -715,7 +715,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   template <typename T>
   void TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Cross_Attention() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention cross attention requires "
                       "cuDNN >= 8.9.4.";
@@ -739,7 +739,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   void TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Dbias() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     auto cc = GetCudaComputeCapability();
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
             se::dnn::VersionInfo(8, 9, 6) ||
         !cc.IsAtLeastHopper() || cc.minor != 0) {
       GTEST_SKIP()
@@ -859,7 +859,7 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
   template <typename T>
   void TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 4)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
@@ -1013,7 +1013,7 @@ class FlashAttentionBMMScalePaddingMaskSoftmaxBMM
   template <typename T>
   void TestImpl_Flash_Attention_Training_BMM1_PaddingMask_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
         se::dnn::VersionInfo(8, 9, 3)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
     }

From f27e37b646b12b731d4cabbc8d072447b6dac7e8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 01:49:06 -0700
Subject: [PATCH 060/287] [JAX] Automatically share PGO data for GPU
 latency-hiding scheduler.

Overall the idea is to collect profile data for each module given amount of times (which can be configured) then recompile the module with the aggregated profile data.

1. We need to track how many times each module were profiled and collect profiling results. For this i added a ProfileSessionRunner class at profile.py. The class can track how many times an instance of it was called to profile a session and also can aggregate profile results.

2. We need associate profiling session to the module at the interpreter. To do this i added a dictionary to pjit.py which associates Jaxpr with profile session runner.

3. The profile session runner should be passed to pxla.py and then called.

4. We need to correctly deal with fast path at the interpreter level, so JAX won't use HLO directly if PGLE need to be collected, but also JAX will not recompiled the module only for PGLE. See changes in pjit.py and in lru_cache.h

5. Once FDO is collected we need to share it between hosts to keep deterministic compilation.

PiperOrigin-RevId: 638197166
---
 third_party/xla/xla/pjrt/lru_cache.h          |  12 ++
 third_party/xla/xla/python/BUILD              |  39 ++++++
 .../xla/xla/python/aggregate_profile.cc       |  55 ++++++++
 .../xla/xla/python/aggregate_profile.h        |  33 +++++
 .../xla/xla/python/aggregate_profile_test.cc  | 125 ++++++++++++++++++
 third_party/xla/xla/python/pjit.cc            |  11 +-
 third_party/xla/xla/python/profiler.cc        |  23 ++++
 third_party/xla/xla/python/xla_client.py      |   2 +-
 .../xla/xla/python/xla_extension/profiler.pyi |   4 +-
 9 files changed, 301 insertions(+), 3 deletions(-)
 create mode 100644 third_party/xla/xla/python/aggregate_profile.cc
 create mode 100644 third_party/xla/xla/python/aggregate_profile.h
 create mode 100644 third_party/xla/xla/python/aggregate_profile_test.cc

diff --git a/third_party/xla/xla/pjrt/lru_cache.h b/third_party/xla/xla/pjrt/lru_cache.h
index 0e5ea6d07d9dad..82dbf6f3a9387e 100644
--- a/third_party/xla/xla/pjrt/lru_cache.h
+++ b/third_party/xla/xla/pjrt/lru_cache.h
@@ -87,6 +87,8 @@ class LRUCache {
   Value GetOrCreateIfAbsent(const Key& key,
                             const std::function<Value(const Key&)>& factory);
 
+  void Remove(const Key& key);
+
   // Removes all entries from the cache.
   void Clear();
 
@@ -139,6 +141,16 @@ LRUCache<Key, Value, Hash, Eq>::~LRUCache() {
   Clear();
 }
 
+template <typename Key, typename Value, typename Hash, typename Eq>
+void LRUCache<Key, Value, Hash, Eq>::Remove(const Key& key) {
+  LRUListEntry* l = &entries_[key];
+  l->next->prev = l->prev;
+  l->prev->next = l->next;
+  --lru_list_->size_;
+
+  entries_.erase(key);
+}
+
 template <typename Key, typename Value, typename Hash, typename Eq>
 Value LRUCache<Key, Value, Hash, Eq>::GetOrCreateIfAbsent(
     const Key& key, const std::function<Value(const Key&)>& factory) {
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index af51f6e6e52a7f..fbf2d526810b87 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -325,6 +325,7 @@ cc_library(
     ]),
     features = ["-use_header_modules"],
     deps = [
+        ":aggregate_profile",
         ":callback",
         ":nb_absl_span",
         ":nb_class_ptr",
@@ -337,6 +338,7 @@ cc_library(
         ":transfer_guard_lib",
         ":types",
         ":util",
+        ":xplane_to_profile_instructions",
         # placeholder for index annotation deps
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -345,12 +347,14 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
         "//third_party/nanobind",
@@ -365,6 +369,8 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
+        "//xla/client:xla_computation",
+        "//xla/client/lib:arithmetic",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:exceptions",
         "//xla/pjrt:host_callback",
@@ -380,6 +386,8 @@ cc_library(
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt:status_casters",
         "//xla/pjrt:transpose",
+        "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
         "//xla/python/ifrt",
         "//xla/python/ifrt:plugin_program",
         "//xla/python/ifrt:plugin_program_serdes",
@@ -401,7 +409,9 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -733,6 +743,7 @@ cc_library(
         "//xla/pjrt:lru_cache",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
         "//xla/tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
@@ -993,11 +1004,14 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":aggregate_profile",
         ":profiler_utils",
+        ":py_client",
         ":types",
         ":xplane_to_profile_instructions",
         # placeholder for index annotation deps
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "//third_party/nanobind",
         "//xla:status",
         "//xla/backends/profiler:profiler_backends",
@@ -1520,3 +1534,28 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
+
+cc_library(
+    name = "aggregate_profile",
+    srcs = ["aggregate_profile.cc"],
+    hdrs = ["aggregate_profile.h"],
+    deps = [
+        ":xplane_to_profile_instructions",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
+    ],
+)
+
+xla_cc_test(
+    name = "aggregate_profile_test",
+    srcs = ["aggregate_profile_test.cc"],
+    deps = [
+        ":aggregate_profile",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
+        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc_impl",
+    ],
+)
diff --git a/third_party/xla/xla/python/aggregate_profile.cc b/third_party/xla/xla/python/aggregate_profile.cc
new file mode 100644
index 00000000000000..29e09d04821c95
--- /dev/null
+++ b/third_party/xla/xla/python/aggregate_profile.cc
@@ -0,0 +1,55 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/aggregate_profile.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/python/xplane_to_profile_instructions.h"
+
+namespace xla {
+
+void AggregateProfiledInstructionsProto(
+    absl::Span<const tensorflow::profiler::ProfiledInstructionsProto> profiles,
+    int percentile,
+    tensorflow::profiler::ProfiledInstructionsProto *result_profile) {
+  if (percentile < 0 || percentile > 100) return;
+
+  absl::flat_hash_map<std::string, HloLatencyInfo> hlo_latency_info;
+  // Store costs information from each profile to the hash map.
+  for (const auto &profile : profiles) {
+    for (const auto &cost : profile.costs()) {
+      hlo_latency_info[cost.name()].durations.emplace_back(cost.cost_us());
+    }
+  }
+  for (const auto &iter : hlo_latency_info) {
+    auto *cost = result_profile->add_costs();
+    std::vector<double> durations = iter.second.durations;
+    int index = 0;
+    if (durations.size() > 1) {
+      std::sort(durations.begin(), durations.end());
+      index = percentile / 100.0 * (durations.size() - 1);
+    }
+
+    cost->set_cost_us(durations[index]);
+    cost->set_name(iter.first);
+  }
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/aggregate_profile.h b/third_party/xla/xla/python/aggregate_profile.h
new file mode 100644
index 00000000000000..7e7e8e2bbe6c72
--- /dev/null
+++ b/third_party/xla/xla/python/aggregate_profile.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_AGGREGATE_PROFILE_H_
+#define XLA_PYTHON_AGGREGATE_PROFILE_H_
+
+#include "absl/types/span.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+
+namespace xla {
+
+// Aggregates and gets given percentile of multiple ProfiledInstructionsProtos
+// into one ProfiledInstructionsProto.
+void AggregateProfiledInstructionsProto(
+    absl::Span<const tensorflow::profiler::ProfiledInstructionsProto> profiles,
+    int percentile,
+    tensorflow::profiler::ProfiledInstructionsProto *result_profile);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_AGGREGATE_PROFILE_H_
diff --git a/third_party/xla/xla/python/aggregate_profile_test.cc b/third_party/xla/xla/python/aggregate_profile_test.cc
new file mode 100644
index 00000000000000..bf1f44ef956afc
--- /dev/null
+++ b/third_party/xla/xla/python/aggregate_profile_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/aggregate_profile.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tsl/platform/test.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::profiler::ProfiledInstructionsProto;
+
+TEST(AggregateProfiledInstructionsProtoTest, aggregateAndGetPercentile) {
+  tensorflow::profiler::ProfiledInstructionsProto profile_a;
+  {
+    auto *cost_a = profile_a.add_costs();
+    cost_a->set_cost_us(10);
+    cost_a->set_name("reduce");
+  }
+  {
+    auto *cost_a = profile_a.add_costs();
+    cost_a->set_cost_us(30);
+    cost_a->set_name("copy");
+  }
+
+  tensorflow::profiler::ProfiledInstructionsProto profile_c;
+  {
+    auto *cost_c = profile_c.add_costs();
+    cost_c->set_cost_us(30);
+    cost_c->set_name("reduce");
+  }
+
+  std::vector<tensorflow::profiler::ProfiledInstructionsProto> profiles = {
+      profile_a, profile_c};
+
+  std::vector<int> custom_call_costs = {0,  10, 20, 30, 40, 50,
+                                        60, 70, 80, 90, 100};
+  for (int cost : custom_call_costs) {
+    tensorflow::profiler::ProfiledInstructionsProto profile_custom_call;
+    {
+      auto *cost_c = profile_custom_call.add_costs();
+      cost_c->set_cost_us(cost);
+      cost_c->set_name("custom-call");
+    }
+
+    profiles.push_back(profile_custom_call);
+  }
+  tensorflow::profiler::ProfiledInstructionsProto result_90th;
+  AggregateProfiledInstructionsProto(
+      absl::Span<const tensorflow::profiler::ProfiledInstructionsProto>(
+          profiles.data(), profiles.size()),
+      90, &result_90th);
+
+  EXPECT_EQ(result_90th.costs_size(), 3);
+  std::map<std::string, float> costs;
+  for (const auto &cost : result_90th.costs()) {
+    costs[cost.name()] = cost.cost_us();
+  }
+  EXPECT_EQ(costs["copy"], 30);
+  EXPECT_EQ(costs["custom-call"], 90);
+  EXPECT_EQ(costs["reduce"], 10);
+
+  tensorflow::profiler::ProfiledInstructionsProto result_10th;
+  AggregateProfiledInstructionsProto(
+      absl::Span<const tensorflow::profiler::ProfiledInstructionsProto>(
+          profiles.data(), profiles.size()),
+      10, &result_10th);
+
+  EXPECT_EQ(result_10th.costs_size(), 3);
+  for (const auto &cost : result_10th.costs()) {
+    costs[cost.name()] = cost.cost_us();
+  }
+  EXPECT_EQ(costs["copy"], 30);
+  EXPECT_EQ(costs["custom-call"], 10);
+  EXPECT_EQ(costs["reduce"], 10);
+}
+
+TEST(AggregateProfiledInstructionsProtoTest, getIncorrectPercentile) {
+  tensorflow::profiler::ProfiledInstructionsProto profile_a;
+  {
+    auto *cost_a = profile_a.add_costs();
+    cost_a->set_cost_us(10);
+    cost_a->set_name("reduce");
+  }
+
+  std::vector<tensorflow::profiler::ProfiledInstructionsProto> profiles = {
+      profile_a};
+  tensorflow::profiler::ProfiledInstructionsProto result;
+  AggregateProfiledInstructionsProto(
+      absl::Span<const tensorflow::profiler::ProfiledInstructionsProto>(
+          profiles.data(), profiles.size()),
+      -1, &result);
+  EXPECT_EQ(result.costs_size(), 0);
+  AggregateProfiledInstructionsProto(
+      absl::Span<const tensorflow::profiler::ProfiledInstructionsProto>(
+          profiles.data(), profiles.size()),
+      101, &result);
+  EXPECT_EQ(result.costs_size(), 0);
+
+  AggregateProfiledInstructionsProto(
+      absl::Span<const tensorflow::profiler::ProfiledInstructionsProto>(
+          profiles.data(), profiles.size()),
+      100, &result);
+  EXPECT_EQ(result.costs_size(), 1);
+}
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 2300448bc7a554..ce9f64a04a3242 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -584,6 +584,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
       nb::object out_and_fastpath_data;
       nb::tuple out_tuple;
       VLOG(2) << "Cache miss for " << call_signature.DebugString();
+      bool remove_cache = false;
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
@@ -594,6 +595,10 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
         out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
 
         PopulateCacheEntry(*cache_entry, out_tuple);
+
+        if (out_tuple.size() > 2 && out_tuple[2].is_valid()) {
+          remove_cache = nb::cast<bool>(out_tuple[2]);
+        }
       } catch (const std::exception& e) {
         VLOG(2) << "cache miss fail: " << e.what();
         cache_entry->fall_back_to_python = true;
@@ -602,6 +607,10 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
       }
       cache_entry->compilation_complete.Notify();
 
+      if (remove_cache) {
+        executables_->Remove(call_signature);
+      }
+
       // We have already computed the result in the miss path so we can return
       // it. We are even *required* to do so if there are donated arguments,
       // because any donated buffers will now be invalid.
@@ -738,7 +747,7 @@ absl::Status PjitFunction::ComputeCallSignature(
 
 void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
                                       const nb::tuple& out_and_fastpath_data) {
-  DCHECK_EQ(out_and_fastpath_data.size(), 2);
+  DCHECK_GE(out_and_fastpath_data.size(), 2);
 
   if (out_and_fastpath_data[1].is_none()) {
     VLOG(2) << "fastpath_data is none";
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index de758bdecd3d8b..5c4ce1b50771a4 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -20,18 +20,22 @@ limitations under the License.
 #include <string>
 #include <string_view>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/backends/profiler/plugin/plugin_tracer.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/aggregate_profile.h"
 #include "xla/python/profiler_utils.h"
 #include "xla/python/xplane_to_profile_instructions.h"
 #include "tsl/platform/macros.h"
@@ -276,6 +280,25 @@ void BuildProfilerSubmodule(nb::module_& m) {
     std::string out = GetFdoProfile(std::string(xspace.c_str(), xspace.size()));
     return nb::bytes(out.data(), out.size());
   });
+
+  profiler.def(
+      "aggregate_profiled_instructions",
+      [](const std::vector<nb::bytes>& profiles, int percentile) -> nb::object {
+        std::vector<tensorflow::profiler::ProfiledInstructionsProto>
+            fdo_profiles;
+        for (const nb::bytes& profile : profiles) {
+          tensorflow::profiler::ProfiledInstructionsProto profile_proto;
+          profile_proto.ParseFromString(profile.c_str());
+          fdo_profiles.push_back(std::move(profile_proto));
+        }
+
+        tensorflow::profiler::ProfiledInstructionsProto result_proto;
+        xla::AggregateProfiledInstructionsProto(fdo_profiles, percentile,
+                                                &result_proto);
+        auto result = result_proto.SerializeAsString();
+        return nb::bytes(result.data(), result.size());
+      },
+      nb::arg("profiles") = nb::list(), nb::arg("percentile"));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index a790b2ca7c6a80..6552c3aed01f83 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -49,7 +49,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 267
+_version = 268
 
 # Version number for MLIR:Python components.
 mlir_api_version = 56
diff --git a/third_party/xla/xla/python/xla_extension/profiler.pyi b/third_party/xla/xla/python/xla_extension/profiler.pyi
index 92dbb02639b7f3..5adc9c5111f1ae 100644
--- a/third_party/xla/xla/python/xla_extension/profiler.pyi
+++ b/third_party/xla/xla/python/xla_extension/profiler.pyi
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 from types import TracebackType
-from typing import Any, Optional, Type, Union
+from typing import Any, Optional, Type, Union, List
 
 _Status = Any
 
@@ -42,6 +42,8 @@ class ProfileOptions:
   duration_ms: int
   repository_path: str
 
+def aggregate_profiled_instructions(profiles: List[bytes], percentile: int) -> str: ...
+
 class TraceMe:
   def __init__(self, name: str, **kwargs: Any) -> None: ...
   def __enter__(self) -> TraceMe: ...

From 0deb1e60377a48dab10ab683ac6f042c499e6472 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 01:49:07 -0700
Subject: [PATCH 061/287] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 638197172
---
 tensorflow/go/op/wrappers.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ea62d33e92d431..5541f820d215e5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -19382,6 +19382,17 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
+// GatherNdAttr is an optional argument to GatherNd.
+type GatherNdAttr func(optionalAttr)
+
+// GatherNdBadIndicesPolicy sets the optional bad_indices_policy attribute to value.
+// If not specified, defaults to ""
+func GatherNdBadIndicesPolicy(value string) GatherNdAttr {
+	return func(m optionalAttr) {
+		m["bad_indices_policy"] = value
+	}
+}
+
 // Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
 // `indices` is a K-dimensional integer tensor, best thought of as a
@@ -19508,15 +19519,20 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 //
 // Returns Values from `params` gathered from indices given by `indices`, with
 // shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherNdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "GatherNd",
 		Input: []tf.Input{
 			params, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)

From 95906c6d1f6c95d9c056bd75dba3749d4bfb1c70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 02:03:35 -0700
Subject: [PATCH 062/287] Update GraphDef version to 1877.

PiperOrigin-RevId: 638201594
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b188b0142e52a7..51fd7d80ac04ac 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1876  // Updated: 2024/5/28
+#define TF_GRAPH_DEF_VERSION 1877  // Updated: 2024/5/29
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From ebadd6fb23e691f5d902ec1f19815827c4245615 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 02:03:38 -0700
Subject: [PATCH 063/287] compat: Update forward compatibility horizon to
 2024-05-29

PiperOrigin-RevId: 638201606
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index cc1ee13ce3de3f..d422b1a34b47b6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 29)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 5e9b15e6aca9411d77fee1868d6a6669d8c9afbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 03:02:56 -0700
Subject: [PATCH 064/287] Automated Code Change

PiperOrigin-RevId: 638216250
---
 tensorflow/core/framework/model.cc               | 16 ++++++++--------
 .../core/framework/node_def_builder_test.cc      |  2 +-
 tensorflow/core/framework/op_kernel.cc           |  7 ++++---
 tensorflow/core/framework/op_kernel.h            | 10 ++++++----
 tensorflow/core/framework/op_kernel_test.cc      | 14 +++++++-------
 tensorflow/core/framework/queue_interface.h      |  2 +-
 tensorflow/core/framework/reader_interface.h     |  2 +-
 tensorflow/core/framework/resource_handle.h      |  3 ++-
 tensorflow/core/framework/resource_mgr.cc        | 14 ++++++++------
 tensorflow/core/framework/resource_mgr.h         |  3 ++-
 10 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 904538cfcffcc3..a34e274c48228f 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -418,7 +418,7 @@ class InterleaveMany : public Node {
     }
   }
 
-  virtual ~InterleaveMany() {}
+  ~InterleaveMany() override {}
 
   // The ratio of an InterleaveMany node is `1/cycle_length`. If cycle length is
   // not available, we approximate it by `1/input_size`. The input size does not
@@ -574,7 +574,7 @@ class AsyncInterleaveMany : public Node {
     }
   }
 
-  virtual ~AsyncInterleaveMany() {}
+  ~AsyncInterleaveMany() override {}
 
   bool IsAsync() const override { return true; }
 
@@ -786,7 +786,7 @@ class KnownRatio : public Node {
  public:
   KnownRatio(Node::Args args, double ratio) : Node(args), ratio_(ratio) {}
 
-  virtual ~KnownRatio() {}
+  ~KnownRatio() override {}
 
   double Ratio() const override { return ratio_; }
 
@@ -892,7 +892,7 @@ class AsyncRatio : public Node {
     }
   }
 
-  virtual ~AsyncRatio() {}
+  ~AsyncRatio() override {}
 
   bool IsAsync() const override { return true; }
 
@@ -1137,7 +1137,7 @@ class UnknownRatio : public Node {
  public:
   using Node::Node;
 
-  virtual ~UnknownRatio() {}
+  ~UnknownRatio() override {}
 
   double Ratio() const override {
     tf_shared_lock l(mu_);
@@ -1258,7 +1258,7 @@ class Unknown : public Node {
  public:
   using Node::Node;
 
-  virtual ~Unknown() {}
+  ~Unknown() override {}
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
@@ -1316,7 +1316,7 @@ class AsyncKnownRatio : public AsyncRatio {
       : AsyncRatio(args, ratio, memory_ratio, parameters,
                    is_legacy_prefetch_autotuned) {}
 
-  virtual ~AsyncKnownRatio() {}
+  ~AsyncKnownRatio() override {}
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
@@ -1357,7 +1357,7 @@ class AsyncUnknownRatio : public AsyncRatio {
                     std::vector<std::shared_ptr<Parameter>> parameters)
       : AsyncRatio(args, /*ratio=*/0.0, /*memory_ratio=*/0.0, parameters) {}
 
-  virtual ~AsyncUnknownRatio() {}
+  ~AsyncUnknownRatio() override {}
 
   double Ratio() const override {
     tf_shared_lock l(mu_);
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index 24fc361d2f8390..8531027b232ed1 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -43,7 +43,7 @@ class NodeDefBuilderTest : public ::testing::Test {
   // to Op() above.
   NodeDefBuilder& Builder() {
     EXPECT_FALSE(op_def_.name().empty()) << "Must call Op() before Builder()";
-    builder_.reset(new NodeDefBuilder("n", &op_def_));
+    builder_ = std::make_unique<NodeDefBuilder>("n", &op_def_);
     return *builder_;
   }
 
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 79c5b31e0fe510..f8b8f81b15a67a 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdlib>
 #include <cstring>
+#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
@@ -341,7 +342,7 @@ OpKernelContext::OpKernelContext(Params* params)
 OpKernelContext::OpKernelContext(Params* params, int num_outputs)
     : params_(params), outputs_(num_outputs) {
   if (params_->track_allocations) {
-    tracking_state_ = absl::make_unique<TrackingState>();
+    tracking_state_ = std::make_unique<TrackingState>();
   }
 
   params_->ensure_eigen_gpu_device();
@@ -701,7 +702,7 @@ Status OpKernelContext::output_list(StringPiece name, OpOutputList* list) {
 
 void OpKernelContext::maybe_initialize_scope_id_set() {
   if (allocated_scope_ids_ == nullptr) {
-    allocated_scope_ids_ = absl::make_unique<std::unordered_set<int32>>();
+    allocated_scope_ids_ = std::make_unique<std::unordered_set<int32>>();
   }
 }
 
@@ -1122,7 +1123,7 @@ void OpKernelContext::clear_recorded_memory() {
 void OpKernelContext::set_record_memory_consumption(bool v) {
   record_memory_consumption_ = v;
   if (v && !tracking_state_) {
-    tracking_state_ = absl::make_unique<TrackingState>();
+    tracking_state_ = std::make_unique<TrackingState>();
   }
 }
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 1161f1545cdfd8..ff067cd9b61412 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 
 #include <functional>
+#include <memory>
+#include <optional>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -582,7 +584,7 @@ class OpKernelContext {
     int64_t start_time_usecs = 0;
 
     // The deadline for the session to complete by. Empty if unspecified.
-    absl::optional<absl::Time> deadline;
+    std::optional<absl::Time> deadline;
 
     // The op kernel being computed.
     OpKernel* op_kernel = nullptr;
@@ -690,7 +692,7 @@ class OpKernelContext {
     std::function<void()> inc_num_deferred_ops_function;
     std::function<void()> dec_num_deferred_ops_function;
 
-    absl::optional<ManagedStackTrace> stack_trace = {};
+    std::optional<ManagedStackTrace> stack_trace = {};
 
     // For implementing `OpKernelContext::output_required()`. If null, all
     // outputs are required.
@@ -715,7 +717,7 @@ class OpKernelContext {
 
   // The deadline for the session to complete by. Empty if unspecified in
   // RunOptions.
-  absl::optional<absl::Time> deadline() const { return params_->deadline; }
+  std::optional<absl::Time> deadline() const { return params_->deadline; }
 
   const OpKernel& op_kernel() const { return *params_->op_kernel; }
 
@@ -1557,7 +1559,7 @@ class OpKernelRegistrar {
                     OpKernel* (*create_fn)(OpKernelConstruction*))
       TF_ATTRIBUTE_NOINLINE {
     InitInternal(kernel_def, kernel_class_name,
-                 absl::make_unique<PtrOpKernelFactory>(create_fn));
+                 std::make_unique<PtrOpKernelFactory>(create_fn));
   }
 
  private:
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index ed4bb1bf7cdcf2..a4373446481d93 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -425,7 +425,7 @@ TEST_F(OpKernelTest, InputDtype) {
   gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&a), TensorValue(&b),
                                             TensorValue(&c)};
   params.inputs = inputs;
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
+  auto ctx = std::make_unique<OpKernelContext>(&params);
 
   DataType dtype;
   EXPECT_FALSE(ctx->input_dtype("non_existent_input", &dtype).ok());
@@ -593,7 +593,7 @@ class ScopedAllocatorDevice : public DeviceBase {
 TEST_F(OpKernelTest, ScopedAllocationTest) {
   Env* env = Env::Default();
   OpKernelContext::Params params;
-  auto sa_device = absl::make_unique<ScopedAllocatorDevice>(env);
+  auto sa_device = std::make_unique<ScopedAllocatorDevice>(env);
   params.device = sa_device.get();
   Status status;
   std::unique_ptr<OpKernel> op(CreateOpKernel(
@@ -607,7 +607,7 @@ TEST_F(OpKernelTest, ScopedAllocationTest) {
   params.output_attr_array = output_alloc_attrs.data();
   std::vector<int> forward_from({OpKernelContext::Params::kNeverForward});
   params.forward_from_array = forward_from.data();
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
+  auto ctx = std::make_unique<OpKernelContext>(&params);
 
   EXPECT_EQ(sa_device->num_allocations(false), 0);
   EXPECT_EQ(sa_device->num_allocations(true), 0);
@@ -797,7 +797,7 @@ TEST_F(OpKernelBuilderTest, OpOutputList) {
       TF_GRAPH_DEF_VERSION, &status));
   EXPECT_TRUE(status.ok()) << status.ToString();
   params.op_kernel = op.get();
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
+  auto ctx = std::make_unique<OpKernelContext>(&params);
 
   EXPECT_EQ(DT_INT32, ctx->expected_output_dtype(0));
   OpOutputList out_list;
@@ -1075,7 +1075,7 @@ void BM_InputRangeHelper(::testing::benchmark::State& state,
                          const NodeDef& node_def, const char* input_name,
                          int expected_start, int expected_stop) {
   Status status;
-  auto device = absl::make_unique<DummyDevice>(Env::Default());
+  auto device = std::make_unique<DummyDevice>(Env::Default());
 
   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
                                               cpu_allocator(), node_def,
@@ -1151,7 +1151,7 @@ void BM_TraceString(::testing::benchmark::State& state) {
 
   // Build OpKernel and OpKernelContext
   Status status;
-  auto device = absl::make_unique<DummyDevice>(Env::Default());
+  auto device = std::make_unique<DummyDevice>(Env::Default());
   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
                                               cpu_allocator(), node_def,
                                               TF_GRAPH_DEF_VERSION, &status));
@@ -1164,7 +1164,7 @@ void BM_TraceString(::testing::benchmark::State& state) {
   Tensor b(DT_FLOAT, TensorShape({256, 256}));
   gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&a), TensorValue(&b)};
   params.inputs = inputs;
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
+  auto ctx = std::make_unique<OpKernelContext>(&params);
 
   for (auto s : state) {
     auto trace = op->TraceString(*ctx, verbose);
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 9395cce1644f7e..2093dd1f45df01 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -94,7 +94,7 @@ class QueueInterface : public ResourceBase {
   }
 
  protected:
-  virtual ~QueueInterface() {}
+  ~QueueInterface() override {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index e3d7e7e62ade50..f78bce374925e3 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -80,7 +80,7 @@ class ReaderInterface : public ResourceBase {
   string DebugString() const override { return "a reader"; }
 
  protected:
-  virtual ~ReaderInterface() {}
+  ~ReaderInterface() override {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 97392eea61e8d7..93c62f44b8d36c 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_HANDLE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_HANDLE_H_
 
+#include <optional>
 #include <string>
 
 #include "tensorflow/core/framework/resource_base.h"
@@ -180,7 +181,7 @@ class ResourceHandle {
   uint64 hash_code_ = 0;
   std::string maybe_type_name_;
   std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes_;
-  absl::optional<ManagedStackTrace> definition_stack_trace_;
+  std::optional<ManagedStackTrace> definition_stack_trace_;
   // A smart pointer to the actual resource. When this field is not empty, the
   // handle is in a "ref-counting" mode, owning the resource; otherwise it's in
   // a "weak-ref" mode, only containing the name of the resource (conceptually a
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 4d49c051c50c10..872665170ae08a 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #include <atomic>
+#include <memory>
+#include <variant>
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -97,17 +99,17 @@ const char* ResourceMgr::DebugTypeName(uint64 hash_code) const {
 ResourceMgr::ResourceAndName::ResourceAndName() : name(nullptr) {}
 
 ResourceMgr::ResourceAndName::ResourceAndName(const string& name)
-    : name(absl::make_unique<string>(name)) {}
+    : name(std::make_unique<string>(name)) {}
 
 core::RefCountPtr<ResourceBase> ResourceMgr::ResourceAndName::GetResource()
     const {
-  if (absl::holds_alternative<core::RefCountPtr<ResourceBase>>(resource)) {
+  if (std::holds_alternative<core::RefCountPtr<ResourceBase>>(resource)) {
     ResourceBase* ptr =
-        absl::get<core::RefCountPtr<ResourceBase>>(resource).get();
+        std::get<core::RefCountPtr<ResourceBase>>(resource).get();
     ptr->Ref();
     return core::RefCountPtr<ResourceBase>(ptr);
-  } else if (absl::holds_alternative<core::WeakPtr<ResourceBase>>(resource)) {
-    return absl::get<core::WeakPtr<ResourceBase>>(resource).GetNewRef();
+  } else if (std::holds_alternative<core::WeakPtr<ResourceBase>>(resource)) {
+    return std::get<core::WeakPtr<ResourceBase>>(resource).GetNewRef();
   } else {
     return nullptr;
   }
@@ -287,7 +289,7 @@ Status ResourceMgr::DoDelete(const string& container, uint64 type_hash_code,
   TF_RETURN_IF_ERROR(PopResourceAndName(
       container, type_hash_code, resource_name, type_name, resource_and_name));
 
-  if (absl::holds_alternative<core::WeakPtr<ResourceBase>>(
+  if (std::holds_alternative<core::WeakPtr<ResourceBase>>(
           resource_and_name.resource)) {
     return errors::Internal(
         "Cannot delete an unowned Resource ", container, "/", resource_name,
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 7a4a1047bc59bb..b13de22dd49e99 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <typeindex>
 #include <typeinfo>
 #include <unordered_map>
+#include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/variant.h"
@@ -250,7 +251,7 @@ class ResourceMgr {
     }
   };
   struct ResourceAndName {
-    absl::variant<core::RefCountPtr<ResourceBase>, core::WeakPtr<ResourceBase>>
+    std::variant<core::RefCountPtr<ResourceBase>, core::WeakPtr<ResourceBase>>
         resource;
     std::unique_ptr<std::string> name;
 

From f70a053b71d90fafd5d16825885fc119dd4d9500 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 29 May 2024 03:06:24 -0700
Subject: [PATCH 065/287] More unsigned reduction fixes.

This is terrible. I would like to order one progressive
lowering please.

PiperOrigin-RevId: 638217181
---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 10 ++++++++
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.h  |  3 +++
 .../xla/service/gpu/fusions/reduction_mlir.cc | 25 ++++++++++---------
 .../gpu/fusions/reduction_mlir_test.cc        | 25 +++++++++++++++++++
 4 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 62eef69fbf5377..2eec7017067beb 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -651,6 +651,16 @@ Value UnrealizedConversionCast(mlir::Type type, Value value,
   return converted.front();
 }
 
+SmallVector<Value> UnrealizedConversionCast(mlir::TypeRange types,
+                                            ValueRange values,
+                                            ImplicitLocOpBuilder& b) {
+  SmallVector<Value> converted;
+  for (auto [type, value] : llvm::zip(types, values)) {
+    converted.push_back(UnrealizedConversionCast(type, value, b));
+  }
+  return converted;
+}
+
 SmallVector<Value> ConvertToSignless(mlir::ValueRange values,
                                      ImplicitLocOpBuilder& b) {
   mlir::mhlo::RemoveSignTypeConverter sign_converter;
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
index 31351c98455ed6..fad1a922c5332e 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
@@ -83,6 +83,9 @@ absl::Status SubgraphToMlirFunction(
 
 mlir::Value UnrealizedConversionCast(mlir::Type type, mlir::Value value,
                                      mlir::ImplicitLocOpBuilder& b);
+mlir::SmallVector<mlir::Value> UnrealizedConversionCast(
+    mlir::TypeRange types, mlir::ValueRange values,
+    mlir::ImplicitLocOpBuilder& b);
 
 // Converts any integers that aren't yet signless to signless.
 mlir::SmallVector<mlir::Value> ConvertToSignless(mlir::ValueRange values,
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index 6b31c75eaa954e..a7094d0cdf5818 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
@@ -304,7 +303,8 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
                                               output_indices);
       }
     }
-    return outputs;
+    return mlir_converter::UnrealizedConversionCast(
+        state.entry_function.getResultTypes(), outputs, b);
   };
 
   HloValueMap inits;
@@ -336,10 +336,8 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
       int max_dist = WarpSize() / 2 / reduction_info().GetRowsPerWarp();
       const auto& inits_for_reduction = inits.at(reduction);
       auto& values = accumulated[reduction];
-      for (auto [index, acc] : llvm::enumerate(values)) {
-        auto ty = inits_for_reduction[index].getType();
-        values[index] = mlir_converter::UnrealizedConversionCast(ty, acc, b);
-      }
+      values = mlir_converter::UnrealizedConversionCast(
+          mlir::TypeRange(inits_for_reduction), values, b);
       values =
           b.create<ShuffleReduceOp>(reducer, values, max_dist).getResults();
     }
@@ -460,13 +458,10 @@ HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
     for (auto* reduction : reductions) {
       int arity = reduction->operand_count() / 2;
       int start = iter_arg_starts[reduction];
-      SmallVector<Value> reduce_args = iter_args.slice(start, arity);
       const auto& inits_for_reduction = inits.at(reduction);
-      for (auto [index, arg] : llvm::enumerate(reduce_args)) {
-        auto init_type = inits_for_reduction[index].getType();
-        reduce_args[index] =
-            mlir_converter::UnrealizedConversionCast(init_type, arg, builder);
-      }
+      SmallVector<Value> reduce_args = mlir_converter::UnrealizedConversionCast(
+          mlir::TypeRange(inits_for_reduction), iter_args.slice(start, arity),
+          builder);
       reduce_args.append(ProvideParameterRange(
           computation, reduction, 0, arity, get_input_indices(reduction, true),
           call_target, entry_function, builder));
@@ -488,6 +483,7 @@ HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
       Value value = mlir_converter::ProvideParameter(
           computation, root_tuple, root_tuple->operand_index(side_output),
           indices, call_target, entry_function, builder)[0];
+      value = mlir_converter::ConvertToSignless(value, builder).front();
       side_output_values.push_back({std::move(indices), value});
     }
     for (const auto& [side_output, values] :
@@ -506,6 +502,11 @@ HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
   for (const auto& [hero, init] : inits) {
     results_per_hero[hero] = results.slice(iter_arg_starts[hero], init.size());
   }
+  for (auto* side_output : side_outputs) {
+    auto& results_for_hero = results_per_hero[side_output];
+    results_for_hero = mlir_converter::UnrealizedConversionCast(
+        mlir::TypeRange(inits.at(side_output)), results_for_hero, builder);
+  }
   return results_per_hero;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
index 0b5d82bc5e1fb4..2cbc3363bdba39 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
@@ -403,6 +403,31 @@ TEST_F(ReductionTest, SideOutput) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(ReductionTest, UnsignedSideOutput) {
+  constexpr auto kHloString = R"(
+    HloModule Test, is_scheduled=true
+
+    Add {
+      lhs = u32[] parameter(0)
+      rhs = u32[] parameter(1)
+      ROOT add = u32[] add(lhs, rhs)
+    }
+    fused_computation {
+      param_0 = u32[8,2048] parameter(0)
+      param_1 = u32[] parameter(1)
+      add = u32[8,2048] add(param_0, param_0)
+      reduce = u32[8] reduce(param_0, param_1), dimensions={1}, to_apply=Add
+      ROOT t = (u32[8], u32[8,2048]) tuple(reduce, add)
+    }
+    ENTRY main {
+      a = u32[8,2048] parameter(0)
+      c = u32[] constant(0)
+      ROOT fusion = (u32[8], u32[8,2048]) fusion(a, c), kind=kInput,
+          calls=fused_computation
+    })";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 TEST_F(ReductionTest, BroadcastSideOutput) {
   constexpr auto kHloString = R"(
     %add {

From 2f71f0768868ec8ef3d27bb60895454a3c748919 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 29 May 2024 03:06:48 -0700
Subject: [PATCH 066/287] Always inline functions that are called only once.

PiperOrigin-RevId: 638217258
---
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |  1 +
 .../gpu/fusions/mlir/ir/xla_gpu_ops.cc        | 14 ++-
 .../gpu/fusions/mlir/lower_xla_gpu_to_scf.cc  |  4 +-
 .../gpu/fusions/mlir/mlir_fusion_emitter.cc   |  5 +-
 .../xla/xla/service/gpu/fusions/mlir/passes.h |  1 +
 .../xla/service/gpu/fusions/mlir/passes.td    | 16 ++++
 .../service/gpu/fusions/mlir/pre_inliner.cc   | 96 +++++++++++++++++++
 .../gpu/fusions/mlir/tests/inlining.mlir      | 41 +++++++-
 .../mlir/tests/lower_xla_gpu_to_scf.mlir      | 16 ++--
 9 files changed, 179 insertions(+), 15 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 8035e113d86b5c..f74bc5733fdbd1 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -284,6 +284,7 @@ cc_library(
         "lower_to_llvm.cc",
         "lower_xla_gpu_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
+        "pre_inliner.cc",
         "propagate_slice_indices.cc",
         "simplify_affine.cc",
         "simplify_arith.cc",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
index a413e3c86787d7..4c9e3596718637 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
@@ -99,8 +99,18 @@ struct XlaGpuInlinerInterface : public mlir::DialectInlinerInterface {
     if (!region) {
       return false;
     }
-    const int kMaxOperationsToInline = 8;
-    return region->front().getOperations().size() <= kMaxOperationsToInline;
+
+    if (call->hasAttr("xla_gpu.always_inline")) {
+      return true;
+    }
+
+    constexpr int kMaxOperationsToInline = 8;
+    int num_ops = 0;
+    region->front().walk([&](mlir::Operation* op) { ++num_ops; });
+
+    // Don't inline functions that are called more than once and contain more
+    // than one call themselves.
+    return num_ops <= kMaxOperationsToInline;
   }
   // Returns true if the given operation 'op', that is registered to this
   // dialect, can be inlined into the given region, false otherwise.
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
index bdd6d6ecd58c33..4222fd9db15767 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
@@ -166,8 +166,8 @@ struct RewriteShuffleReduce : mlir::OpRewritePattern<ShuffleReduceOp> {
       for (auto value : values) {
         args.push_back(shuffle(value));
       }
-      values = b.create<mlir::func::CallOp>(op.getReducerAttr().getAttr(),
-                                            op.getResultTypes(), args)
+      values = b.create<PureCallOp>(op.getResultTypes(),
+                                    op.getReducerAttr().getAttr(), args)
                    .getResults();
     }
     rewriter.replaceOp(op, values);
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index 159e0e6e656485..791495d350304f 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -300,14 +300,15 @@ MlirFusionEmitterBase::CreateLLVMModule(
                                     buffer_assignment));
 
   mlir::PassManager pm(&mlir_context);
-  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(CreateLowerXlaGpuToScfPass());
+  pm.addPass(CreatePreInlinerPass());
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
   pm.addPass(CreatePropagateSliceIndicesPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertPureCallOpsPass());
-  pm.addPass(CreateLowerXlaGpuToScfPass());
   pm.addPass(CreateLowerTensorsPass(
       is_amd, is_amd ? device.rocm_compute_capability().gcn_arch_name()
                      : device.cuda_compute_capability().ToString()));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
index bd2ae346d95588..01e5db89c76f5e 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
@@ -42,6 +42,7 @@ std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
 std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass();
 std::unique_ptr<mlir::Pass> CreateLowerXlaGpuToScfPass();
 std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
+std::unique_ptr<mlir::Pass> CreatePreInlinerPass();
 std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
 std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
 std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
index be46b68cc1fcfb..9689d438c66a00 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
@@ -157,6 +157,22 @@ def LowerXlaGpuToScfPass :
   let constructor = "CreateLowerXlaGpuToScfPass()";
 }
 
+def PreInlinerPass : Pass<"xla-pre-inliner", "mlir::ModuleOp"> {
+  let summary = "Determines functions to inline and deletes unused functions";
+
+  let description = [{
+      Finds functions that are called only once and annotates them with
+      `always_inline`. Also deletes functions that are not called.
+  }];
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "xla::gpu::XlaGpuDialect"
+  ];
+
+  let constructor = "CreatePreInlinerPass()";
+}
+
 def LowerToLLVMPass :
    Pass<"xla-gpu-lower-to-llvm", "mlir::ModuleOp"> {
   let summary = "Lowers to LLVM.";
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc
new file mode 100644
index 00000000000000..09e273b0d1e63d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <queue>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DEF_PREINLINERPASS
+#include "xla/service/gpu/fusions/mlir/passes.h.inc"
+
+namespace {
+
+struct CallInfo {
+  PureCallOp call;
+  int count;
+};
+
+llvm::DenseSet<mlir::func::FuncOp> FindLiveFunctions(mlir::ModuleOp module) {
+  std::queue<mlir::func::FuncOp> worklist;
+  llvm::DenseSet<mlir::func::FuncOp> live_funcs;
+  module.walk([&](mlir::func::FuncOp func) {
+    if (!func.isPrivate()) {
+      worklist.push(func);
+      live_funcs.insert(func);
+    }
+  });
+
+  mlir::SymbolTableCollection symbol_table;
+  while (!worklist.empty()) {
+    auto func = worklist.front();
+    worklist.pop();
+    func.walk([&](mlir::CallOpInterface call) {
+      auto callee =
+          mlir::cast<mlir::func::FuncOp>(call.resolveCallable(&symbol_table));
+      if (live_funcs.insert(callee).second) {
+        worklist.push(callee);
+      }
+    });
+  }
+  return live_funcs;
+}
+
+class PreInlinerPass : public impl::PreInlinerPassBase<PreInlinerPass> {
+ public:
+  void runOnOperation() override {
+    // Find live functions and erase dead ones.
+    auto live = FindLiveFunctions(getOperation());
+    getOperation().walk([&](mlir::func::FuncOp func) {
+      if (!live.contains(func)) {
+        func.erase();
+      }
+    });
+
+    absl::flat_hash_map<std::string, CallInfo> calls;
+    getOperation().walk([&](PureCallOp call) {
+      auto& info = calls[call.getCallee().str()];
+      info.call = call;
+      info.count++;
+    });
+
+    for (const auto& [_, info] : calls) {
+      if (info.count == 1) {
+        info.call->setAttr("xla_gpu.always_inline",
+                           mlir::UnitAttr::get(&getContext()));
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreatePreInlinerPass() {
+  return std::make_unique<PreInlinerPass>();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
index b2a0b2db942793..adc18dde7e0ba7 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir_fusions_opt %s -split-input-file -inline | FileCheck %s
+// RUN: mlir_fusions_opt %s -split-input-file -xla-pre-inliner -inline | FileCheck %s -dump-input=always
 
 module {
   func.func private @mul(%a: f32, %b: f32) -> f32 {
@@ -193,3 +193,42 @@ module {
 
 // CHECK: @caller
 // CHECK-NEXT: complex.create
+
+// -----
+
+module {
+  func.func private @callee2(%a: f32) -> f32 {
+    %ret = arith.addf %a, %a : f32
+    return %ret : f32
+  }
+
+  func.func private @callee1(%a: f32) -> f32 {
+    %c1 = xla_gpu.pure_call @callee2(%a) : (f32) -> (f32)
+    %b0 = arith.addf %a, %a : f32
+    %b1 = arith.addf %b0, %a : f32
+    %b2 = arith.addf %b1, %a : f32
+    %b3 = arith.addf %b2, %a : f32
+    %b4 = arith.addf %b3, %a : f32
+    %b5 = arith.addf %b4, %a : f32
+    %b6 = arith.addf %b5, %a : f32
+    %b7 = arith.addf %b6, %a : f32
+    %c2 = xla_gpu.pure_call @callee2(%b7) : (f32) -> (f32)
+    %ret = arith.addf %c1, %c2 : f32
+    return %ret : f32
+  }
+
+  func.func private @dead(%a: f32) -> f32 {
+    %ret = xla_gpu.pure_call @callee1(%a) : (f32) -> (f32)
+    return %ret : f32
+  }
+
+  func.func @caller(%a: f32, %b: f32) -> f32 {
+    %ret = xla_gpu.pure_call @callee1(%a) : (f32) -> (f32)
+    return %ret : f32
+  }
+}
+
+// CHECK-NOT: func.func
+// CHECK: func.func @caller
+// CHECK-NOT: xla_gpu.pure_call
+// CHECK-NOT: func.func
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
index ea464cd50641ea..efc8bdb2a28953 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
@@ -18,14 +18,14 @@ module {
 // CHECK-DAG: %[[C32:.*]] = arith.constant 32
 // CHECK: %[[A4H:.*]], {{.*}} = gpu.shuffle down %[[A]], %[[C4]], %[[C32]]
 // CHECK: %[[B4H:.*]], {{.*}} = gpu.shuffle down %[[B]], %[[C4]], %[[C32]]
-// CHECK: %[[AB4:.*]]:2 = call @reducer(%[[A]], %[[B]], %[[A4H]], %[[B4H]])
-// CHECK: %[[A2H:.*]], {{.*}} = gpu.shuffle down %[[AB4]]#0, %[[C2]], %[[C32]]
-// CHECK: %[[B2H:.*]], {{.*}} = gpu.shuffle down %[[AB4]]#1, %[[C2]], %[[C32]]
-// CHECK: %[[AB2:.*]]:2 = call @reducer(%[[AB4]]#0, %[[AB4]]#1, %[[A2H]], %[[B2H]])
-// CHECK: %[[A1H:.*]], {{.*}} = gpu.shuffle down %[[AB2]]#0, %[[C1]], %[[C32]]
-// CHECK: %[[B1H:.*]], {{.*}} = gpu.shuffle down %[[AB2]]#1, %[[C1]], %[[C32]]
-// CHECK: %[[AB1:.*]]:2 = call @reducer(%[[AB2]]#0, %[[AB2]]#1, %[[A1H]], %[[B1H]])
-// CHECK: return %[[AB1]]#0, %[[AB1]]#1
+// CHECK: %[[AB4_0:.*]], %[[AB4_1:.*]] = xla_gpu.pure_call @reducer(%[[A]], %[[B]], %[[A4H]], %[[B4H]])
+// CHECK: %[[A2H:.*]], {{.*}} = gpu.shuffle down %[[AB4_0]], %[[C2]], %[[C32]]
+// CHECK: %[[B2H:.*]], {{.*}} = gpu.shuffle down %[[AB4_1]], %[[C2]], %[[C32]]
+// CHECK: %[[AB2_0:.*]], %[[AB2_1:.*]] = xla_gpu.pure_call @reducer(%[[AB4_0]], %[[AB4_1]], %[[A2H]], %[[B2H]])
+// CHECK: %[[A1H:.*]], {{.*}} = gpu.shuffle down %[[AB2_0]], %[[C1]], %[[C32]]
+// CHECK: %[[B1H:.*]], {{.*}} = gpu.shuffle down %[[AB2_1]], %[[C1]], %[[C32]]
+// CHECK: %[[AB1_0:.*]], %[[AB1_1:.*]] = xla_gpu.pure_call @reducer(%[[AB2_0]], %[[AB2_1]], %[[A1H]], %[[B1H]])
+// CHECK: return %[[AB1_0]], %[[AB1_1]]
 
 // -----
 

From 4cd4877f3f9c767a6cdaf94db364eeaaf3b4d028 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 04:42:11 -0700
Subject: [PATCH 067/287] Automated Code Change

PiperOrigin-RevId: 638239528
---
 .../kernels/batching_util/adaptive_shared_batch_scheduler.h | 4 ++--
 .../core/kernels/batching_util/shared_batch_scheduler.h     | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 9e6a4b2829aa5a..9098d5a76d1ee9 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -633,7 +633,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
     const internal::ASBSBatch<TaskType>* batch,
     AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback,
     bool is_express) {
-  profiler::TraceMeConsumer trace_me(
+  tsl::profiler::TraceMeConsumer trace_me(
       [&] {
         return profiler::TraceMeEncode(
             "ProcessBatch", {{"batch_size_before_padding", batch->size()},
@@ -792,7 +792,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
 
       // Annotate each task (corresponds to one call of schedule) with a
       // TraceMeProducer.
-      profiler::TraceMeProducer trace_me(
+      tsl::profiler::TraceMeProducer trace_me(
           [task_size = task->size()] {
             return profiler::TraceMeEncode(
                 "ASBSQueue::Schedule",
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 500dfc662ee359..93ac0c922fb404 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -954,7 +954,7 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
       if (task_handle_batches_.back()->empty()) {
         open_batch_start_time_micros_ = env_->NowMicros();
       }
-      profiler::TraceMeProducer trace_me(
+      tsl::profiler::TraceMeProducer trace_me(
           [&task_handles, i] {
             return profiler::TraceMeEncode("ScheduleOutputTask",
                                            {{"size", task_handles[i]->size()}});
@@ -1035,7 +1035,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplitImpl(
     if (batches.back()->empty()) {
       open_batch_start_time_micros_ = env_->NowMicros();
     }
-    profiler::TraceMeProducer trace_me(
+    tsl::profiler::TraceMeProducer trace_me(
         [&output_tasks, i] {
           return profiler::TraceMeEncode("ScheduleOutputTask",
                                          {{"size", output_tasks[i]->size()}});
@@ -1326,7 +1326,7 @@ template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(
     std::unique_ptr<Batch<TaskType>> batch,
     std::vector<std::unique_ptr<TaskType>> padding_task) {
-  profiler::TraceMeConsumer trace_me(
+  tsl::profiler::TraceMeConsumer trace_me(
       [&] {
         return profiler::TraceMeEncode(
             "ProcessBatch", {{"batch_size_before_padding", batch->size()},

From 3082c0baff6b59befa4b866cc19791d62576adf8 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 29 May 2024 04:50:06 -0700
Subject: [PATCH 068/287] Revert usage of always_inline attribute.

This is not necessary, we already inline functions with just one caller. The
problem that we didn't inline in some cases was just that there were dead
functions which called them. After removing dead functions, inlining works.
Rename PreInlinerPass to EraseDeadFunctions pass.

PiperOrigin-RevId: 638241030
---
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |  3 ++-
 ...pre_inliner.cc => erase_dead_functions.cc} | 26 ++++++-------------
 .../gpu/fusions/mlir/ir/xla_gpu_ops.cc        |  4 ---
 .../gpu/fusions/mlir/mlir_fusion_emitter.cc   |  2 +-
 .../xla/xla/service/gpu/fusions/mlir/passes.h |  2 +-
 .../xla/service/gpu/fusions/mlir/passes.td    |  9 +++----
 .../gpu/fusions/mlir/tests/inlining.mlir      |  2 +-
 7 files changed, 17 insertions(+), 31 deletions(-)
 rename third_party/xla/xla/service/gpu/fusions/mlir/{pre_inliner.cc => erase_dead_functions.cc} (77%)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index f74bc5733fdbd1..0219bd3e22b914 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -279,12 +279,12 @@ cc_library(
     name = "passes",
     srcs = [
         "convert_xla_gpu_pure_call_ops.cc",
+        "erase_dead_functions.cc",
         "expand_float_ops.cc",
         "lower_tensors.cc",
         "lower_to_llvm.cc",
         "lower_xla_gpu_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
-        "pre_inliner.cc",
         "propagate_slice_indices.cc",
         "simplify_affine.cc",
         "simplify_arith.cc",
@@ -312,6 +312,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/erase_dead_functions.cc
similarity index 77%
rename from third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc
rename to third_party/xla/xla/service/gpu/fusions/mlir/erase_dead_functions.cc
index 09e273b0d1e63d..30a7fcd9a1de23 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/pre_inliner.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/erase_dead_functions.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
 #include <queue>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
@@ -24,7 +26,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-#define GEN_PASS_DEF_PREINLINERPASS
+#define GEN_PASS_DEF_ERASEDEADFUNCTIONSPASS
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
 namespace {
@@ -59,7 +61,8 @@ llvm::DenseSet<mlir::func::FuncOp> FindLiveFunctions(mlir::ModuleOp module) {
   return live_funcs;
 }
 
-class PreInlinerPass : public impl::PreInlinerPassBase<PreInlinerPass> {
+class EraseDeadFunctionsPass
+    : public impl::EraseDeadFunctionsPassBase<EraseDeadFunctionsPass> {
  public:
   void runOnOperation() override {
     // Find live functions and erase dead ones.
@@ -69,27 +72,14 @@ class PreInlinerPass : public impl::PreInlinerPassBase<PreInlinerPass> {
         func.erase();
       }
     });
-
-    absl::flat_hash_map<std::string, CallInfo> calls;
-    getOperation().walk([&](PureCallOp call) {
-      auto& info = calls[call.getCallee().str()];
-      info.call = call;
-      info.count++;
-    });
-
-    for (const auto& [_, info] : calls) {
-      if (info.count == 1) {
-        info.call->setAttr("xla_gpu.always_inline",
-                           mlir::UnitAttr::get(&getContext()));
-      }
-    }
   }
 };
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreatePreInlinerPass() {
-  return std::make_unique<PreInlinerPass>();
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateEraseDeadFunctionsPass() {
+  return std::make_unique<EraseDeadFunctionsPass>();
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
index 4c9e3596718637..780a1f15c247c3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
@@ -100,10 +100,6 @@ struct XlaGpuInlinerInterface : public mlir::DialectInlinerInterface {
       return false;
     }
 
-    if (call->hasAttr("xla_gpu.always_inline")) {
-      return true;
-    }
-
     constexpr int kMaxOperationsToInline = 8;
     int num_ops = 0;
     region->front().walk([&](mlir::Operation* op) { ++num_ops; });
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index 791495d350304f..eebe3d946ef6dd 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -300,9 +300,9 @@ MlirFusionEmitterBase::CreateLLVMModule(
                                     buffer_assignment));
 
   mlir::PassManager pm(&mlir_context);
+  pm.addPass(CreateEraseDeadFunctionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(CreateLowerXlaGpuToScfPass());
-  pm.addPass(CreatePreInlinerPass());
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
index 01e5db89c76f5e..0cc0fd101c4e8d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
@@ -42,7 +42,7 @@ std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
 std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass();
 std::unique_ptr<mlir::Pass> CreateLowerXlaGpuToScfPass();
 std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
-std::unique_ptr<mlir::Pass> CreatePreInlinerPass();
+std::unique_ptr<mlir::Pass> CreateEraseDeadFunctionsPass();
 std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
 std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
 std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
index 9689d438c66a00..0283530ca32ea4 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
@@ -157,12 +157,11 @@ def LowerXlaGpuToScfPass :
   let constructor = "CreateLowerXlaGpuToScfPass()";
 }
 
-def PreInlinerPass : Pass<"xla-pre-inliner", "mlir::ModuleOp"> {
-  let summary = "Determines functions to inline and deletes unused functions";
+def EraseDeadFunctionsPass : Pass<"xla-erase-dead-functions", "mlir::ModuleOp"> {
+  let summary = "Deletes unused functions";
 
   let description = [{
-      Finds functions that are called only once and annotates them with
-      `always_inline`. Also deletes functions that are not called.
+      Deletes functions that are not called.
   }];
 
   let dependentDialects = [
@@ -170,7 +169,7 @@ def PreInlinerPass : Pass<"xla-pre-inliner", "mlir::ModuleOp"> {
     "xla::gpu::XlaGpuDialect"
   ];
 
-  let constructor = "CreatePreInlinerPass()";
+  let constructor = "CreateEraseDeadFunctionsPass()";
 }
 
 def LowerToLLVMPass :
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
index adc18dde7e0ba7..8fcb92d0227380 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/inlining.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir_fusions_opt %s -split-input-file -xla-pre-inliner -inline | FileCheck %s -dump-input=always
+// RUN: mlir_fusions_opt %s -split-input-file -xla-erase-dead-functions -inline | FileCheck %s
 
 module {
   func.func private @mul(%a: f32, %b: f32) -> f32 {

From 15f7aa2fc12a953c0f5da11bd40b568f02f7938f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 05:05:36 -0700
Subject: [PATCH 069/287] Define "bad indices handling" utilities.

PiperOrigin-RevId: 638244760
---
 tensorflow/core/util/BUILD                    | 26 +++++++
 tensorflow/core/util/bad_indices_policy.cc    | 39 +++++++++++
 tensorflow/core/util/bad_indices_policy.h     | 39 +++++++++++
 .../core/util/bad_indices_policy_test.cc      | 70 +++++++++++++++++++
 4 files changed, 174 insertions(+)
 create mode 100644 tensorflow/core/util/bad_indices_policy.cc
 create mode 100644 tensorflow/core/util/bad_indices_policy.h
 create mode 100644 tensorflow/core/util/bad_indices_policy_test.cc

diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index c2b913a17a6a8f..3902d1700e7424 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -1026,3 +1026,29 @@ tf_proto_library(
         "@local_tsl//tsl:__subpackages__",
     ],
 )
+
+cc_library(
+    name = "bad_indices_policy",
+    srcs = ["bad_indices_policy.cc"],
+    hdrs = ["bad_indices_policy.h"],
+    deps = [
+        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+tf_cc_test(
+    name = "bad_indices_policy_test",
+    srcs = ["bad_indices_policy_test.cc"],
+    deps = [
+        ":bad_indices_policy",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/util/bad_indices_policy.cc b/tensorflow/core/util/bad_indices_policy.cc
new file mode 100644
index 00000000000000..443a80a053eb20
--- /dev/null
+++ b/tensorflow/core/util/bad_indices_policy.cc
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/bad_indices_policy.h"
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+
+constexpr char kDefault[] = "DEFAULT";
+constexpr char kErrorStr[] = "ERROR";
+constexpr char kIgnoreStr[] = "IGNORE";
+
+absl::StatusOr<BadIndicesPolicy> BadIndicesPolicyFromString(
+    absl::string_view str) {
+  if (str.empty()) return BadIndicesPolicy::kDefault;
+  if (str == kDefault) return BadIndicesPolicy::kDefault;
+  if (str == kErrorStr) return BadIndicesPolicy::kError;
+  if (str == kIgnoreStr) return BadIndicesPolicy::kIgnore;
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unknown bad indices handling attribute: ", str));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/bad_indices_policy.h b/tensorflow/core/util/bad_indices_policy.h
new file mode 100644
index 00000000000000..ee8f4a8977d58f
--- /dev/null
+++ b/tensorflow/core/util/bad_indices_policy.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
+#define TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+enum class BadIndicesPolicy {
+  // Default behavior: return an error on CPU and ignore on GPU. This is because
+  // we handle bad indices differently on CPU and GPU before this policy is
+  // introduced.
+  kDefault,
+  // Return an error.
+  kError,
+  // Ignore bad indices.
+  kIgnore,
+};
+
+absl::StatusOr<BadIndicesPolicy> BadIndicesPolicyFromString(
+    absl::string_view str);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
diff --git a/tensorflow/core/util/bad_indices_policy_test.cc b/tensorflow/core/util/bad_indices_policy_test.cc
new file mode 100644
index 00000000000000..b4f525abf04631
--- /dev/null
+++ b/tensorflow/core/util/bad_indices_policy_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/bad_indices_policy.h"
+
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {  // Anonymous namespace to avoid name conflicts
+
+constexpr absl::string_view kDefault = "DEFAULT";
+constexpr absl::string_view kErrorStr = "ERROR";
+constexpr absl::string_view kIgnoreStr = "IGNORE";
+
+// Unit test class using Google Test framework
+class BadIndicesPolicyFromStringTest : public ::testing::Test {
+ protected:
+  // Reusable function to test valid inputs
+  void TestValidInput(absl::string_view input, BadIndicesPolicy expected) {
+    absl::StatusOr<BadIndicesPolicy> result = BadIndicesPolicyFromString(input);
+    ASSERT_TRUE(result.ok());             // Check for success
+    EXPECT_EQ(result.value(), expected);  // Verify the policy value
+  }
+};
+
+// Test cases covering valid inputs
+TEST_F(BadIndicesPolicyFromStringTest, EmptyString) {
+  TestValidInput("", BadIndicesPolicy::kDefault);
+}
+
+TEST_F(BadIndicesPolicyFromStringTest, DefaultKeyword) {
+  TestValidInput(kDefault, BadIndicesPolicy::kDefault);
+}
+
+TEST_F(BadIndicesPolicyFromStringTest, ErrorKeyword) {
+  TestValidInput(kErrorStr, BadIndicesPolicy::kError);
+}
+
+TEST_F(BadIndicesPolicyFromStringTest, IgnoreKeyword) {
+  TestValidInput(kIgnoreStr, BadIndicesPolicy::kIgnore);
+}
+
+// Test case for invalid input
+TEST_F(BadIndicesPolicyFromStringTest, InvalidInput) {
+  absl::StatusOr<BadIndicesPolicy> result =
+      BadIndicesPolicyFromString("unknown");
+  ASSERT_FALSE(result.ok());  // Check for failure
+  EXPECT_THAT(result.status().message(),
+              ::testing::HasSubstr("Unknown bad indices handling attribute"));
+}
+
+}  // namespace
+
+}  // namespace tensorflow

From 816c13afabffe25d32cf070911171aba26ae239d Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Wed, 29 May 2024 05:37:25 -0700
Subject: [PATCH 070/287] Reduce logs in XNNPack delegate for prod builds.

PiperOrigin-RevId: 638251994
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 33e1d317bce6c8..81a35e17e5cfc6 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -523,8 +523,8 @@ class Delegate {
     }
 
 #endif
-    TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
-                    "Created TensorFlow Lite XNNPACK delegate for CPU.");
+    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
+                         "Created TensorFlow Lite XNNPACK delegate for CPU.");
 
     options_ =
         options != nullptr ? *options : TfLiteXNNPackDelegateOptionsDefault();
@@ -536,14 +536,13 @@ class Delegate {
       if (options_.experimental_weight_cache_file_path) {
         if (weight_cache_provider_.Load(
                 options_.experimental_weight_cache_file_path)) {
-          TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
-                          "XNNPack weight cache loaded from '%s'.",
-                          options_.experimental_weight_cache_file_path);
+          TFLITE_LOG(tflite::TFLITE_LOG_INFO,
+                     "XNNPack weight cache loaded from '%s'.",
+                     options_.experimental_weight_cache_file_path);
         } else {
-          TFLITE_LOG_PROD(
-              tflite::TFLITE_LOG_INFO,
-              "XNNPack weight cache not found at '%s', building it.",
-              options_.experimental_weight_cache_file_path);
+          TFLITE_LOG(tflite::TFLITE_LOG_INFO,
+                     "XNNPack weight cache not found at '%s', building it.",
+                     options_.experimental_weight_cache_file_path);
         }
         options_.weights_cache =
             reinterpret_cast<TfLiteXNNPackDelegateWeightsCache*>(
@@ -551,8 +550,8 @@ class Delegate {
         options_.experimental_weight_cache_file_path =
             weight_cache_provider_.GetFilePath().data();
       } else {
-        TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
-                        "XNNPack weight cache not enabled.");
+        TFLITE_LOG(tflite::TFLITE_LOG_INFO,
+                   "XNNPack weight cache not enabled.");
       }
     }
   }

From e6d17c5a33e51f2b7f83c041965ee8f414ca5c46 Mon Sep 17 00:00:00 2001
From: Tori Baker <vwbaker@google.com>
Date: Wed, 29 May 2024 05:40:11 -0700
Subject: [PATCH 071/287] Combine sparsity patches

PiperOrigin-RevId: 638252468
---
 third_party/triton/xla_extensions/series.bzl  |   6 +-
 ...sparse_dot_base.patch => sparse_dot.patch} | 597 +++++++++++++++++-
 .../sparse_dot_fixes_y24w17.patch             |  71 ---
 .../sparse_dot_fixes_y24w19.patch             |  31 -
 .../xla_extensions/sparse_dot_nvgpu.patch     | 123 ----
 .../xla_extensions/sparse_dot_passes.patch    | 429 -------------
 .../triton/xla_extensions/series.bzl          |   6 +-
 ...sparse_dot_base.patch => sparse_dot.patch} | 597 +++++++++++++++++-
 .../sparse_dot_fixes_y24w17.patch             |  71 ---
 .../sparse_dot_fixes_y24w19.patch             |  31 -
 .../xla_extensions/sparse_dot_nvgpu.patch     | 123 ----
 .../xla_extensions/sparse_dot_passes.patch    | 429 -------------
 12 files changed, 1174 insertions(+), 1340 deletions(-)
 rename third_party/triton/xla_extensions/{sparse_dot_base.patch => sparse_dot.patch} (54%)
 delete mode 100644 third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
 delete mode 100644 third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
 delete mode 100644 third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
 delete mode 100644 third_party/triton/xla_extensions/sparse_dot_passes.patch
 rename third_party/xla/third_party/triton/xla_extensions/{sparse_dot_base.patch => sparse_dot.patch} (54%)
 delete mode 100644 third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
 delete mode 100644 third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
 delete mode 100644 third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
 delete mode 100644 third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch

diff --git a/third_party/triton/xla_extensions/series.bzl b/third_party/triton/xla_extensions/series.bzl
index af524fb253cbef..d35fcadd621eec 100644
--- a/third_party/triton/xla_extensions/series.bzl
+++ b/third_party/triton/xla_extensions/series.bzl
@@ -4,9 +4,5 @@ applied in the previous copybara workflow.
 """
 
 extensions_files_patch_list = [
-    "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w17.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w19.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot.patch",  # Sparsity internal patch
 ]
diff --git a/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/triton/xla_extensions/sparse_dot.patch
similarity index 54%
rename from third_party/triton/xla_extensions/sparse_dot_base.patch
rename to third_party/triton/xla_extensions/sparse_dot.patch
index 08b7dd6f7ada87..250a8285001133 100644
--- a/third_party/triton/xla_extensions/sparse_dot_base.patch
+++ b/third_party/triton/xla_extensions/sparse_dot.patch
@@ -1,8 +1,8 @@
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-index 56f0b6b49..aa91ea9b8 100644
+index 05f3378dc..0a463c44d 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -1262,4 +1262,16 @@ section 9.7.13.4.1 for more details.
+@@ -1298,4 +1298,16 @@ elements along the K dim, or they use all elements of the tensor along the K dim
    }];
  }
  
@@ -20,7 +20,7 @@ index 56f0b6b49..aa91ea9b8 100644
 +
  #endif
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-index 4966a5f73..d2bb33cfa 100644
+index 2530009cb..b9f5bd933 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 @@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
@@ -31,7 +31,7 @@ index 4966a5f73..d2bb33cfa 100644
  include "mlir/IR/OpBase.td"
  include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
  include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
-@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [MemoryEffects<[MemWrite<SharedMemo
+@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
    }];
  }
  
@@ -51,11 +51,126 @@ index 4966a5f73..d2bb33cfa 100644
 +}
 +
  #endif
+diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+index 4aa2712ec..16a6253d7 100644
+--- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
++++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
+   }
+ };
+ 
++struct TritonSparseDotPattern
++    : public OpConversionPattern<triton::gpu::SparseDotOp> {
++  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    RankedTensorType origType = cast<RankedTensorType>(op.getType());
++    auto origShape = origType.getShape();
++    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
++    int numWarps = typeConverter->getNumWarps();
++    int threadsPerWarp = typeConverter->getThreadsPerWarp();
++    int numCTAs = typeConverter->getNumCTAs();
++
++    auto rank = origShape.size();
++    auto numElements = product<int64_t>(origShape);
++    SmallVector<unsigned> retSizePerThread(rank, 1);
++    if (numElements / (numWarps * threadsPerWarp) >= 4) {
++      retSizePerThread[rank - 1] = 2;
++      retSizePerThread[rank - 2] = 2;
++    }
++    if (numElements / (numWarps * threadsPerWarp) >= 16) {
++      retSizePerThread[rank - 1] = 4;
++      retSizePerThread[rank - 2] = 4;
++    }
++    SmallVector<unsigned> retOrder(rank);
++    for (unsigned i = 0; i < rank; ++i)
++      retOrder[i] = rank - 1 - i;
++    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
++        getContext(), origShape, retSizePerThread, retOrder, numWarps,
++        threadsPerWarp, numCTAs);
++    RankedTensorType retType =
++        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
++
++    // a & b must be of smem layout
++    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
++    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
++    Type aEltType = aType.getElementType();
++    Type bEltType = bType.getElementType();
++    Attribute aEncoding = aType.getEncoding();
++    Attribute bEncoding = bType.getEncoding();
++    if (!aEncoding || !bEncoding)
++      return failure();
++    Value a = adaptor.getA();
++    Value b = adaptor.getB();
++    Value c = adaptor.getC();
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 0, dEncoding, aEltType);
++      auto dstType =
++          RankedTensorType::get(aType.getShape(), aEltType, encoding);
++      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
++    }
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 1, dEncoding, bEltType);
++      auto dstType =
++          RankedTensorType::get(bType.getShape(), bEltType, encoding);
++      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
++    }
++    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
++
++    // aMeta must be of smem layout
++    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
++    Attribute aMetaEncoding = aMetaType.getEncoding();
++    if (!aMetaEncoding) return failure();
++    Value aMeta = adaptor.getAMeta();
++    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
++      Attribute encoding =
++          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
++      auto dstType = RankedTensorType::get(
++          aMetaType.getShape(), aMetaType.getElementType(), encoding);
++      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
++                                                            dstType, aMeta);
++    }
++
++    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
++                      op, retType, a, b, c, aMeta),
++                  adaptor.getAttributes());
++    return success();
++  }
++};
++
+ struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
+   using OpConversionPattern::OpConversionPattern;
+ 
+@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
+       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
+       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
+                                                              context);
++  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
+ }
+ 
+ //
+@@ -794,6 +878,12 @@ public:
+     mod->setAttr(AttrTargetName,
+                  StringAttr::get(context, this->target.getValue()));
+ 
++    // Only transform sparse dot op with undefined layout.
++    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
++        [](triton::gpu::SparseDotOp op) {
++          return op.getAMeta().getType().getEncoding() != nullptr;
++        });
++
+     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
+       return signalPassFailure();
+ 
 diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-index 0ce7ecf18..3736a1551 100644
+index 74ae61c06..1bdbd84be 100644
 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
 +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -483,6 +483,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
+@@ -497,6 +497,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
    return encoding;
  }
  
@@ -175,6 +290,460 @@ index 0ce7ecf18..3736a1551 100644
  } // namespace gpu
  } // namespace triton
  } // namespace mlir
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index ee09a34de..d995f6c04 100644
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -39,7 +39,8 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
+   return 0;
+ }
+ 
+-SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
++template <typename DotType>
++SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
+                                      int numWarps) {
+   auto rank = shape.size();
+   // Early exit for batched matmul
+@@ -53,8 +54,8 @@ SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
+   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
+   bool hasChainedDot = false;
+   for (Operation *op : slices) {
+-    if (isa<DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<DotOp>(op);
++    if (isa<DotType>(op) && (op != dotOp)) {
++      auto chainedDot = cast<DotType>(op);
+       auto resTy = chainedDot.getResult().getType();
+       if (resTy.getRank() != rank) {
+         continue;
+@@ -98,12 +99,13 @@ SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
+   return ret;
+ }
+ 
+-SmallVector<unsigned, 2>
+-warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-               const SmallVector<unsigned, 3> &instrShape) {
++template <typename DotType>
++SmallVector<unsigned, 2> warpsPerTileV3(
++    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
++    const SmallVector<unsigned, 3> &instrShape) {
+   SetVector<Operation *> slices;
+   mlir::getForwardSlice(dotOp.getResult(), &slices);
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
++  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
+       slices.end())
+     return {(unsigned)numWarps, 1};
+ 
+@@ -144,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+                 mlir::TypeID::get<arith::ArithDialect>());
+   }
+ 
++public:
+   // Finds the first different bitwidth in the chain of shape-preserving
+   // unary ops that x depends on.
+   // There are two primary scenarios:
+@@ -177,14 +180,14 @@ class BlockedToMMA : public mlir::RewritePattern {
+     return origBitWidth;
+   }
+ 
+-public:
+   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+ 
+-  static SmallVector<unsigned, 3>
+-  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
++  template <typename DotType>
++  static SmallVector<unsigned, 3> getWarpsPerTile(
++      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
++      const SmallVector<unsigned, 3> &instrShape) {
+     switch (version) {
+     case 2:
+       return warpsPerTileV2(dotOp, shape, numWarps);
+@@ -339,6 +342,103 @@ public:
+     return success();
+   }
+ };
++
++class SparseBlockedToMMA : public mlir::RewritePattern {
++ public:
++  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
++  using SparseDotMetaEncodingAttr =
++      mlir::triton::gpu::SparseDotMetaEncodingAttr;
++
++  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
++      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
++        computeCapability(computeCapability) {}
++
++  mlir::LogicalResult matchAndRewrite(
++      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
++    auto dotOp = cast<SparseDotOp>(op);
++    auto ctx = op->getContext();
++    Value a = dotOp.getA();
++    Value b = dotOp.getB();
++
++    // Check data-types and SM compatibility
++    RankedTensorType oldRetType = dotOp.getType();
++    if (!oldRetType.getEncoding() ||
++        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
++      return failure();
++
++    assert(computeCapability >= 80 &&
++           "SparseDot is supported on Ampere and higher");
++    bool allowV3 = !triton::tools::getBoolEnv("DISABLE_MMA_V3");
++    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
++
++    // get MMA encoding for the given number of warps
++    auto retShapePerCTA = getShapePerCTA(oldRetType);
++    auto mod = op->getParentOfType<mlir::ModuleOp>();
++    int numWarps = TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = getCTALayout(oldRetType.getEncoding());
++
++    auto instrShape =
++        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
++                               cast<TensorOrMemDesc>(a.getType()), numWarps);
++    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
++        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
++    NvidiaMmaEncodingAttr mmaEnc =
++        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++                                        warpsPerTile, CTALayout, instrShape);
++    auto newRetType = RankedTensorType::get(
++        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
++
++    // convert accumulator
++    auto oldAcc = dotOp.getOperand(2);
++    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
++                                                        newRetType, oldAcc);
++
++    if (versionMajor == 2) {
++      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
++                                 BlockedToMMA::computeOrigBitWidth(b));
++      int kWidth = 32 / minBitwidth;
++
++      // convert A operand
++      auto oldAType = cast<RankedTensorType>(a.getType());
++      auto newAEncoding =
++          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
++      auto newAType = RankedTensorType::get(
++          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
++      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
++
++      // convert B operand
++      auto oldBType = cast<RankedTensorType>(b.getType());
++      auto newBEncoding =
++          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
++      auto newBType = RankedTensorType::get(
++          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
++      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
++    } else {
++      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
++      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
++    }
++
++    // convert metadata
++    Value meta = dotOp.getAMeta();
++    auto oldMetaType = cast<RankedTensorType>(meta.getType());
++    auto newMetaType = RankedTensorType::get(
++        oldMetaType.getShape(), oldMetaType.getElementType(),
++        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
++    meta =
++        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++
++    // convert dot instruction
++    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
++                                               newAcc, meta);
++
++    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
++                                                      newDot.getResult());
++    return success();
++  }
++
++ private:
++  int computeCapability;
++};
+ } // namespace
+ 
+ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
+@@ -398,6 +498,7 @@ public:
+ 
+     mlir::RewritePatternSet patterns(context);
+     patterns.add<BlockedToMMA>(context, computeCapability);
++    patterns.add<SparseBlockedToMMA>(context, computeCapability);
+     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
+       signalPassFailure();
+     }
+diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+index 457d42f4e..5ab249066 100644
+--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+@@ -188,6 +188,10 @@ public:
+   }
+ };
+ 
++static bool isDotOp(Operation* op) {
++  return isa<tt::DotOp, ttg::SparseDotOp>(op);
++}
++
+ static bool isMMAv3Dot(Operation *op) {
+   auto dot = dyn_cast<tt::DotOp>(op);
+   if (!dot)
+@@ -387,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
+     } else {
+       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
+         return std::nullopt;
+-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
+-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
+-      if (!dotOpEnc)
++      auto enc =
++          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
++      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
++        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
++        auto order = ttg::getOrder(srcTy.getEncoding());
++        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
++            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()),
++            srcTy.getElementType().getIntOrFloatBitWidth(),
++            /*needTrans=*/false);
++      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
++            ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()));
++      } else {
+         return std::nullopt;
+-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
+-      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
+-      auto order = ttg::getOrder(srcTy.getEncoding());
+-      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
+-      tempAttr = ttg::SharedEncodingAttr::get(
+-          val.getContext(), dotOpEnc, srcTy.getShape(),
+-          ttg::getOrder(srcTy.getEncoding()),
+-          ttg::getCTALayout(srcTy.getEncoding()),
+-          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
++      }
+     }
+     // Check that the shared encodings needed by the users are compatible.
+     if (!tempAttr || (attr != nullptr && attr != tempAttr))
+@@ -506,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
+       };
+ 
+   for (Operation &op : forOp.getBody()->without_terminator()) {
+-    if (!isa<tt::DotOp>(op))
++    if (!isDotOp(&op))
+       continue;
+     seen.clear();
+     dfs(&op, 0, &op);
+@@ -583,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         continue;
+     }
+ 
+-    if (auto dot = dyn_cast<tt::DotOp>(use)) {
++    if (isDotOp(use)) {
++      auto dot = dyn_cast<tt::DotOp>(use);
+       loadInfo.usedByDot = true;
+       if (loadIsMMAv3(op)) {
+         loadInfo.loadIsMMAV3 = true;
+@@ -605,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         // The codegen bug is caught by an assertion, so if you think you've
+         // fixed it, feel free to delete this code and see if the assert still
+         // fails.  :)
+-        if (!loadInfo.sharedEncoding) {
++        if (dot && !loadInfo.sharedEncoding) {
+           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                   dot.getResult().getType().getEncoding())) {
+             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
+diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+index c0b586d60..d592ac523 100644
+--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+@@ -38,6 +38,10 @@ public:
+       auto srcEncoding = srcType.getEncoding();
+       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
+         return;
++      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
++        replaceSparseMetaEncoding(cvtOp);
++        return;
++      }
+       auto dstDotOp =
+           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
+       if (!dstDotOp)
+@@ -84,6 +88,27 @@ public:
+       cvtOp.erase();
+     });
+   }
++
++ private:
++  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
++    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
++    auto srcEncoding = srcType.getEncoding();
++    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
++        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
++        triton::gpu::getCTALayout(srcEncoding));
++
++    auto dstType = cast<RankedTensorType>(cvtOp.getType());
++    auto tmpType = triton::MemDescType::get(
++        dstType.getShape(), dstType.getElementType(), sharedLayout);
++
++    OpBuilder builder(cvtOp);
++    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
++        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
++    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
++        cvtOp.getLoc(), dstType, tmp);
++    cvtOp.replaceAllUsesWith(newConvert.getResult());
++    cvtOp.erase();
++  }
+ };
+ 
+ } // namespace gpu
+diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+index c7dd8d595..df110e0a9 100644
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+@@ -44,7 +44,7 @@ public:
+       return;
+     ModuleOp mod = getOperation();
+     mod.walk([&](Operation *op) {
+-      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
++      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+         return WalkResult::advance();
+       OpBuilder builder(op);
+       auto a = op->getOperand(0);
+@@ -79,7 +79,7 @@ private:
+     static DenseSet<std::pair<Operation *, unsigned>> trace;
+     auto op = operand.getDefiningOp();
+     // avoid redundant insertion
+-    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
++    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+       return false;
+     // reach convertlayout
+     if (op && isa<ttg::LocalAllocOp>(op) &&
+diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+index ca9d18873..d39bc6ec4 100644
+--- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
+   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
+ }
+ 
++def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
++  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
++                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
++                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
++                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
++  let results = (outs LLVM_AnyStruct:$res);
++  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
++}
++
+ def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
+   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
+   let builders = [
+diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+index 1aba5f85b..5eb1d0811 100644
+--- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
++++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+@@ -694,6 +694,84 @@ public:
+   }
+ };
+ 
++class SparseWGMMAOpPattern
++    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
++public:
++  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
++  using Base::Base;
++
++  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
++    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
++    uint32_t numOutputRegs = outputStructType.getBody().size();
++    std::string output =
++        outputStructType.getBody().front().isF32() ? "=f" : "=r";
++    return std::vector<std::string>(numOutputRegs, output);
++  }
++
++  OperandsAndConstraints getOperandsAndConstraints(
++      ttn::SparseWGMMAOp op) const {
++    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
++            {op.getMetaA(), "r"}};
++  }
++
++  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
++    using namespace ttn;
++    auto opA = op.getOpA();
++    auto opB = op.getOpB();
++    auto m = op.getM();
++    auto n = op.getN();
++    auto k = op.getK();
++    auto eltTypeC = op.getEltTypeC();
++    auto eltTypeA = op.getEltTypeA();
++    auto eltTypeB = op.getEltTypeB();
++    auto layoutA = op.getLayoutA();
++    auto layoutB = op.getLayoutB();
++
++    // Only f16/bf16 variant is supported.
++    bool supported =
++        eltTypeC == WGMMAEltType::f32 &&
++        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
++         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
++        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
++    assert(supported && "Sparse WGMMA type or shape is not supported");
++
++    // Operands
++    uint32_t asmOpIdx = 0;
++    std::string args = "";
++
++    // Output and operand C
++    uint32_t numCRegs =
++        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
++    args += "{";
++    for (uint32_t i = 0; i < numCRegs; ++i) {
++      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
++    }
++    args += "}, ";
++    asmOpIdx += numCRegs;
++
++    // Operands A and B (must be `desc`)
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++
++    // Metadata for A
++    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
++
++    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
++    args += "1, 1, 1";
++
++    // `trans-a` and `trans-b`
++    args += ", " + std::to_string(layoutA == WGMMALayout::col);
++    args += ", " + std::to_string(layoutB == WGMMALayout::row);
++
++    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
++                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
++                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
++                  "." + stringifyEnum(eltTypeA).str() + "." +
++                  stringifyEnum(eltTypeB).str() + " " + args + ";";
++    return ptxAsm;
++  }
++};
++
+ class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
+ 
+ public:
+@@ -714,10 +792,9 @@ public:
+     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
+         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
+ 
+-    patterns
+-        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+-             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
+-            context);
++    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
++                 ClusterArriveOpPattern, WGMMAOpPattern,
++                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
+ 
+     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
+       signalPassFailure();
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
 index f8ece0f1c..435610817 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -234,10 +803,10 @@ index f8ece0f1c..435610817 100644
  struct ConvertLayoutOpOptimizedConversion
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 new file mode 100644
-index 000000000..3011cf73d
+index 000000000..58023633e
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -0,0 +1,69 @@
+@@ -0,0 +1,75 @@
 +#include "../Utility.h"
 +
 +namespace SharedToSparseDotOperand {
@@ -262,16 +831,22 @@ index 000000000..3011cf73d
 +  // Calculate tile size as number of mask elements (4xi4).
 +  NvidiaMmaEncodingAttr mmaLayout =
 +      cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
++  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
 +  SmallVector<unsigned> shapePerCTATile = {
-+      kTileSize * mmaLayout.getWarpsPerCTA()[0],
-+      kTileSize / kMetadataElementsPerPackedValue};
++      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
 +  Value strideM = smemObj.strides[0];
 +  Value strideK = smemObj.strides[1];
 +
 +  // Calculate offset in the tile for the current thread.
 +  Value threadsPerWarp = i32_val(kThreadsPerWarp);
 +  Value warpId = udiv(thread, threadsPerWarp);
-+  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value warpGroupId;
++  if (mmaLayout.isHopper()) {
++    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
++  } else {
++    assert(mmaLayout.isAmpere());
++    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  }
 +  Value laneId = urem(thread, threadsPerWarp);
 +  Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
 +  Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
deleted file mode 100644
index abccf863abca64..00000000000000
--- a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 0516fc56f..1f27f8a43 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -146,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
-                 mlir::TypeID::get<arith::ArithDialect>());
-   }
- 
-+public:
-   // Finds the first different bitwidth in the chain of shape-preserving
-   // unary ops that x depends on.
-   // There are two primary scenarios:
-@@ -179,7 +180,6 @@ class BlockedToMMA : public mlir::RewritePattern {
-     return origBitWidth;
-   }
- 
--public:
-   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
-         computeCapability(computeCapability) {}
-@@ -393,18 +393,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
-                                                         newRetType, oldAcc);
- 
-     if (versionMajor == 2) {
-+      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
-+                                 BlockedToMMA::computeOrigBitWidth(b));
-+      int kWidth = 32 / minBitwidth;
-+
-       // convert A operand
-       auto oldAType = cast<RankedTensorType>(a.getType());
--      auto newAEncoding = DotOperandEncodingAttr::get(
--          ctx, 0, mmaEnc, oldAType.getElementType());
-+      auto newAEncoding =
-+          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
-       auto newAType = RankedTensorType::get(
-           oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-       a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
- 
-       // convert B operand
-       auto oldBType = cast<RankedTensorType>(b.getType());
--      auto newBEncoding = DotOperandEncodingAttr::get(
--          ctx, 1, mmaEnc, oldBType.getElementType());
-+      auto newBEncoding =
-+          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
-       auto newBType = RankedTensorType::get(
-           oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-       b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-index 3011cf73d..ea587dced 100644
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -22,16 +22,16 @@ Value convertLayout(
-   // Calculate tile size as number of mask elements (4xi4).
-   NvidiaMmaEncodingAttr mmaLayout =
-       cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
-+  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
-   SmallVector<unsigned> shapePerCTATile = {
--      kTileSize * mmaLayout.getWarpsPerCTA()[0],
--      kTileSize / kMetadataElementsPerPackedValue};
-+      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
-   Value strideM = smemObj.strides[0];
-   Value strideK = smemObj.strides[1];
- 
-   // Calculate offset in the tile for the current thread.
-   Value threadsPerWarp = i32_val(kThreadsPerWarp);
-   Value warpId = udiv(thread, threadsPerWarp);
--  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
-+  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-   Value laneId = urem(thread, threadsPerWarp);
-   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
-   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
deleted file mode 100644
index 71ccb7e3c2e6e5..00000000000000
--- a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
+++ /dev/null
@@ -1,31 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -368,7 +368,8 @@ class SparseBlockedToMMA : public mlir::
- 
-     assert(computeCapability >= 80 &&
-            "SparseDot is supported on Ampere and higher");
--    int versionMajor = computeCapability < 90 ? 2 : 3;
-+    bool allowV3 = !triton::tools::getBoolEnv("DISABLE_MMA_V3");
-+    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
- 
-     // get MMA encoding for the given number of warps
-     auto retShapePerCTA = getShapePerCTA(oldRetType);
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -31,7 +31,13 @@ Value convertLayout(
-   // Calculate offset in the tile for the current thread.
-   Value threadsPerWarp = i32_val(kThreadsPerWarp);
-   Value warpId = udiv(thread, threadsPerWarp);
--  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-+  Value warpGroupId;
-+  if (mmaLayout.isHopper()) {
-+    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
-+  } else {
-+    assert(mmaLayout.isAmpere());
-+    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-+  }
-   Value laneId = urem(thread, threadsPerWarp);
-   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
-   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
deleted file mode 100644
index 791618363b2f34..00000000000000
--- a/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
+++ /dev/null
@@ -1,123 +0,0 @@
-diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-index ca9d18873..d39bc6ec4 100644
---- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-+++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
-   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
- }
- 
-+def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
-+  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
-+                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
-+                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
-+                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
-+  let results = (outs LLVM_AnyStruct:$res);
-+  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
-+}
-+
- def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
-   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
-   let builders = [
-diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-index e19216520..aacbfb569 100644
---- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-+++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-@@ -668,6 +668,84 @@ public:
-   }
- };
- 
-+class SparseWGMMAOpPattern
-+    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
-+public:
-+  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
-+  using Base::Base;
-+
-+  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
-+    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
-+    uint32_t numOutputRegs = outputStructType.getBody().size();
-+    std::string output =
-+        outputStructType.getBody().front().isF32() ? "=f" : "=r";
-+    return std::vector<std::string>(numOutputRegs, output);
-+  }
-+
-+  OperandsAndConstraints getOperandsAndConstraints(
-+      ttn::SparseWGMMAOp op) const {
-+    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
-+            {op.getMetaA(), "r"}};
-+  }
-+
-+  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
-+    using namespace ttn;
-+    auto opA = op.getOpA();
-+    auto opB = op.getOpB();
-+    auto m = op.getM();
-+    auto n = op.getN();
-+    auto k = op.getK();
-+    auto eltTypeC = op.getEltTypeC();
-+    auto eltTypeA = op.getEltTypeA();
-+    auto eltTypeB = op.getEltTypeB();
-+    auto layoutA = op.getLayoutA();
-+    auto layoutB = op.getLayoutB();
-+
-+    // Only f16/bf16 variant is supported.
-+    bool supported =
-+        eltTypeC == WGMMAEltType::f32 &&
-+        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
-+         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
-+        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
-+    assert(supported && "Sparse WGMMA type or shape is not supported");
-+
-+    // Operands
-+    uint32_t asmOpIdx = 0;
-+    std::string args = "";
-+
-+    // Output and operand C
-+    uint32_t numCRegs =
-+        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
-+    args += "{";
-+    for (uint32_t i = 0; i < numCRegs; ++i) {
-+      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
-+    }
-+    args += "}, ";
-+    asmOpIdx += numCRegs;
-+
-+    // Operands A and B (must be `desc`)
-+    args += "$" + std::to_string(asmOpIdx++) + ", ";
-+    args += "$" + std::to_string(asmOpIdx++) + ", ";
-+
-+    // Metadata for A
-+    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
-+
-+    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
-+    args += "1, 1, 1";
-+
-+    // `trans-a` and `trans-b`
-+    args += ", " + std::to_string(layoutA == WGMMALayout::col);
-+    args += ", " + std::to_string(layoutB == WGMMALayout::row);
-+
-+    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
-+                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
-+                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
-+                  "." + stringifyEnum(eltTypeA).str() + "." +
-+                  stringifyEnum(eltTypeB).str() + " " + args + ";";
-+    return ptxAsm;
-+  }
-+};
-+
- class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
- 
- public:
-@@ -688,10 +766,9 @@ public:
-     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
-         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
- 
--    patterns
--        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
--             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
--            context);
-+    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
-+                 ClusterArriveOpPattern, WGMMAOpPattern,
-+                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
- 
-     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
-       signalPassFailure();
diff --git a/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/triton/xla_extensions/sparse_dot_passes.patch
deleted file mode 100644
index afa38b7359b958..00000000000000
--- a/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ /dev/null
@@ -1,429 +0,0 @@
-diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-index 4aa2712ec..16a6253d7 100644
---- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-+++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
-   }
- };
- 
-+struct TritonSparseDotPattern
-+    : public OpConversionPattern<triton::gpu::SparseDotOp> {
-+  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
-+
-+  LogicalResult matchAndRewrite(
-+      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
-+      ConversionPatternRewriter &rewriter) const override {
-+    RankedTensorType origType = cast<RankedTensorType>(op.getType());
-+    auto origShape = origType.getShape();
-+    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
-+    int numWarps = typeConverter->getNumWarps();
-+    int threadsPerWarp = typeConverter->getThreadsPerWarp();
-+    int numCTAs = typeConverter->getNumCTAs();
-+
-+    auto rank = origShape.size();
-+    auto numElements = product<int64_t>(origShape);
-+    SmallVector<unsigned> retSizePerThread(rank, 1);
-+    if (numElements / (numWarps * threadsPerWarp) >= 4) {
-+      retSizePerThread[rank - 1] = 2;
-+      retSizePerThread[rank - 2] = 2;
-+    }
-+    if (numElements / (numWarps * threadsPerWarp) >= 16) {
-+      retSizePerThread[rank - 1] = 4;
-+      retSizePerThread[rank - 2] = 4;
-+    }
-+    SmallVector<unsigned> retOrder(rank);
-+    for (unsigned i = 0; i < rank; ++i)
-+      retOrder[i] = rank - 1 - i;
-+    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
-+        getContext(), origShape, retSizePerThread, retOrder, numWarps,
-+        threadsPerWarp, numCTAs);
-+    RankedTensorType retType =
-+        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
-+
-+    // a & b must be of smem layout
-+    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
-+    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
-+    Type aEltType = aType.getElementType();
-+    Type bEltType = bType.getElementType();
-+    Attribute aEncoding = aType.getEncoding();
-+    Attribute bEncoding = bType.getEncoding();
-+    if (!aEncoding || !bEncoding)
-+      return failure();
-+    Value a = adaptor.getA();
-+    Value b = adaptor.getB();
-+    Value c = adaptor.getC();
-+    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
-+      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
-+          getContext(), 0, dEncoding, aEltType);
-+      auto dstType =
-+          RankedTensorType::get(aType.getShape(), aEltType, encoding);
-+      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
-+    }
-+    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
-+      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
-+          getContext(), 1, dEncoding, bEltType);
-+      auto dstType =
-+          RankedTensorType::get(bType.getShape(), bEltType, encoding);
-+      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
-+    }
-+    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
-+
-+    // aMeta must be of smem layout
-+    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
-+    Attribute aMetaEncoding = aMetaType.getEncoding();
-+    if (!aMetaEncoding) return failure();
-+    Value aMeta = adaptor.getAMeta();
-+    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
-+      Attribute encoding =
-+          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
-+      auto dstType = RankedTensorType::get(
-+          aMetaType.getShape(), aMetaType.getElementType(), encoding);
-+      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
-+                                                            dstType, aMeta);
-+    }
-+
-+    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
-+                      op, retType, a, b, c, aMeta),
-+                  adaptor.getAttributes());
-+    return success();
-+  }
-+};
-+
- struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
-   using OpConversionPattern::OpConversionPattern;
- 
-@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
-       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
-       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
-                                                              context);
-+  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
- }
- 
- //
-@@ -794,6 +878,12 @@ public:
-     mod->setAttr(AttrTargetName,
-                  StringAttr::get(context, this->target.getValue()));
- 
-+    // Only transform sparse dot op with undefined layout.
-+    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
-+        [](triton::gpu::SparseDotOp op) {
-+          return op.getAMeta().getType().getEncoding() != nullptr;
-+        });
-+
-     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
-       return signalPassFailure();
- 
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 098ee85e4..0516fc56f 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -39,8 +39,9 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
-   return 0;
- }
- 
--SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
-+template <typename DotType>
-+SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
-                                      int numWarps) {
-   auto rank = shape.size();
-   // Early exit for batched matmul
-   if (rank == 3)
-@@ -54,8 +54,8 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
-   bool hasChainedDot = false;
-   for (Operation *op : slices) {
--    if (isa<DotOp>(op) && (op != dotOp)) {
--      auto chainedDot = cast<DotOp>(op);
-+    if (isa<DotType>(op) && (op != dotOp)) {
-+      auto chainedDot = cast<DotType>(op);
-       auto resTy = chainedDot.getResult().getType();
-       if (resTy.getRank() != rank) {
-         continue;
-@@ -99,12 +99,13 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-   return ret;
- }
- 
--SmallVector<unsigned, 2>
--warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
--               const SmallVector<unsigned, 3> &instrShape) {
-+template <typename DotType>
-+SmallVector<unsigned, 2> warpsPerTileV3(
-+    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
-+    const SmallVector<unsigned, 3> &instrShape) {
-   SetVector<Operation *> slices;
-   mlir::getForwardSlice(dotOp.getResult(), &slices);
--  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
-+  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
-       slices.end())
-     return {(unsigned)numWarps, 1};
- 
-@@ -184,9 +184,10 @@ public:
-       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
-         computeCapability(computeCapability) {}
- 
--  static SmallVector<unsigned, 3>
--  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
--                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
-+  template <typename DotType>
-+  static SmallVector<unsigned, 3> getWarpsPerTile(
-+      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
-+      const SmallVector<unsigned, 3> &instrShape) {
-     switch (version) {
-     case 2:
-       return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -342,6 +342,98 @@ public:
-     return success();
-   }
- };
-+
-+class SparseBlockedToMMA : public mlir::RewritePattern {
-+ public:
-+  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
-+  using SparseDotMetaEncodingAttr =
-+      mlir::triton::gpu::SparseDotMetaEncodingAttr;
-+
-+  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-+      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
-+        computeCapability(computeCapability) {}
-+
-+  mlir::LogicalResult matchAndRewrite(
-+      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
-+    auto dotOp = cast<SparseDotOp>(op);
-+    auto ctx = op->getContext();
-+    Value a = dotOp.getA();
-+    Value b = dotOp.getB();
-+
-+    // Check data-types and SM compatibility
-+    RankedTensorType oldRetType = dotOp.getType();
-+    if (!oldRetType.getEncoding() ||
-+        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
-+      return failure();
-+
-+    assert(computeCapability >= 80 &&
-+           "SparseDot is supported on Ampere and higher");
-+    int versionMajor = computeCapability < 90 ? 2 : 3;
-+
-+    // get MMA encoding for the given number of warps
-+    auto retShapePerCTA = getShapePerCTA(oldRetType);
-+    auto mod = op->getParentOfType<mlir::ModuleOp>();
-+    int numWarps = TritonGPUDialect::getNumWarps(mod);
-+    auto CTALayout = getCTALayout(oldRetType.getEncoding());
-+
-+    auto instrShape =
-+        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
-+                               cast<TensorOrMemDesc>(a.getType()), numWarps);
-+    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
-+        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
-+    NvidiaMmaEncodingAttr mmaEnc =
-+        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
-+                                        warpsPerTile, CTALayout, instrShape);
-+    auto newRetType = RankedTensorType::get(
-+        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
-+
-+    // convert accumulator
-+    auto oldAcc = dotOp.getOperand(2);
-+    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
-+                                                        newRetType, oldAcc);
-+
-+    if (versionMajor == 2) {
-+      // convert A operand
-+      auto oldAType = cast<RankedTensorType>(a.getType());
-+      auto newAEncoding = DotOperandEncodingAttr::get(
-+          ctx, 0, mmaEnc, oldAType.getElementType());
-+      auto newAType = RankedTensorType::get(
-+          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-+      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
-+
-+      // convert B operand
-+      auto oldBType = cast<RankedTensorType>(b.getType());
-+      auto newBEncoding = DotOperandEncodingAttr::get(
-+          ctx, 1, mmaEnc, oldBType.getElementType());
-+      auto newBType = RankedTensorType::get(
-+          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-+      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
-+    } else {
-+      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
-+      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
-+    }
-+
-+    // convert metadata
-+    Value meta = dotOp.getAMeta();
-+    auto oldMetaType = cast<RankedTensorType>(meta.getType());
-+    auto newMetaType = RankedTensorType::get(
-+        oldMetaType.getShape(), oldMetaType.getElementType(),
-+        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
-+    meta =
-+        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
-+
-+    // convert dot instruction
-+    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
-+                                               newAcc, meta);
-+
-+    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
-+                                                      newDot.getResult());
-+    return success();
-+  }
-+
-+ private:
-+  int computeCapability;
-+};
- } // namespace
- 
- static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -394,6 +493,7 @@ public:
- 
-     mlir::RewritePatternSet patterns(context);
-     patterns.add<BlockedToMMA>(context, computeCapability);
-+    patterns.add<SparseBlockedToMMA>(context, computeCapability);
-     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
-       signalPassFailure();
-     }
-diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-index 97ca6a840..f0ef124ff 100644
---- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-@@ -188,6 +188,10 @@ public:
-   }
- };
- 
-+static bool isDotOp(Operation* op) {
-+  return isa<tt::DotOp, ttg::SparseDotOp>(op);
-+}
-+
- static bool isMMAv3Dot(Operation *op) {
-   auto dot = dyn_cast<tt::DotOp>(op);
-   if (!dot)
-@@ -391,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
-     } else {
-       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
-         return std::nullopt;
--      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
--          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
--      if (!dotOpEnc)
-+      auto enc =
-+          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
-+      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
-+        auto srcTy = cast<TensorOrMemDesc>(val.getType());
-+        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
-+        auto order = ttg::getOrder(srcTy.getEncoding());
-+        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
-+        tempAttr = ttg::SharedEncodingAttr::get(
-+            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
-+            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
-+            ttg::getCTALayout(srcTy.getEncoding()),
-+            srcTy.getElementType().getIntOrFloatBitWidth(),
-+            /*needTrans=*/false);
-+      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
-+        auto srcTy = cast<TensorOrMemDesc>(val.getType());
-+        tempAttr = ttg::SharedEncodingAttr::get(
-+            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
-+            ttg::getOrder(srcTy.getEncoding()),
-+            ttg::getCTALayout(srcTy.getEncoding()));
-+      } else {
-         return std::nullopt;
--      auto srcTy = cast<TensorOrMemDesc>(val.getType());
--      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
--      auto order = ttg::getOrder(srcTy.getEncoding());
--      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
--      tempAttr = ttg::SharedEncodingAttr::get(
--          val.getContext(), dotOpEnc, srcTy.getShape(),
--          ttg::getOrder(srcTy.getEncoding()),
--          ttg::getCTALayout(srcTy.getEncoding()),
--          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
-+      }
-     }
-     // Check that the shared encodings needed by the users are compatible.
-     if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -519,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
-       };
- 
-   for (Operation &op : forOp.getBody()->without_terminator()) {
--    if (!isa<tt::DotOp>(op))
-+    if (!isDotOp(&op))
-       continue;
-     seen.clear();
-     dfs(&op, 0, &op);
-@@ -596,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
-         continue;
-     }
- 
--    if (auto dot = dyn_cast<tt::DotOp>(use)) {
-+    if (isDotOp(use)) {
-+      auto dot = dyn_cast<tt::DotOp>(use);
-       loadInfo.usedByDot = true;
-       if (loadIsMMAv3(op)) {
-         loadInfo.loadIsMMAV3 = true;
-@@ -619,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
-         // The codegen bug is caught by an assertion, so if you think you've
-         // fixed it, feel free to delete this code and see if the assert still
-         // fails.  :)
--        if (!loadInfo.sharedEncoding) {
-+        if (dot && !loadInfo.sharedEncoding) {
-           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
-                   dot.getResult().getType().getEncoding())) {
-             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
-diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-index 2211df31b..ee5ff44d8 100644
---- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -38,6 +38,10 @@ public:
-       auto srcEncoding = srcType.getEncoding();
-       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
-         return;
-+      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
-+        replaceSparseMetaEncoding(cvtOp);
-+        return;
-+      }
-       auto dstDotOp =
-           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
-       if (!dstDotOp)
-@@ -84,6 +88,27 @@ public:
-       cvtOp.erase();
-     });
-   }
-+
-+ private:
-+  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
-+    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
-+    auto srcEncoding = srcType.getEncoding();
-+    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
-+        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
-+        triton::gpu::getCTALayout(srcEncoding));
-+
-+    auto dstType = cast<RankedTensorType>(cvtOp.getType());
-+    auto tmpType = triton::MemDescType::get(
-+        dstType.getShape(), dstType.getElementType(), sharedLayout);
-+
-+    OpBuilder builder(cvtOp);
-+    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
-+        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
-+    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
-+        cvtOp.getLoc(), dstType, tmp);
-+    cvtOp.replaceAllUsesWith(newConvert.getResult());
-+    cvtOp.erase();
-+  }
- };
- 
- } // namespace gpu
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-index f456d36a6..a1dac2b72 100644
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -44,7 +44,7 @@ public:
-       return;
-     ModuleOp mod = getOperation();
-     mod.walk([&](Operation *op) {
--      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
-+      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
-         return WalkResult::advance();
-       OpBuilder builder(op);
-       auto a = op->getOperand(0);
-@@ -79,7 +79,7 @@ private:
-     static DenseSet<std::pair<Operation *, unsigned>> trace;
-     auto op = operand.getDefiningOp();
-     // avoid redundant insertion
--    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
-+    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
-       return false;
-     // reach convertlayout
-     if (op && isa<ttg::LocalAllocOp>(op) &&
diff --git a/third_party/xla/third_party/triton/xla_extensions/series.bzl b/third_party/xla/third_party/triton/xla_extensions/series.bzl
index af524fb253cbef..d35fcadd621eec 100644
--- a/third_party/xla/third_party/triton/xla_extensions/series.bzl
+++ b/third_party/xla/third_party/triton/xla_extensions/series.bzl
@@ -4,9 +4,5 @@ applied in the previous copybara workflow.
 """
 
 extensions_files_patch_list = [
-    "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w17.patch",  # Sparsity internal patch
-    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w19.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot.patch",  # Sparsity internal patch
 ]
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot.patch
similarity index 54%
rename from third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
rename to third_party/xla/third_party/triton/xla_extensions/sparse_dot.patch
index 08b7dd6f7ada87..250a8285001133 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot.patch
@@ -1,8 +1,8 @@
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-index 56f0b6b49..aa91ea9b8 100644
+index 05f3378dc..0a463c44d 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -1262,4 +1262,16 @@ section 9.7.13.4.1 for more details.
+@@ -1298,4 +1298,16 @@ elements along the K dim, or they use all elements of the tensor along the K dim
    }];
  }
  
@@ -20,7 +20,7 @@ index 56f0b6b49..aa91ea9b8 100644
 +
  #endif
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-index 4966a5f73..d2bb33cfa 100644
+index 2530009cb..b9f5bd933 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 @@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
@@ -31,7 +31,7 @@ index 4966a5f73..d2bb33cfa 100644
  include "mlir/IR/OpBase.td"
  include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
  include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
-@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [MemoryEffects<[MemWrite<SharedMemo
+@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
    }];
  }
  
@@ -51,11 +51,126 @@ index 4966a5f73..d2bb33cfa 100644
 +}
 +
  #endif
+diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+index 4aa2712ec..16a6253d7 100644
+--- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
++++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
+   }
+ };
+ 
++struct TritonSparseDotPattern
++    : public OpConversionPattern<triton::gpu::SparseDotOp> {
++  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    RankedTensorType origType = cast<RankedTensorType>(op.getType());
++    auto origShape = origType.getShape();
++    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
++    int numWarps = typeConverter->getNumWarps();
++    int threadsPerWarp = typeConverter->getThreadsPerWarp();
++    int numCTAs = typeConverter->getNumCTAs();
++
++    auto rank = origShape.size();
++    auto numElements = product<int64_t>(origShape);
++    SmallVector<unsigned> retSizePerThread(rank, 1);
++    if (numElements / (numWarps * threadsPerWarp) >= 4) {
++      retSizePerThread[rank - 1] = 2;
++      retSizePerThread[rank - 2] = 2;
++    }
++    if (numElements / (numWarps * threadsPerWarp) >= 16) {
++      retSizePerThread[rank - 1] = 4;
++      retSizePerThread[rank - 2] = 4;
++    }
++    SmallVector<unsigned> retOrder(rank);
++    for (unsigned i = 0; i < rank; ++i)
++      retOrder[i] = rank - 1 - i;
++    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
++        getContext(), origShape, retSizePerThread, retOrder, numWarps,
++        threadsPerWarp, numCTAs);
++    RankedTensorType retType =
++        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
++
++    // a & b must be of smem layout
++    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
++    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
++    Type aEltType = aType.getElementType();
++    Type bEltType = bType.getElementType();
++    Attribute aEncoding = aType.getEncoding();
++    Attribute bEncoding = bType.getEncoding();
++    if (!aEncoding || !bEncoding)
++      return failure();
++    Value a = adaptor.getA();
++    Value b = adaptor.getB();
++    Value c = adaptor.getC();
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 0, dEncoding, aEltType);
++      auto dstType =
++          RankedTensorType::get(aType.getShape(), aEltType, encoding);
++      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
++    }
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 1, dEncoding, bEltType);
++      auto dstType =
++          RankedTensorType::get(bType.getShape(), bEltType, encoding);
++      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
++    }
++    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
++
++    // aMeta must be of smem layout
++    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
++    Attribute aMetaEncoding = aMetaType.getEncoding();
++    if (!aMetaEncoding) return failure();
++    Value aMeta = adaptor.getAMeta();
++    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
++      Attribute encoding =
++          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
++      auto dstType = RankedTensorType::get(
++          aMetaType.getShape(), aMetaType.getElementType(), encoding);
++      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
++                                                            dstType, aMeta);
++    }
++
++    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
++                      op, retType, a, b, c, aMeta),
++                  adaptor.getAttributes());
++    return success();
++  }
++};
++
+ struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
+   using OpConversionPattern::OpConversionPattern;
+ 
+@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
+       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
+       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
+                                                              context);
++  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
+ }
+ 
+ //
+@@ -794,6 +878,12 @@ public:
+     mod->setAttr(AttrTargetName,
+                  StringAttr::get(context, this->target.getValue()));
+ 
++    // Only transform sparse dot op with undefined layout.
++    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
++        [](triton::gpu::SparseDotOp op) {
++          return op.getAMeta().getType().getEncoding() != nullptr;
++        });
++
+     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
+       return signalPassFailure();
+ 
 diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-index 0ce7ecf18..3736a1551 100644
+index 74ae61c06..1bdbd84be 100644
 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
 +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -483,6 +483,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
+@@ -497,6 +497,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
    return encoding;
  }
  
@@ -175,6 +290,460 @@ index 0ce7ecf18..3736a1551 100644
  } // namespace gpu
  } // namespace triton
  } // namespace mlir
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index ee09a34de..d995f6c04 100644
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -39,7 +39,8 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
+   return 0;
+ }
+ 
+-SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
++template <typename DotType>
++SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
+                                      int numWarps) {
+   auto rank = shape.size();
+   // Early exit for batched matmul
+@@ -53,8 +54,8 @@ SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
+   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
+   bool hasChainedDot = false;
+   for (Operation *op : slices) {
+-    if (isa<DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<DotOp>(op);
++    if (isa<DotType>(op) && (op != dotOp)) {
++      auto chainedDot = cast<DotType>(op);
+       auto resTy = chainedDot.getResult().getType();
+       if (resTy.getRank() != rank) {
+         continue;
+@@ -98,12 +99,13 @@ SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
+   return ret;
+ }
+ 
+-SmallVector<unsigned, 2>
+-warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-               const SmallVector<unsigned, 3> &instrShape) {
++template <typename DotType>
++SmallVector<unsigned, 2> warpsPerTileV3(
++    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
++    const SmallVector<unsigned, 3> &instrShape) {
+   SetVector<Operation *> slices;
+   mlir::getForwardSlice(dotOp.getResult(), &slices);
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
++  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
+       slices.end())
+     return {(unsigned)numWarps, 1};
+ 
+@@ -144,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+                 mlir::TypeID::get<arith::ArithDialect>());
+   }
+ 
++public:
+   // Finds the first different bitwidth in the chain of shape-preserving
+   // unary ops that x depends on.
+   // There are two primary scenarios:
+@@ -177,14 +180,14 @@ class BlockedToMMA : public mlir::RewritePattern {
+     return origBitWidth;
+   }
+ 
+-public:
+   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
+       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+ 
+-  static SmallVector<unsigned, 3>
+-  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
++  template <typename DotType>
++  static SmallVector<unsigned, 3> getWarpsPerTile(
++      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
++      const SmallVector<unsigned, 3> &instrShape) {
+     switch (version) {
+     case 2:
+       return warpsPerTileV2(dotOp, shape, numWarps);
+@@ -339,6 +342,103 @@ public:
+     return success();
+   }
+ };
++
++class SparseBlockedToMMA : public mlir::RewritePattern {
++ public:
++  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
++  using SparseDotMetaEncodingAttr =
++      mlir::triton::gpu::SparseDotMetaEncodingAttr;
++
++  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
++      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
++        computeCapability(computeCapability) {}
++
++  mlir::LogicalResult matchAndRewrite(
++      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
++    auto dotOp = cast<SparseDotOp>(op);
++    auto ctx = op->getContext();
++    Value a = dotOp.getA();
++    Value b = dotOp.getB();
++
++    // Check data-types and SM compatibility
++    RankedTensorType oldRetType = dotOp.getType();
++    if (!oldRetType.getEncoding() ||
++        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
++      return failure();
++
++    assert(computeCapability >= 80 &&
++           "SparseDot is supported on Ampere and higher");
++    bool allowV3 = !triton::tools::getBoolEnv("DISABLE_MMA_V3");
++    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
++
++    // get MMA encoding for the given number of warps
++    auto retShapePerCTA = getShapePerCTA(oldRetType);
++    auto mod = op->getParentOfType<mlir::ModuleOp>();
++    int numWarps = TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = getCTALayout(oldRetType.getEncoding());
++
++    auto instrShape =
++        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
++                               cast<TensorOrMemDesc>(a.getType()), numWarps);
++    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
++        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
++    NvidiaMmaEncodingAttr mmaEnc =
++        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++                                        warpsPerTile, CTALayout, instrShape);
++    auto newRetType = RankedTensorType::get(
++        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
++
++    // convert accumulator
++    auto oldAcc = dotOp.getOperand(2);
++    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
++                                                        newRetType, oldAcc);
++
++    if (versionMajor == 2) {
++      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
++                                 BlockedToMMA::computeOrigBitWidth(b));
++      int kWidth = 32 / minBitwidth;
++
++      // convert A operand
++      auto oldAType = cast<RankedTensorType>(a.getType());
++      auto newAEncoding =
++          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
++      auto newAType = RankedTensorType::get(
++          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
++      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
++
++      // convert B operand
++      auto oldBType = cast<RankedTensorType>(b.getType());
++      auto newBEncoding =
++          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
++      auto newBType = RankedTensorType::get(
++          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
++      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
++    } else {
++      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
++      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
++    }
++
++    // convert metadata
++    Value meta = dotOp.getAMeta();
++    auto oldMetaType = cast<RankedTensorType>(meta.getType());
++    auto newMetaType = RankedTensorType::get(
++        oldMetaType.getShape(), oldMetaType.getElementType(),
++        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
++    meta =
++        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++
++    // convert dot instruction
++    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
++                                               newAcc, meta);
++
++    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
++                                                      newDot.getResult());
++    return success();
++  }
++
++ private:
++  int computeCapability;
++};
+ } // namespace
+ 
+ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
+@@ -398,6 +498,7 @@ public:
+ 
+     mlir::RewritePatternSet patterns(context);
+     patterns.add<BlockedToMMA>(context, computeCapability);
++    patterns.add<SparseBlockedToMMA>(context, computeCapability);
+     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
+       signalPassFailure();
+     }
+diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+index 457d42f4e..5ab249066 100644
+--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+@@ -188,6 +188,10 @@ public:
+   }
+ };
+ 
++static bool isDotOp(Operation* op) {
++  return isa<tt::DotOp, ttg::SparseDotOp>(op);
++}
++
+ static bool isMMAv3Dot(Operation *op) {
+   auto dot = dyn_cast<tt::DotOp>(op);
+   if (!dot)
+@@ -387,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
+     } else {
+       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
+         return std::nullopt;
+-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
+-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
+-      if (!dotOpEnc)
++      auto enc =
++          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
++      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
++        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
++        auto order = ttg::getOrder(srcTy.getEncoding());
++        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
++            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()),
++            srcTy.getElementType().getIntOrFloatBitWidth(),
++            /*needTrans=*/false);
++      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
++            ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()));
++      } else {
+         return std::nullopt;
+-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
+-      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
+-      auto order = ttg::getOrder(srcTy.getEncoding());
+-      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
+-      tempAttr = ttg::SharedEncodingAttr::get(
+-          val.getContext(), dotOpEnc, srcTy.getShape(),
+-          ttg::getOrder(srcTy.getEncoding()),
+-          ttg::getCTALayout(srcTy.getEncoding()),
+-          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
++      }
+     }
+     // Check that the shared encodings needed by the users are compatible.
+     if (!tempAttr || (attr != nullptr && attr != tempAttr))
+@@ -506,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
+       };
+ 
+   for (Operation &op : forOp.getBody()->without_terminator()) {
+-    if (!isa<tt::DotOp>(op))
++    if (!isDotOp(&op))
+       continue;
+     seen.clear();
+     dfs(&op, 0, &op);
+@@ -583,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         continue;
+     }
+ 
+-    if (auto dot = dyn_cast<tt::DotOp>(use)) {
++    if (isDotOp(use)) {
++      auto dot = dyn_cast<tt::DotOp>(use);
+       loadInfo.usedByDot = true;
+       if (loadIsMMAv3(op)) {
+         loadInfo.loadIsMMAV3 = true;
+@@ -605,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         // The codegen bug is caught by an assertion, so if you think you've
+         // fixed it, feel free to delete this code and see if the assert still
+         // fails.  :)
+-        if (!loadInfo.sharedEncoding) {
++        if (dot && !loadInfo.sharedEncoding) {
+           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                   dot.getResult().getType().getEncoding())) {
+             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
+diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+index c0b586d60..d592ac523 100644
+--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+@@ -38,6 +38,10 @@ public:
+       auto srcEncoding = srcType.getEncoding();
+       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
+         return;
++      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
++        replaceSparseMetaEncoding(cvtOp);
++        return;
++      }
+       auto dstDotOp =
+           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
+       if (!dstDotOp)
+@@ -84,6 +88,27 @@ public:
+       cvtOp.erase();
+     });
+   }
++
++ private:
++  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
++    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
++    auto srcEncoding = srcType.getEncoding();
++    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
++        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
++        triton::gpu::getCTALayout(srcEncoding));
++
++    auto dstType = cast<RankedTensorType>(cvtOp.getType());
++    auto tmpType = triton::MemDescType::get(
++        dstType.getShape(), dstType.getElementType(), sharedLayout);
++
++    OpBuilder builder(cvtOp);
++    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
++        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
++    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
++        cvtOp.getLoc(), dstType, tmp);
++    cvtOp.replaceAllUsesWith(newConvert.getResult());
++    cvtOp.erase();
++  }
+ };
+ 
+ } // namespace gpu
+diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+index c7dd8d595..df110e0a9 100644
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+@@ -44,7 +44,7 @@ public:
+       return;
+     ModuleOp mod = getOperation();
+     mod.walk([&](Operation *op) {
+-      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
++      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+         return WalkResult::advance();
+       OpBuilder builder(op);
+       auto a = op->getOperand(0);
+@@ -79,7 +79,7 @@ private:
+     static DenseSet<std::pair<Operation *, unsigned>> trace;
+     auto op = operand.getDefiningOp();
+     // avoid redundant insertion
+-    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
++    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+       return false;
+     // reach convertlayout
+     if (op && isa<ttg::LocalAllocOp>(op) &&
+diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+index ca9d18873..d39bc6ec4 100644
+--- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
+   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
+ }
+ 
++def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
++  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
++                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
++                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
++                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
++  let results = (outs LLVM_AnyStruct:$res);
++  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
++}
++
+ def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
+   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
+   let builders = [
+diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+index 1aba5f85b..5eb1d0811 100644
+--- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
++++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+@@ -694,6 +694,84 @@ public:
+   }
+ };
+ 
++class SparseWGMMAOpPattern
++    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
++public:
++  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
++  using Base::Base;
++
++  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
++    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
++    uint32_t numOutputRegs = outputStructType.getBody().size();
++    std::string output =
++        outputStructType.getBody().front().isF32() ? "=f" : "=r";
++    return std::vector<std::string>(numOutputRegs, output);
++  }
++
++  OperandsAndConstraints getOperandsAndConstraints(
++      ttn::SparseWGMMAOp op) const {
++    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
++            {op.getMetaA(), "r"}};
++  }
++
++  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
++    using namespace ttn;
++    auto opA = op.getOpA();
++    auto opB = op.getOpB();
++    auto m = op.getM();
++    auto n = op.getN();
++    auto k = op.getK();
++    auto eltTypeC = op.getEltTypeC();
++    auto eltTypeA = op.getEltTypeA();
++    auto eltTypeB = op.getEltTypeB();
++    auto layoutA = op.getLayoutA();
++    auto layoutB = op.getLayoutB();
++
++    // Only f16/bf16 variant is supported.
++    bool supported =
++        eltTypeC == WGMMAEltType::f32 &&
++        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
++         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
++        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
++    assert(supported && "Sparse WGMMA type or shape is not supported");
++
++    // Operands
++    uint32_t asmOpIdx = 0;
++    std::string args = "";
++
++    // Output and operand C
++    uint32_t numCRegs =
++        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
++    args += "{";
++    for (uint32_t i = 0; i < numCRegs; ++i) {
++      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
++    }
++    args += "}, ";
++    asmOpIdx += numCRegs;
++
++    // Operands A and B (must be `desc`)
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++
++    // Metadata for A
++    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
++
++    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
++    args += "1, 1, 1";
++
++    // `trans-a` and `trans-b`
++    args += ", " + std::to_string(layoutA == WGMMALayout::col);
++    args += ", " + std::to_string(layoutB == WGMMALayout::row);
++
++    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
++                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
++                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
++                  "." + stringifyEnum(eltTypeA).str() + "." +
++                  stringifyEnum(eltTypeB).str() + " " + args + ";";
++    return ptxAsm;
++  }
++};
++
+ class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
+ 
+ public:
+@@ -714,10 +792,9 @@ public:
+     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
+         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
+ 
+-    patterns
+-        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+-             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
+-            context);
++    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
++                 ClusterArriveOpPattern, WGMMAOpPattern,
++                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
+ 
+     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
+       signalPassFailure();
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
 index f8ece0f1c..435610817 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -234,10 +803,10 @@ index f8ece0f1c..435610817 100644
  struct ConvertLayoutOpOptimizedConversion
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 new file mode 100644
-index 000000000..3011cf73d
+index 000000000..58023633e
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -0,0 +1,69 @@
+@@ -0,0 +1,75 @@
 +#include "../Utility.h"
 +
 +namespace SharedToSparseDotOperand {
@@ -262,16 +831,22 @@ index 000000000..3011cf73d
 +  // Calculate tile size as number of mask elements (4xi4).
 +  NvidiaMmaEncodingAttr mmaLayout =
 +      cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
++  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
 +  SmallVector<unsigned> shapePerCTATile = {
-+      kTileSize * mmaLayout.getWarpsPerCTA()[0],
-+      kTileSize / kMetadataElementsPerPackedValue};
++      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
 +  Value strideM = smemObj.strides[0];
 +  Value strideK = smemObj.strides[1];
 +
 +  // Calculate offset in the tile for the current thread.
 +  Value threadsPerWarp = i32_val(kThreadsPerWarp);
 +  Value warpId = udiv(thread, threadsPerWarp);
-+  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value warpGroupId;
++  if (mmaLayout.isHopper()) {
++    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
++  } else {
++    assert(mmaLayout.isAmpere());
++    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  }
 +  Value laneId = urem(thread, threadsPerWarp);
 +  Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
 +  Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
deleted file mode 100644
index abccf863abca64..00000000000000
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 0516fc56f..1f27f8a43 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -146,6 +146,7 @@ class BlockedToMMA : public mlir::RewritePattern {
-                 mlir::TypeID::get<arith::ArithDialect>());
-   }
- 
-+public:
-   // Finds the first different bitwidth in the chain of shape-preserving
-   // unary ops that x depends on.
-   // There are two primary scenarios:
-@@ -179,7 +180,6 @@ class BlockedToMMA : public mlir::RewritePattern {
-     return origBitWidth;
-   }
- 
--public:
-   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
-         computeCapability(computeCapability) {}
-@@ -393,18 +393,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
-                                                         newRetType, oldAcc);
- 
-     if (versionMajor == 2) {
-+      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
-+                                 BlockedToMMA::computeOrigBitWidth(b));
-+      int kWidth = 32 / minBitwidth;
-+
-       // convert A operand
-       auto oldAType = cast<RankedTensorType>(a.getType());
--      auto newAEncoding = DotOperandEncodingAttr::get(
--          ctx, 0, mmaEnc, oldAType.getElementType());
-+      auto newAEncoding =
-+          DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
-       auto newAType = RankedTensorType::get(
-           oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-       a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
- 
-       // convert B operand
-       auto oldBType = cast<RankedTensorType>(b.getType());
--      auto newBEncoding = DotOperandEncodingAttr::get(
--          ctx, 1, mmaEnc, oldBType.getElementType());
-+      auto newBEncoding =
-+          DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
-       auto newBType = RankedTensorType::get(
-           oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-       b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-index 3011cf73d..ea587dced 100644
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -22,16 +22,16 @@ Value convertLayout(
-   // Calculate tile size as number of mask elements (4xi4).
-   NvidiaMmaEncodingAttr mmaLayout =
-       cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
-+  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
-   SmallVector<unsigned> shapePerCTATile = {
--      kTileSize * mmaLayout.getWarpsPerCTA()[0],
--      kTileSize / kMetadataElementsPerPackedValue};
-+      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
-   Value strideM = smemObj.strides[0];
-   Value strideK = smemObj.strides[1];
- 
-   // Calculate offset in the tile for the current thread.
-   Value threadsPerWarp = i32_val(kThreadsPerWarp);
-   Value warpId = udiv(thread, threadsPerWarp);
--  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
-+  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-   Value laneId = urem(thread, threadsPerWarp);
-   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
-   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
deleted file mode 100644
index 71ccb7e3c2e6e5..00000000000000
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
+++ /dev/null
@@ -1,31 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -368,7 +368,8 @@ class SparseBlockedToMMA : public mlir::
- 
-     assert(computeCapability >= 80 &&
-            "SparseDot is supported on Ampere and higher");
--    int versionMajor = computeCapability < 90 ? 2 : 3;
-+    bool allowV3 = !triton::tools::getBoolEnv("DISABLE_MMA_V3");
-+    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
- 
-     // get MMA encoding for the given number of warps
-     auto retShapePerCTA = getShapePerCTA(oldRetType);
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
-@@ -31,7 +31,13 @@ Value convertLayout(
-   // Calculate offset in the tile for the current thread.
-   Value threadsPerWarp = i32_val(kThreadsPerWarp);
-   Value warpId = udiv(thread, threadsPerWarp);
--  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-+  Value warpGroupId;
-+  if (mmaLayout.isHopper()) {
-+    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
-+  } else {
-+    assert(mmaLayout.isAmpere());
-+    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
-+  }
-   Value laneId = urem(thread, threadsPerWarp);
-   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
-   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
deleted file mode 100644
index 791618363b2f34..00000000000000
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
+++ /dev/null
@@ -1,123 +0,0 @@
-diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-index ca9d18873..d39bc6ec4 100644
---- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-+++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
-@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
-   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
- }
- 
-+def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
-+  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
-+                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
-+                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
-+                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
-+  let results = (outs LLVM_AnyStruct:$res);
-+  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
-+}
-+
- def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
-   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
-   let builders = [
-diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-index e19216520..aacbfb569 100644
---- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-+++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-@@ -668,6 +668,84 @@ public:
-   }
- };
- 
-+class SparseWGMMAOpPattern
-+    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
-+public:
-+  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
-+  using Base::Base;
-+
-+  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
-+    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
-+    uint32_t numOutputRegs = outputStructType.getBody().size();
-+    std::string output =
-+        outputStructType.getBody().front().isF32() ? "=f" : "=r";
-+    return std::vector<std::string>(numOutputRegs, output);
-+  }
-+
-+  OperandsAndConstraints getOperandsAndConstraints(
-+      ttn::SparseWGMMAOp op) const {
-+    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
-+            {op.getMetaA(), "r"}};
-+  }
-+
-+  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
-+    using namespace ttn;
-+    auto opA = op.getOpA();
-+    auto opB = op.getOpB();
-+    auto m = op.getM();
-+    auto n = op.getN();
-+    auto k = op.getK();
-+    auto eltTypeC = op.getEltTypeC();
-+    auto eltTypeA = op.getEltTypeA();
-+    auto eltTypeB = op.getEltTypeB();
-+    auto layoutA = op.getLayoutA();
-+    auto layoutB = op.getLayoutB();
-+
-+    // Only f16/bf16 variant is supported.
-+    bool supported =
-+        eltTypeC == WGMMAEltType::f32 &&
-+        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
-+         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
-+        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
-+    assert(supported && "Sparse WGMMA type or shape is not supported");
-+
-+    // Operands
-+    uint32_t asmOpIdx = 0;
-+    std::string args = "";
-+
-+    // Output and operand C
-+    uint32_t numCRegs =
-+        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
-+    args += "{";
-+    for (uint32_t i = 0; i < numCRegs; ++i) {
-+      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
-+    }
-+    args += "}, ";
-+    asmOpIdx += numCRegs;
-+
-+    // Operands A and B (must be `desc`)
-+    args += "$" + std::to_string(asmOpIdx++) + ", ";
-+    args += "$" + std::to_string(asmOpIdx++) + ", ";
-+
-+    // Metadata for A
-+    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
-+
-+    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
-+    args += "1, 1, 1";
-+
-+    // `trans-a` and `trans-b`
-+    args += ", " + std::to_string(layoutA == WGMMALayout::col);
-+    args += ", " + std::to_string(layoutB == WGMMALayout::row);
-+
-+    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
-+                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
-+                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
-+                  "." + stringifyEnum(eltTypeA).str() + "." +
-+                  stringifyEnum(eltTypeB).str() + " " + args + ";";
-+    return ptxAsm;
-+  }
-+};
-+
- class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
- 
- public:
-@@ -688,10 +766,9 @@ public:
-     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
-         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
- 
--    patterns
--        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
--             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
--            context);
-+    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
-+                 ClusterArriveOpPattern, WGMMAOpPattern,
-+                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
- 
-     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
-       signalPassFailure();
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
deleted file mode 100644
index afa38b7359b958..00000000000000
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ /dev/null
@@ -1,429 +0,0 @@
-diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-index 4aa2712ec..16a6253d7 100644
---- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-+++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
-   }
- };
- 
-+struct TritonSparseDotPattern
-+    : public OpConversionPattern<triton::gpu::SparseDotOp> {
-+  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
-+
-+  LogicalResult matchAndRewrite(
-+      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
-+      ConversionPatternRewriter &rewriter) const override {
-+    RankedTensorType origType = cast<RankedTensorType>(op.getType());
-+    auto origShape = origType.getShape();
-+    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
-+    int numWarps = typeConverter->getNumWarps();
-+    int threadsPerWarp = typeConverter->getThreadsPerWarp();
-+    int numCTAs = typeConverter->getNumCTAs();
-+
-+    auto rank = origShape.size();
-+    auto numElements = product<int64_t>(origShape);
-+    SmallVector<unsigned> retSizePerThread(rank, 1);
-+    if (numElements / (numWarps * threadsPerWarp) >= 4) {
-+      retSizePerThread[rank - 1] = 2;
-+      retSizePerThread[rank - 2] = 2;
-+    }
-+    if (numElements / (numWarps * threadsPerWarp) >= 16) {
-+      retSizePerThread[rank - 1] = 4;
-+      retSizePerThread[rank - 2] = 4;
-+    }
-+    SmallVector<unsigned> retOrder(rank);
-+    for (unsigned i = 0; i < rank; ++i)
-+      retOrder[i] = rank - 1 - i;
-+    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
-+        getContext(), origShape, retSizePerThread, retOrder, numWarps,
-+        threadsPerWarp, numCTAs);
-+    RankedTensorType retType =
-+        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
-+
-+    // a & b must be of smem layout
-+    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
-+    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
-+    Type aEltType = aType.getElementType();
-+    Type bEltType = bType.getElementType();
-+    Attribute aEncoding = aType.getEncoding();
-+    Attribute bEncoding = bType.getEncoding();
-+    if (!aEncoding || !bEncoding)
-+      return failure();
-+    Value a = adaptor.getA();
-+    Value b = adaptor.getB();
-+    Value c = adaptor.getC();
-+    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
-+      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
-+          getContext(), 0, dEncoding, aEltType);
-+      auto dstType =
-+          RankedTensorType::get(aType.getShape(), aEltType, encoding);
-+      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
-+    }
-+    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
-+      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
-+          getContext(), 1, dEncoding, bEltType);
-+      auto dstType =
-+          RankedTensorType::get(bType.getShape(), bEltType, encoding);
-+      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
-+    }
-+    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
-+
-+    // aMeta must be of smem layout
-+    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
-+    Attribute aMetaEncoding = aMetaType.getEncoding();
-+    if (!aMetaEncoding) return failure();
-+    Value aMeta = adaptor.getAMeta();
-+    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
-+      Attribute encoding =
-+          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
-+      auto dstType = RankedTensorType::get(
-+          aMetaType.getShape(), aMetaType.getElementType(), encoding);
-+      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
-+                                                            dstType, aMeta);
-+    }
-+
-+    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
-+                      op, retType, a, b, c, aMeta),
-+                  adaptor.getAttributes());
-+    return success();
-+  }
-+};
-+
- struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
-   using OpConversionPattern::OpConversionPattern;
- 
-@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
-       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
-       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
-                                                              context);
-+  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
- }
- 
- //
-@@ -794,6 +878,12 @@ public:
-     mod->setAttr(AttrTargetName,
-                  StringAttr::get(context, this->target.getValue()));
- 
-+    // Only transform sparse dot op with undefined layout.
-+    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
-+        [](triton::gpu::SparseDotOp op) {
-+          return op.getAMeta().getType().getEncoding() != nullptr;
-+        });
-+
-     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
-       return signalPassFailure();
- 
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 098ee85e4..0516fc56f 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -39,8 +39,9 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
-   return 0;
- }
- 
--SmallVector<unsigned> warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape,
-+template <typename DotType>
-+SmallVector<unsigned> warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape,
-                                      int numWarps) {
-   auto rank = shape.size();
-   // Early exit for batched matmul
-   if (rank == 3)
-@@ -54,8 +54,8 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
-   bool hasChainedDot = false;
-   for (Operation *op : slices) {
--    if (isa<DotOp>(op) && (op != dotOp)) {
--      auto chainedDot = cast<DotOp>(op);
-+    if (isa<DotType>(op) && (op != dotOp)) {
-+      auto chainedDot = cast<DotType>(op);
-       auto resTy = chainedDot.getResult().getType();
-       if (resTy.getRank() != rank) {
-         continue;
-@@ -99,12 +99,13 @@ warpsPerTileV2(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
-   return ret;
- }
- 
--SmallVector<unsigned, 2>
--warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
--               const SmallVector<unsigned, 3> &instrShape) {
-+template <typename DotType>
-+SmallVector<unsigned, 2> warpsPerTileV3(
-+    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
-+    const SmallVector<unsigned, 3> &instrShape) {
-   SetVector<Operation *> slices;
-   mlir::getForwardSlice(dotOp.getResult(), &slices);
--  if (llvm::find_if(slices, [](Operation *op) { return isa<DotOp>(op); }) !=
-+  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
-       slices.end())
-     return {(unsigned)numWarps, 1};
- 
-@@ -184,9 +184,10 @@ public:
-       : mlir::RewritePattern(DotOp::getOperationName(), 2, context),
-         computeCapability(computeCapability) {}
- 
--  static SmallVector<unsigned, 3>
--  getWarpsPerTile(DotOp dotOp, const ArrayRef<int64_t> shape, int version,
--                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
-+  template <typename DotType>
-+  static SmallVector<unsigned, 3> getWarpsPerTile(
-+      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
-+      const SmallVector<unsigned, 3> &instrShape) {
-     switch (version) {
-     case 2:
-       return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -342,6 +342,98 @@ public:
-     return success();
-   }
- };
-+
-+class SparseBlockedToMMA : public mlir::RewritePattern {
-+ public:
-+  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
-+  using SparseDotMetaEncodingAttr =
-+      mlir::triton::gpu::SparseDotMetaEncodingAttr;
-+
-+  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
-+      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
-+        computeCapability(computeCapability) {}
-+
-+  mlir::LogicalResult matchAndRewrite(
-+      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
-+    auto dotOp = cast<SparseDotOp>(op);
-+    auto ctx = op->getContext();
-+    Value a = dotOp.getA();
-+    Value b = dotOp.getB();
-+
-+    // Check data-types and SM compatibility
-+    RankedTensorType oldRetType = dotOp.getType();
-+    if (!oldRetType.getEncoding() ||
-+        isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
-+      return failure();
-+
-+    assert(computeCapability >= 80 &&
-+           "SparseDot is supported on Ampere and higher");
-+    int versionMajor = computeCapability < 90 ? 2 : 3;
-+
-+    // get MMA encoding for the given number of warps
-+    auto retShapePerCTA = getShapePerCTA(oldRetType);
-+    auto mod = op->getParentOfType<mlir::ModuleOp>();
-+    int numWarps = TritonGPUDialect::getNumWarps(mod);
-+    auto CTALayout = getCTALayout(oldRetType.getEncoding());
-+
-+    auto instrShape =
-+        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
-+                               cast<TensorOrMemDesc>(a.getType()), numWarps);
-+    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
-+        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
-+    NvidiaMmaEncodingAttr mmaEnc =
-+        NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
-+                                        warpsPerTile, CTALayout, instrShape);
-+    auto newRetType = RankedTensorType::get(
-+        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
-+
-+    // convert accumulator
-+    auto oldAcc = dotOp.getOperand(2);
-+    auto newAcc = rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(),
-+                                                        newRetType, oldAcc);
-+
-+    if (versionMajor == 2) {
-+      // convert A operand
-+      auto oldAType = cast<RankedTensorType>(a.getType());
-+      auto newAEncoding = DotOperandEncodingAttr::get(
-+          ctx, 0, mmaEnc, oldAType.getElementType());
-+      auto newAType = RankedTensorType::get(
-+          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-+      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
-+
-+      // convert B operand
-+      auto oldBType = cast<RankedTensorType>(b.getType());
-+      auto newBEncoding = DotOperandEncodingAttr::get(
-+          ctx, 1, mmaEnc, oldBType.getElementType());
-+      auto newBType = RankedTensorType::get(
-+          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
-+      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
-+    } else {
-+      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
-+      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
-+    }
-+
-+    // convert metadata
-+    Value meta = dotOp.getAMeta();
-+    auto oldMetaType = cast<RankedTensorType>(meta.getType());
-+    auto newMetaType = RankedTensorType::get(
-+        oldMetaType.getShape(), oldMetaType.getElementType(),
-+        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
-+    meta =
-+        rewriter.create<ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
-+
-+    // convert dot instruction
-+    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
-+                                               newAcc, meta);
-+
-+    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
-+                                                      newDot.getResult());
-+    return success();
-+  }
-+
-+ private:
-+  int computeCapability;
-+};
- } // namespace
- 
- static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -394,6 +493,7 @@ public:
- 
-     mlir::RewritePatternSet patterns(context);
-     patterns.add<BlockedToMMA>(context, computeCapability);
-+    patterns.add<SparseBlockedToMMA>(context, computeCapability);
-     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
-       signalPassFailure();
-     }
-diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-index 97ca6a840..f0ef124ff 100644
---- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-@@ -188,6 +188,10 @@ public:
-   }
- };
- 
-+static bool isDotOp(Operation* op) {
-+  return isa<tt::DotOp, ttg::SparseDotOp>(op);
-+}
-+
- static bool isMMAv3Dot(Operation *op) {
-   auto dot = dyn_cast<tt::DotOp>(op);
-   if (!dot)
-@@ -391,19 +391,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
-     } else {
-       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
-         return std::nullopt;
--      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
--          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
--      if (!dotOpEnc)
-+      auto enc =
-+          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
-+      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
-+        auto srcTy = cast<TensorOrMemDesc>(val.getType());
-+        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
-+        auto order = ttg::getOrder(srcTy.getEncoding());
-+        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
-+        tempAttr = ttg::SharedEncodingAttr::get(
-+            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
-+            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
-+            ttg::getCTALayout(srcTy.getEncoding()),
-+            srcTy.getElementType().getIntOrFloatBitWidth(),
-+            /*needTrans=*/false);
-+      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
-+        auto srcTy = cast<TensorOrMemDesc>(val.getType());
-+        tempAttr = ttg::SharedEncodingAttr::get(
-+            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
-+            ttg::getOrder(srcTy.getEncoding()),
-+            ttg::getCTALayout(srcTy.getEncoding()));
-+      } else {
-         return std::nullopt;
--      auto srcTy = cast<TensorOrMemDesc>(val.getType());
--      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
--      auto order = ttg::getOrder(srcTy.getEncoding());
--      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
--      tempAttr = ttg::SharedEncodingAttr::get(
--          val.getContext(), dotOpEnc, srcTy.getShape(),
--          ttg::getOrder(srcTy.getEncoding()),
--          ttg::getCTALayout(srcTy.getEncoding()),
--          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
-+      }
-     }
-     // Check that the shared encodings needed by the users are compatible.
-     if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -519,7 +519,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
-       };
- 
-   for (Operation &op : forOp.getBody()->without_terminator()) {
--    if (!isa<tt::DotOp>(op))
-+    if (!isDotOp(&op))
-       continue;
-     seen.clear();
-     dfs(&op, 0, &op);
-@@ -596,7 +596,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
-         continue;
-     }
- 
--    if (auto dot = dyn_cast<tt::DotOp>(use)) {
-+    if (isDotOp(use)) {
-+      auto dot = dyn_cast<tt::DotOp>(use);
-       loadInfo.usedByDot = true;
-       if (loadIsMMAv3(op)) {
-         loadInfo.loadIsMMAV3 = true;
-@@ -619,7 +619,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
-         // The codegen bug is caught by an assertion, so if you think you've
-         // fixed it, feel free to delete this code and see if the assert still
-         // fails.  :)
--        if (!loadInfo.sharedEncoding) {
-+        if (dot && !loadInfo.sharedEncoding) {
-           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
-                   dot.getResult().getType().getEncoding())) {
-             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
-diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-index 2211df31b..ee5ff44d8 100644
---- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -38,6 +38,10 @@ public:
-       auto srcEncoding = srcType.getEncoding();
-       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
-         return;
-+      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
-+        replaceSparseMetaEncoding(cvtOp);
-+        return;
-+      }
-       auto dstDotOp =
-           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
-       if (!dstDotOp)
-@@ -84,6 +88,27 @@ public:
-       cvtOp.erase();
-     });
-   }
-+
-+ private:
-+  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
-+    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
-+    auto srcEncoding = srcType.getEncoding();
-+    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
-+        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
-+        triton::gpu::getCTALayout(srcEncoding));
-+
-+    auto dstType = cast<RankedTensorType>(cvtOp.getType());
-+    auto tmpType = triton::MemDescType::get(
-+        dstType.getShape(), dstType.getElementType(), sharedLayout);
-+
-+    OpBuilder builder(cvtOp);
-+    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
-+        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
-+    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
-+        cvtOp.getLoc(), dstType, tmp);
-+    cvtOp.replaceAllUsesWith(newConvert.getResult());
-+    cvtOp.erase();
-+  }
- };
- 
- } // namespace gpu
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-index f456d36a6..a1dac2b72 100644
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -44,7 +44,7 @@ public:
-       return;
-     ModuleOp mod = getOperation();
-     mod.walk([&](Operation *op) {
--      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
-+      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
-         return WalkResult::advance();
-       OpBuilder builder(op);
-       auto a = op->getOperand(0);
-@@ -79,7 +79,7 @@ private:
-     static DenseSet<std::pair<Operation *, unsigned>> trace;
-     auto op = operand.getDefiningOp();
-     // avoid redundant insertion
--    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
-+    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
-       return false;
-     // reach convertlayout
-     if (op && isa<ttg::LocalAllocOp>(op) &&

From 9347d13268ef38a022d8cd262f669a912b1b717d Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 29 May 2024 06:22:21 -0700
Subject: [PATCH 072/287] Sort summands by distance of symbols to insertion
 point.

By emitting more distant summands first, we maximize opportunities for LICM.

PiperOrigin-RevId: 638262010
---
 .../gpu/fusions/mlir/simplify_affine.cc       | 124 +++++++++++++++---
 .../fusions/mlir/tests/simplify_affine.mlir   |  33 ++++-
 2 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index c89d032479ca6f..e2eb6ec0cff62a 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -41,7 +42,6 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/fusions/mlir/passes.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -70,32 +70,125 @@ namespace arith = mlir::arith;
 #define GEN_PASS_DEF_SIMPLIFYAFFINEPASS
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
-Value EvaluateExpression(ImplicitLocOpBuilder& b, AffineExpr expr,
-                         unsigned dim_count, ValueRange operands) {
+int Distance(ImplicitLocOpBuilder& builder, Value a) {
+  auto* block = builder.getInsertionBlock();
+  auto* parent = a.getParentBlock();
+  int distance = 0;
+  while (block && block != parent) {
+    ++distance;
+    block = block->getParentOp()->getBlock();
+  }
+  return distance;
+}
+
+void CollectArgs(AffineExpr expr, AffineExprKind kind,
+                 llvm::SmallVector<AffineExpr>& ret) {
+  if (auto bin_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr)) {
+    if (bin_op.getKind() == kind) {
+      CollectArgs(bin_op.getLHS(), kind, ret);
+      CollectArgs(bin_op.getRHS(), kind, ret);
+      return;
+    }
+  }
+  ret.push_back(expr);
+}
+
+struct ExpressionEvaluator {
+  ExpressionEvaluator(ImplicitLocOpBuilder& builder, unsigned dim_count,
+                      ValueRange operands)
+      : builder(builder), operands(operands) {
+    for (int i = 0; i < dim_count; ++i) {
+      dim_distances.push_back(Distance(builder, operands[i]));
+    }
+    for (int i = dim_count; i < operands.size(); ++i) {
+      sym_distances.push_back(Distance(builder, operands[i]));
+    }
+  }
+
+  // Returns the distance (in basic blocks) from the insertion point to the
+  // values used in the given expression.
+  int ExprDistance(AffineExpr e, int depth = 0) {
+    if (auto dim = mlir::dyn_cast<AffineDimExpr>(e)) {
+      return dim_distances[dim.getPosition()];
+    }
+    if (auto sym = mlir::dyn_cast<AffineSymbolExpr>(e)) {
+      return sym_distances[sym.getPosition()];
+    }
+    if (auto binop = mlir::dyn_cast<AffineBinaryOpExpr>(e)) {
+      return std::min(ExprDistance(binop.getLHS(), depth + 1),
+                      ExprDistance(binop.getRHS(), depth + 1));
+    }
+    if (depth == 0) {
+      // Top-level constant. Always add these last.
+      return std::numeric_limits<int>::min();
+    }
+    // Nested constant. Ignore these for distances.
+    return std::numeric_limits<int>::max();
+  }
+
+  Value EvaluateExpression(AffineExpr expr);
+
+  template <typename Op>
+  Value EvaluateAddMul(AffineExpr expr);
+
+  ImplicitLocOpBuilder& builder;
+  ValueRange operands;
+  SmallVector<int> dim_distances;
+  SmallVector<int> sym_distances;
+};
+
+template <typename Op>
+Value ExpressionEvaluator::EvaluateAddMul(AffineExpr expr) {
+  llvm::SmallVector<AffineExpr> args;
+  CollectArgs(expr, expr.getKind(), args);
+  // Sort the args so that the ones that are closest to the insertion point
+  // are evaluated last - this improves LICM.
+  llvm::stable_sort(args, [&](AffineExpr a, AffineExpr b) {
+    int dist_a = ExprDistance(a);
+    int dist_b = ExprDistance(b);
+    return dist_a > dist_b;
+  });
+
+  Value result = nullptr;
+  for (auto arg : args) {
+    Value arg_evaluated = EvaluateExpression(arg);
+    if (result) {
+      result = builder.create<Op>(result, arg_evaluated);
+    } else {
+      result = arg_evaluated;
+    }
+  }
+
+  return result;
+}
+
+Value ExpressionEvaluator::EvaluateExpression(AffineExpr expr) {
   if (auto bin_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr)) {
-    auto lhs = EvaluateExpression(b, bin_op.getLHS(), dim_count, operands);
-    auto rhs = EvaluateExpression(b, bin_op.getRHS(), dim_count, operands);
     switch (expr.getKind()) {
       case AffineExprKind::Add:
-        return b.create<arith::AddIOp>(lhs, rhs);
+        return EvaluateAddMul<arith::AddIOp>(expr);
       case AffineExprKind::Mul:
-        return b.create<arith::MulIOp>(lhs, rhs);
+        return EvaluateAddMul<arith::MulIOp>(expr);
       case AffineExprKind::Mod:
-        return b.create<arith::RemUIOp>(lhs, rhs);
+        return builder.create<arith::RemUIOp>(
+            EvaluateExpression(bin_op.getLHS()),
+            EvaluateExpression(bin_op.getRHS()));
       case AffineExprKind::FloorDiv:
-        return b.create<arith::DivUIOp>(lhs, rhs);
+        return builder.create<arith::DivUIOp>(
+            EvaluateExpression(bin_op.getLHS()),
+            EvaluateExpression(bin_op.getRHS()));
       default:
         ABSL_UNREACHABLE();
     }
   }
   switch (expr.getKind()) {
     case AffineExprKind::Constant:
-      return b.create<arith::ConstantIndexOp>(
+      return builder.create<arith::ConstantIndexOp>(
           mlir::cast<AffineConstantExpr>(expr).getValue());
     case AffineExprKind::DimId:
       return operands[mlir::cast<AffineDimExpr>(expr).getPosition()];
     case AffineExprKind::SymbolId:
-      return operands[dim_count +
+      return operands[dim_distances.size() +
                       mlir::cast<AffineSymbolExpr>(expr).getPosition()];
     default:
       ABSL_UNREACHABLE();
@@ -157,8 +250,9 @@ struct RewriteAffineApply : OpRewritePattern<mlir::affine::AffineApplyOp> {
                                          "unable to lower the affine apply");
     }
     b.setInsertionPoint(op);
-    auto result = EvaluateExpression(
-        b, result_expr, indexing_map.GetDimensionCount(), op->getOperands());
+    auto result = ExpressionEvaluator(b, indexing_map.GetDimensionCount(),
+                                      op->getOperands())
+                      .EvaluateExpression(result_expr);
     rewriter.replaceOp(op, result);
     return mlir::success();
   }
@@ -186,8 +280,8 @@ struct RewriteApplyIndexingOp : OpRewritePattern<ApplyIndexingOp> {
       // If the expression cannot be lowered, we convert it to affine.apply,
       // since it supports more expression types.
       if (IsLoweringSupported(result_expr, range_evaluator)) {
-        results.push_back(
-            EvaluateExpression(b, result_expr, dim_count, operands));
+        results.push_back(ExpressionEvaluator(b, dim_count, operands)
+                              .EvaluateExpression(result_expr));
       } else {
         results.push_back(
             b.create<AffineApplyOp>(affine_map.getSubMap({i}), operands));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
index eaa05c3b207371..3d96446cbdd6b3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir_fusions_opt %s -split-input-file -xla-gpu-simplify-affine | FileCheck %s
+// RUN: mlir_fusions_opt --allow-unregistered-dialect %s -split-input-file -xla-gpu-simplify-affine | FileCheck %s
 
 func.func @op_and_for_ranges(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
   %c0 = arith.constant 0 : index
@@ -109,6 +109,37 @@ func.func @cant_lower(%arg0: index, %arg1: index) -> (index, index) {
     [%arg0 in [-10, 42], %arg1 in [0, 1000]]
   return %0#0, %0#1 : index, index
 }
+
 // CHECK-LABEL: @cant_lower
 // CHECK:         affine.apply
 // CHECK-NEXT:    arith.addi
+
+// -----
+
+func.func @order_summands(%arg1: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %arg2 = %c0 to %c4 step %c1 {
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      %0 = xla_gpu.apply_indexing
+        affine_map<()[s0, s1, s2] -> ((s0 + s1) floordiv 3 + s0 * 512 + s1 * 4 + s2 * 10)>
+        [%arg2 in [0, 3], %arg1 in [0, 3], %arg3 in [0, 3]]
+      "dummy.op"(%0) : (index) -> ()
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: @order_summands
+// CHECK-SAME:    (%[[ARG1:.*]]: index)
+// CHECK: scf.for %[[ARG2:.*]] =
+// CHECK: scf.for %[[ARG3:.*]] =
+// CHECK: arith.muli %[[ARG1]]
+// CHECK: arith.muli %[[ARG2]]
+// CHECK: arith.addi
+// CHECK: arith.addi %[[ARG1]], %[[ARG2]]
+// CHECK: arith.divui
+// CHECK: arith.addi
+// CHECK: arith.muli %[[ARG3]]
+// CHECK: arith.addi %5, %6 : index

From e68bd93276d530c23d21b0e9e7dc0043f10d4f0b Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 29 May 2024 06:41:58 -0700
Subject: [PATCH 073/287] Create a simple loop unswitching pass.

In some cases, LLVM's loop unswitcher doesn't detect our
conditions as trivial and doesn't optimize.

PiperOrigin-RevId: 638266302
---
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |   1 +
 .../gpu/fusions/mlir/mlir_fusion_emitter.cc   |   3 +-
 .../xla/xla/service/gpu/fusions/mlir/passes.h |   1 +
 .../xla/service/gpu/fusions/mlir/passes.td    |  48 ++++++++
 .../fusions/mlir/tests/unswitch_loops.mlir    |  43 +++++++
 .../gpu/fusions/mlir/unswitch_loops.cc        | 107 ++++++++++++++++++
 6 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/gpu/fusions/mlir/tests/unswitch_loops.mlir
 create mode 100644 third_party/xla/xla/service/gpu/fusions/mlir/unswitch_loops.cc

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 0219bd3e22b914..762d59232ea60f 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -288,6 +288,7 @@ cc_library(
         "propagate_slice_indices.cc",
         "simplify_affine.cc",
         "simplify_arith.cc",
+        "unswitch_loops.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index eebe3d946ef6dd..58db089a342612 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -308,6 +308,8 @@ MlirFusionEmitterBase::CreateLLVMModule(
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
   pm.addPass(CreatePropagateSliceIndicesPass());
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateUnswitchLoopsPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertPureCallOpsPass());
   pm.addPass(CreateLowerTensorsPass(
       is_amd, is_amd ? device.rocm_compute_capability().gcn_arch_name()
@@ -329,7 +331,6 @@ MlirFusionEmitterBase::CreateLLVMModule(
   pm.addPass(mlir::createLoopInvariantCodeMotionPass());
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addPass(CreateLowerTensorsPass());
   pm.addPass(CreateExpandFloatOpsPass(
       !device.cuda_compute_capability().IsAtLeastAmpere()));
   pm.addPass(CreateLowerToLLVMPass());
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
index 0cc0fd101c4e8d..4bb7b0d855cd34 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
@@ -46,6 +46,7 @@ std::unique_ptr<mlir::Pass> CreateEraseDeadFunctionsPass();
 std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
 std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
 std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
+std::unique_ptr<mlir::Pass> CreateUnswitchLoopsPass();
 
 #define GEN_PASS_REGISTRATION
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
index 0283530ca32ea4..400cbb31cb33ae 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
@@ -187,4 +187,52 @@ def LowerToLLVMPass :
   let constructor = "CreateLowerToLLVMPass()";
 }
 
+def UnswitchLoopsPass :
+   Pass<"xla-gpu-unswitch-loops", "mlir::func::FuncOp"> {
+  let summary = "Swaps scf.if and scf.for.";
+
+  let description = [{
+      Extracts `scf.if` ops with conditions that are independent of the loop
+      variable from `scf.for` by doing the following rewrite:
+
+      Before:
+
+      %cond = some_cond() : i1
+      %results = scf.for {
+        %some_val = scf.if %cond  {
+        } else {
+        }
+        scf.yield %some_val
+      }
+
+      After:
+
+      %cond = some_cond() : i1
+      %results = scf.if %cond {
+         %results = scf.for {
+            %some_val = scf.if %true  {
+            } else {
+            }
+         }
+         yield %results
+      } else {
+         %results = scf.for {
+            %some_val = scf.if %false  {
+            } else {
+            }
+         }
+         yield %results
+      }
+
+      This only triggers if there is a single `scf.if` op in the loop body (and
+      nothing else).
+  }];
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect", "mlir::scf::SCFDialect"
+  ];
+
+  let constructor = "CreateUnswitchLoopsPass()";
+}
+
 #endif  // XLA_SERVICE_GPU_FUSIONS_MLIR_PASSES_TD_
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/unswitch_loops.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/unswitch_loops.mlir
new file mode 100644
index 00000000000000..da0c8d07f9a184
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/unswitch_loops.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir_fusions_opt %s -split-input-file -xla-gpu-unswitch-loops | FileCheck %s
+
+module {
+  func.func @unswitchable(
+     %arg0: tensor<2xf32>,
+     %arg1: index
+  ) -> tensor<2xf32> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %cst3 = arith.constant 3.0 : f32
+    %cst4 = arith.constant 4.0 : f32
+    %cond = arith.cmpi sle, %arg1, %c1 : index
+
+    %for = scf.for %i = %c0 to %c2 step %c1 iter_args(%arg2 = %arg0) -> tensor<2xf32> {
+      %result = scf.if %cond -> tensor<2xf32> {
+        %set_3 = tensor.insert %cst3 into %arg2[%i] : tensor<2xf32>
+        scf.yield %set_3 : tensor<2xf32>
+      } else {
+        %set_4 = tensor.insert %cst4 into %arg2[%i] : tensor<2xf32>
+        scf.yield %set_4 : tensor<2xf32>
+      }
+      scf.yield %result : tensor<2xf32>
+    }
+
+    func.return %for : tensor<2xf32>
+  }
+}
+
+// CHECK:      @unswitchable(%[[ARG0:.*]]: tensor<2xf32>, %[[ARG1:.*]]: index)
+// CHECK:        %[[CST3:.*]] = arith.constant 3.0
+// CHECK:        %[[CST4:.*]] = arith.constant 4.0
+// CHECK:        %[[COND:.*]] = arith.cmpi sle, %[[ARG1]]
+// CHECK-NEXT:   scf.if %[[COND]]
+// CHECK-NEXT:     scf.for
+// CHECK-NEXT:       tensor.insert %[[CST3]]
+// CHECK-NEXT:       scf.yield
+// CHECK-NEXT:     }
+// CHECK:        } else {
+// CHECK-NEXT:     scf.for
+// CHECK-NEXT:       tensor.insert %[[CST4]]
+// CHECK-NEXT:       scf.yield
+// CHECK-NEXT:   }
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/unswitch_loops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/unswitch_loops.cc
new file mode 100644
index 00000000000000..4beaa7072e5690
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/unswitch_loops.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DEF_UNSWITCHLOOPSPASS
+#include "xla/service/gpu/fusions/mlir/passes.h.inc"
+
+namespace {
+
+class UnswitchLoopsPass
+    : public impl::UnswitchLoopsPassBase<UnswitchLoopsPass> {
+ public:
+  void runOnOperation() override;
+};
+
+struct UnswitchLoop : mlir::OpRewritePattern<mlir::scf::ForOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::scf::ForOp op, mlir::PatternRewriter& rewriter) const override {
+    if (op.getBody()->getOperations().size() != 2) {
+      return rewriter.notifyMatchFailure(
+          op, "loop body is not a single instruction");
+    }
+    auto if_op = mlir::dyn_cast<mlir::scf::IfOp>(op.getBody()->front());
+    if (!if_op) {
+      return rewriter.notifyMatchFailure(op, "no if found inside the loop");
+    }
+    if (mlir::matchPattern(if_op.getCondition(), mlir::m_Constant())) {
+      return rewriter.notifyMatchFailure(op, "condition is a constant");
+    }
+
+    auto true_cst = rewriter.create<mlir::arith::ConstantOp>(
+        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
+    auto false_cst = rewriter.create<mlir::arith::ConstantOp>(
+        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 0));
+    rewriter.setInsertionPoint(op);
+    mlir::IRMapping mapping;
+    mapping.map(if_op.getCondition(), false_cst);
+    auto false_branch_loop = op->clone(mapping);
+    auto new_if = rewriter.create<mlir::scf::IfOp>(
+        op.getLoc(), op.getResultTypes(), if_op.getCondition(), true, true);
+    rewriter.replaceAllUsesWith(op.getResults(), new_if.getResults());
+
+    auto then_builder = new_if.getThenBodyBuilder(rewriter.getListener());
+    auto then_yield =
+        then_builder.create<mlir::scf::YieldOp>(op.getLoc(), op.getResults());
+    rewriter.moveOpBefore(op, then_yield);
+    rewriter.modifyOpInPlace(if_op, [&]() { if_op->setOperand(0, true_cst); });
+
+    auto else_builder = new_if.getElseBodyBuilder(rewriter.getListener());
+    else_builder.insert(false_branch_loop);
+    else_builder.create<mlir::scf::YieldOp>(op.getLoc(),
+                                            false_branch_loop->getResults());
+
+    return mlir::success();
+  }
+};
+
+void UnswitchLoopsPass::runOnOperation() {
+  mlir::RewritePatternSet patterns(&getContext());
+  patterns.add<UnswitchLoop>(&getContext());
+  mlir::scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
+  mlir::scf::IfOp::getCanonicalizationPatterns(patterns, &getContext());
+  if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
+                                                      std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateUnswitchLoopsPass() {
+  return std::make_unique<UnswitchLoopsPass>();
+}
+
+}  // namespace gpu
+}  // namespace xla

From a41723a0cc2716eb9e061cca067ae576e3d61673 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 08:31:26 -0700
Subject: [PATCH 074/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638294947
---
 third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD    | 1 -
 .../xla/python/ifrt_proxy/integration_tests/mock_array_test.cc   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
index d063dd3f03835e..15292bafdc7278 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
@@ -91,7 +91,6 @@ ifrt_proxy_cc_test(
     size = "small",
     srcs = ["mock_array_test.cc"],
     deps = [
-        "//xla:status",
         "//xla/pjrt/cpu:cpu_client",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index fe0aefc64b7c6e..5127b0c8365730 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -47,7 +47,6 @@
 #include "xla/python/ifrt_proxy/client/registry.h"
 #include "xla/python/ifrt_proxy/server/grpc_server.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status_matchers.h"

From 1d4045527438a742b90a87a217e06340adad5e44 Mon Sep 17 00:00:00 2001
From: Farzin Houshmand <farzinh@google.com>
Date: Wed, 29 May 2024 09:14:12 -0700
Subject: [PATCH 075/287] Change IsCollectiveWithChannelId to return the
 instruction with channel_id or nullptr.

PiperOrigin-RevId: 638307912
---
 .../xla/xla/service/collective_ops_utils.cc     | 17 ++++++++++-------
 .../xla/xla/service/collective_ops_utils.h      |  6 +++---
 .../xla/service/collective_ops_utils_test.cc    |  6 +++---
 .../xla/xla/service/while_loop_unroller.cc      |  5 +++--
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index c565836aaf7b36..b980e517f2a22d 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -610,19 +610,22 @@ bool IsCollective(const HloInstruction* instruction) {
   }
 }
 
-bool IsCollectiveWithChannelId(const HloInstruction* instruction) {
+HloInstruction* IsOrHasCollectiveWithChannelId(HloInstruction* instruction) {
   if (instruction->opcode() == HloOpcode::kFusion) {
-    for (const auto* inner_inst : instruction->fused_instructions()) {
-      if (IsCollectiveWithChannelId(inner_inst)) {
-        return true;
+    for (auto* inner_inst : instruction->fused_instructions()) {
+      if (IsOrHasCollectiveWithChannelId(inner_inst) != nullptr) {
+        return inner_inst;
       }
     }
-    return false;
+    return nullptr;
   }
   if (DynCast<HloChannelInstruction>(instruction) == nullptr) {
-    return false;
+    return nullptr;
+  }
+  if (IsCollective(instruction) && instruction->channel_id().has_value()) {
+    return instruction;
   }
-  return IsCollective(instruction) && instruction->channel_id().has_value();
+  return nullptr;
 }
 
 bool IsSyncCollective(const HloInstruction* instr) {
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 315d0f3330b836..79a79863c21dff 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -172,9 +172,9 @@ inline constexpr absl::string_view kNopReturnTokenCustomCallTarget =
 // Returns true if instruction is a collective op or a collective fusion.
 bool IsCollective(const HloInstruction* instruction);
 
-// Returns true if instruction is a collective op (or a collective fusion) with
-// channel_id.
-bool IsCollectiveWithChannelId(const HloInstruction* instruction);
+// Returns the collective instruction if argument is a collective op (or a
+// collective fusion) with channel_id.
+HloInstruction* IsOrHasCollectiveWithChannelId(HloInstruction* instruction);
 
 // Returns true if instruction is a synchronous collective op.
 bool IsSyncCollective(const HloInstruction* instr);
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index 4f4a725e2314d6..9fbcaf5f1f2ba2 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -85,7 +85,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId) {
   HloInstruction *all_gather =
       module->entry_computation()->GetInstructionWithName("all-gather");
 
-  EXPECT_TRUE(IsCollectiveWithChannelId(all_gather));
+  EXPECT_EQ(IsOrHasCollectiveWithChannelId(all_gather), all_gather);
 }
 
 TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
@@ -109,7 +109,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction::CreateFusion(ShapeUtil::MakeShape(BF16, {1, 4096, 4096}),
                                    HloInstruction::FusionKind::kOutput,
                                    {param_0}, computation.get(), "fusion");
-  EXPECT_TRUE(IsCollectiveWithChannelId(fusion.get()));
+  EXPECT_EQ(IsOrHasCollectiveWithChannelId(fusion.get()), instr);
 
   auto builder2 = HloComputation::Builder("CollectiveWithChannelId2");
   TF_ASSERT_OK_AND_ASSIGN(
@@ -126,7 +126,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction::CreateFusion(ShapeUtil::MakeShape(BF16, {1, 4096, 4096}),
                                    HloInstruction::FusionKind::kOutput,
                                    {param_1}, computation2.get(), "fusion2");
-  EXPECT_FALSE(IsCollectiveWithChannelId(fusion2.get()));
+  EXPECT_EQ(IsOrHasCollectiveWithChannelId(fusion2.get()), nullptr);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 68fc1d767d56ce..1dfa8dac54619d 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -119,11 +119,12 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
   for (HloInstruction* body_inst : while_body_clone->instructions()) {
     // We need to assign a unique channel_id for the collective ops that are
     // unrolled within the while loop body or fusions containing collectives.
-    if (IsCollectiveWithChannelId(body_inst)) {
+    HloInstruction* collective = IsOrHasCollectiveWithChannelId(body_inst);
+    if (collective != nullptr) {
       // To obtain the channel_id for the collective ops we only need to
       // increment the `unique_channel_id` since it records the next available
       // channel_id across the module.
-      body_inst->set_channel_id(unique_channel_id++);
+      collective->set_channel_id(unique_channel_id++);
     }
 
     // We only consider induction variable instructions of the following form.

From ec57d19d1fd52c2c57fd2fcad1d5b19e17361a4a Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Wed, 29 May 2024 09:52:53 -0700
Subject: [PATCH 076/287] [xla:ffi] Renamed BufferBase to AnyBuffer

BufferBase is too opaque to guess that it should be used for arguments which
accept any combination of dtype/rank.

PiperOrigin-RevId: 638319805
---
 third_party/xla/xla/ffi/api/ffi.h             | 42 +++++---
 third_party/xla/xla/ffi/api/ffi_test.cc       | 43 ++++----
 third_party/xla/xla/ffi/ffi.h                 | 36 ++++---
 third_party/xla/xla/ffi/ffi_test.cc           | 22 ++---
 .../xla/xla/pjrt/cpu/cpu_client_test.cc       | 10 +-
 .../xla/xla/python/custom_calls_testlib.cc    |  8 +-
 ...ddress_computation_fusion_rewriter_test.cc |  8 +-
 .../xla/xla/service/gpu/custom_call_test.cc   | 78 +++++++--------
 .../address_computation_fusion_test.cc        | 99 +++++++++----------
 .../runtime/address_computation_thunk_test.cc |  8 +-
 .../service/llvm_ir/alias_analysis_test.cc    |  8 +-
 third_party/xla/xla/tests/custom_call_test.cc | 20 ++--
 12 files changed, 201 insertions(+), 181 deletions(-)

diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 75c9e8a0b08dc7..e7f7d4868e1137 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -127,12 +127,19 @@ class Error {
 // Arguments
 //===----------------------------------------------------------------------===//
 
-struct BufferBase {
+// Dynamically-typed buffer.
+//
+// No checks are done at decoding time. Any dtype and rank combination is
+// accepted.
+struct AnyBuffer {
   DataType dtype;
   void* data;
   Span<const int64_t> dimensions;
 };
 
+// Deprecated. Use `AnyBuffer` instead.
+using BufferBase = AnyBuffer;
+
 namespace internal {
 
 // A workaround for the fact that a static_assertion can be evaluated
@@ -228,6 +235,10 @@ static_assert(IsComplexType<DataType::C64>());
 static_assert(IsComplexType<DataType::C128>());
 static_assert(!IsComplexType<DataType::F32>());
 
+// Buffer with a statically-known dtype and rank.
+//
+// The dtype and rank are checked at decoding time. If rank is not specified,
+// any rank is accepted.
 template <DataType dtype, size_t rank = internal::kDynamicRank>
 struct Buffer {
   NativeType<dtype>* data;
@@ -246,9 +257,9 @@ using Token = BufferR0<DataType::TOKEN>;
 
 namespace internal {
 
-inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
-  return BufferBase{static_cast<DataType>(buf->dtype), buf->data,
-                    Span<const int64_t>(buf->dims, buf->rank)};
+inline AnyBuffer DecodeBuffer(XLA_FFI_Buffer* buf) {
+  return AnyBuffer{static_cast<DataType>(buf->dtype), buf->data,
+                   Span<const int64_t>(buf->dims, buf->rank)};
 }
 
 template <DataType dtype, size_t rank>
@@ -275,7 +286,7 @@ std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
 
 }  // namespace internal
 
-using ResultBufferBase = Result<BufferBase>;
+using ResultBufferBase = Result<AnyBuffer>;
 template <DataType dtype, size_t rank = internal::kDynamicRank>
 using ResultBuffer = Result<Buffer<dtype, rank>>;
 
@@ -292,8 +303,8 @@ template <DataType dtype> using ResultBufferR4 = ResultBuffer<dtype, 4>;
 //===----------------------------------------------------------------------===//
 
 template <>
-struct ArgBinding<BufferBase> {
-  using Arg = BufferBase;
+struct ArgBinding<AnyBuffer> {
+  using Arg = AnyBuffer;
 };
 
 template <DataType dtype, size_t rank>
@@ -306,8 +317,8 @@ struct ArgBinding<Buffer<dtype, rank>> {
 //===----------------------------------------------------------------------===//
 
 template <>
-struct RetBinding<Result<BufferBase>> {
-  using Ret = BufferBase;
+struct RetBinding<Result<AnyBuffer>> {
+  using Ret = AnyBuffer;
 };
 
 template <DataType dtype, size_t rank>
@@ -327,10 +338,10 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
 }
 
 template <>
-struct ArgDecoding<BufferBase> {
+struct ArgDecoding<AnyBuffer> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
-  static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
-                                          DiagnosticEngine& diagnostic) {
+  static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
+                                         DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
@@ -366,10 +377,11 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_RetType type) {
 }
 
 template <>
-struct RetDecoding<BufferBase> {
+struct RetDecoding<AnyBuffer> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
-  static std::optional<Result<BufferBase>> Decode(
-      XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
+  static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
+                                                 void* ret,
+                                                 DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
       return diagnostic.Emit("Wrong result type: expected ")
              << XLA_FFI_RetType_BUFFER << " but got " << type;
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index 2f6e95cc8379b5..e134062019c166 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -93,7 +93,7 @@ TEST(FfiTest, DataTypeEnumValue) {
   EXPECT_EQ(encoded(PrimitiveType::TOKEN), encoded(DataType::TOKEN));
 }
 
-TEST(FfiTest, BufferBaseArgument) {
+TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
@@ -101,7 +101,7 @@ TEST(FfiTest, BufferBaseArgument) {
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
   auto call_frame = builder.Build();
 
-  auto handler = Ffi::Bind().Arg<BufferBase>().To([&](auto buffer) {
+  auto handler = Ffi::Bind().Arg<AnyBuffer>().To([&](auto buffer) {
     EXPECT_EQ(buffer.data, storage.data());
     EXPECT_EQ(buffer.dimensions.size(), 2);
     return Error::Success();
@@ -130,7 +130,7 @@ TEST(FfiTest, BufferArgument) {
   TF_ASSERT_OK(status);
 }
 
-TEST(FfiTest, BufferBaseResult) {
+TEST(FfiTest, AnyBufferResult) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
@@ -138,12 +138,11 @@ TEST(FfiTest, BufferBaseResult) {
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
   auto call_frame = builder.Build();
 
-  auto handler =
-      Ffi::Bind().Ret<BufferBase>().To([&](Result<BufferBase> buffer) {
-        EXPECT_EQ(buffer->data, storage.data());
-        EXPECT_EQ(buffer->dimensions.size(), 2);
-        return Error::Success();
-      });
+  auto handler = Ffi::Bind().Ret<AnyBuffer>().To([&](Result<AnyBuffer> buffer) {
+    EXPECT_EQ(buffer->data, storage.data());
+    EXPECT_EQ(buffer->dimensions.size(), 2);
+    return Error::Success();
+  });
   auto status = Call(*handler, call_frame);
 
   TF_ASSERT_OK(status);
@@ -216,7 +215,7 @@ TEST(FfiTest, TokenArgument) {
 TEST(FfiTest, AutoBinding) {
   static constexpr char kI32[] = "i32";
 
-  auto handler = Ffi::BindTo(+[](BufferBase buffer, Attr<int32_t, kI32> foo) {
+  auto handler = Ffi::BindTo(+[](AnyBuffer buffer, Attr<int32_t, kI32> foo) {
     EXPECT_EQ(*foo, 42);
     return Error::Success();
   });
@@ -238,7 +237,7 @@ TEST(FfiTest, AutoBinding) {
 
 TEST(FfiTest, AutoBindingResult) {
   auto handler =
-      Ffi::BindTo(+[](Result<BufferBase> buffer) { return Error::Success(); });
+      Ffi::BindTo(+[](Result<AnyBuffer> buffer) { return Error::Success(); });
 
   CallFrameBuilder builder;
   builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
@@ -483,13 +482,13 @@ static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
 }
 
 //===----------------------------------------------------------------------===//
-// BM_BufferBaseArgX1
+// BM_AnyBufferArgX1
 //===----------------------------------------------------------------------===//
 
-void BM_BufferBaseArgX1(benchmark::State& state) {
+void BM_AnyBufferArgX1(benchmark::State& state) {
   auto call_frame = WithBufferArgs(1).Build();
 
-  auto handler = Ffi::Bind().Arg<BufferBase>().To([](auto buffer) {
+  auto handler = Ffi::Bind().Arg<AnyBuffer>().To([](auto buffer) {
     benchmark::DoNotOptimize(buffer);
     return Error::Success();
   });
@@ -498,20 +497,20 @@ void BM_BufferBaseArgX1(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_BufferBaseArgX1);
+BENCHMARK(BM_AnyBufferArgX1);
 
 //===----------------------------------------------------------------------===//
-// BM_BufferBaseArgX4
+// BM_AnyBufferArgX4
 //===----------------------------------------------------------------------===//
 
-void BM_BufferBaseArgX4(benchmark::State& state) {
+void BM_AnyBufferArgX4(benchmark::State& state) {
   auto call_frame = WithBufferArgs(4).Build();
 
   auto handler = Ffi::Bind()
-                     .Arg<BufferBase>()
-                     .Arg<BufferBase>()
-                     .Arg<BufferBase>()
-                     .Arg<BufferBase>()
+                     .Arg<AnyBuffer>()
+                     .Arg<AnyBuffer>()
+                     .Arg<AnyBuffer>()
+                     .Arg<AnyBuffer>()
                      .To([](auto b0, auto b1, auto b2, auto b3) {
                        benchmark::DoNotOptimize(b0);
                        benchmark::DoNotOptimize(b1);
@@ -525,7 +524,7 @@ void BM_BufferBaseArgX4(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_BufferBaseArgX4);
+BENCHMARK(BM_AnyBufferArgX4);
 
 //===----------------------------------------------------------------------===//
 // BM_BufferArgX1
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 3100bc3a4fa3b7..ad97f6aea411b0 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -55,7 +55,11 @@ struct CalledComputation {};  // binds `HloComputation*`
 // Arguments
 //===----------------------------------------------------------------------===//
 
-struct BufferBase {
+// Dynamically-typed buffer.
+//
+// No checks are done at decoding time. Any dtype and rank combination is
+// accepted.
+struct AnyBuffer {
   using Shape = absl::Span<const int64_t>;
 
   PrimitiveType dtype;
@@ -63,6 +67,9 @@ struct BufferBase {
   Shape dimensions;
 };
 
+// Deprecated. Use `AnyBuffer` instead.
+using BufferBase = AnyBuffer;
+
 namespace internal {
 
 inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
@@ -72,9 +79,13 @@ using NativeType = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
 
 }  // namespace internal
 
+// Buffer with a statically-known dtype and rank.
+//
+// The dtype and rank are checked at decoding time. If rank is not specified,
+// any rank is accepted.
 template <PrimitiveType dtype, size_t rank = internal::kDynamicRank>
 struct Buffer {
-  using Shape = BufferBase::Shape;
+  using Shape = AnyBuffer::Shape;
 
   se::DeviceMemory<internal::NativeType<dtype>> data;
   Shape dimensions;
@@ -92,14 +103,14 @@ using Token = BufferR0<PrimitiveType::TOKEN>;
 
 namespace internal {
 
-inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
+inline AnyBuffer DecodeBuffer(XLA_FFI_Buffer* buf) {
   size_t size_bytes = 0;
   if (primitive_util::IsArrayType(PrimitiveType(buf->dtype))) {
     size_bytes = primitive_util::ByteWidth(PrimitiveType(buf->dtype));
     for (int64_t i = 0; i < buf->rank; ++i) size_bytes *= buf->dims[i];
   }
 
-  BufferBase buffer;
+  AnyBuffer buffer;
   buffer.dtype = PrimitiveType(buf->dtype);
   buffer.data = se::DeviceMemoryBase(buf->data, size_bytes);
   buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
@@ -143,8 +154,8 @@ std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
 //===----------------------------------------------------------------------===//
 
 template <>
-struct ArgBinding<BufferBase> {
-  using Arg = BufferBase;
+struct ArgBinding<AnyBuffer> {
+  using Arg = AnyBuffer;
 };
 
 template <PrimitiveType dtype, size_t rank>
@@ -157,10 +168,10 @@ struct ArgBinding<Buffer<dtype, rank>> {
 //===----------------------------------------------------------------------===//
 
 template <>
-struct ArgDecoding<BufferBase> {
+struct ArgDecoding<AnyBuffer> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
-  static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
-                                          DiagnosticEngine& diagnostic) {
+  static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
+                                         DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
@@ -190,10 +201,11 @@ struct ArgDecoding<Buffer<dtype, rank>> {
 //===----------------------------------------------------------------------===//
 
 template <>
-struct RetDecoding<BufferBase> {
+struct RetDecoding<AnyBuffer> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
-  static std::optional<Result<BufferBase>> Decode(
-      XLA_FFI_RetType type, void* arg, DiagnosticEngine& diagnostic) {
+  static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
+                                                 void* arg,
+                                                 DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
       return diagnostic.Emit("Wrong result type: expected ")
              << XLA_FFI_RetType_BUFFER << " but got " << type;
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index f0197385797b2f..a01d81fa22d055 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -82,8 +82,8 @@ TEST(FfiTest, WrongNumArgs) {
   builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
-  auto handler = Ffi::Bind().Arg<BufferBase>().Arg<BufferBase>().To(
-      [](BufferBase, BufferBase) { return absl::OkStatus(); });
+  auto handler = Ffi::Bind().Arg<AnyBuffer>().Arg<AnyBuffer>().To(
+      [](AnyBuffer, AnyBuffer) { return absl::OkStatus(); });
 
   auto status = Call(*handler, call_frame);
 
@@ -433,7 +433,7 @@ TEST(FfiTest, DecodingErrors) {
       << status.message() << "\n";
 }
 
-TEST(FfiTest, BufferBaseArgument) {
+TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
@@ -441,7 +441,7 @@ TEST(FfiTest, BufferBaseArgument) {
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
   auto call_frame = builder.Build();
 
-  auto fn = [&](BufferBase buffer) {
+  auto fn = [&](AnyBuffer buffer) {
     EXPECT_EQ(buffer.dtype, PrimitiveType::F32);
     EXPECT_EQ(buffer.data.opaque(), storage.data());
     EXPECT_EQ(buffer.dimensions.size(), 2);
@@ -449,7 +449,7 @@ TEST(FfiTest, BufferBaseArgument) {
   };
 
   {  // Test explicit binding signature declaration.
-    auto handler = Ffi::Bind().Arg<BufferBase>().To(fn);
+    auto handler = Ffi::Bind().Arg<AnyBuffer>().To(fn);
     auto status = Call(*handler, call_frame);
     TF_ASSERT_OK(status);
   }
@@ -571,8 +571,8 @@ TEST(FfiTest, RemainingArgs) {
 
   auto fn = [&](RemainingArgs args) {
     EXPECT_EQ(args.size(), 1);
-    EXPECT_TRUE(args.get<BufferBase>(0).has_value());
-    EXPECT_FALSE(args.get<BufferBase>(1).has_value());
+    EXPECT_TRUE(args.get<AnyBuffer>(0).has_value());
+    EXPECT_FALSE(args.get<AnyBuffer>(1).has_value());
     return absl::OkStatus();
   };
 
@@ -591,14 +591,14 @@ TEST(FfiTest, RemainingRets) {
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
   auto call_frame = builder.Build();
 
-  auto fn = [&](Result<BufferBase> ret, RemainingResults rets) {
+  auto fn = [&](Result<AnyBuffer> ret, RemainingResults rets) {
     EXPECT_EQ(rets.size(), 1);
-    EXPECT_TRUE(rets.get<BufferBase>(0).has_value());
-    EXPECT_FALSE(rets.get<BufferBase>(1).has_value());
+    EXPECT_TRUE(rets.get<AnyBuffer>(0).has_value());
+    EXPECT_FALSE(rets.get<AnyBuffer>(1).has_value());
     return absl::OkStatus();
   };
 
-  auto handler = Ffi::Bind().Ret<BufferBase>().RemainingResults().To(fn);
+  auto handler = Ffi::Bind().Ret<AnyBuffer>().RemainingResults().To(fn);
   auto status = Call(*handler, call_frame);
 
   TF_ASSERT_OK(status);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index e70c5b4f370919..081f11d7a3cfe8 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -64,16 +64,16 @@ using ::testing::HasSubstr;
 using ::testing::IsFalse;
 using ::tsl::testing::IsOkAndHolds;
 
-static absl::Status TestError(ffi::BufferBase, ffi::Result<ffi::BufferBase>,
-                              ffi::Result<ffi::BufferBase>) {
+static absl::Status TestError(ffi::AnyBuffer, ffi::Result<ffi::AnyBuffer>,
+                              ffi::Result<ffi::AnyBuffer>) {
   return absl::InternalError("test error.");
 }
 
 XLA_FFI_DEFINE_HANDLER(kTestError, TestError,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // in
-                           .Ret<ffi::BufferBase>()  // out0
-                           .Ret<ffi::BufferBase>()  // out1
+                           .Arg<ffi::AnyBuffer>()  // in
+                           .Ret<ffi::AnyBuffer>()  // out0
+                           .Ret<ffi::AnyBuffer>()  // out1
 );
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$TestError", "Host",
diff --git a/third_party/xla/xla/python/custom_calls_testlib.cc b/third_party/xla/xla/python/custom_calls_testlib.cc
index ee32a9864681fe..ab32caeda4e245 100644
--- a/third_party/xla/xla/python/custom_calls_testlib.cc
+++ b/third_party/xla/xla/python/custom_calls_testlib.cc
@@ -26,11 +26,11 @@ namespace nb = ::nanobind;
 // examples and features (e.g. binding attributes, custom user-defined structs
 // and arbitrary execution context).
 
-static Error AlwaysFail(Result<BufferBase>) {
+static Error AlwaysFail(Result<AnyBuffer>) {
   return Error(XLA_FFI_Error_Code_INTERNAL, "Failed intentionally");
 }
 
-static Error AlwaysSucceed(Result<BufferBase>) { return Error::Success(); }
+static Error AlwaysSucceed(Result<AnyBuffer>) { return Error::Success(); }
 
 static Error Subtract(BufferR0<DataType::F32> a, BufferR0<DataType::F32> b,
                       Result<BufferR0<DataType::F32>> out) {
@@ -47,10 +47,10 @@ static Error SubtractCst(BufferR0<DataType::F32> a,
 // Define XLA FFI handlers from the implementations defined above using explicit
 // XLA FFI binding API to describe type signatures of custom calls.
 
-XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail, Ffi::Bind().Ret<BufferBase>());
+XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail, Ffi::Bind().Ret<AnyBuffer>());
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
-                       Ffi::Bind().Ret<BufferBase>());
+                       Ffi::Bind().Ret<AnyBuffer>());
 
 XLA_FFI_DEFINE_HANDLER(kSubtract, Subtract,
                        Ffi::Bind()
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 8adc62ffaadbfc..909c1fc531e379 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -932,8 +932,8 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
                             expected);
 }
 
-static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::BufferBase dst) {
+static absl::Status Memcpy(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::AnyBuffer dst) {
   return stream->MemcpyD2D(
       &dst.data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
@@ -943,8 +943,8 @@ static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
 XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src
-                           .Arg<ffi::BufferBase>()  // dst
+                           .Arg<ffi::AnyBuffer>()  // src
+                           .Arg<ffi::AnyBuffer>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 7155d587a5b64e..54384db6e4b757 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -352,13 +352,13 @@ TEST_F(CustomCallTest, WithStatusFailed) {
 // XLA runtime custom calls provides type-safe custom call API
 //===----------------------------------------------------------------------===//
 
-static absl::Status AlwaysFail(ffi::Result<ffi::BufferBase>, int32_t value) {
+static absl::Status AlwaysFail(ffi::Result<ffi::AnyBuffer>, int32_t value) {
   return absl::InternalError(absl::StrCat("Uh oh, wrong value: ", value));
 }
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                        ffi::Ffi::Bind()
-                           .Ret<ffi::BufferBase>()  //
+                           .Ret<ffi::AnyBuffer>()   //
                            .Attr<int32_t>("value")  // value
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail",
@@ -377,8 +377,8 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, wrong value: 42"));
 }
 
-static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::Result<ffi::BufferBase> dst) {
+static absl::Status Memcpy(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> dst) {
   return stream->MemcpyD2D(
       &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
@@ -388,8 +388,8 @@ static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
 XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src
-                           .Ret<ffi::BufferBase>()  // dst
+                           .Arg<ffi::AnyBuffer>()  // src
+                           .Ret<ffi::AnyBuffer>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
@@ -407,14 +407,14 @@ TEST_F(CustomCallTest, ExportedFfiMemcpy) {
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
 
-static absl::Status HandleUserPointer(ffi::Result<ffi::BufferBase>,
+static absl::Status HandleUserPointer(ffi::Result<ffi::AnyBuffer>,
                                       const std::string* str) {
   return absl::InternalError(*str);
 }
 
 XLA_FFI_DEFINE_HANDLER(kHandleUserPointer, HandleUserPointer,
                        ffi::Ffi::Bind()
-                           .Ret<ffi::BufferBase>()  // buffer for result
+                           .Ret<ffi::AnyBuffer>()  // buffer for result
                            .Attr<ffi::Pointer<std::string>>("message"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$user_data", PLATFORM,
@@ -438,14 +438,14 @@ TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
 }
 
 bool is_ffi_invoked = false;
-static absl::Status IsInvoked(ffi::Result<ffi::BufferBase>) {
+static absl::Status IsInvoked(ffi::Result<ffi::AnyBuffer>) {
   is_ffi_invoked = true;
   return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(
     kIsInvoked, IsInvoked,
-    ffi::Ffi::Bind().Ret<ffi::BufferBase>());  // Buffer for result (unused).
+    ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());  // Buffer for result (unused).
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PLATFORM,
                          kIsInvoked);
@@ -480,7 +480,7 @@ TEST_F(CustomCallTest, ExportedFfiUnknownTarget) {
 // fusions/address_computation_fusion_test.cc
 
 // Reusing kExpectedOpaque from the original test.
-static absl::Status Opaque(ffi::Result<ffi::BufferBase>,
+static absl::Status Opaque(ffi::Result<ffi::AnyBuffer>,
                            const std::string* str) {
   std::string opaque(*str);
   if (opaque != kExpectedOpaque)
@@ -492,7 +492,7 @@ static absl::Status Opaque(ffi::Result<ffi::BufferBase>,
 
 XLA_FFI_DEFINE_HANDLER(kOpaque, Opaque,
                        ffi::Ffi::Bind()
-                           .Ret<ffi::BufferBase>()  // Dummy result buffer.
+                           .Ret<ffi::AnyBuffer>()  // Dummy result buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$opaque", PLATFORM,
@@ -512,37 +512,35 @@ TEST_F(CustomCallTest, ExportedFfiOpaque) {
   TF_ASSERT_OK(Execute(&b, {}).status());
 }
 
-static absl::Status TokensChecker(std::vector<ffi::BufferBase> inputs,
+static absl::Status TokensChecker(std::vector<ffi::AnyBuffer> inputs,
                                   const std::string* opaque) {
   // TODO(penporn): Actually check the inputs when FFI handlers support tokens.
   return absl::OkStatus();
 }
 
-static absl::Status Tokens1Input(ffi::BufferBase input1,
-                                 ffi::Result<ffi::BufferBase>,
+static absl::Status Tokens1Input(ffi::AnyBuffer input1,
+                                 ffi::Result<ffi::AnyBuffer>,
                                  const std::string* opaque) {
   return TokensChecker({input1}, opaque);
 }
 
-static absl::Status Tokens2Inputs(ffi::BufferBase input1,
-                                  ffi::BufferBase input2,
-                                  ffi::Result<ffi::BufferBase>,
+static absl::Status Tokens2Inputs(ffi::AnyBuffer input1, ffi::AnyBuffer input2,
+                                  ffi::Result<ffi::AnyBuffer>,
                                   const std::string* opaque) {
   return TokensChecker({input1, input2}, opaque);
 }
 
-static absl::Status Tokens3Inputs(ffi::BufferBase input1,
-                                  ffi::BufferBase input2,
-                                  ffi::BufferBase input3,
-                                  ffi::Result<ffi::BufferBase>,
+static absl::Status Tokens3Inputs(ffi::AnyBuffer input1, ffi::AnyBuffer input2,
+                                  ffi::AnyBuffer input3,
+                                  ffi::Result<ffi::AnyBuffer>,
                                   const std::string* opaque) {
   return TokensChecker({input1, input2, input3}, opaque);
 }
 
 XLA_FFI_DEFINE_HANDLER(kTokens1Input, Tokens1Input,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // 1 input buffer.
-                           .Ret<ffi::BufferBase>()  // Output buffer.
+                           .Arg<ffi::AnyBuffer>()  // 1 input buffer.
+                           .Ret<ffi::AnyBuffer>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_1input",
@@ -550,9 +548,9 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_1input",
 
 XLA_FFI_DEFINE_HANDLER(kTokens2Inputs, Tokens2Inputs,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // 1st input buffer.
-                           .Arg<ffi::BufferBase>()  // 2nd input buffer.
-                           .Ret<ffi::BufferBase>()  // Output buffer.
+                           .Arg<ffi::AnyBuffer>()  // 1st input buffer.
+                           .Arg<ffi::AnyBuffer>()  // 2nd input buffer.
+                           .Ret<ffi::AnyBuffer>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_2inputs",
@@ -560,10 +558,10 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_2inputs",
 
 XLA_FFI_DEFINE_HANDLER(kTokens3Inputs, Tokens3Inputs,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // 1st input buffer.
-                           .Arg<ffi::BufferBase>()  // 2nd input buffer.
-                           .Arg<ffi::BufferBase>()  // 3rd input buffer.
-                           .Ret<ffi::BufferBase>()  // Output buffer.
+                           .Arg<ffi::AnyBuffer>()  // 1st input buffer.
+                           .Arg<ffi::AnyBuffer>()  // 2nd input buffer.
+                           .Arg<ffi::AnyBuffer>()  // 3rd input buffer.
+                           .Ret<ffi::AnyBuffer>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_3inputs",
@@ -603,12 +601,12 @@ TEST_P(CustomCallTokensTest, ExportedFfiTokensTest) {
 INSTANTIATE_TEST_SUITE_P(CustomCallTokensTest, CustomCallTokensTest,
                          ::testing::ValuesIn(GetTokenTestCases()));
 
-static absl::Status AlwaysSucceed(ffi::Result<ffi::BufferBase>) {
+static absl::Status AlwaysSucceed(ffi::Result<ffi::AnyBuffer>) {
   return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
-                       ffi::Ffi::Bind().Ret<ffi::BufferBase>());
+                       ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
                          PLATFORM, kAlwaysSucceed);
@@ -628,7 +626,7 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
 // XLA:FFI handler for testing attributes decoding
 //===----------------------------------------------------------------------===//
 
-static absl::Status FfiAttributes(ffi::Result<ffi::BufferBase>,
+static absl::Status FfiAttributes(ffi::Result<ffi::AnyBuffer>,
                                   absl::Span<const int32_t> i32_arr) {
   if (i32_arr.size() != 4)
     return absl::InternalError("i32_arr size does not match");
@@ -641,7 +639,7 @@ static absl::Status FfiAttributes(ffi::Result<ffi::BufferBase>,
 
 XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
                        ffi::Ffi::Bind()
-                           .Ret<ffi::BufferBase>()
+                           .Ret<ffi::AnyBuffer>()
                            .Attr<absl::Span<const int32_t>>("i32_arr"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
@@ -665,7 +663,7 @@ TEST_F(CustomCallTest, FfiAttributes) {
 
 static absl::Status MemcpyWithCalledComputation(
     se::Stream* stream, se::OwningScratchAllocator<> scratch_allocator,
-    ffi::BufferBase src, ffi::Result<ffi::BufferBase> dst,
+    ffi::AnyBuffer src, ffi::Result<ffi::AnyBuffer> dst,
     const HloComputation* called_computation) {
   if (called_computation == nullptr)
     return absl::InternalError("Called computation is not defined");
@@ -688,8 +686,8 @@ XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
                            .Ctx<ffi::ScratchAllocator>()  // scratch
-                           .Arg<ffi::BufferBase>()        // src
-                           .Ret<ffi::BufferBase>()        // dst
+                           .Arg<ffi::AnyBuffer>()         // src
+                           .Ret<ffi::AnyBuffer>()         // dst
                            .Ctx<ffi::CalledComputation>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
@@ -731,7 +729,7 @@ struct SomeExtraContext {
 
 static int32_t execution_context_counter = 0;
 
-static absl::Status ExecutionContext(ffi::Result<ffi::BufferBase>,
+static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
                                      SomeExtraContext* ctx) {
   if (ctx->value != 42) return absl::InternalError("Unexpected value");
   ++execution_context_counter;
@@ -740,7 +738,7 @@ static absl::Status ExecutionContext(ffi::Result<ffi::BufferBase>,
 
 XLA_FFI_DEFINE_HANDLER(kExecutionContext, ExecutionContext,
                        ffi::Ffi::Bind()
-                           .Ret<ffi::BufferBase>()
+                           .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context",
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 994e0c3a53d1c3..e5ad1b92d14e7b 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -833,8 +833,8 @@ TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
                                       /*run_hlo_passes=*/false));
 }
 
-static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::Result<ffi::BufferBase> dst) {
+static absl::Status Memcpy(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> dst) {
   return stream->MemcpyD2D(
       &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
@@ -844,8 +844,8 @@ static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
 XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src
-                           .Ret<ffi::BufferBase>()  // dst
+                           .Arg<ffi::AnyBuffer>()  // src
+                           .Ret<ffi::AnyBuffer>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
@@ -884,13 +884,13 @@ TEST_F(AddressComputationFusionTest, CustomCallSimple) {
 }
 
 static absl::Status SubBuffers(
-    se::Stream* stream, ffi::BufferBase src0, ffi::BufferBase src1,
-    ffi::BufferBase src2, ffi::BufferBase src3, ffi::BufferBase src4,
-    ffi::BufferBase src5, ffi::BufferBase src6, ffi::BufferBase src7,
-    ffi::Result<ffi::BufferBase> dst0, ffi::Result<ffi::BufferBase> dst1,
-    ffi::Result<ffi::BufferBase> dst2, ffi::Result<ffi::BufferBase> dst3,
-    ffi::Result<ffi::BufferBase> dst4, ffi::Result<ffi::BufferBase> dst5,
-    ffi::Result<ffi::BufferBase> dst6) {
+    se::Stream* stream, ffi::AnyBuffer src0, ffi::AnyBuffer src1,
+    ffi::AnyBuffer src2, ffi::AnyBuffer src3, ffi::AnyBuffer src4,
+    ffi::AnyBuffer src5, ffi::AnyBuffer src6, ffi::AnyBuffer src7,
+    ffi::Result<ffi::AnyBuffer> dst0, ffi::Result<ffi::AnyBuffer> dst1,
+    ffi::Result<ffi::AnyBuffer> dst2, ffi::Result<ffi::AnyBuffer> dst3,
+    ffi::Result<ffi::AnyBuffer> dst4, ffi::Result<ffi::AnyBuffer> dst5,
+    ffi::Result<ffi::AnyBuffer> dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
@@ -931,21 +931,21 @@ static absl::Status SubBuffers(
 XLA_FFI_DEFINE_HANDLER(kSubBuffers, SubBuffers,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src0
-                           .Arg<ffi::BufferBase>()  // src1
-                           .Arg<ffi::BufferBase>()  // src2
-                           .Arg<ffi::BufferBase>()  // src3
-                           .Arg<ffi::BufferBase>()  // src4
-                           .Arg<ffi::BufferBase>()  // src5
-                           .Arg<ffi::BufferBase>()  // src6
-                           .Arg<ffi::BufferBase>()  // src7
-                           .Ret<ffi::BufferBase>()  // dst0
-                           .Ret<ffi::BufferBase>()  // dst1
-                           .Ret<ffi::BufferBase>()  // dst2
-                           .Ret<ffi::BufferBase>()  // dst3
-                           .Ret<ffi::BufferBase>()  // dst4
-                           .Ret<ffi::BufferBase>()  // dst5
-                           .Ret<ffi::BufferBase>()  // dst6
+                           .Arg<ffi::AnyBuffer>()  // src0
+                           .Arg<ffi::AnyBuffer>()  // src1
+                           .Arg<ffi::AnyBuffer>()  // src2
+                           .Arg<ffi::AnyBuffer>()  // src3
+                           .Arg<ffi::AnyBuffer>()  // src4
+                           .Arg<ffi::AnyBuffer>()  // src5
+                           .Arg<ffi::AnyBuffer>()  // src6
+                           .Arg<ffi::AnyBuffer>()  // src7
+                           .Ret<ffi::AnyBuffer>()  // dst0
+                           .Ret<ffi::AnyBuffer>()  // dst1
+                           .Ret<ffi::AnyBuffer>()  // dst2
+                           .Ret<ffi::AnyBuffer>()  // dst3
+                           .Ret<ffi::AnyBuffer>()  // dst4
+                           .Ret<ffi::AnyBuffer>()  // dst5
+                           .Ret<ffi::AnyBuffer>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers",
                          PLATFORM, kSubBuffers);
@@ -1020,14 +1020,14 @@ TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
-static absl::Status NoOp(se::Stream* stream, ffi::BufferBase operand) {
+static absl::Status NoOp(se::Stream* stream, ffi::AnyBuffer operand) {
   return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(kNoOp, NoOp,
                        ffi::Ffi::Bind()
-                           .Ctx<ffi::Stream>()      // stream
-                           .Arg<ffi::BufferBase>()  // operand
+                           .Ctx<ffi::Stream>()     // stream
+                           .Arg<ffi::AnyBuffer>()  // operand
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$noop", PLATFORM,
                          kNoOp);
@@ -2543,13 +2543,12 @@ TEST_F(AddressComputationFusionTest, DynamicCustomCallWithTuple) {
 }
 
 static absl::Status SubBuffers2(
-    se::Stream* stream, ffi::BufferBase src0, ffi::BufferBase src1,
-    ffi::BufferBase src2, ffi::BufferBase src3, ffi::BufferBase src4,
-    ffi::BufferBase src5, ffi::BufferBase src6,
-    ffi::Result<ffi::BufferBase> dst0, ffi::Result<ffi::BufferBase> dst1,
-    ffi::Result<ffi::BufferBase> dst2, ffi::Result<ffi::BufferBase> dst3,
-    ffi::Result<ffi::BufferBase> dst4, ffi::Result<ffi::BufferBase> dst5,
-    ffi::Result<ffi::BufferBase> dst6) {
+    se::Stream* stream, ffi::AnyBuffer src0, ffi::AnyBuffer src1,
+    ffi::AnyBuffer src2, ffi::AnyBuffer src3, ffi::AnyBuffer src4,
+    ffi::AnyBuffer src5, ffi::AnyBuffer src6, ffi::Result<ffi::AnyBuffer> dst0,
+    ffi::Result<ffi::AnyBuffer> dst1, ffi::Result<ffi::AnyBuffer> dst2,
+    ffi::Result<ffi::AnyBuffer> dst3, ffi::Result<ffi::AnyBuffer> dst4,
+    ffi::Result<ffi::AnyBuffer> dst5, ffi::Result<ffi::AnyBuffer> dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
@@ -2586,20 +2585,20 @@ static absl::Status SubBuffers2(
 XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src0
-                           .Arg<ffi::BufferBase>()  // src1
-                           .Arg<ffi::BufferBase>()  // src2
-                           .Arg<ffi::BufferBase>()  // src3
-                           .Arg<ffi::BufferBase>()  // src4
-                           .Arg<ffi::BufferBase>()  // src5
-                           .Arg<ffi::BufferBase>()  // src6
-                           .Ret<ffi::BufferBase>()  // dst0
-                           .Ret<ffi::BufferBase>()  // dst1
-                           .Ret<ffi::BufferBase>()  // dst2
-                           .Ret<ffi::BufferBase>()  // dst3
-                           .Ret<ffi::BufferBase>()  // dst4
-                           .Ret<ffi::BufferBase>()  // dst5
-                           .Ret<ffi::BufferBase>()  // dst6
+                           .Arg<ffi::AnyBuffer>()  // src0
+                           .Arg<ffi::AnyBuffer>()  // src1
+                           .Arg<ffi::AnyBuffer>()  // src2
+                           .Arg<ffi::AnyBuffer>()  // src3
+                           .Arg<ffi::AnyBuffer>()  // src4
+                           .Arg<ffi::AnyBuffer>()  // src5
+                           .Arg<ffi::AnyBuffer>()  // src6
+                           .Ret<ffi::AnyBuffer>()  // dst0
+                           .Ret<ffi::AnyBuffer>()  // dst1
+                           .Ret<ffi::AnyBuffer>()  // dst2
+                           .Ret<ffi::AnyBuffer>()  // dst3
+                           .Ret<ffi::AnyBuffer>()  // dst4
+                           .Ret<ffi::AnyBuffer>()  // dst5
+                           .Ret<ffi::AnyBuffer>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
                          PLATFORM, kSubBuffers2);
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index c472ac9f6569d6..c4dc33093e1391 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -550,8 +550,8 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   ASSERT_EQ(dst, std::vector<float>({2 * 3 + 3 * 4 + 4 * 5}));
 }
 
-static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::Result<ffi::BufferBase> dst) {
+static absl::Status Memcpy(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> dst) {
   return stream->MemcpyD2D(
       &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
@@ -561,8 +561,8 @@ static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
 XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
-                           .Arg<ffi::BufferBase>()  // src
-                           .Ret<ffi::BufferBase>()  // dst
+                           .Arg<ffi::AnyBuffer>()  // src
+                           .Ret<ffi::AnyBuffer>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
diff --git a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
index 40740f5117fbf2..3547bae2109aa6 100644
--- a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
@@ -24,15 +24,15 @@ namespace cpu {
 namespace {
 class AliasAnalysisTest : public CpuCodegenTest {};
 
-static absl::Status FakeCustomCallTarget(ffi::BufferBase,
-                                         ffi::Result<ffi::BufferBase>) {
+static absl::Status FakeCustomCallTarget(ffi::AnyBuffer,
+                                         ffi::Result<ffi::AnyBuffer>) {
   return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(kFakeCustomCallTarget, FakeCustomCallTarget,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // in
-                           .Ret<ffi::BufferBase>()  // out
+                           .Arg<ffi::AnyBuffer>()  // in
+                           .Ret<ffi::AnyBuffer>()  // out
 );
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 2cd17467e76ef0..cdc42d09c3a98a 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -481,7 +481,7 @@ namespace {
 
 // TODO(abanas): The following three usings are a workaround, delete when
 // ResultBuffer is implemented as its own class
-using ResultBufferBase = ffi::Result<ffi::BufferBase>;
+using ResultBufferBase = ffi::Result<ffi::AnyBuffer>;
 template <PrimitiveType dtype, size_t rank = xla::ffi::internal::kDynamicRank>
 using ResultBuffer = ffi::Result<ffi::Buffer<dtype, rank>>;
 template <PrimitiveType dtype>
@@ -491,13 +491,13 @@ using R0F32Buffer = typename ffi::BufferR0<PrimitiveType::F32>;
 using F32Buffer = typename ffi::Buffer<PrimitiveType::F32>;
 using R0F32ResultBuffer = ResultBufferR0<PrimitiveType::F32>;
 using F32ResultBuffer = ResultBuffer<PrimitiveType::F32>;
-using BufferBase = ffi::BufferBase;
+using AnyBuffer = ffi::AnyBuffer;
 
 // Custom kernels definitions and registrations
 static absl::Status AlwaysSucceed(ResultBufferBase) { return absl::OkStatus(); }
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
-                       ffi::Ffi::Bind().Ret<BufferBase>()  // unused out buffer
+                       ffi::Ffi::Bind().Ret<AnyBuffer>()  // unused out buffer
 );
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
@@ -509,7 +509,7 @@ static absl::Status AlwaysFail(ResultBufferBase, int32_t value) {
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                        ffi::Ffi::Bind()
-                           .Ret<BufferBase>()       // unused out buffer
+                           .Ret<AnyBuffer>()        // unused out buffer
                            .Attr<int32_t>("value")  // value
 );
 
@@ -533,7 +533,7 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32Add2",
                          "Host", kFfiR0F32Add2);
 
 template <PrimitiveType dtype>
-static absl::Status R0FAdd2(BufferBase in, ResultBufferBase out) {
+static absl::Status R0FAdd2(AnyBuffer in, ResultBufferBase out) {
   using NativeType =
       typename ::xla::primitive_util::PrimitiveTypeToNative<dtype>::type;
 
@@ -545,7 +545,7 @@ static absl::Status R0FAdd2(BufferBase in, ResultBufferBase out) {
 }
 
 // This represents a kernel that is valid only for F32 and F64 types
-static absl::Status FfiR0FAdd2BufferBase(BufferBase in, ResultBufferBase out) {
+static absl::Status FfiR0FAdd2BufferBase(AnyBuffer in, ResultBufferBase out) {
   if (in.dtype != out->dtype) {
     return absl::InternalError("Input and output dtypes mismatch");
   }
@@ -562,8 +562,8 @@ static absl::Status FfiR0FAdd2BufferBase(BufferBase in, ResultBufferBase out) {
 
 XLA_FFI_DEFINE_HANDLER(kFfiR0FAdd2BufferBase, FfiR0FAdd2BufferBase,
                        ffi::Ffi::Bind()
-                           .Arg<BufferBase>()  // in
-                           .Ret<BufferBase>()  // out
+                           .Arg<AnyBuffer>()  // in
+                           .Ret<AnyBuffer>()  // out
 );
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
@@ -756,7 +756,7 @@ XLA_FFI_DEFINE_HANDLER(kFfiTupleRotate, FfiTupleRotate,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiTupleRotate",
                          "Host", kFfiTupleRotate);
 
-static absl::Status VerifyR2Dimensions(ffi::BufferBase in, int32_t rows,
+static absl::Status VerifyR2Dimensions(ffi::AnyBuffer in, int32_t rows,
                                        int32_t cols) {
   std::string message;
   if (in.dimensions.size() != 2) {
@@ -780,7 +780,7 @@ static absl::Status VerifyR2Dimensions(ffi::BufferBase in, int32_t rows,
 
 XLA_FFI_DEFINE_HANDLER(kVerifyR2Dimensions, VerifyR2Dimensions,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // in
+                           .Arg<ffi::AnyBuffer>()  // in
                            .Attr<int32_t>("rows")
                            .Attr<int32_t>("cols"));
 

From 252a3053fbf716079617c19da4b56b9cf7fe662a Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 09:53:41 -0700
Subject: [PATCH 077/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638320089
---
 third_party/xla/xla/pjrt/BUILD                  | 17 ++++-------------
 third_party/xla/xla/pjrt/exceptions.h           |  6 +++---
 third_party/xla/xla/pjrt/host_callback_test.cc  |  2 +-
 third_party/xla/xla/pjrt/layout_mode.cc         |  2 +-
 third_party/xla/xla/pjrt/local_device_state.h   |  2 +-
 third_party/xla/xla/pjrt/mlir_to_hlo.cc         |  1 -
 third_party/xla/xla/pjrt/mlir_to_hlo.h          |  2 +-
 third_party/xla/xla/pjrt/pjrt_api.cc            |  1 -
 third_party/xla/xla/pjrt/pjrt_api.h             |  2 +-
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc   |  1 -
 third_party/xla/xla/pjrt/pjrt_c_api_client.h    |  2 +-
 third_party/xla/xla/pjrt/pjrt_client.h          |  9 +++++----
 third_party/xla/xla/pjrt/pjrt_executable.cc     |  1 -
 third_party/xla/xla/pjrt/pjrt_executable.h      |  2 +-
 .../xla/xla/pjrt/pjrt_stream_executor_client.cc |  9 ++++-----
 .../xla/xla/pjrt/pjrt_stream_executor_client.h  |  2 +-
 third_party/xla/xla/pjrt/status_casters.h       |  2 +-
 .../xla/xla/pjrt/tracked_device_buffer.h        |  2 +-
 third_party/xla/xla/pjrt/transpose.cc           |  1 -
 third_party/xla/xla/pjrt/utils.cc               |  1 -
 third_party/xla/xla/pjrt/utils.h                |  2 +-
 21 files changed, 27 insertions(+), 42 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 174e4c85725407..529bd94fe38cfc 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -129,11 +129,11 @@ cc_library(
         ":pjrt_common",
         ":semaphore",
         ":worker_thread",
-        "//xla:status",
         "//xla:util",
         "//xla/client:local_client",
         "//xla/stream_executor",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -147,7 +147,6 @@ cc_library(
     srcs = ["pjrt_api.cc"],
     hdrs = ["pjrt_api.h"],
     deps = [
-        "//xla:status",
         "//xla:statusor",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
@@ -188,7 +187,6 @@ cc_library(
         ":utils",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -247,7 +245,6 @@ cc_library(
         ":pjrt_layout",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -353,7 +350,6 @@ cc_library(
     deps = [
         ":layout_mode",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -401,9 +397,9 @@ cc_library(
     visibility = ["//xla:friends"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla/service:hlo_parser",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -475,7 +471,6 @@ cc_library(
         "//xla:literal",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -495,7 +490,6 @@ cc_library(
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor",
-        "//xla/stream_executor/host:host_platform_id",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -573,7 +567,6 @@ cc_library(
     hdrs = ["mlir_to_hlo.h"],
     visibility = [":friends"],
     deps = [
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/client:xla_computation",
@@ -688,7 +681,6 @@ cc_library(
         "//xla:compiler_macros",
         "//xla:ef57",
         "//xla:permutation_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "@com_google_absl//absl/algorithm:container",
@@ -742,7 +734,6 @@ cc_library(
         ":pjrt_layout",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -918,7 +909,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//xla:status",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
@@ -935,8 +926,8 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":exceptions",
-        "//xla:status",
         "//xla:statusor",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:macros",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/exceptions.h b/third_party/xla/xla/pjrt/exceptions.h
index 4ef8d544958a82..6a5865f3cce0ce 100644
--- a/third_party/xla/xla/pjrt/exceptions.h
+++ b/third_party/xla/xla/pjrt/exceptions.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "xla/status.h"
 
 namespace xla {
 
@@ -40,7 +40,7 @@ class XlaRuntimeError : public std::runtime_error {
 
   explicit XlaRuntimeError(const std::string what) : std::runtime_error(what) {}
 
-  std::optional<Status> status() const { return status_; }
+  std::optional<absl::Status> status() const { return status_; }
 
  private:
   static std::string StatusToString(const absl::Status& st) {
@@ -59,7 +59,7 @@ class XlaRuntimeError : public std::runtime_error {
     return false;
   }
 
-  std::optional<Status> status_;
+  std::optional<absl::Status> status_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback_test.cc b/third_party/xla/xla/pjrt/host_callback_test.cc
index 6994d8029d37ce..f443b9f8bbb524 100644
--- a/third_party/xla/xla/pjrt/host_callback_test.cc
+++ b/third_party/xla/xla/pjrt/host_callback_test.cc
@@ -63,7 +63,7 @@ class TestStream : public CopyToDeviceStream {
     CHECK(!done_.HasBeenNotified());
     chunk_ = std::move(chunk);
     done_.Notify();
-    return PjRtFuture<>(OkStatus());
+    return PjRtFuture<>(absl::OkStatus());
   }
 
  private:
diff --git a/third_party/xla/xla/pjrt/layout_mode.cc b/third_party/xla/xla/pjrt/layout_mode.cc
index b236f066598995..1983d639155c03 100644
--- a/third_party/xla/xla/pjrt/layout_mode.cc
+++ b/third_party/xla/xla/pjrt/layout_mode.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/layout.h"
 #include "xla/service/hlo_parser.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index 9cc0db39776e66..1ce1f1ea7d5401 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -23,13 +23,13 @@ limitations under the License.
 #include <stack>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/pjrt/worker_thread.h"
-#include "xla/status.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index 88d691be956ce7..a2461644467736 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "xla/mlir/utils/error_util.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index efce0dc48f28d9..9bff0687112c98 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef XLA_PJRT_MLIR_TO_HLO_H_
 #define XLA_PJRT_MLIR_TO_HLO_H_
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/xla_computation.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/pjrt_api.cc b/third_party/xla/xla/pjrt/pjrt_api.cc
index 6cfb7d2b07058f..3e8e78cbd0c973 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.cc
+++ b/third_party/xla/xla/pjrt/pjrt_api.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_api.h b/third_party/xla/xla/pjrt/pjrt_api.h
index eff361c3cbe7b6..8c8d807ee4a7d0 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.h
+++ b/third_party/xla/xla/pjrt/pjrt_api.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_API_H_
 #define XLA_PJRT_PJRT_API_H_
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace pjrt {
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index cece2b67bf83fe..48ad0651cdfcaf 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -67,7 +67,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 0a2c0dc0350ece..98232484bbafd9 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index c5a2a0b1e2ff76..aed47575dfe83a 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -290,7 +290,7 @@ struct PjRtCrossHostRecvDescriptors {
 // hang indefinitely.
 using PjRtCrossHostSendCancelNotifier = std::function<void(
     absl::string_view serialized_descriptor, absl::Status reason,
-    std::function<void(Status)> on_canceled)>;
+    std::function<void(absl::Status)> on_canceled)>;
 // State asynchronously returned by MakeCrossHostReceiveBuffers. "descriptors"
 // will match the returned PjRtBuffer objects 1:1. Specifically, each PjRtBuffer
 // returned by MakeCrossHostReceiveBuffers will have one
@@ -300,7 +300,7 @@ struct PjRtCrossHostRecvState {
   PjRtCrossHostSendCancelNotifier cancel_notifier;
 };
 using PjRtCrossHostRecvNotifier =
-    std::function<void(StatusOr<PjRtCrossHostRecvState>)>;
+    std::function<void(absl::StatusOr<PjRtCrossHostRecvState>)>;
 
 // A sized chunk of host data. The host data can be either in host layout or in
 // device layout, and it can be one part of the entire buffer. The PjRt
@@ -649,7 +649,8 @@ class PjRtClient {
 
   // Creates buffer in the given device that carries an error future without
   // allocating memory.
-  ABSL_DEPRECATED("Use CreateErrorBuffer(Status, Shape, PjRtMemorySpace*)")
+  ABSL_DEPRECATED(
+      "Use CreateErrorBuffer(absl::Status, Shape, PjRtMemorySpace*)")
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
       absl::Status error, const Shape& shape, PjRtDevice* device) {
     auto default_memory_space = device->default_memory_space();
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 0697c54593cc7b..43638270db32d3 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index d69b27033512dd..9bec1ae1d4a1d0 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 55216057d66f26..2cebc115d90be5 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -128,7 +128,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -422,7 +421,7 @@ absl::Status AddDestinationBufferSynchronization(
   RecordUsage(std::move(device_buffer), local_device, local_device,
               definition_event, copy_stream,
               /*prefer_to_retain_reference=*/false);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -2591,7 +2590,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       done_.SetStateConcrete();
     }
 
-    return PjRtFuture<>(OkStatus());
+    return PjRtFuture<>(absl::OkStatus());
   }
 
  private:
@@ -2907,7 +2906,7 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
   return outputs;
 }
 
-static Status GetFirstInputError(
+static absl::Status GetFirstInputError(
     absl::Span<PjRtBuffer* const> argument_handles) {
   for (auto* handle : argument_handles) {
     auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(handle);
@@ -2944,7 +2943,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
     (*device_assignment)(0, 0) = device->id();
   }
 
-  Status input_error = GetFirstInputError(argument_handles);
+  absl::Status input_error = GetFirstInputError(argument_handles);
   if (!input_error.ok()) {
     TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
                         device->default_memory_space());
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 016499b0587250..988451297492f3 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -62,7 +63,6 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
index 7e45a310dfa5ba..e8d345ad18a770 100644
--- a/third_party/xla/xla/pjrt/status_casters.h
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef XLA_PJRT_STATUS_CASTERS_H_
 #define XLA_PJRT_STATUS_CASTERS_H_
 
+#include "absl/status/status.h"
 #include "xla/pjrt/exceptions.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/macros.h"
 
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index 3b04f44ecdfe98..a3ec15319b3923 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -68,7 +68,7 @@ class BufferSequencingEvent {
  public:
   explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool)
       : thread_pool_(thread_pool),
-        defined_status_(tsl::MakeUnconstructedAsyncValueRef<Status>()) {}
+        defined_status_(tsl::MakeUnconstructedAsyncValueRef<absl::Status>()) {}
 
   // Sets the sequencing event to 'event', which is recorded on 'stream'. Must
   // be called at most once. Unblocks any other host threads that are blocked in
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 7f94e1dec1c1bf..a04e81e0e362ec 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -96,7 +96,6 @@ limitations under the License.
 #include "xla/ef57.h"
 #include "xla/permutation_util.h"
 #include "xla/pjrt/transpose_kernels.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/pjrt/utils.cc b/third_party/xla/xla/pjrt/utils.cc
index 5c8af4dee30344..d37da37f885307 100644
--- a/third_party/xla/xla/pjrt/utils.cc
+++ b/third_party/xla/xla/pjrt/utils.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/pjrt/utils.h b/third_party/xla/xla/pjrt/utils.h
index 0271924420913b..ea68241f87d698 100644
--- a/third_party/xla/xla/pjrt/utils.h
+++ b/third_party/xla/xla/pjrt/utils.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/executable_build_options.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/pjrt/layout_mode.h"
 #include "xla/service/computation_placer.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 

From e7e2b655d573de99c7c7fd2ceb5659f57fb2d908 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 09:58:27 -0700
Subject: [PATCH 078/287] [xla] Add `test_xla_cpu_thunks` tag to test XLA:CPU
 with thunks runtime

PiperOrigin-RevId: 638321698
---
 third_party/xla/xla/tests/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 1bd6b5a4896650..1a645725ba1993 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1719,7 +1719,10 @@ xla_test(
 xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
-    tags = ["test_hlo_pjrt_runner"],
+    tags = [
+        "test_hlo_pjrt_runner",
+        "test_xla_cpu_thunks",
+    ],
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",

From 8316e74be397ed47e288a75941bf5f7a3a56b2bb Mon Sep 17 00:00:00 2001
From: Jackson Stokes <jacksonstokes@google.com>
Date: Wed, 29 May 2024 09:59:29 -0700
Subject: [PATCH 079/287] [xprof] Add infrastructure to render host offload
 transfers in trace viewer.

This adds a new line to the trace viewer to support host offload ops.

PiperOrigin-RevId: 638322073
---
 tensorflow/core/profiler/utils/BUILD          |  22 ++-
 .../core/profiler/utils/derived_timeline.cc   |  13 +-
 .../core/profiler/utils/host_offload_utils.cc | 136 ++++++++++++++++++
 .../core/profiler/utils/host_offload_utils.h  |  64 +++++++++
 tensorflow/core/profiler/utils/trace_utils.h  |   1 +
 .../core/profiler/utils/xplane_schema.h       |   1 +
 .../tsl/tsl/profiler/utils/trace_utils.h      |   1 +
 .../tsl/tsl/profiler/utils/xplane_schema.cc   |   1 +
 .../tsl/tsl/profiler/utils/xplane_schema.h    |   1 +
 9 files changed, 238 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/profiler/utils/host_offload_utils.cc
 create mode 100644 tensorflow/core/profiler/utils/host_offload_utils.h

diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index a3f7d16a834cef..0b54cdd30e633c 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -212,6 +212,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_offload_utils",
+    srcs = ["host_offload_utils.cc"],
+    hdrs = ["host_offload_utils.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":xplane_builder",
+        ":xplane_schema",
+        ":xplane_visitor",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/profiler/utils:timespan",
+        "@local_xla//xla:shape_util",
+    ],
+)
+
 cc_library(
     name = "derived_timeline",
     srcs = ["derived_timeline.cc"],
@@ -221,6 +239,7 @@ cc_library(
         ":gpu_event_stats",
         ":hlo_module_map",
         ":hlo_proto_map",
+        ":host_offload_utils",
         ":math_utils",
         ":trace_utils",
         ":xplane_builder",
@@ -229,16 +248,17 @@ cc_library(
         ":xplane_visitor",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/util:event_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/profiler/convert:xla_op_utils",
         "@local_tsl//tsl/profiler/utils:group_events",
         "@local_tsl//tsl/profiler/utils:tf_op_utils",
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
         "@local_tsl//tsl/profiler/utils:timespan",
         "@local_tsl//tsl/profiler/utils:tpu_xplane_utils",
+        "@local_tsl//tsl/profiler/utils:xplane_visitor",
         "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 0cb61f2cbdd762..913a1b1c4ae5eb 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
 #include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
 #include "tensorflow/core/profiler/utils/hlo_module_map.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_map.h"
+#include "tensorflow/core/profiler/utils/host_offload_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
@@ -461,6 +461,14 @@ void DeriveLinesFromStats(XPlane* device_trace) {
       &plane_builder, tensorflow::profiler::kThreadIdSource,
       tensorflow::profiler::kSourceLineName, start_timestamp_ns, {});
 
+  XLineBuilder host_offload_op_line_builder =
+      plane_builder.GetOrCreateLine(kThreadIdHostOffloadOp);
+  host_offload_op_line_builder.SetName(kHostOffloadOpLineName);
+  host_offload_op_line_builder.SetTimestampNs(start_timestamp_ns);
+
+  HostOffloadEventProcessor host_offload_event_processor(
+      &plane_builder, &host_offload_op_line_builder);
+
   for (const XEventVisitor& event :
        GetSortedEvents<XEventVisitor>(plane_visitor, true)) {
     tsl::profiler::Timespan event_span = event.GetTimespan();
@@ -493,6 +501,9 @@ void DeriveLinesFromStats(XPlane* device_trace) {
           *plane_builder.GetOrCreateEventMetadata(*source_info), event_span,
           group_id);
     }
+    if (host_offload_event_processor.IsHostOffloadOpName(event)) {
+      host_offload_event_processor.ProcessHostOffloadOpEvent(event, group_id);
+    }
   }
 
   RemoveEmptyLines(device_trace);
diff --git a/tensorflow/core/profiler/utils/host_offload_utils.cc b/tensorflow/core/profiler/utils/host_offload_utils.cc
new file mode 100644
index 00000000000000..08ef21d1bc32ac
--- /dev/null
+++ b/tensorflow/core/profiler/utils/host_offload_utils.cc
@@ -0,0 +1,136 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/host_offload_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+bool HostOffloadEventProcessor::IsHostOffloadOpName(
+    const XEventVisitor& event) const {
+  static constexpr absl::string_view keywords[] = {"copy-start",
+                                                   "copy-done",
+                                                   "dynamic-slice-start",
+                                                   "dynamic-slice-done",
+                                                   "dynamic-update-slice-start",
+                                                   "dynamic-update-slice-done"};
+
+  for (const auto& keyword : keywords) {
+    // The host_memory_label_ S(5) is used by instructions to designate tensors
+    // that are on the host.
+    if (absl::StrContains(event.DisplayName(), keyword) &&
+        absl::StrContains(event.Name(), host_memory_label_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string HostOffloadEventProcessor::GetOffloadInstructionID(
+    absl::string_view op_name) const {
+  std::vector<std::string> op_name_vec = absl::StrSplit(op_name, '.');
+
+  // If no dot is found, or it's at the beginning or end of the string, return
+  // a 0. Hlo opnames are not expected to have a dot followed by 0.
+  if (op_name_vec.size() < 2) {
+    return "0";
+  }
+  return op_name_vec.back();
+}
+
+std::string HostOffloadEventProcessor::GetOffloadInstructionName(
+    absl::string_view op_name) const {
+  // TODO(b/342469268): Get the display ID and name from the HloInstruction, not
+  // just the event name.
+  std::string display_id = GetOffloadInstructionID(op_name);
+
+  size_t startPos = op_name.find("-start");
+  size_t donePos = op_name.find("-done");
+
+  absl::string_view display_opname;
+  if (startPos != absl::string_view::npos) {
+    display_opname = op_name.substr(0, startPos);
+  } else if (donePos != absl::string_view::npos) {
+    display_opname = op_name.substr(0, donePos);
+  } else {
+    // Invalid input format: neither "-start" nor "-done" found
+    LOG(WARNING) << "Invalid op name: " << op_name;
+    display_opname = op_name;
+  }
+  return absl::StrCat("offload-", display_opname, ".", display_id);
+}
+
+void HostOffloadEventProcessor::ProcessHostOffloadOpEvent(
+    const XEventVisitor& event, std::optional<int64_t> group_id) {
+  std::string display_opname = GetOffloadInstructionName(event.DisplayName());
+
+  auto [iter, inserted] = seen_events_.try_emplace(display_opname);
+  std::queue<const XEventVisitor*>& events = iter->second;
+
+  if (absl::StrContains(event.DisplayName(), "-start")) {
+    // For start events, just push them into the queue.
+    events.push(&event);
+    return;
+  } else if (absl::StrContains(event.DisplayName(), "-done")) {
+    // for done events, pop the start event and create the new event.
+    // Not all start events may be traced. In this case we just skip the
+    // corresponding done event.
+    if (events.empty()) {
+      LOG(INFO) << "No corresponding start event found for "
+                << event.DisplayName();
+      return;
+    }
+    const XEventVisitor* start_event = events.front();
+    events.pop();
+
+    // At this point, we have the corresponding start and end event.
+    // Create the new event.
+    tsl::profiler::Timespan event_span = tsl::profiler::Timespan::FromEndPoints(
+        start_event->GetTimespan().begin_ps(), event.GetTimespan().end_ps());
+
+    XEventMetadata* host_offload_copy_metadata =
+        plane_builder_->CreateEventMetadata();
+    host_offload_copy_metadata->set_display_name(display_opname);
+    XEventBuilder event_builder =
+        host_offload_op_line_builder_->AddEvent(*host_offload_copy_metadata);
+    event_builder.SetTimespan(event_span);
+
+    // We mark the events as async so that they are displayed on new sub-lines
+    // below other async events.
+    const XStatMetadata& async_stat = *plane_builder_->GetOrCreateStatMetadata(
+        GetStatTypeStr(StatType::kIsAsync));
+    event_builder.AddStatValue(async_stat, 1);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/host_offload_utils.h b/tensorflow/core/profiler/utils/host_offload_utils.h
new file mode 100644
index 00000000000000..8bbf995af21ad4
--- /dev/null
+++ b/tensorflow/core/profiler/utils/host_offload_utils.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <queue>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/layout.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+class HostOffloadEventProcessor {
+ public:
+  HostOffloadEventProcessor(XPlaneBuilder* plane_builder,
+                            XLineBuilder* host_offload_op_line_builder)
+      : plane_builder_(plane_builder),
+        host_offload_op_line_builder_(host_offload_op_line_builder) {}
+  ~HostOffloadEventProcessor() = default;
+
+  void ProcessHostOffloadOpEvent(const XEventVisitor& event,
+                                 std::optional<int64_t> group_id);
+
+  bool IsHostOffloadOpName(const XEventVisitor& event) const;
+
+ private:
+  std::string GetOffloadInstructionID(absl::string_view op_name) const;
+  std::string GetOffloadInstructionName(absl::string_view op_name) const;
+
+  absl::flat_hash_map<std::string, std::queue<const XEventVisitor*>>
+      seen_events_;
+  std::string host_memory_label_ =
+      absl::StrCat("S(", xla::Layout::kHostMemorySpace, ")");
+
+  XPlaneBuilder* plane_builder_;
+  XLineBuilder* host_offload_op_line_builder_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/trace_utils.h b/tensorflow/core/profiler/utils/trace_utils.h
index e6b788bb16cb4f..9aab7a1ddc247d 100644
--- a/tensorflow/core/profiler/utils/trace_utils.h
+++ b/tensorflow/core/profiler/utils/trace_utils.h
@@ -29,6 +29,7 @@ using tsl::profiler::kThreadIdDerivedMax;    // NOLINT
 using tsl::profiler::kThreadIdDerivedMin;    // NOLINT
 using tsl::profiler::kThreadIdHloModule;     // NOLINT
 using tsl::profiler::kThreadIdHloOp;         // NOLINT
+using tsl::profiler::kThreadIdHostOffloadOp;  // NOLINT
 using tsl::profiler::kThreadIdKernelLaunch;  // NOLINT
 using tsl::profiler::kThreadIdOverhead;      // NOLINT
 using tsl::profiler::kThreadIdSource;        // NOLINT
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 4e7c486ccc6161..70d5efc7c11a09 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -37,6 +37,7 @@ using tsl::profiler::kCustomPlanePrefix;            // NOLINT
 using tsl::profiler::kDeviceVendorAMD;              // NOLINT
 using tsl::profiler::kDeviceVendorNvidia;           // NOLINT
 using tsl::profiler::kGpuPlanePrefix;               // NOLINT
+using tsl::profiler::kHostOffloadOpLineName;        // NOLINT
 using tsl::profiler::kHostThreadsPlaneName;         // NOLINT
 using tsl::profiler::kKernelLaunchLineName;         // NOLINT
 using tsl::profiler::kMegaScaleBarrier;             // NOLINT
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
index 27d27c5b9966a9..e70f3f70b3c9de 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
@@ -50,6 +50,7 @@ constexpr int kThreadIdHloModule = kThreadIdDerivedMin + 4;
 constexpr int kThreadIdHloOp = kThreadIdDerivedMin + 5;
 constexpr int kThreadIdOverhead = kThreadIdDerivedMin + 6;
 constexpr int kThreadIdSource = kThreadIdDerivedMin + 7;
+constexpr int kThreadIdHostOffloadOp = kThreadIdDerivedMin + 8;
 constexpr int kThreadIdDerivedMax = kThreadIdSource;
 
 static inline bool IsDerivedThreadId(int thread_id) {
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index 4fa87ef4085372..a9f57a6eb35967 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -54,6 +54,7 @@ const absl::string_view kXlaOpLineName = "XLA Ops";
 const absl::string_view kXlaAsyncOpLineName = "Async XLA Ops";
 const absl::string_view kKernelLaunchLineName = "Launch Stats";
 const absl::string_view kSourceLineName = "Source code";
+const absl::string_view kHostOffloadOpLineName = "Host Offload Ops";
 const absl::string_view kCounterEventsLineName = "_counters_";
 
 const absl::string_view kDeviceVendorNvidia = "Nvidia";
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index d6870715b99fa2..0e1cc10a1c9211 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -72,6 +72,7 @@ TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 TF_CONST_INIT extern const absl::string_view kSourceLineName;
 TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
+TF_CONST_INIT extern const absl::string_view kHostOffloadOpLineName;
 
 // GPU device vendors.
 TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;

From 24e315bc90764bd740155a25936251fdffbe5a62 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 10:13:19 -0700
Subject: [PATCH 080/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638327367
---
 third_party/xla/xla/hlo/experimental/auto_sharding/BUILD        | 2 +-
 .../xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc     | 1 -
 .../xla/xla/hlo/experimental/auto_sharding/auto_sharding.h      | 2 +-
 .../hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc | 1 -
 .../xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc  | 1 -
 .../xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc  | 1 -
 .../xla/hlo/experimental/auto_sharding/auto_sharding_solver.h   | 2 +-
 .../xla/hlo/experimental/auto_sharding/auto_sharding_util.cc    | 1 -
 8 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 052489e6482f7a..ff2410c9dab9fe 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -142,11 +142,11 @@ cc_library(
     deps = [
         ":auto_sharding_proto_cc",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_value",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_ortools//ortools/linear_solver",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 537ce681a530ad..177616ee902469 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -83,7 +83,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index b7266fbbdad321..e914fd96d46cca 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 417d9f14a09d94..c13bdaf805e28b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "xla/service/dot_as_convolution_util.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/sharding_propagation.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
index c1f994e9830b99..e82bf2a5e31751 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_parser.h"
-#include "xla/status.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "tsl/platform/init_main.h"
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 7ab21817a059c7..cd57e2bef3c3a9 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/fingerprint.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index 9b3c099cd92521..ff3f1bdcf57c98 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
-#include "xla/status.h"
 #include "ortools/linear_solver/linear_solver.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index b81c7b97457dce..9ece09787cc8b2 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"

From 6335a747124f768ca151e667d98190b04fed7702 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 29 May 2024 10:43:08 -0700
Subject: [PATCH 081/287] Add simple loop peeling for imperfect tiling.

This is very ad-hoc and ugly, but it works for now.

PiperOrigin-RevId: 638337640
---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 151 +++++++++++++-----
 .../gpu/fusions/mlir/ir/xla_gpu_ops.cc        |   2 +-
 .../xla/service/gpu/fusions/reduction_mlir.cc |   1 +
 .../gpu/fusions/reduction_mlir_test.cc        |  16 +-
 4 files changed, 123 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 2eec7017067beb..b18106d03128bb 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -1301,7 +1301,23 @@ absl::Status SubgraphToMlirFunction(
   return absl::OkStatus();
 }
 
-SmallVector<Value> EmitLoopNest(
+namespace {
+
+bool IsSymbolConstrained(const IndexingMap& map, int symbol_id) {
+  for (const auto& [expr, _] : map.GetConstraints()) {
+    bool result = false;
+    expr.walk([&](mlir::AffineExpr leaf) {
+      auto sym = mlir::dyn_cast<mlir::AffineSymbolExpr>(leaf);
+      if (sym && sym.getPosition() == symbol_id) {
+        result = true;
+      }
+    });
+    if (result) return true;
+  }
+  return false;
+}
+
+SmallVector<Value> EmitLoopNestImpl(
     ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
     const IndexingMap& indexing_map,
     mlir::function_ref<SmallVector<Value>(ValueRange /*iter_args*/,
@@ -1326,53 +1342,100 @@ SmallVector<Value> EmitLoopNest(
     iter_args_inits = vector_inits;
   }
 
-  scf::LoopNest loop_nest = scf::buildLoopNest(
-      b, b.getLoc(), lbs, ubs, steps, iter_args_inits,
-      [&](OpBuilder& nested_builder, Location loc, ValueRange symbol_values,
-          ValueRange iter_args) -> scf::ValueVector {
-        ImplicitLocOpBuilder nested_b(loc, nested_builder);
-        auto is_in_bounds = mlir_converter::CheckConstraints(
-            indexing_map, dim_values, symbol_values, nested_b);
-        auto if_op = nested_b.create<scf::IfOp>(
-            is_in_bounds,
-            [&](OpBuilder& then_builder, Location then_loc) -> void {
-              OpBuilder::InsertionGuard g(b);
-              b.setInsertionPointToStart(then_builder.getInsertionBlock());
-              SmallVector<Value, 4> results;
-              if (vectorize) {
-                SmallVector<Value, 4> vector_args;
-                vector_args = iter_args;
-                // Extract the vector elements.
-                for (auto& init : vector_args) {
-                  if (mlir::isa<mlir::VectorType>(init.getType())) {
-                    init = b.create<mlir::vector::ExtractOp>(
-                        init, symbol_values.back());
-                  }
-                }
-                results = create_body(vector_args, dim_values, symbol_values);
-                // Insert the results.
-                for (auto [index, init] : llvm::enumerate(iter_args)) {
-                  if (mlir::isa<mlir::VectorType>(init.getType())) {
-                    results[index] = b.create<mlir::vector::InsertOp>(
-                        results[index], iter_args[index], symbol_values.back());
-                  }
-                }
-              } else {
-                results = create_body(iter_args, dim_values, symbol_values);
+  auto bb = [&](OpBuilder& nested_builder, Location loc,
+                ValueRange symbol_values,
+                ValueRange iter_args) -> scf::ValueVector {
+    ImplicitLocOpBuilder nested_b(loc, nested_builder);
+    auto is_in_bounds = mlir_converter::CheckConstraints(
+        indexing_map, dim_values, symbol_values, nested_b);
+    auto if_op = nested_b.create<scf::IfOp>(
+        is_in_bounds,
+        [&](OpBuilder& then_builder, Location then_loc) -> void {
+          OpBuilder::InsertionGuard g(b);
+          b.setInsertionPointToStart(then_builder.getInsertionBlock());
+          SmallVector<Value, 4> results;
+          if (vectorize) {
+            SmallVector<Value, 4> vector_args;
+            vector_args = iter_args;
+            // Extract the vector elements.
+            for (auto& init : vector_args) {
+              if (mlir::isa<mlir::VectorType>(init.getType())) {
+                init = b.create<mlir::vector::ExtractOp>(init,
+                                                         symbol_values.back());
               }
-              b.create<scf::YieldOp>(results);
-            },
-            [&](OpBuilder& else_b, Location else_loc) {
-              OpBuilder::InsertionGuard g(b);
-              b.setInsertionPointToStart(else_b.getInsertionBlock());
-              b.create<scf::YieldOp>(iter_args);
-            });
-
-        return if_op.getResults();
-      });
+            }
+            results = create_body(vector_args, dim_values, symbol_values);
+            // Insert the results.
+            for (auto [index, init] : llvm::enumerate(iter_args)) {
+              if (mlir::isa<mlir::VectorType>(init.getType())) {
+                results[index] = b.create<mlir::vector::InsertOp>(
+                    results[index], iter_args[index], symbol_values.back());
+              }
+            }
+          } else {
+            results = create_body(iter_args, dim_values, symbol_values);
+          }
+          b.create<scf::YieldOp>(results);
+        },
+        [&](OpBuilder& else_b, Location else_loc) {
+          OpBuilder::InsertionGuard g(b);
+          b.setInsertionPointToStart(else_b.getInsertionBlock());
+          b.create<scf::YieldOp>(iter_args);
+        });
+
+    return if_op.getResults();
+  };
+  scf::LoopNest loop_nest =
+      scf::buildLoopNest(b, b.getLoc(), lbs, ubs, steps, iter_args_inits, bb);
   return loop_nest.results;
 }
 
+}  // namespace
+
+SmallVector<Value> EmitLoopNest(
+    ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
+    const IndexingMap& indexing_map,
+    mlir::function_ref<SmallVector<Value>(ValueRange /*iter_args*/,
+                                          ValueRange /*dim_values*/,
+                                          ValueRange /*symbol_values*/)>
+        create_body,
+    bool vectorize) {
+  // TODO(b/343420432): Add an op that represents a constrained loop nest and
+  // peel in a pass, instead of doing it ad hoc here.
+  int64_t cumulative_loop_size = 1;
+  for (int sym_index = indexing_map.GetSymbolCount() - 1;
+       sym_index >= 0 && cumulative_loop_size < 64; --sym_index) {
+    auto& bound = indexing_map.GetSymbolBound(sym_index);
+    cumulative_loop_size *= bound.NumElements();
+    if (!IsSymbolConstrained(indexing_map, sym_index)) continue;
+
+    IndexingMap peeled_map = indexing_map;
+    if (bound.upper == bound.lower) continue;
+
+    --peeled_map.GetMutableSymbolBound(sym_index).upper;
+    peeled_map.Simplify();
+
+    // If the symbol is still constrained, peeling does not help.
+    if (IsSymbolConstrained(peeled_map, sym_index)) continue;
+
+    auto first_results = EmitLoopNestImpl(b, dim_values, iter_args_inits,
+                                          peeled_map, create_body, vectorize);
+
+    IndexingMap remainder = indexing_map;
+    remainder.GetMutableSymbolBound(sym_index).lower = bound.upper;
+    remainder.Simplify();
+
+    VLOG(5) << "Peeled indexing map " << indexing_map.ToString() << "\n into "
+            << peeled_map.ToString() << "\nand remainder\n"
+            << remainder.ToString();
+    return EmitLoopNestImpl(b, dim_values, first_results, remainder,
+                            create_body, vectorize);
+  }
+
+  return EmitLoopNestImpl(b, dim_values, iter_args_inits, indexing_map,
+                          create_body, vectorize);
+}
+
 absl::StatusOr<SmallVector<Value>> EmitLoopNestWithStatus(
     ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
     const IndexingMap& indexing_map,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
index 780a1f15c247c3..ece25b8e0e3b8d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
@@ -363,7 +363,7 @@ struct SimplifyIndexingMap : public mlir::OpRewritePattern<ApplyIndexingOp> {
 
     // Remove unused symbols.
     auto unused_symbols_bit_vector = indexing_map.RemoveUnusedVars();
-    bool symbols_removed = !unused_symbols_bit_vector.empty();
+    bool symbols_removed = unused_symbols_bit_vector.count() != 0;
 
     if (!is_simplified && !symbols_removed) {
       return rewriter.notifyMatchFailure(indexing_op,
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index a7094d0cdf5818..f6612b276420d5 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -429,6 +429,7 @@ HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
       .GetMutableDimensionBound(
           KernelFusionInterface::kIndexingMapBlockIdxDims[1])
       .upper = owner.reduction_heroes_.size();
+  tile_indexing.Simplify();
   bool vectorize = tiling.GetThreadTileSize().back() > 1;
 
   SmallVector<Value> iter_arg_inits;
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
index 2cbc3363bdba39..3c8b5597600708 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
@@ -309,8 +309,20 @@ TEST_F(ReductionTest, NonPowerOfTwoRowReduction) {
       c = f64[] constant(0)
       ROOT fusion = f64[100] fusion(a, c), kind=kInput, calls=fused_computation
     })";
-  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK: allocate_shared
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1)[s0] -> (d0 + s0 * 128 + (d1 mod 64) * 2)>
+    // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> ((d1 mod 64) * 2 + d0 + 512)>
+    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+    // CHECK: %[[FULL_TILES:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-NEXT: scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
+    // CHECK-NOT: scf.if
+    // CHECK: xla_gpu.apply_indexing #[[MAP1]](%[[J]] in [0, 1], %thread_id_x in [0, 255])[%[[I]] in [0, 4]]
+    // CHECK: scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%{{.*}} = %[[FULL_TILES]])
+    // CHECK: scf.if
+    // CHECK: xla_gpu.apply_indexing #[[MAP2]](%[[J]] in [0, 1], %thread_id_x in [0, 255])
   )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }

From 254d33d4d64ef4073e160a360d176a7128d407d7 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 10:44:09 -0700
Subject: [PATCH 082/287] [xla:gpu] Exclude collectives from command buffers if
 CUDA < 12.3

Tracing NCCL collectives relies on CUDA feature that was added in 12.3

PiperOrigin-RevId: 638338001
---
 third_party/xla/xla/service/gpu/command_buffer_scheduling.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 0bdde28b5961e6..29767cef78d275 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -697,7 +697,8 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
   // Erase command buffer cmd types that are not supported by the gpu runtime.
   static constexpr auto kRequireConditionals = {DebugOptions::CONDITIONALS};
   static constexpr auto kRequireTracing = {
-      DebugOptions::CUBLAS, DebugOptions::CUDNN, DebugOptions::CUSTOM_CALL};
+      DebugOptions::CUBLAS, DebugOptions::CUDNN, DebugOptions::CUSTOM_CALL,
+      DebugOptions::COLLECTIVES};
 
   auto erase = [&](absl::Span<const DebugOptions::CommandBufferCmdType> cmds) {
     for (auto cmd : cmds) {

From acb357d98ab8242ad9b66744eb979d813d0ab7db Mon Sep 17 00:00:00 2001
From: Vamsi Manchala <vamsimanchala@google.com>
Date: Wed, 29 May 2024 11:01:43 -0700
Subject: [PATCH 083/287] Add support for NCHW layout in stablehlo.composites
 for jax.image.resize and torch.nn.interpolate in nearest mode.

PiperOrigin-RevId: 638344504
---
 .../stablehlo/tests/composite-lowering.mlir   | 45 ++++++++++-
 .../transforms/composite_lowering_patterns.td | 78 ++++++++++++-------
 .../stablehlo/transforms/composite_utils.cc   | 24 ------
 .../stablehlo/transforms/composite_utils.h    |  4 -
 4 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
index ccc9114a7fe1e6..c5c4d6c792c4da 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -191,7 +191,7 @@ func.func private @XlaCallModule_aten.avg_pool2d.default.impl_6(%arg0: tensor<1x
 // CHECK: %4 = "tfl.transpose"(%3, %cst_2) : (tensor<1x1x5x1xf32>, tensor<4xi32>) -> tensor<1x1x1x5xf32>
 
 func.func @upsample_bilinear2d(%arg0: tensor<1x64x16x16xf32>) -> (tensor<1x64x32x32xf32>) {
-  %0 = mhlo.composite "odml.upsample_bilinear2d" %arg0 {composite_attributes = {align_corners = false, output = dense<32> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.upsample_bilinear2d.impl_21_0} : (tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32>
+  %0 = mhlo.composite "odml.upsample_bilinear2d" %arg0 {composite_attributes = {is_nchw_op = true, align_corners = false, output = dense<32> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.upsample_bilinear2d.impl_21_0} : (tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32>
   return %0 : tensor<1x64x32x32xf32>
 }
 func.func private @XlaCallModule_odml.upsample_bilinear2d.impl_21_0(%arg0: tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32> {
@@ -269,10 +269,10 @@ func.func @gelu(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // CHECK-LABEL  func.func @jax_image_resize_nearest
 func.func @jax_image_resize_nearest(%arg0: tensor<1x2x2x10xf32>) -> (tensor<1x4x4x10xf32>) {
-  %1 = mhlo.composite "odml.jax_resize_nearest_neighbor2d" %arg0 {composite_attributes = {output_size = dense<4> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.jax_resize_nearest_neighbor2d.impl_0} : (tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32>
+  %1 = mhlo.composite "tfl.resize_nearest_neighbor" %arg0 {composite_attributes = {is_nchw_op = false, align_corners = false, size = dense<4> : tensor<2xi64>}, decomposition = @XlaCallModule_tfl.resize_nearest_neighbor.impl_0} : (tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32>
   return %1 : tensor<1x4x4x10xf32>
 }
-func.func private @XlaCallModule_odml.jax_resize_nearest_neighbor2d.impl_0(%arg0: tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32> {
+func.func private @XlaCallModule_tfl.resize_nearest_neighbor.impl_0(%arg0: tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32> {
   %0 = call @XlaCallModule__resize_0(%arg0) : (tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32>
   return %0 : tensor<1x4x4x10xf32>
 }
@@ -320,3 +320,42 @@ func.func private @XlaCallModule__resize_0(%arg0: tensor<1x2x2x10xf32>) -> (tens
 // CHECK:  %cst = arith.constant dense<4> : tensor<2xi32>
 // CHECK:  %0 = "tfl.resize_nearest_neighbor"(%arg0, %cst) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x2x2x10xf32>, tensor<2xi32>) -> tensor<1x4x4x10xf32>
 // CHECK:  return %0 : tensor<1x4x4x10xf32>
+
+// CHECK-LABEL  func.func @jax_image_resize_nearest_nchw
+func.func @jax_image_resize_nearest_nchw(%arg0: tensor<4x8x32x32xf32>) -> (tensor<4x8x64x64xf32>) {
+  %0 = call @XlaCallModule_tfl.resize_nearest_neighbor.impl_1(%arg0) : (tensor<4x8x32x32xf32>) -> tensor<4x8x64x64xf32>
+  %1 = mhlo.composite "tfl.resize_nearest_neighbor" %arg0 {composite_attributes = {is_nchw_op = true, align_corners = false, size = dense<64> : tensor<2xi64>}, decomposition = @XlaCallModule_tfl.resize_nearest_neighbor.impl_1} : (tensor<4x8x32x32xf32>) -> tensor<4x8x64x64xf32>
+  return %1 : tensor<4x8x64x64xf32>
+}
+func.func private @XlaCallModule_tfl.resize_nearest_neighbor.impl_1(%arg0: tensor<4x8x32x32xf32>) -> tensor<4x8x64x64xf32> {
+  %0 = call @XlaCallModule__resize_1(%arg0) : (tensor<4x8x32x32xf32>) -> tensor<4x8x64x64xf32>
+  return %0 : tensor<4x8x64x64xf32>
+}
+func.func private @XlaCallModule__resize_1(%arg0: tensor<4x8x32x32xf32>) -> (tensor<4x8x64x64xf32>) {
+  %0 = mhlo.constant dense<32> : tensor<64xi32>
+  %1 = mhlo.constant dense<0> : tensor<64xi32>
+  %2 = mhlo.constant dense<6.400000e+01> : tensor<64xf32>
+  %3 = mhlo.constant dense<3.200000e+01> : tensor<64xf32>
+  %4 = mhlo.constant dense<5.000000e-01> : tensor<64xf32>
+  %5 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<64xf32>
+  %6 = mhlo.add %5, %4 : tensor<64xf32>
+  %7 = mhlo.multiply %6, %3 : tensor<64xf32>
+  %8 = mhlo.divide %7, %2 : tensor<64xf32>
+  %9 = mhlo.floor %8 : tensor<64xf32>
+  %10 = mhlo.convert %9 : (tensor<64xf32>) -> tensor<64xi32>
+  %11 = mhlo.compare  LT, %10, %1,  SIGNED : (tensor<64xi32>, tensor<64xi32>) -> tensor<64xi1>
+  %12 = mhlo.add %10, %0 : tensor<64xi32>
+  %13 = mhlo.select %11, %12, %10 : tensor<64xi1>, tensor<64xi32>
+  %14 = mhlo.reshape %13 : (tensor<64xi32>) -> tensor<64x1xi32>
+  %15 = "mhlo.gather"(%arg0, %14) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1, 3], collapsed_slice_dims = [2], start_index_map = [2], index_vector_dim = 1>, slice_sizes = dense<[4, 8, 1, 32]> : tensor<4xi64>}> : (tensor<4x8x32x32xf32>, tensor<64x1xi32>) -> tensor<4x8x64x32xf32>
+  %16 = "mhlo.gather"(%15, %14) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1, 2], collapsed_slice_dims = [3], start_index_map = [3], index_vector_dim = 1>, slice_sizes = dense<[4, 8, 64, 1]> : tensor<4xi64>}> : (tensor<4x8x64x32xf32>, tensor<64x1xi32>) -> tensor<4x8x64x64xf32>
+  return %16 : tensor<4x8x64x64xf32>
+}
+// CHECK:  %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK:  %1 = "tfl.transpose"(%arg0, %cst) : (tensor<4x8x32x32xf32>, tensor<4xi32>) -> tensor<4x32x32x8xf32>
+// CHECK:  %cst_0 = arith.constant dense<64> : tensor<2xi32>
+// CHECK:  %2 = "tfl.resize_nearest_neighbor"(%1, %cst_0) <{align_corners = false, half_pixel_centers = true}> : (tensor<4x32x32x8xf32>, tensor<2xi32>) -> tensor<4x64x64x8xf32>
+// CHECK:  %cst_1 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK:  %3 = "tfl.transpose"(%2, %cst_1) : (tensor<4x64x64x8xf32>, tensor<4xi32>) -> tensor<4x8x64x64xf32>
+// CHECK:  return %3 : tensor<4x8x64x64xf32>
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
index 5b9324c2a1782b..78b8e945f96502 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -28,43 +28,63 @@ def LegalizeHardSwishComposite: Pat<
                     ConstantStrAttr<StrAttr, "aten.hardswish.default">, $_, $_, $_),
                     (TFL_HardSwishOp $input)>;
 
-// Checks if the given op is an InterpolateBilinear op with NCHW layout.
-// Supplied arguments are the input, output op values and the output shape.
-def IsSupportedNchwUpsampleBlinear: Constraint<CPred<
-  "IsSupportedNchwUpsampleBlinear($0, $1[0], $2.get(\"output\").dyn_cast<DenseIntElementsAttr>())">>;
+def IsNchwLayoutOp: Constraint<CPred<
+  "$0.get(\"is_nchw_op\") && $0.get(\"is_nchw_op\").dyn_cast<BoolAttr>() "
+      "== mlir::BoolAttr::get($_builder.getContext(), true)">>;
 
+def IsNhwcLayoutOp: Constraint<Neg<IsNchwLayoutOp.predicate>>;
+
+// TODO(b/343278954): Move the creation of transposes to a separate prepare pass
+// to avoid creating multiple pattern-rewrite rules for the same composite op.
 def LegalizeTorchUpsampleBlinear2dComposite: Pat<
-                    (MHLO_CompositeOp:$old_val
-                    (variadic $input),
-                    ConstantStrAttr<StrAttr, "odml.upsample_bilinear2d">, $attrs, $_, $_),
-                    (TFL_TransposeOp
-                        (TFL_ResizeBilinearOp
-                            (TFL_TransposeOp $input,
-                                (Arith_ConstantOp
-                                    ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
-                            (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"output"> $attrs))),
-                            (GetCompositeAttributeAs<"align_corners", "BoolAttr"> $attrs),
-                            ConstBoolAttrTrue,
-                            (returnType (GetNhwcReturnTypeFromNchw $old_val))),
-                        (Arith_ConstantOp
-                            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)),
-                    [(IsSupportedNchwUpsampleBlinear $input, $old_val, $attrs)]>;
+    (MHLO_CompositeOp:$old_val
+    (variadic $input),
+    ConstantStrAttr<StrAttr, "odml.upsample_bilinear2d">, $attrs, $_, $_),
+    (TFL_TransposeOp
+      (TFL_ResizeBilinearOp
+        (TFL_TransposeOp $input,
+          (Arith_ConstantOp
+            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+        (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"output"> $attrs))),
+        (GetCompositeAttributeAs<"align_corners", "BoolAttr"> $attrs),
+        ConstBoolAttrTrue,
+        (returnType (GetNhwcReturnTypeFromNchw $old_val))),
+      (Arith_ConstantOp
+        ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">))>;
 
-// TODO(b/333961789): Add support for NCHW layout for PyTorch resize, plus jax
-// supports NCHW inputs as well, so we need to add reliable way of checking the
-// layout.
-// pattern to lower a stablehlo.composite with `jax.image.resize` in `nearest`
-// mode to a tflite.resize_nearest_neighbor op.
-def LegalizeJaxResizeNearestNeighbor2dComposite: Pat<
+// pattern to lower a stablehlo.composite of `jax` and `pytorch` image resize fuctions
+// in `nearest` mode and with NHWC inputs to a tflite.resize_nearest_neighbor op.
+def LegalizeNhwcResizeNearestNeighbor2dComposite: Pat<
     (MHLO_CompositeOp:$old_val
     (variadic $input),
-    ConstantStrAttr<StrAttr, "odml.jax_resize_nearest_neighbor2d">, $attrs, $_, $_),
+    ConstantStrAttr<StrAttr, "tfl.resize_nearest_neighbor">, $attrs, $_, $_),
     (TFL_ResizeNearestNeighborOp
       $input,
-      (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"output_size"> $attrs))),
-      ConstBoolAttrFalse,
-      ConstBoolAttrTrue)>;
+      (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"size"> $attrs))),
+      (GetCompositeAttributeAs<"align_corners", "BoolAttr"> $attrs),
+      ConstBoolAttrTrue),
+    [(IsNhwcLayoutOp $attrs)]>;
 
+// pattern to lower a stablehlo.composite of `jax` and `pytorch` image resize fuctions
+// in `nearest`mode and with NCHW inputs to a tflite.resize_nearest_neighbor op.
+// TODO(b/343278954): Move the creation of transposes to a separate prepare pass
+// to avoid creating multiple pattern-rewrite rules for the same composite op.
+def LegalizeNchwResizeNearestNeighbor2dComposite: Pat<
+    (MHLO_CompositeOp:$old_val
+    (variadic $input),
+    ConstantStrAttr<StrAttr, "tfl.resize_nearest_neighbor">, $attrs, $_, $_),
+    (TFL_TransposeOp
+      (TFL_ResizeNearestNeighborOp
+        (TFL_TransposeOp $input,
+          (Arith_ConstantOp
+            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+        (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"size"> $attrs))),
+        (GetCompositeAttributeAs<"align_corners", "BoolAttr"> $attrs),
+        ConstBoolAttrTrue,
+        (returnType (GetNhwcReturnTypeFromNchw $old_val))),
+      (Arith_ConstantOp
+        ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)),
+    [(IsNchwLayoutOp $attrs)]>;
 
 def LegalizeCompositeGELU : Pat<(MHLO_CompositeOp:$composite 
                                   (variadic $inputs), 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
index eebce19066a7ae..bf0e409f73d858 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
@@ -76,30 +76,6 @@ std::optional<bool> GetBoolFromCompositeAttr(
   return std::nullopt;
 }
 
-bool IsSupportedNchwUpsampleBlinear(
-    Value input, Value output, const DenseIntElementsAttr& output_size_attr) {
-  auto input_shape = mlir::cast<ShapedType>(input.getType()).getShape();
-  auto output_shape = mlir::cast<ShapedType>(output.getType()).getShape();
-
-  // Only support 4D tensor.
-  if (input_shape.size() != 4 || output_shape.size() != 4) {
-    return false;
-  }
-
-  // Only expects the first two dimensions of input and output to be the same as
-  // in NCHW.
-  if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
-    return false;
-  }
-
-  // Supplied output size should be 2D.
-  if (output_size_attr.getNumElements() != 2) {
-    return false;
-  }
-  auto output_size = output_size_attr.getValues<int64_t>();
-  return output_size[0] == output_shape[2] && output_size[1] == output_shape[3];
-}
-
 ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op) {
   auto composite_result_shape =
       mlir::cast<ShapedType>(old_op->getResults().front().getType()).getShape();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
index 7d57cbe21a5a4f..8de026e842c100 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
@@ -69,10 +69,6 @@ bool GetI32VectorFromDenseI64CompositeAttr(
 DenseIntElementsAttr DenseI64AttrToI32Attr(
     const DenseIntElementsAttr& dense_attr, PatternRewriter& builder);
 
-// Returns true if the given input and output are in NCHW layout
-bool IsSupportedNchwUpsampleBlinear(
-    Value input, Value output, const DenseIntElementsAttr& output_size_attr);
-
 // Returns a NHWC shaped type from an NCHW shaped type op.
 // For example- Given a Composite op that wraps a core.aten.avg_pool2d, this
 // returns the return type of the tfl.average_pool_2d emitted. Note that the

From 07679bd5339aced7b2b31ce5af8e13f7ab6a3c01 Mon Sep 17 00:00:00 2001
From: zoranjovanovic-ns <126815388+zoranjovanovic-ns@users.noreply.github.com>
Date: Wed, 29 May 2024 11:16:15 -0700
Subject: [PATCH 084/287] PR #13194: [ROCm] Fix build break in
 xla/service/gpu/ir_emitter_triton_rocm.cc

Imported from GitHub PR https://github.com/openxla/xla/pull/13194

Issue is present after following commit:
https://github.com/openxla/triton/commit/6d54ba2fba40073a21c22f5fac2bdc5fb385a655
Copybara import of the project:

--
c870abf922ae32592a693938f2ea829712e57347 by Zoran Jovanovic <zjovanov@amd.com>:

[ROCm] Fix build break in xla/service/gpu/ir_emitter_triton_rocm.cc

Merging this change closes #13194

PiperOrigin-RevId: 638350201
---
 .../xla/service/gpu/ir_emitter_triton_rocm.cc | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
index e4331885baa08c..ead31a2fb6fcbe 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
@@ -71,23 +71,23 @@ absl::Status CreateTritonPipeline(
   pm.addPass(mt::createConvertTritonToTritonGPUPass(
       absl::StrFormat("cuda:%u", ccAsInt), config.num_warps, threadsPerWarp,
       config.num_ctas));
-  pm.addPass(mt::gpu::createCoalescePass());
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
-  pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createTritonGPUCoalesce());
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  pm.addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
+  pm.addPass(mt::gpu::createTritonGPUAccelerateMatmul({ccAsInt}));
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   // TODO ROCm Check if we want to compare MI100 and greater
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(true));
+  pm.addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
   pm.addPass(mlir::createCSEPass());
-  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                         config.num_ctas, ccAsInt));
-  pm.addPass(mt::gpu::createPrefetchPass());
+  pm.addPass(mt::gpu::createTritonGPUPipeline(
+      {config.num_stages, config.num_warps, config.num_ctas, ccAsInt}));
+  pm.addPass(mt::gpu::createTritonGPUPrefetch());
 
   // TODO ROCm Check if we want to compare MI100 and greater
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(true));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
-  pm.addPass(mt::gpu::createReorderInstructionsPass());
+  pm.addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
+  pm.addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  pm.addPass(mt::gpu::createTritonGPUReduceDataDuplication());
+  pm.addPass(mt::gpu::createTritonGPUReorderInstructions());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::createCanonicalizerPass());

From 2197ad91d113f7bcc14cf2b3c1337aa9f8730c76 Mon Sep 17 00:00:00 2001
From: Ionel Gog <icgog@google.com>
Date: Wed, 29 May 2024 11:27:57 -0700
Subject: [PATCH 085/287] [IFRT] Add function for converting from
 xla::ifrt::Sharding to OpSharding.

PiperOrigin-RevId: 638354440
---
 third_party/xla/xla/python/ifrt/support/BUILD |   5 +-
 .../ifrt/support/sharding_conversions.cc      |  35 ++-
 .../ifrt/support/sharding_conversions.h       |  17 +-
 .../ifrt/support/sharding_conversions_test.cc | 242 +++++++++++-------
 4 files changed, 191 insertions(+), 108 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index a214a2e3cca52e..b3c1b01726b35c 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -13,6 +13,7 @@ cc_library(
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/python/ifrt",
         "//xla/python/ifrt/ir:sharding_param",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -33,8 +34,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:tile_assignment",
         "//xla/python/ifrt",
-        "//xla/python/ifrt:sharding_test_util",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:test_util",
         "//xla/python/ifrt/ir:sharding_param",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
index 1c6a2b3f6f5a73..77e28170283d5a 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
@@ -25,16 +25,32 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace ifrt {
 namespace support {
 
-absl::StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
-                                        absl::Span<const int> device_mapping) {
+absl::StatusOr<OpSharding> ToOpSharding(const Sharding& sharding) {
+  if (auto* sharding_param_sharding =
+          llvm::dyn_cast<xla::ifrt::ShardingParamSharding>(&sharding)) {
+    return ToOpSharding(sharding_param_sharding->sharding_param(),
+                        sharding_param_sharding->devices());
+  } else {
+    return absl::InvalidArgumentError(
+        "Only conversion from `ShardingParamSharding` to `OpSharding` is "
+        "supported.");
+  }
+}
+
+absl::StatusOr<OpSharding> ToOpSharding(
+    const ShardingParam& sharding_param,
+    const xla::ifrt::DeviceList& device_mapping) {
   OpSharding op_sharding;
   {
     bool all_dim_replicated = true;
@@ -69,18 +85,19 @@ absl::StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
   }
 
   // Populate tile_assignment_devices.
-  llvm::SmallVector<int, 4> devices;
-  sharding_param.minor_to_major().ToDeviceList(devices);
+  llvm::SmallVector<int> logical_device_ids;
+  sharding_param.minor_to_major().ToDeviceList(logical_device_ids);
   auto* tile_assignment_devices = op_sharding.mutable_tile_assignment_devices();
-  tile_assignment_devices->Reserve(devices.size());
-  for (const int device : devices) {
-    if (device < 0 || device >= device_mapping.size()) {
+  tile_assignment_devices->Reserve(logical_device_ids.size());
+  for (const int logical_device_id : logical_device_ids) {
+    if (logical_device_id < 0 || logical_device_id >= device_mapping.size()) {
       return absl::OutOfRangeError(
-          absl::StrCat("Can't map device with logical id ", device,
+          absl::StrCat("Can't map device with logical id ", logical_device_id,
                        ". The logical device id should be within [0, ",
                        device_mapping.size(), ")."));
     }
-    tile_assignment_devices->Add(device_mapping[device]);
+    tile_assignment_devices->Add(
+        device_mapping[logical_device_id]->Id().value());
   }
 
   return op_sharding;
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions.h b/third_party/xla/xla/python/ifrt/support/sharding_conversions.h
index 1a9c2d3e728a51..cf569a3c34f930 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions.h
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions.h
@@ -17,15 +17,25 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_SUPPORT_SHARDING_CONVERSIONS_H_
 
 #include "absl/status/statusor.h"
-#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace ifrt {
 namespace support {
 
+// Converts a Sharding to an OpSharding.
+//
+// The function currently supports only `ShardingParamSharding`. It assumes that
+// `sharding_param` is valid.
+//
+// Returns error if the given sharding is of a type for which conversion is
+// not supported.
+absl::StatusOr<OpSharding> ToOpSharding(const Sharding& sharding);
+
 // Converts ShardingParam and a device_mapping to OpSharding.
 //
 // The function assumes that `sharding_param` is valid. The logical device
@@ -34,8 +44,9 @@ namespace support {
 //
 // Returns error when `device_mapping` can't map the logical devices in
 // `sharding_param`.
-absl::StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
-                                        absl::Span<const int> device_mapping);
+absl::StatusOr<OpSharding> ToOpSharding(
+    const ShardingParam& sharding_param,
+    const xla::ifrt::DeviceList& device_mapping);
 
 // Converts ShardingParam to HloSharding.
 //
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index d6619f4693d0eb..da1b26c6bf9555 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <memory>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -31,9 +33,10 @@ limitations under the License.
 #include "xla/python/ifrt/index_domain.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
-#include "xla/python/ifrt/sharding_test_util.h"
+#include "xla/python/ifrt/test_util.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -45,17 +48,76 @@ namespace ifrt {
 namespace support {
 namespace {
 
+using ::testing::Return;
 using ::tsl::testing::StatusIs;
 using xla::HloSharding;
 
 absl::StatusOr<HloSharding> ToHloShardingViaOpSharding(
-    const ShardingParam& sharding_param, absl::Span<const int> device_list) {
+    const ShardingParam& sharding_param, const DeviceList& device_list) {
   TF_ASSIGN_OR_RETURN(xla::OpSharding op_sharding,
                       ToOpSharding(sharding_param, device_list));
   return HloSharding::FromProto(op_sharding);
 }
 
-TEST(ShardingConversionsTest, Replicated) {
+// Internal state of a client for sharding conversion tests.
+struct ShardingConversionTestClientState {
+  absl::flat_hash_map<DeviceId, std::unique_ptr<Device>> device_map;
+  std::vector<Device*> devices;
+};
+
+// Creates a mock client for sharding tests. The client will have a specified
+// number of fake devices. Client implements `devices()`, and Device implements
+// `Id()`, with iota device ids assignment.
+std::shared_ptr<MockClient> MakeTestClient(int num_devices) {
+  auto state = std::make_shared<ShardingConversionTestClientState>();
+  state->devices.reserve(num_devices);
+  for (int i = 0; i < num_devices; ++i) {
+    auto device = std::make_unique<MockDevice>();
+    ON_CALL(*device, Id).WillByDefault(Return(DeviceId(i)));
+    state->devices.push_back(device.get());
+    state->device_map.insert({DeviceId(i), std::move(device)});
+  }
+  auto client = std::make_shared<MockClient>();
+  ON_CALL(*client, devices)
+      .WillByDefault(
+          [state]() -> absl::Span<Device* const> { return state->devices; });
+  return client;
+}
+
+class ShardingConversionsTest : public testing::TestWithParam<int> {
+ public:
+  void SetUp() override { client_ = MakeTestClient(GetParam()); }
+
+  DeviceList GetDevices(absl::Span<const int> device_indices) {
+    return test_util::GetDevices(client_.get(), device_indices).value();
+  }
+
+  void AssertSameTiling(const ShardingParam& sharding_param,
+                        const HloSharding& hlo_sharding, const Shape& shape) {
+    auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const Sharding> sharding,
+                            ShardingParamSharding::Create(
+                                sharding_param, device_list, MemoryKind()));
+    const xla::Shape xla_shape(PrimitiveType::F16, shape.dims(), {}, {});
+
+    TF_ASSERT_OK_AND_ASSIGN(const std::vector<IndexDomain> index_domains,
+                            sharding->IndexDomains(shape));
+    ASSERT_EQ(index_domains.size(),
+              hlo_sharding.tile_assignment().num_elements());
+    const xla::Shape xla_tile_shape = hlo_sharding.TileShape(xla_shape);
+    for (int i = 0; i < index_domains.size(); ++i) {
+      SCOPED_TRACE(absl::StrCat("on device ", i));
+      EXPECT_EQ(index_domains[i].origin().elements(),
+                hlo_sharding.TileOffsetForDevice(xla_shape, i));
+      EXPECT_EQ(index_domains[i].shape().dims(), xla_tile_shape.dimensions());
+    }
+  }
+
+ private:
+  std::shared_ptr<Client> client_;
+};
+
+TEST_P(ShardingConversionsTest, Replicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1, 1},
       {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
@@ -64,7 +126,8 @@ TEST(ShardingConversionsTest, Replicated) {
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
       const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
+      ToHloShardingViaOpSharding(expected_sharding_param,
+                                 GetDevices({0, 1, 2, 3, 4, 5})));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
   TF_ASSERT_OK_AND_ASSIGN(auto sharding_param,
@@ -76,7 +139,7 @@ TEST(ShardingConversionsTest, Replicated) {
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
 
-TEST(ShardingConversionsTest, SingleDeviceReplicated) {
+TEST_P(ShardingConversionsTest, SingleDeviceReplicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1}, {/*permutation=*/{0}, /*axis_sizes=*/{1}}};
   TF_EXPECT_OK(expected_sharding_param.verify());
@@ -84,7 +147,7 @@ TEST(ShardingConversionsTest, SingleDeviceReplicated) {
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
       const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {0}));
+      ToHloShardingViaOpSharding(expected_sharding_param, GetDevices({0})));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
   TF_ASSERT_OK_AND_ASSIGN(auto sharding_param,
@@ -92,7 +155,7 @@ TEST(ShardingConversionsTest, SingleDeviceReplicated) {
   EXPECT_EQ(expected_sharding_param, sharding_param);
 }
 
-TEST(ShardingConversionsTest, Permutation) {
+TEST_P(ShardingConversionsTest, Permutation) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
@@ -101,7 +164,8 @@ TEST(ShardingConversionsTest, Permutation) {
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
       const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
+      ToHloShardingViaOpSharding(expected_sharding_param,
+                                 GetDevices({0, 1, 2, 3, 4, 5})));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]0,3,1,4,2,5}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
   TF_ASSERT_OK_AND_ASSIGN(auto sharding_param,
@@ -109,7 +173,7 @@ TEST(ShardingConversionsTest, Permutation) {
   EXPECT_EQ(expected_sharding_param, sharding_param);
 }
 
-TEST(ShardingConversionsTest, Partial) {
+TEST_P(ShardingConversionsTest, Partial) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
   TF_EXPECT_OK(expected_sharding_param.verify());
@@ -117,7 +181,8 @@ TEST(ShardingConversionsTest, Partial) {
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
       const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
+      ToHloShardingViaOpSharding(expected_sharding_param,
+                                 GetDevices({0, 1, 2, 3, 4, 5})));
   EXPECT_EQ(hlo_sharding.ToString(),
             "{devices=[2,1,3]0,1,2,3,4,5 last_tile_dim_replicate}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -130,15 +195,15 @@ TEST(ShardingConversionsTest, Partial) {
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
 
-TEST(ShardingConversionsTest, OneDimToTwoAxes) {
+TEST_P(ShardingConversionsTest, OneDimToTwoAxes) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{4}, {/*permutation=*/{1, 0}, /*axis_sizes=*/{2, 2}}};
   TF_EXPECT_OK(expected_sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3}));
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_sharding,
+                          ToHloShardingViaOpSharding(expected_sharding_param,
+                                                     GetDevices({0, 1, 2, 3})));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[4]0,2,1,3}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
   TF_ASSERT_OK_AND_ASSIGN(auto sharding_param,
@@ -146,18 +211,19 @@ TEST(ShardingConversionsTest, OneDimToTwoAxes) {
   EXPECT_EQ(expected_sharding_param, sharding_param);
 }
 
-TEST(ShardingConversionsTest, NonTrivialDeviceAssignment) {
+TEST_P(ShardingConversionsTest, NonTrivialDeviceAssignment) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_EXPECT_OK(expected_sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
       const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(expected_sharding_param, {6, 5, 4, 3, 2, 1}));
+      ToHloShardingViaOpSharding(expected_sharding_param,
+                                 GetDevices({6, 5, 4, 3, 2, 1})));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]6,3,5,2,4,1}");
 }
 
-TEST(ShardingConversionsTest, VerifyIncorrectShardings) {
+TEST_P(ShardingConversionsTest, VerifyIncorrectShardings) {
   ShardingParam different_permutation_and_axis{
       /*dim_shards=*/{1, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2}}};
   EXPECT_FALSE(different_permutation_and_axis.verify().ok());
@@ -170,24 +236,82 @@ TEST(ShardingConversionsTest, VerifyIncorrectShardings) {
   EXPECT_FALSE(incorrect_permutation.verify().ok());
 }
 
-TEST(ShardingConversionsTest, ErrorOnDeviceAssignment) {
+TEST_P(ShardingConversionsTest, ErrorOnDeviceAssignment) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
                                {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_EXPECT_OK(sharding_param.verify());
   EXPECT_THAT(
-      ToHloShardingViaOpSharding(sharding_param, {6, 5, 4, 3, 2}),
+      ToHloShardingViaOpSharding(sharding_param, GetDevices({6, 5, 4, 3, 2})),
       StatusIs(absl::StatusCode::kOutOfRange,
                ::testing::HasSubstr("Can't map device with logical id 5")));
 }
 
+TEST_P(ShardingConversionsTest, ShardingParamFullySharded) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_sharding,
+                          ToHloShardingViaOpSharding(
+                              sharding_param, GetDevices({0, 1, 2, 3, 4, 5})));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+TEST_P(ShardingConversionsTest, ShardingParamWithPermutation) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_sharding,
+                          ToHloShardingViaOpSharding(
+                              sharding_param, GetDevices({0, 1, 2, 3, 4, 5})));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+TEST_P(ShardingConversionsTest, ShardingParamWithReplication) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_sharding,
+                          ToHloShardingViaOpSharding(
+                              sharding_param, GetDevices({0, 1, 2, 3, 4, 5})));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+TEST_P(ShardingConversionsTest, OpShardingReplicated) {
+  OpSharding op_sharding;
+  op_sharding.set_type(OpSharding::REPLICATED);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_sharding,
+                          HloSharding::FromProto(op_sharding));
+  TF_ASSERT_OK_AND_ASSIGN(auto actual, ToShardingParam(hlo_sharding, 2, 6));
+  ShardingParam expected{/*dim_shards=*/{1, 1},
+                         {/*permutation=*/{0}, /*axis_sizes=*/{6}}};
+  TF_EXPECT_OK(expected.verify());
+  EXPECT_EQ(actual, expected);
+}
+
+INSTANTIATE_TEST_SUITE_P(NumDevices, ShardingConversionsTest,
+                         testing::Values(7));
+
 struct HloShardingTestStruct {
   HloSharding hlo_sharding;
   int rank;
   int num_devices;
 };
 
-using HloShardingToShardingParamTest =
-    ::testing::TestWithParam<HloShardingTestStruct>;
+class HloShardingToShardingParamTest
+    : public testing::TestWithParam<HloShardingTestStruct> {
+ public:
+  void SetUp() override {
+    const auto& param = GetParam();
+    client_ = MakeTestClient(param.num_devices);
+  }
+
+  DeviceList GetDevices(absl::Span<const int> device_indices) {
+    return test_util::GetDevices(client_.get(), device_indices).value();
+  }
+
+ private:
+  std::shared_ptr<Client> client_;
+};
 
 TEST_P(HloShardingToShardingParamTest, HloShardingToShardingParam) {
   const auto& param = GetParam();
@@ -203,7 +327,8 @@ TEST_P(HloShardingToShardingParamTest, HloShardingToShardingParam) {
   std::iota(device_ids.begin(), device_ids.end(), 0);
   TF_ASSERT_OK_AND_ASSIGN(
       auto hlo_via_op_sharding,
-      ToHloShardingViaOpSharding(sharding_param, device_ids));
+      ToHloShardingViaOpSharding(sharding_param,
+                                 GetDevices(absl::MakeSpan(device_ids))));
   EXPECT_EQ(param.hlo_sharding, hlo_via_op_sharding);
 }
 
@@ -254,79 +379,6 @@ INSTANTIATE_TEST_SUITE_P(
          3, 24},
     }));
 
-class ShardingConversionsEquivalentTest : public test_util::ShardingTest {
- public:
-  void AssertSameTiling(const ShardingParam& sharding_param,
-                        const HloSharding& hlo_sharding, const Shape& shape) {
-    auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
-    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const Sharding> sharding,
-                            ShardingParamSharding::Create(
-                                sharding_param, device_list, MemoryKind()));
-    const xla::Shape xla_shape(PrimitiveType::F16, shape.dims(), {}, {});
-
-    TF_ASSERT_OK_AND_ASSIGN(const std::vector<IndexDomain> index_domains,
-                            sharding->IndexDomains(shape));
-    ASSERT_EQ(index_domains.size(),
-              hlo_sharding.tile_assignment().num_elements());
-    const xla::Shape xla_tile_shape = hlo_sharding.TileShape(xla_shape);
-    for (int i = 0; i < index_domains.size(); ++i) {
-      SCOPED_TRACE(absl::StrCat("on device ", i));
-      EXPECT_EQ(index_domains[i].origin().elements(),
-                hlo_sharding.TileOffsetForDevice(xla_shape, i));
-      EXPECT_EQ(index_domains[i].shape().dims(), xla_tile_shape.dimensions());
-    }
-  }
-
- private:
-  std::shared_ptr<Client> client_;
-};
-
-TEST_P(ShardingConversionsEquivalentTest, ShardingParamFullySharded) {
-  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
-                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_EXPECT_OK(sharding_param.verify());
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
-  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
-}
-
-TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithPermutation) {
-  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
-                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
-  TF_EXPECT_OK(sharding_param.verify());
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
-  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
-}
-
-TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithReplication) {
-  ShardingParam sharding_param{/*dim_shards=*/{2, 1},
-                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_EXPECT_OK(sharding_param.verify());
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloSharding hlo_sharding,
-      ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
-  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
-}
-
-TEST_P(ShardingConversionsEquivalentTest, OpShardingReplicated) {
-  OpSharding op_sharding;
-  op_sharding.set_type(OpSharding::REPLICATED);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_sharding,
-                          HloSharding::FromProto(op_sharding));
-  TF_ASSERT_OK_AND_ASSIGN(auto actual, ToShardingParam(hlo_sharding, 2, 6));
-  ShardingParam expected{/*dim_shards=*/{1, 1},
-                         {/*permutation=*/{0}, /*axis_sizes=*/{6}}};
-  TF_EXPECT_OK(expected.verify());
-  EXPECT_EQ(actual, expected);
-}
-
-INSTANTIATE_TEST_SUITE_P(NumDevices, ShardingConversionsEquivalentTest,
-                         testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 4}));
-
 }  // namespace
 }  // namespace support
 }  // namespace ifrt

From c593b0fd48fb2cbacb9e11a9e336531508515721 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 11:28:19 -0700
Subject: [PATCH 086/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638354595
---
 third_party/xla/xla/hlo/ir/BUILD                     |  1 -
 third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h         |  2 +-
 .../xla/xla/hlo/ir/dynamic_parameter_binding.cc      |  2 +-
 .../xla/xla/hlo/ir/dynamic_parameter_binding.h       |  2 +-
 .../xla/xla/hlo/ir/hlo_input_output_alias_config.cc  |  2 +-
 third_party/xla/xla/hlo/ir/hlo_instruction.cc        |  2 +-
 third_party/xla/xla/hlo/ir/hlo_instruction.h         |  2 +-
 third_party/xla/xla/hlo/ir/hlo_instructions.cc       |  2 +-
 third_party/xla/xla/hlo/ir/hlo_instructions.h        |  2 +-
 third_party/xla/xla/hlo/ir/hlo_schedule.h            |  2 +-
 third_party/xla/xla/python/BUILD                     | 12 +++---------
 third_party/xla/xla/python/custom_call_sharding.cc   |  1 -
 third_party/xla/xla/python/mlir.cc                   |  1 -
 third_party/xla/xla/python/py_array.cc               |  1 -
 third_party/xla/xla/python/py_array.h                |  1 -
 third_party/xla/xla/python/types.h                   |  2 +-
 third_party/xla/xla/python/util.cc                   |  2 +-
 third_party/xla/xla/python/util.h                    |  2 +-
 .../xla/xla/python/xplane_to_profile_instructions.cc |  1 -
 .../xla/xla/python/xplane_to_profile_instructions.h  |  2 +-
 20 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 8f15554a314061..19a42093ab8d02 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -70,7 +70,6 @@ cc_library(
         "//xla:protobuf_util",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
index 3382aeef22a163..d69ceb51dd97d5 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
-#include "xla/status.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
index 62d35b9d405f84..9a90dbda61f60b 100644
--- a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
+++ b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
index 7c78ef3c8d659d..12d42ddf121d71 100644
--- a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
+++ b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
@@ -24,9 +24,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
index 6303e4273b2046..ada3edf2bbe618 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 422071b090e6df..a8e960a1008a8e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
@@ -73,7 +74,6 @@ limitations under the License.
 #include "xla/service/name_uniquer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index 15ae6f6ebe84ea..4de06016084736 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -60,7 +61,6 @@ limitations under the License.
 #include "xla/service/name_uniquer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/gtl/iterator_range.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 143eaafc1363e3..c8d4b96c6efdf6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -58,7 +59,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index c84754fb87a114..b0e337adfc5061 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/gtl/iterator_range.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index bfc2ce5342c5b4..4deb36d4e89c32 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -23,11 +23,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index fbf2d526810b87..ac8b7fba84d3ed 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -198,7 +198,6 @@ cc_library(
         "//third_party/nanobind",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -208,6 +207,7 @@ cc_library(
         "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -362,7 +362,6 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -627,7 +626,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "//third_party/nanobind",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
@@ -921,7 +919,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "//third_party/nanobind",
-        "//xla:status",
         "//xla/client:xla_computation",
         "//xla/mlir/utils:error_util",
         "//xla/mlir_hlo",
@@ -1010,10 +1007,10 @@ cc_library(
         ":types",
         ":xplane_to_profile_instructions",
         # placeholder for index annotation deps
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "//third_party/nanobind",
-        "//xla:status",
         "//xla/backends/profiler:profiler_backends",
         "//xla/backends/profiler/cpu:python_tracer",
         "//xla/backends/profiler/plugin:plugin_tracer",
@@ -1070,12 +1067,11 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//xla:status",
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_future",
         "//xla/python/ifrt",
-        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1348,7 +1344,6 @@ tsl_pybind_extension(
         "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -1422,7 +1417,6 @@ cc_library(
     srcs = ["xplane_to_profile_instructions.cc"],
     hdrs = ["xplane_to_profile_instructions.h"],
     deps = [
-        "//xla:status",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
diff --git a/third_party/xla/xla/python/custom_call_sharding.cc b/third_party/xla/xla/python/custom_call_sharding.cc
index 599cb160a9d94d..aa2102a1907892 100644
--- a/third_party/xla/xla/python/custom_call_sharding.cc
+++ b/third_party/xla/xla/python/custom_call_sharding.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/python/custom_partition_callback.h"
 #include "xla/python/inspect_sharding.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/python/mlir.cc b/third_party/xla/xla/python/mlir.cc
index 1853583a014c8b..e74178833b87bc 100644
--- a/third_party/xla/xla/python/mlir.cc
+++ b/third_party/xla/xla/python/mlir.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/refine_polymorphic_shapes.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 5acee75370f2d4..d1e0ff97f58c7b 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -83,7 +83,6 @@ limitations under the License.
 #include "xla/python/util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 75df6229ee59a1..4c3cc8447a5f15 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/traceback.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 8a338c7bfba118..63e11a7c16b7b0 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/layout.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/python/nb_numpy.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/util.cc b/third_party/xla/xla/python/util.cc
index 724f71772ed213..2d2ea906db7e13 100644
--- a/third_party/xla/xla/python/util.cc
+++ b/third_party/xla/xla/python/util.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
index 20977ffa0be36c..e623e87bd599d3 100644
--- a/third_party/xla/xla/python/util.h
+++ b/third_party/xla/xla/python/util.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/array.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/xplane_to_profile_instructions.cc b/third_party/xla/xla/python/xplane_to_profile_instructions.cc
index b55b68e51cb925..c84a9594ccae2b 100644
--- a/third_party/xla/xla/python/xplane_to_profile_instructions.cc
+++ b/third_party/xla/xla/python/xplane_to_profile_instructions.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/status.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/types.h"
diff --git a/third_party/xla/xla/python/xplane_to_profile_instructions.h b/third_party/xla/xla/python/xplane_to_profile_instructions.h
index b32bfd2fbb9220..4375480afe1ae4 100644
--- a/third_party/xla/xla/python/xplane_to_profile_instructions.h
+++ b/third_party/xla/xla/python/xplane_to_profile_instructions.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 

From acd241a7052ba67431de92d1d2f14c386de3931e Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 11:29:50 -0700
Subject: [PATCH 087/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638355193
---
 third_party/xla/xla/backends/profiler/plugin/BUILD          | 1 -
 .../xla/xla/backends/profiler/plugin/plugin_tracer.cc       | 1 -
 .../xla/xla/backends/profiler/plugin/plugin_tracer.h        | 2 +-
 third_party/xla/xla/client/BUILD                            | 3 +--
 third_party/xla/xla/client/xla_builder.cc                   | 1 -
 third_party/xla/xla/client/xla_builder.h                    | 2 +-
 third_party/xla/xla/client/xla_builder_test.cc              | 2 +-
 third_party/xla/xla/ffi/BUILD                               | 6 +-----
 third_party/xla/xla/ffi/ffi.h                               | 4 ++--
 third_party/xla/xla/ffi/ffi_api.cc                          | 6 +++---
 third_party/xla/xla/ffi/ffi_api.h                           | 2 +-
 third_party/xla/xla/hlo/utils/BUILD                         | 1 -
 third_party/xla/xla/hlo/utils/hlo_sharding_util.cc          | 1 -
 third_party/xla/xla/hlo/utils/hlo_sharding_util.h           | 2 +-
 third_party/xla/xla/service/llvm_ir/BUILD                   | 1 -
 .../xla/xla/service/llvm_ir/kernel_support_library.h        | 4 ++--
 third_party/xla/xla/service/llvm_ir/llvm_util.cc            | 1 -
 third_party/xla/xla/tests/BUILD                             | 2 +-
 third_party/xla/xla/tests/multithreaded_compilation_test.cc | 2 +-
 third_party/xla/xla/tests/test_utils.cc                     | 2 +-
 third_party/xla/xla/tools/multihost_hlo_runner/BUILD        | 5 ++---
 .../xla/tools/multihost_hlo_runner/functional_hlo_runner.cc | 1 -
 .../xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc   | 2 +-
 23 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 990df18380b508..e31ded1d77e63a 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -26,7 +26,6 @@ cc_library(
     visibility = ["//xla:internal"],
     deps = [
         ":profiler_c_api_hdrs",
-        "//xla:status",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
index 57fb7980cf06ed..5777975ba90944 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
-#include "xla/status.h"
 #include "tsl/platform/logging.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
index 832a72e6546ea0..6a415fe761907e 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_H_
 #define XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_H_
 
+#include "absl/status/status.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
-#include "xla/status.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index a98dbe5f0e4bab..da8eeaf366000a 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -262,7 +262,6 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:sharding_op_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -300,7 +299,6 @@ xla_cc_test(
         "//xla:comparison_util",
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:test",
         "//xla:test_helpers",
         "//xla:util",
@@ -313,6 +311,7 @@ xla_cc_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index b6fc46de1e87ec..c869c43e160518 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -62,7 +62,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/sharding_op_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index d88c90b2cf8896..0e7ba2d23f49ec 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index fad65f35384248..ef927062590cb9 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/padding.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 4ee78b15aa4e54..54a85e35c24c1b 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -55,9 +55,7 @@ xla_cc_test(
     deps = [
         ":execution_context",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -71,7 +69,6 @@ cc_library(
         ":api",
         ":execution_context",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
@@ -79,6 +76,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -91,8 +89,6 @@ cc_library(
         ":api",
         ":call_frame",
         ":execution_context",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index ad97f6aea411b0..9e365ae08eb67e 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -30,13 +30,13 @@ limitations under the License.
 #include "xla/ffi/api/api.h"
 // IWYU pragma: end_exports
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -371,7 +371,7 @@ struct CtxDecoding<UserData<T>> {
 //===----------------------------------------------------------------------===//
 
 template <>
-struct ResultEncoding<Status> {
+struct ResultEncoding<absl::Status> {
   static XLA_FFI_Error* Encode(const XLA_FFI_Api* api, absl::Status status) {
     return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
   }
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index add71664417b72..c6f90c753a3e24 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/logging.h"
@@ -270,7 +269,8 @@ static XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args) {
       "XLA_FFI_Error_Create", XLA_FFI_Error_Create_Args_STRUCT_SIZE,
       args->struct_size));
 
-  return new XLA_FFI_Error{Status(ToStatusCode(args->errc), args->message)};
+  return new XLA_FFI_Error{
+      absl::Status(ToStatusCode(args->errc), args->message)};
 }
 
 static void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args) {
@@ -364,7 +364,7 @@ static XLA_FFI_Error* XLA_FFI_ExecutionContext_Get(
 //===----------------------------------------------------------------------===//
 
 static XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status) {
-  return new XLA_FFI_Error{std::move(*reinterpret_cast<Status*>(status))};
+  return new XLA_FFI_Error{std::move(*reinterpret_cast<absl::Status*>(status))};
 }
 
 static void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx) {
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index 8c03428ec2471e..79f98c3027f828 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <string_view>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 39153f7e1eda67..27fee52a3af361 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -101,7 +101,6 @@ cc_library(
         "//xla:literal_util",
         "//xla:protobuf_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 8e28d90f5c9e21..e0d1c55e398899 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -54,7 +54,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index b1269f7562cfa5..795c969ac8811a 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index e47cd17ef1a445..88c927a856e477 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -71,7 +71,6 @@ cc_library(
         ":llvm_type_conversion_util",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
index d1a6a28ce4315b..13c826abd3f3a2 100644
--- a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
+++ b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
@@ -59,7 +59,7 @@ class KernelSupportLibrary {
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    CHECK_EQ(OkStatus(),
+    CHECK_EQ(absl::OkStatus(),
              ForWithStatus(name, start, end, step,
                            [&](llvm::Value* ind_var,
                                bool is_first_iteration) -> absl::Status {
@@ -100,7 +100,7 @@ class KernelSupportLibrary {
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    CHECK_EQ(OkStatus(),
+    CHECK_EQ(absl::OkStatus(),
              ForWithStatus(name, start, end, step,
                            [&](llvm::Value* ind_var) -> absl::Status {
                              for_body_generator(ind_var);
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 7fac277c23c490..1992f0dea01dae 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -68,7 +68,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 1a645725ba1993..2ee5737a933110 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2288,10 +2288,10 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:test",
         "//xla:test_helpers",
         "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
diff --git a/third_party/xla/xla/tests/multithreaded_compilation_test.cc b/third_party/xla/xla/tests/multithreaded_compilation_test.cc
index 530384d16e894d..cbbfedab4f7e84 100644
--- a/third_party/xla/xla/tests/multithreaded_compilation_test.cc
+++ b/third_party/xla/xla/tests/multithreaded_compilation_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
diff --git a/third_party/xla/xla/tests/test_utils.cc b/third_party/xla/xla/tests/test_utils.cc
index 191bb909bf0775..4944c9717246d9 100644
--- a/third_party/xla/xla/tests/test_utils.cc
+++ b/third_party/xla/xla/tests/test_utils.cc
@@ -261,7 +261,7 @@ absl::StatusOr<Literal> MakeFakeLiteralInternal(
   new_shape.mutable_layout()->set_element_size_in_bits(0);
   Literal literal(new_shape);
 
-  TF_RETURN_IF_ERROR(primitive_util::PrimitiveTypeSwitch<Status>(
+  TF_RETURN_IF_ERROR(primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index 019f76b6ae75e7..b77e9eb4d521aa 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -36,13 +36,13 @@ xla_cc_binary(
         ":functional_hlo_runner",
         ":hlo_runner_flags",
         "//xla:debug_options_flags",
-        "//xla:status",
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/distributed",
         "//xla/service:cpu_plugin",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -69,7 +69,6 @@ xla_cc_binary(
         ":functional_hlo_runner",
         ":hlo_runner_flags",
         "//xla:debug_options_flags",
-        "//xla:status",
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/distributed",
@@ -78,6 +77,7 @@ xla_cc_binary(
         "//xla/stream_executor:cuda_platform",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -96,7 +96,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index b6a3752999ab02..8402e8d92752a7 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -60,7 +60,6 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tools/hlo_control_flow_flattening.h"
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index d1ea3db744b81b..3c60e182408f4f 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 #include "xla/tools/multihost_hlo_runner/hlo_runner_flags.h"

From 58d8104ed2921aa9db867ec9eb4a901ce216cd72 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Wed, 29 May 2024 11:31:53 -0700
Subject: [PATCH 088/287] Use numpy as the source of truth for expm1 test.

PiperOrigin-RevId: 638355993
---
 tensorflow/compiler/tests/unary_ops_test.py | 38 ++++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 1544127586b21e..ee0967d2150e3d 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -651,20 +651,32 @@ def testComplexOps(self):
 
       self._assertOpOutputMatchesExpected(
           math_ops.expm1,
-          np.array([[1e-11 + 1j, -1e-11 - 1j, 1. + 1e-11j,
-                     -1. - 1e-11j, 1e-13j + 1e-13j]], dtype=dtype),
-          # TODO(srvasude): Use numpy as the source of truth after we depend on
-          # latest numpy with this pull request:
-          # https://github.com/numpy/numpy/pull/15110.
-          # The numbers below were generated by scipy.special.expm1.
-          expected=np.array([[
-              -4.59697694e-01+8.41470985e-01j,
-              -4.59697694e-01-8.41470985e-01j,
-              1.71828183e+00+2.71828183e-11j,
-              -6.32120559e-01-3.67879441e-12j,
-              -2.00000000e-26+2.00000000e-13j]], dtype=dtype),
+          np.array(
+              [[
+                  1e-11 + 1j,
+                  -1e-11 - 1j,
+                  1.0 + 1e-11j,
+                  -1.0 - 1e-11j,
+                  1e-13j + 1e-13j,
+              ]],
+              dtype=dtype,
+          ),
+          expected=np.expm1(
+              np.array(
+                  [[
+                      1e-11 + 1j,
+                      -1e-11 - 1j,
+                      1.0 + 1e-11j,
+                      -1.0 - 1e-11j,
+                      1e-13j + 1e-13j,
+                  ]],
+                  dtype=dtype,
+              ),
+              dtype=dtype,
+          ),
           rtol=1e-09,
-          atol=1e-20)
+          atol=1e-20,
+      )
 
       self._assertOpOutputMatchesExpected(
           math_ops.reciprocal,

From 99f11218aac172a61f2c54baf5737396b1df79f7 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 11:32:28 -0700
Subject: [PATCH 089/287] Remove Stream::platform_specific_stream method, and
 keep it on GpuStream where it makes sense.

PiperOrigin-RevId: 638356187
---
 third_party/xla/xla/stream_executor/gpu/gpu_stream.cc | 6 ++++++
 third_party/xla/xla/stream_executor/gpu/gpu_stream.h  | 7 ++++++-
 third_party/xla/xla/stream_executor/stream.cc         | 2 +-
 third_party/xla/xla/stream_executor/stream.h          | 8 +-------
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc b/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
index a9e5108df70ce8..ac74e2e8d40cbf 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
@@ -46,6 +46,12 @@ bool GpuStream::Init() {
       .ok();
 }
 
+Stream::PlatformSpecificHandle GpuStream::platform_specific_handle() const {
+  PlatformSpecificHandle handle;
+  handle.stream = gpu_stream_;
+  return handle;
+}
+
 void GpuStream::Destroy() {
   if (completed_event_ != nullptr) {
     absl::Status status =
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index 0ef380738db577..c8e120702501db 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -50,7 +50,11 @@ class GpuStream : public Stream {
     parent()->DeallocateStream(this);
   }
 
-  void* platform_specific_stream() const override { return gpu_stream_; }
+  // Returns a pointer to a platform specific stream associated with this object
+  // if it exists, or nullptr otherwise. This is available via Stream public API
+  // as Stream::PlatformSpecificHandle, and should not be accessed directly
+  // outside of a StreamExecutor package.
+  void* platform_specific_stream() const { return gpu_stream_; }
 
   // Explicitly initialize the CUDA resources associated with this stream.
   bool Init();
@@ -62,6 +66,7 @@ class GpuStream : public Stream {
   std::variant<StreamPriority, int> priority() const override {
     return stream_priority_;
   }
+  PlatformSpecificHandle platform_specific_handle() const override;
 
   // Explicitly destroy the CUDA resources associated with this stream, used by
   // StreamExecutor::DeallocateStream().
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index 7213b25eff5f0e..65034ff0577593 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -49,7 +49,7 @@ Stream::Stream(StreamExecutor *parent)
 
 Stream::PlatformSpecificHandle Stream::platform_specific_handle() const {
   PlatformSpecificHandle handle;
-  handle.stream = platform_specific_stream();
+  handle.stream = nullptr;
   return handle;
 }
 
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index e3ef38ccb245d3..922653540c9df4 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -85,7 +85,7 @@ class Stream {
   // TODO(ezhulenev): Consider removing this platform-specific accessor and
   // forward all users to platform-specific headers, however it requires careful
   // build rules set up to avoid leaking even more implementation details.
-  PlatformSpecificHandle platform_specific_handle() const;
+  virtual PlatformSpecificHandle platform_specific_handle() const;
 
   // Returns whether any errors have occurred while entraining work for this
   // stream.
@@ -270,12 +270,6 @@ class Stream {
     return StreamPriority::Default;
   }
 
-  // Returns a pointer to a platform specific stream associated with this object
-  // if it exists, or nullptr otherwise. This is available via Stream public API
-  // as Stream::PlatformSpecificHandle, and should not be accessed directly
-  // outside of a StreamExecutor package.
-  virtual void *platform_specific_stream() const { return nullptr; }
-
  private:
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);

From 13faf739267397b5c6848ee06db9c29bf19bbee6 Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Wed, 29 May 2024 11:58:35 -0700
Subject: [PATCH 090/287] PR #13018: [ROCm] Distinguish between AMD and NVIDIA
 GPUs with relevant tags

Imported from GitHub PR https://github.com/openxla/xla/pull/13018

Add `requires-gpu-amd` for AMD gpus and unit tests that require AMD backend. Use `requires-gpu-nvidia` for any nvidia gpu backend.
Copybara import of the project:

--
8c3c52760c37b6c9fc17910174df6bfa396b5b5e by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

[ROCm] Distinguish between AMD and NVIDIA GPUs with relevant tags

--
0bf9ab80d06284883df19f17f421eb6a691c0dd8 by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

Change ALL_GPU_BACKENDS to GPU_BACKENDS

Merging this change closes #13018

PiperOrigin-RevId: 638364879
---
 third_party/xla/.kokoro/linux/build.sh        |  4 +-
 third_party/xla/xla/service/gpu/BUILD         | 31 +++++--
 third_party/xla/xla/service/gpu/runtime/BUILD |  2 +
 third_party/xla/xla/service/gpu/tests/BUILD   |  4 +
 third_party/xla/xla/tests/build_defs.bzl      | 85 +++++++++++++++----
 5 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index 32e24d4d474674..06f18d562fe8e4 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -60,7 +60,7 @@ RBE_FLAGS=""
 TARGET_FILTERS=""
 
 if is_linux_gpu_job ; then
-    TAGS_FILTER="$TAGS_FILTER,requires-gpu-nvidia"
+    TAGS_FILTER="$TAGS_FILTER,requires-gpu-nvidia,-requires-gpu-amd"
 
     # We are currently running XLA presubmits on machines with NVIDIA T4 GPUs,
     # which have a compute compatibility of 7.5. Se we filter out all the tests
@@ -79,7 +79,7 @@ if is_linux_gpu_job ; then
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi
 else
-    TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia"
+    TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia,-requires-gpu-amd"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=nonccl"
     TARGET_FILTERS="$TARGET_FILTERS -//xla/service/gpu/..."
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index c657d84533cd99..42397343601b94 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -602,13 +602,17 @@ cc_library(
 
 xla_test(
     name = "ir_emitter_triton_test",
-    srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
+    srcs = if_gpu_is_configured(["ir_emitter_triton_test.cc"]),
     backends = [
         "gpu_a100",
         "gpu_h100",
+        "gpu_amd_any",
     ],
     shard_count = 20,
-    tags = ["nomac"],
+    tags = [
+        "no_rocm",
+        "nomac",
+    ],
     deps = [
         ":backend_configs_cc",
         ":gpu_device_info_for_tests",
@@ -676,10 +680,11 @@ cc_test(
 
 xla_test(
     name = "ir_emitter_triton_large_test",
-    srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
+    srcs = if_gpu_is_configured(["ir_emitter_triton_large_test.cc"]),
     backends = [
         "gpu_a100",
         "gpu_h100",
+        "gpu_amd_any",
     ],
     tags = [
         "large",
@@ -700,10 +705,11 @@ xla_test(
 
 xla_test(
     name = "ir_emitter_triton_parametrized_test",
-    srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
+    srcs = if_gpu_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
     backends = [
         "gpu_a100",
         "gpu_h100",
+        "gpu_amd_any",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -1211,9 +1217,10 @@ cc_library(
 
 xla_test(
     name = "triton_support_test",
-    srcs = if_cuda_is_configured(["triton_support_test.cc"]),
+    srcs = if_gpu_is_configured(["triton_support_test.cc"]),
     backends = [
         "gpu_a100",
+        "gpu_amd_any",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -1630,6 +1637,7 @@ xla_test(
     srcs = if_gpu_is_configured(["gemm_algorithm_picker_test.cc"]),
     backends = [
         "gpu_v100",
+        "gpu_amd_any",
     ],
     deps = [
         ":autotuner_util",
@@ -1926,6 +1934,7 @@ xla_test(
     srcs = if_gpu_is_configured(["conv_algorithm_picker_test.cc"]),
     backends = [
         "gpu_v100",
+        "gpu_amd_any",
     ],
     tags = [
         "noasan",
@@ -3801,7 +3810,7 @@ cc_library(
 
 xla_test(
     name = "nvptx_compiler_test",
-    srcs = if_gpu_is_configured([
+    srcs = if_cuda_is_configured([
         "nvptx_compiler_test.cc",
     ]),
     backends = [
@@ -4598,6 +4607,7 @@ xla_test(
     },
     backends = [
         "gpu_a100",
+        "gpu_amd_any",
     ] + if_oss(["gpu_any"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     shard_count = 10,
@@ -5255,13 +5265,14 @@ cc_library(
 
 xla_test(
     name = "dot_operand_converter_test",
-    srcs = if_cuda_is_configured(["dot_operand_converter_test.cc"]),
+    srcs = if_gpu_is_configured(["dot_operand_converter_test.cc"]),
     backends = [
         "gpu_a100",
         "gpu_p100",
         "gpu_v100",
+        "gpu_amd_any",
     ],
-    deps = if_cuda_is_configured(
+    deps = if_gpu_is_configured(
         [
             ":dot_operand_converter",
             "@com_google_googletest//:gtest",
@@ -5596,10 +5607,11 @@ cc_library(
 
 xla_test(
     name = "dot_algorithm_support_test",
-    srcs = if_cuda_is_configured(["dot_algorithm_support_test.cc"]),
+    srcs = if_gpu_is_configured(["dot_algorithm_support_test.cc"]),
     backends = [
         "gpu_v100",
         "gpu_a100",
+        "gpu_amd_any",
     ],
     tags = [
         "no_oss",  # Needs fix for `ConvertGenerator`
@@ -5811,6 +5823,7 @@ xla_test(
     srcs = if_gpu_is_configured(["determinism_test.cc"]),
     backends = [
         "gpu_a100",
+        "gpu_amd_any",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 133a68f9b2ba49..bfdc4411d42ae9 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -356,6 +356,7 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_v100",
+        "gpu_amd_any",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
@@ -456,6 +457,7 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_v100",
+        "gpu_amd_any",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 6b7af9993c12ab..7f0b9d992b476b 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -467,6 +467,7 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_v100",
+        "gpu_amd_any",
     ],
     deps = [
         ":gpu_codegen_test",
@@ -547,6 +548,7 @@ xla_test(
     srcs = ["gpu_kernel_tiling_test.cc"],
     backends = [
         "gpu_p100",
+        "gpu_amd_any",
     ] + if_oss(["gpu_any"]),
     deps = [
         ":gpu_codegen_test",
@@ -930,6 +932,7 @@ xla_test(
     srcs = ["tensor_float_32_global_var_test.cc"],
     backends = [
         "gpu_a100",
+        "gpu_amd_any",
     ] + if_oss(["gpu_any"]),
     deps = [
         "//xla:error_spec",
@@ -946,6 +949,7 @@ xla_test(
         "gpu_a100",
         "gpu_h100",
     ],
+    tags = ["no_rocm"],
     deps = if_cuda_is_configured(
         [
             ":gpu_codegen_test",
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index ef4570ed47c18e..160dba92bec174 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -5,14 +5,10 @@ load(
     "tf_gpu_tests_tags",
 )
 load("//xla:xla.bzl", "xla_cc_test")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load("//xla/tests:plugin.bzl", "plugins")
 
 # Possible backend values for the GPU family.
-GPU_BACKENDS = [
+NVIDIA_GPU_BACKENDS = [
     "gpu_any",
     "gpu_p100",
     "gpu_v100",
@@ -21,35 +17,41 @@ GPU_BACKENDS = [
 ]
 
 # The generic "gpu" backend includes the actual backends in this list.
-GPU_DEFAULT_BACKENDS = [
+NVIDIA_GPU_DEFAULT_BACKENDS = [
     "gpu_any",
     "gpu_a100",
     "gpu_h100",
 ]
 
-_DEFAULT_BACKENDS = ["cpu"] + GPU_DEFAULT_BACKENDS
+AMD_GPU_DEFAULT_BACKENDS = ["gpu_amd_any"]
+
+_DEFAULT_BACKENDS = ["cpu"] + NVIDIA_GPU_DEFAULT_BACKENDS + AMD_GPU_DEFAULT_BACKENDS
+
+GPU_BACKENDS = NVIDIA_GPU_BACKENDS + AMD_GPU_DEFAULT_BACKENDS
+
+GPU_DEFAULT_BACKENDS = NVIDIA_GPU_DEFAULT_BACKENDS
 
-_ALL_BACKENDS = ["cpu", "interpreter"] + GPU_BACKENDS + list(plugins.keys())
+_ALL_BACKENDS = ["cpu", "interpreter"] + NVIDIA_GPU_BACKENDS + AMD_GPU_DEFAULT_BACKENDS + list(plugins.keys())
 
 # buildifier: disable=function-docstring
-def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
+def prepare_nvidia_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
     # Expand "gpu" backend name into device specific backend names.
     new_backends = [name for name in backends if name != "gpu"]
     if len(new_backends) < len(backends):
-        new_backends.extend(GPU_DEFAULT_BACKENDS)
+        new_backends.extend(NVIDIA_GPU_DEFAULT_BACKENDS)
 
     new_disabled_backends = [name for name in disabled_backends if name != "gpu"]
     if len(new_disabled_backends) < len(disabled_backends):
-        new_disabled_backends.extend(GPU_BACKENDS)
+        new_disabled_backends.extend(NVIDIA_GPU_BACKENDS)
 
     new_backend_tags = {key: value for key, value in backend_tags.items() if key != "gpu"}
     gpu_backend_tags = backend_tags.get("gpu", [])
-    for key in GPU_BACKENDS:
+    for key in NVIDIA_GPU_BACKENDS:
         new_backend_tags.setdefault(key, gpu_backend_tags[:])
 
     new_backend_args = {key: value for key, value in backend_args.items() if key != "gpu"}
     if "gpu" in backend_args:
-        for key in GPU_BACKENDS:
+        for key in NVIDIA_GPU_BACKENDS:
             new_backend_args.setdefault(key, backend_args["gpu"])
 
     # Disable backends that don't meet the device requirements.
@@ -60,7 +62,7 @@ def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_
         "gpu_a100": (8, 0),
         "gpu_h100": (9, 0),
     }
-    for gpu_backend in GPU_BACKENDS:
+    for gpu_backend in NVIDIA_GPU_BACKENDS:
         all_tags = new_backend_tags[gpu_backend]
         requires_gpu = [t for t in all_tags if t.startswith("requires-gpu-")]
         requires_sm, only = None, False
@@ -87,6 +89,52 @@ def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_
 
     return new_backends, new_disabled_backends, new_backend_tags, new_backend_args
 
+# buildifier: disable=function-docstring
+def prepare_amd_gpu_backend_data(backends, backend_tags):
+    new_backends = [name for name in backends if name != "gpu"]
+    if len(new_backends) < len(backends):
+        new_backends.extend(AMD_GPU_DEFAULT_BACKENDS)
+
+    new_backend_tags = {key: value for key, value in backend_tags.items() if key != "gpu"}
+    gpu_backend_tags = backend_tags.get("gpu", [])
+    for key in AMD_GPU_DEFAULT_BACKENDS:
+        new_backend_tags.setdefault(key, gpu_backend_tags[:])
+
+    for backend in AMD_GPU_DEFAULT_BACKENDS:
+        new_backend_tags[backend].append("requires-gpu-amd")
+        new_backend_tags[backend].append("notap")
+
+    return new_backends, new_backend_tags
+
+# buildifier: disable=function-docstring
+def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
+    nvidia_backends = [
+        backend
+        for backend in backends
+        if backend in ["gpu"] + NVIDIA_GPU_BACKENDS
+    ]
+    amd_backends = [
+        backend
+        for backend in backends
+        if backend in ["gpu"] + AMD_GPU_DEFAULT_BACKENDS
+    ]
+    other_backends = [
+        backend
+        for backend in backends
+        if backend not in ["gpu"] + NVIDIA_GPU_BACKENDS + AMD_GPU_DEFAULT_BACKENDS
+    ]
+
+    nvidia_backends, disabled_backends, backend_tags, backend_args = \
+        prepare_nvidia_gpu_backend_data(nvidia_backends, disabled_backends, backend_tags, backend_args)
+    amd_backends, backend_tags = prepare_amd_gpu_backend_data(amd_backends, backend_tags)
+
+    new_backends = [
+        backend
+        for backend in nvidia_backends + amd_backends + other_backends
+    ]
+
+    return new_backends, disabled_backends, backend_tags, backend_args
+
 def xla_test(
         name,
         srcs,
@@ -192,13 +240,16 @@ def xla_test(
                 "//xla/service:cpu_plugin",
                 "//xla/tests:test_macros_cpu",
             ]
-        elif backend in GPU_BACKENDS:
-            backend_deps += if_gpu_is_configured([
+        elif backend in NVIDIA_GPU_BACKENDS:
+            backend_deps += [
                 "//xla/service:gpu_plugin",
                 "//xla/tests:test_macros_%s" % backend,
-            ])
+            ]
             this_backend_tags += tf_gpu_tests_tags()
             this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
+        elif backend in AMD_GPU_DEFAULT_BACKENDS:
+            backend_deps.append("//xla/tests:test_macros_gpu_amd_any")
+            this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
         elif backend == "interpreter":
             backend_deps += [
                 "//xla/service:interpreter_plugin",

From 83fa80d9af6b1d2dd64c48312eb312e787905080 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 29 May 2024 12:36:16 -0700
Subject: [PATCH 091/287] [XLA:GPU] Do not CHECK in nvptx_compiler, we return
 Status already

PiperOrigin-RevId: 638376738
---
 third_party/xla/xla/service/gpu/nvptx_compiler.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 8b02969fca6131..aa307503a95618 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -647,15 +647,10 @@ static absl::StatusOr<std::vector<uint8_t>> AssembleOptionsAndCompile(
   }
 
   if (maybe_cubin.status().code() == absl::StatusCode::kCancelled) {
-    // Register spilling has occurred during autotuning.
-    CHECK(options.is_autotuning_compilation) << maybe_cubin.status();
     return maybe_cubin;
   }
 
   if (maybe_cubin.status().code() == absl::StatusCode::kResourceExhausted) {
-    // Exhausting the register limit during autotuning is not a fatal
-    // error, we should just skip the problematic tiling.
-    CHECK(options.is_autotuning_compilation) << maybe_cubin.status();
     return maybe_cubin;
   }
 

From be9d3f0b6419d5fab409641dc6ae4a91dec8a6d5 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 12:45:34 -0700
Subject: [PATCH 092/287] [xla:cpu] Add support for emitting host kernels for
 fusions

PiperOrigin-RevId: 638379451
---
 third_party/xla/xla/service/cpu/BUILD         |  3 +
 .../xla/xla/service/cpu/ir_emitter2.cc        | 58 +++++++++++---
 third_party/xla/xla/service/cpu/ir_emitter2.h | 10 +++
 .../xla/xla/service/cpu/thunk_emitter.cc      | 75 ++++++++++++++++---
 .../xla/xla/service/cpu/thunk_emitter.h       | 14 +++-
 5 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index e7f9c1241af0f4..9dd8aef62db647 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -63,6 +63,7 @@ filegroup(
 test_suite(
     name = "thunk_runtime_tests",
     tests = [
+        "//xla/tests:array_elementwise_ops_test_cpu",
         "//xla/tests:copy_test_cpu",
     ],
 )
@@ -649,6 +650,7 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
+        "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
@@ -661,6 +663,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 5d1b100d81aead..642c82664af4d4 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -34,9 +34,11 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/cpu/elemental_math_emitter.h"
 #include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla::cpu {
 namespace {
@@ -159,15 +162,15 @@ IrEmitter2::IrEmitter2(const HloModule& hlo_module, llvm::Module* module)
       thread_ty_(KernelThreadTy(module_->getContext())),
       arg_ty_(KernelArgTy(module_->getContext())) {}
 
+bool IrEmitter2::fast_min_max() const {
+  return hlo_module_.config().debug_options().xla_cpu_enable_fast_min_max();
+}
+
 absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     const HloInstruction* instr) {
-  llvm::IRBuilder<> b(module_->getContext());
-
-  std::vector<Shape> parameters = FlattenedParameters(instr);
-  std::vector<Shape> results = FlattenedResults(instr);
+  KernelPrototype kernel_prototype = EmitKernelPrototype(instr);
 
-  KernelPrototype kernel_prototype =
-      EmitKernelPrototype(instr->name(), parameters, results);
+  llvm::IRBuilder<> b(module_->getContext());
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
@@ -182,9 +185,7 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     return absl::InternalError("Multi-output host kernels are not supported");
   }
 
-  ElementalIrEmitter elemental_emitter(
-      module_, &b,
-      hlo_module_.config().debug_options().xla_cpu_enable_fast_min_max());
+  ElementalIrEmitter elemental_emitter(module_, &b, fast_min_max());
   llvm_ir::ElementGenerator element_generator =
       elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
 
@@ -195,6 +196,39 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
   return kernels_.emplace_back(kernel_prototype.function->getName().str());
 }
 
+absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
+    const HloFusionInstruction* fusion) {
+  if (fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    return absl::InternalError(absl::StrCat(
+        "Unsupported loop fusion kind for instruction: ", fusion->ToString()));
+  }
+
+  KernelPrototype kernel_prototype = EmitKernelPrototype(fusion);
+
+  llvm::IRBuilder<> b(module_->getContext());
+  b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
+
+  ElementalIrEmitter elemental_emitter(module_, &b, fast_min_max());
+  FusedIrEmitter fused_emitter(elemental_emitter);
+
+  for (int i = 0; i < fusion->operand_count(); i++) {
+    fused_emitter.BindGenerator(
+        *fusion->fused_parameter(i), [&, i](llvm_ir::IrArray::Index idx) {
+          return kernel_prototype.arguments[i].EmitReadArrayElement(idx, &b);
+        });
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto element_generator,
+      fused_emitter.GetGenerator(*fusion->fused_expression_root()));
+
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
+          .EmitLoop(llvm_ir::IrName(fusion)));
+
+  return kernels_.emplace_back(kernel_prototype.function->getName().str());
+}
+
 //===----------------------------------------------------------------------===//
 // Building HostKernel prototypes.
 //===----------------------------------------------------------------------===//
@@ -291,4 +325,10 @@ IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
                          std::move(ir_arguments), std::move(ir_results)};
 }
 
+IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
+    const HloInstruction* instr) {
+  return EmitKernelPrototype(instr->name(), FlattenedParameters(instr),
+                             FlattenedResults(instr));
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 9210c5e3b12138..580183c178ca68 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 
@@ -101,12 +102,19 @@ class IrEmitter2 {
   absl::StatusOr<KernelInfo> EmitElementalHostKernel(
       const HloInstruction* instr);
 
+  // Emits a host kernel for the given fusion instruction.
+  absl::StatusOr<KernelInfo> EmitFusionHostKernel(
+      const HloFusionInstruction* fusion);
+
   // Emits a host kernel prototype and prepares function for emitting kernel
   // body into it.
   KernelPrototype EmitKernelPrototype(std::string_view name,
                                       absl::Span<const Shape> arguments,
                                       absl::Span<const Shape> results);
 
+  // Emits a host kernel prototype for the given HLO instruction.
+  KernelPrototype EmitKernelPrototype(const HloInstruction* instr);
+
  private:
   class ElementalIrEmitter;
 
@@ -119,6 +127,8 @@ class IrEmitter2 {
                                       llvm::Value* call_frame, int64_t index,
                                       const Shape& shape);
 
+  bool fast_min_max() const;
+
   const HloModule& hlo_module_;
   llvm::Module* module_;
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 029d4eb5774174..8a3a7cb8e367a2 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -20,8 +20,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -95,13 +97,48 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     // Simple HLO instructions lowered to elemental host kernels (plain loops
     // behind the HostKernel API).
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
+    case HloOpcode::kAtan2:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kClamp:
+    case HloOpcode::kClz:
+    case HloOpcode::kCompare:
     case HloOpcode::kConvert:
+    case HloOpcode::kCos:
+    case HloOpcode::kDivide:
+    case HloOpcode::kErf:
+    case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
+    case HloOpcode::kImag:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLog1p:
+    case HloOpcode::kLog:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
+    case HloOpcode::kPopulationCount:
+    case HloOpcode::kPower:
+    case HloOpcode::kReal:
+    case HloOpcode::kRemainder:
     case HloOpcode::kRsqrt:
+    case HloOpcode::kShiftLeft:
+    case HloOpcode::kShiftRightArithmetic:
+    case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kSqrt:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kTan:
+    case HloOpcode::kTanh:
+    case HloOpcode::kXor:
       return EmitElementalKernelThunk(instruction);
 
+    case HloOpcode::kFusion:
+      return EmitFusionKernelThunk(instruction);
+
     case HloOpcode::kCopy:
       return EmitCopyThunk(instruction);
 
@@ -113,19 +150,39 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
-    const HloInstruction* copy) {
-  TF_ASSIGN_OR_RETURN(auto source_buffer, GetAllocationSlice(copy->operand(0)));
-  TF_ASSIGN_OR_RETURN(auto destination_buffer, GetAllocationSlice(copy));
-  return ThunkSequence::Of<CopyThunk>(source_buffer, destination_buffer,
-                                      ShapeUtil::ByteSizeOf(copy->shape()));
+    const HloInstruction* instruction) {
+  TF_ASSIGN_OR_RETURN(auto source_buffer,
+                      GetAllocationSlice(instruction->operand(0)));
+  TF_ASSIGN_OR_RETURN(auto destination_buffer, GetAllocationSlice(instruction));
+  return ThunkSequence::Of<CopyThunk>(
+      source_buffer, destination_buffer,
+      ShapeUtil::ByteSizeOf(instruction->shape()));
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
     const HloInstruction* instruction) {
   TF_ASSIGN_OR_RETURN(auto kernel,
                       ir_emitter_->EmitElementalHostKernel(instruction));
+  TF_ASSIGN_OR_RETURN(auto buffers, GetLeafAllocationSlices(instruction));
+
+  // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
+  // invocation, for now we assume that we always emit a full loop.
+  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+}
 
-  // Collect flattened buffer slices for all operands and result(s).
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
+    const HloInstruction* instruction) {
+  auto* fusion = Cast<HloFusionInstruction>(instruction);
+  TF_ASSIGN_OR_RETURN(auto kernel, ir_emitter_->EmitFusionHostKernel(fusion));
+  TF_ASSIGN_OR_RETURN(auto buffers, GetLeafAllocationSlices(instruction));
+
+  // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
+  // invocation, for now we assume that we always emit a full loop.
+  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+}
+
+absl::StatusOr<std::vector<BufferAllocation::Slice>>
+ThunkEmitter::GetLeafAllocationSlices(const HloInstruction* instruction) {
   std::vector<BufferAllocation::Slice> buffers;
   auto add_buffers = [&](const HloInstruction* instr) -> absl::Status {
     for (const auto& indexed : ShapeUtil::GetLeafShapes(instr->shape())) {
@@ -134,15 +191,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
     }
     return absl::OkStatus();
   };
-
   for (HloInstruction* operand : instruction->operands()) {
     TF_RETURN_IF_ERROR(add_buffers(operand));
   }
   TF_RETURN_IF_ERROR(add_buffers(instruction));
-
-  // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
-  // invocation, for now we assume that we always emit a full loop.
-  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+  return buffers;
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index eb99193eee015a..1f12d3f34e56ab 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_THUNK_EMITTER_H_
 #define XLA_SERVICE_CPU_THUNK_EMITTER_H_
 
+#include <vector>
+
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -54,11 +56,21 @@ class ThunkEmitter {
   absl::StatusOr<ThunkSequence> EmitHloInstruction(
       const HloInstruction* instruction);
 
-  absl::StatusOr<ThunkSequence> EmitCopyThunk(const HloInstruction* copy);
+  absl::StatusOr<ThunkSequence> EmitCopyThunk(
+      const HloInstruction* instruction);
 
   absl::StatusOr<ThunkSequence> EmitElementalKernelThunk(
       const HloInstruction* instruction);
 
+  absl::StatusOr<ThunkSequence> EmitFusionKernelThunk(
+      const HloInstruction* instruction);
+
+  // Returns the list of buffer allocation slices assigned to the given
+  // instruction leaf buffers. We do not materialize tuples at run time and only
+  // read and write from buffers corresponding to arrays.
+  absl::StatusOr<std::vector<BufferAllocation::Slice>> GetLeafAllocationSlices(
+      const HloInstruction* instruction);
+
   IrEmitter2* ir_emitter_;
   const BufferAssignment* buffer_assignment_;
 };

From 62f01058bcce46f34d66f74d60c6c51b815a2b3a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 13:06:34 -0700
Subject: [PATCH 093/287] move loop-invariant code outside the loop

PiperOrigin-RevId: 638386434
---
 .../pjrt_ifrt/xla_executable_impl_test_lib.cc | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index e52c9c2a169290..7e7bcb0610f75a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -78,24 +78,22 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
   if (devices.empty()) {
     compile_options->compile_options.compile_portable_executable = true;
   } else {
-    for (Device* device : devices) {
-      build_options.set_device_ordinal(device->Id().value());
-      if (replicated) {
-        DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
-                                           /*computation_count=*/1);
-        for (int i = 0; i < devices.size(); ++i) {
-          device_assignment(i, 0) = i;
-        }
-        build_options.set_device_assignment(device_assignment);
-      } else {
-        DeviceAssignment device_assignment(
-            /*replica_count=*/1,
-            /*computation_count=*/devices.size());
-        for (int i = 0; i < devices.size(); ++i) {
-          device_assignment(i, 0) = i;
-        }
-        build_options.set_device_assignment(device_assignment);
+    build_options.set_device_ordinal(devices.front()->Id().value());
+    if (replicated) {
+      DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
+                                         /*computation_count=*/1);
+      for (int i = 0; i < devices.size(); ++i) {
+        device_assignment(i, 0) = i;
       }
+      build_options.set_device_assignment(device_assignment);
+    } else {
+      DeviceAssignment device_assignment(
+          /*replica_count=*/1,
+          /*computation_count=*/devices.size());
+      for (int i = 0; i < devices.size(); ++i) {
+        device_assignment(i, 0) = i;
+      }
+      build_options.set_device_assignment(device_assignment);
     }
   }
   return compiler->Compile(std::make_unique<HloProgram>(*module),

From bd9bfbb8fa7f9cabd1c03fa246f0433f770a50bd Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Wed, 29 May 2024 13:09:43 -0700
Subject: [PATCH 094/287] [Triton] Fixing an issue where LocalAllocOp doesn't
 support DotOperandEncoding when it calls emitIndices during lowering.

The change basically inserts a layout conversion from DotOperandEncoding to the original parent encoding when creating a LocalAllocOp and modifies RemoveLayoutConversions pass to not canonicalize it away.

PiperOrigin-RevId: 638387502
---
 .../temporary/local_alloc_lowering_fix.patch  | 120 ++++++++++++++++++
 third_party/triton/temporary/series.bzl       |   1 +
 .../temporary/local_alloc_lowering_fix.patch  | 120 ++++++++++++++++++
 .../third_party/triton/temporary/series.bzl   |   1 +
 .../xla/service/gpu/ir_emitter_triton_test.cc |   3 -
 .../tests/sparse_ttg_accelerate_matmul.mlir   |   8 +-
 6 files changed, 247 insertions(+), 6 deletions(-)
 create mode 100644 third_party/triton/temporary/local_alloc_lowering_fix.patch
 create mode 100644 third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch

diff --git a/third_party/triton/temporary/local_alloc_lowering_fix.patch b/third_party/triton/temporary/local_alloc_lowering_fix.patch
new file mode 100644
index 00000000000000..098f19aab9f359
--- /dev/null
+++ b/third_party/triton/temporary/local_alloc_lowering_fix.patch
@@ -0,0 +1,120 @@
+No plans to upstream this fix at the moment given the friction with OAI. We
+can consider upstreaming this in the future if it starts causing issues while
+patching.
+
+diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
++++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+@@ -2772,6 +2772,11 @@ struct CanonicalizeConvertFromAlloc
+     auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
+     if (!convert)
+       return failure();
++    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
++    // to SharedEncoding, so we want to keep this layout conversion.
++    if (mlir::isa<triton::gpu::DotOperandEncodingAttr>(
++            convert.getSrc().getType().getEncoding()))
++      return failure();
+     rewriter.replaceOpWithNewOp<triton::gpu::LocalAllocOp>(
+         op, op->getResult(0).getType(), convert.getSrc());
+     return mlir::success();
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -227,6 +227,20 @@ public:
+     auto newType = MemDescType::get(argType.getShape(),
+                                     argType.getElementType(), newLayout);
+     rewriter.setInsertionPointAfterValue(arg);
++
++    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
++    // to SharedEncoding.
++    if (auto dotOpEnc = mlir::dyn_cast<DotOperandEncodingAttr>(
++            argType.getEncoding())) {
++      // Create a layout conversion from DotOperandEncoding to BlockedEncoding
++      // then pass it to the LocalAllocOp.
++      auto newArgType = RankedTensorType::get(
++          argType.getShape(), argType.getElementType(), dotOpEnc.getParent());
++      auto dotOperandToBlockedCvt =
++          rewriter.create<ConvertLayoutOp>(arg.getLoc(), newArgType, arg);
++      return rewriter.create<LocalAllocOp>(arg.getLoc(), newType,
++                                                dotOperandToBlockedCvt);
++    }
+     return rewriter.create<LocalAllocOp>(arg.getLoc(), newType, arg);
+   }
+ 
+diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
+--- a/python/test/unit/language/test_core.py
++++ b/python/test/unit/language/test_core.py
+@@ -3588,6 +3588,25 @@ def test_dot_without_load(dtype_str, dev
+     kernel[(1, )](out)
+     assert torch.all(out == out_ref)
+ 
++@pytest.mark.interpreter
++def test_dot_on_broadcast(device):
++    @triton.jit
++    def _kernel(a, b, out):
++        a_offsets = tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
++        lhs = tl.load(a + a_offsets, mask=a_offsets < 32 * 64)
++        rhs = tl.load(b)
++        rhs_bc = tl.broadcast_to(rhs, [32, 32])
++        c = tl.dot(lhs, rhs_bc)
++        out_ptr = out + tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
++        tl.store(out_ptr, c)
++
++    a = torch.ones((64, 32), dtype=getattr(torch, 'float32'), device=device)
++    b = torch.tensor([1.0], dtype=getattr(torch, 'float32'), device=device)
++    out_ref = torch.matmul(a, torch.broadcast_to(b, (32, 32)))
++    out = torch.zeros((64, 32), dtype=getattr(torch, 'float32'), device=device)
++    _kernel[(1, )](a, b, out, num_stages=1, num_warps=4)
++    assert torch.all(out == out_ref)
++
+ 
+ # ---------------
+ # test arange
+diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
+--- a/test/TritonGPU/accelerate-matmul.mlir
++++ b/test/TritonGPU/accelerate-matmul.mlir
+@@ -148,3 +148,21 @@ module attributes {"triton_gpu.target" =
+     tt.return
+   }
+ }
++
++// -----
++
++// CHECK-DAG: #[[$BLOCKED:.*]] = #triton_gpu.blocked
++// CHECK-DAG: #mma = #triton_gpu.nvidia_mma<{versionMajor = 3
++#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
++module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @local_alloc_dot_operand(%in0:  tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> {tt.divisibility = 16 : i32}, %in1: f32, %in2: tensor<64x32xf32, #blocked>) -> (tensor<64x32xf32, #blocked>) {
++    // CHECK-LABEL: local_alloc_dot_operand
++    %splat_in1 = tt.splat %in1 : f32 -> tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
++    // CHECK: %[[LHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc
++    // CHECK: %[[RHS_CVT:.*]] = triton_gpu.convert_layout {{.*}} #triton_gpu.dot_op<{{.*}}> -> {{.*}} #[[$BLOCKED]]
++    // CHECK: %[[RHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc %[[RHS_CVT]]
++    // CHECK: tt.dot %[[LHS_LOCAL_ALLOC]], %[[RHS_LOCAL_ALLOC]]
++    %res = tt.dot %in0, %splat_in1, %in2, inputPrecision = tf32 : tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x32xf32, #blocked>
++    tt.return %res :  tensor<64x32xf32, #blocked>
++  }
++}
+diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
+--- a/test/TritonGPU/canonicalize.mlir
++++ b/test/TritonGPU/canonicalize.mlir
+@@ -57,3 +57,19 @@ tt.func @test_canonicalize_convert_histo
+     tt.return %2 : tensor<512xi32, #blocked2>
+ }
+ }  // end module
++
++// -----
++
++// CHECK: #[[$BLOCKED:.*]] = #triton_gpu.blocked
++#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
++#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
++module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @cvt_from_dot_op_into_local_allow_not_canonicalized(%in: tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<256x32xf32, #shared1> {
++    // CHECK-LABEL: cvt_from_dot_op_into_local_allow_not_canonicalized
++    %cvt_in = triton_gpu.convert_layout %in : tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<256x32xf32, #blocked>
++    %alloc = triton_gpu.local_alloc %cvt_in : (tensor<256x32xf32, #blocked>) -> !tt.memdesc<256x32xf32, #shared1>
++    // CHECK: %[[ALLOC:.*]] = triton_gpu.local_alloc {{.*}} (tensor<{{.*}}, #[[$BLOCKED]]{{.*}}>) ->
++    tt.return %alloc : !tt.memdesc<256x32xf32, #shared1>
++  }
++} // end module
++
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index 30ee1d604dc2a9..15a1a6d220d765 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -7,4 +7,5 @@ internal patch during the next triton integration process.
 
 temporary_patch_list = [
     "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
+    "//third_party/triton/temporary:local_alloc_lowering_fix.patch",
 ]
diff --git a/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch b/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch
new file mode 100644
index 00000000000000..098f19aab9f359
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch
@@ -0,0 +1,120 @@
+No plans to upstream this fix at the moment given the friction with OAI. We
+can consider upstreaming this in the future if it starts causing issues while
+patching.
+
+diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
++++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+@@ -2772,6 +2772,11 @@ struct CanonicalizeConvertFromAlloc
+     auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
+     if (!convert)
+       return failure();
++    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
++    // to SharedEncoding, so we want to keep this layout conversion.
++    if (mlir::isa<triton::gpu::DotOperandEncodingAttr>(
++            convert.getSrc().getType().getEncoding()))
++      return failure();
+     rewriter.replaceOpWithNewOp<triton::gpu::LocalAllocOp>(
+         op, op->getResult(0).getType(), convert.getSrc());
+     return mlir::success();
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -227,6 +227,20 @@ public:
+     auto newType = MemDescType::get(argType.getShape(),
+                                     argType.getElementType(), newLayout);
+     rewriter.setInsertionPointAfterValue(arg);
++
++    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
++    // to SharedEncoding.
++    if (auto dotOpEnc = mlir::dyn_cast<DotOperandEncodingAttr>(
++            argType.getEncoding())) {
++      // Create a layout conversion from DotOperandEncoding to BlockedEncoding
++      // then pass it to the LocalAllocOp.
++      auto newArgType = RankedTensorType::get(
++          argType.getShape(), argType.getElementType(), dotOpEnc.getParent());
++      auto dotOperandToBlockedCvt =
++          rewriter.create<ConvertLayoutOp>(arg.getLoc(), newArgType, arg);
++      return rewriter.create<LocalAllocOp>(arg.getLoc(), newType,
++                                                dotOperandToBlockedCvt);
++    }
+     return rewriter.create<LocalAllocOp>(arg.getLoc(), newType, arg);
+   }
+ 
+diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
+--- a/python/test/unit/language/test_core.py
++++ b/python/test/unit/language/test_core.py
+@@ -3588,6 +3588,25 @@ def test_dot_without_load(dtype_str, dev
+     kernel[(1, )](out)
+     assert torch.all(out == out_ref)
+ 
++@pytest.mark.interpreter
++def test_dot_on_broadcast(device):
++    @triton.jit
++    def _kernel(a, b, out):
++        a_offsets = tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
++        lhs = tl.load(a + a_offsets, mask=a_offsets < 32 * 64)
++        rhs = tl.load(b)
++        rhs_bc = tl.broadcast_to(rhs, [32, 32])
++        c = tl.dot(lhs, rhs_bc)
++        out_ptr = out + tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
++        tl.store(out_ptr, c)
++
++    a = torch.ones((64, 32), dtype=getattr(torch, 'float32'), device=device)
++    b = torch.tensor([1.0], dtype=getattr(torch, 'float32'), device=device)
++    out_ref = torch.matmul(a, torch.broadcast_to(b, (32, 32)))
++    out = torch.zeros((64, 32), dtype=getattr(torch, 'float32'), device=device)
++    _kernel[(1, )](a, b, out, num_stages=1, num_warps=4)
++    assert torch.all(out == out_ref)
++
+ 
+ # ---------------
+ # test arange
+diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
+--- a/test/TritonGPU/accelerate-matmul.mlir
++++ b/test/TritonGPU/accelerate-matmul.mlir
+@@ -148,3 +148,21 @@ module attributes {"triton_gpu.target" =
+     tt.return
+   }
+ }
++
++// -----
++
++// CHECK-DAG: #[[$BLOCKED:.*]] = #triton_gpu.blocked
++// CHECK-DAG: #mma = #triton_gpu.nvidia_mma<{versionMajor = 3
++#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
++module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @local_alloc_dot_operand(%in0:  tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> {tt.divisibility = 16 : i32}, %in1: f32, %in2: tensor<64x32xf32, #blocked>) -> (tensor<64x32xf32, #blocked>) {
++    // CHECK-LABEL: local_alloc_dot_operand
++    %splat_in1 = tt.splat %in1 : f32 -> tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
++    // CHECK: %[[LHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc
++    // CHECK: %[[RHS_CVT:.*]] = triton_gpu.convert_layout {{.*}} #triton_gpu.dot_op<{{.*}}> -> {{.*}} #[[$BLOCKED]]
++    // CHECK: %[[RHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc %[[RHS_CVT]]
++    // CHECK: tt.dot %[[LHS_LOCAL_ALLOC]], %[[RHS_LOCAL_ALLOC]]
++    %res = tt.dot %in0, %splat_in1, %in2, inputPrecision = tf32 : tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x32xf32, #blocked>
++    tt.return %res :  tensor<64x32xf32, #blocked>
++  }
++}
+diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
+--- a/test/TritonGPU/canonicalize.mlir
++++ b/test/TritonGPU/canonicalize.mlir
+@@ -57,3 +57,19 @@ tt.func @test_canonicalize_convert_histo
+     tt.return %2 : tensor<512xi32, #blocked2>
+ }
+ }  // end module
++
++// -----
++
++// CHECK: #[[$BLOCKED:.*]] = #triton_gpu.blocked
++#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
++#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
++module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @cvt_from_dot_op_into_local_allow_not_canonicalized(%in: tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<256x32xf32, #shared1> {
++    // CHECK-LABEL: cvt_from_dot_op_into_local_allow_not_canonicalized
++    %cvt_in = triton_gpu.convert_layout %in : tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<256x32xf32, #blocked>
++    %alloc = triton_gpu.local_alloc %cvt_in : (tensor<256x32xf32, #blocked>) -> !tt.memdesc<256x32xf32, #shared1>
++    // CHECK: %[[ALLOC:.*]] = triton_gpu.local_alloc {{.*}} (tensor<{{.*}}, #[[$BLOCKED]]{{.*}}>) ->
++    tt.return %alloc : !tt.memdesc<256x32xf32, #shared1>
++  }
++} // end module
++
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index 30ee1d604dc2a9..15a1a6d220d765 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -7,4 +7,5 @@ internal patch during the next triton integration process.
 
 temporary_patch_list = [
     "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
+    "//third_party/triton/temporary:local_alloc_lowering_fix.patch",
 ]
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index e05e560245f4d6..d0116078f8bea2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -2653,9 +2653,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, BroadcastOfScalarParameterIsFused) {
-  if (GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "TODO(b/338371693): reenable test once bug is resolved.";
-  }
   const std::string kHloText = R"(
 ENTRY e {
   p0 = f16[64,256] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir b/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir
index 2148cf0f8bca0b..397ae76ee6f3c6 100644
--- a/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir
+++ b/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir
@@ -9,8 +9,10 @@
 module attributes {"triton_gpu.num-warps" = 4 : i32} {
   tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
     %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
-    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
-    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
+    // CHECK-DAG: %[[LHS_TEMP:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<64x32xf16, #blocked>
+    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc %[[LHS_TEMP]] : (tensor<64x32xf16, #blocked>) -> !tt.memdesc<64x32xf16, #{{.+}}>
+    // CHECK-DAG: %[[RHS_TEMP:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x64xf16, #blocked>
+    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc %[[RHS_TEMP]] : (tensor<64x64xf16, #blocked>) -> !tt.memdesc<64x64xf16, #{{.+}}>
     // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
     // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
     // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
@@ -24,4 +26,4 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
     // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
     tt.return %D : tensor<64x64xf32, #blocked>
   }
-}
+}
\ No newline at end of file

From fcbcc19aa91748d2b506048cb450f95792f92254 Mon Sep 17 00:00:00 2001
From: Samuel Agyakwa <sagyakwa@google.com>
Date: Wed, 29 May 2024 13:21:06 -0700
Subject: [PATCH 095/287] Update mlir tests to reflect new namespace change

PiperOrigin-RevId: 638391003
---
 .../mlir/tfrt/tests/tfrt_fallback/tf_delegate.mlir | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/tf_delegate.mlir b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/tf_delegate.mlir
index 59c15f71d981bc..0162982d3228b5 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/tf_delegate.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/tf_delegate.mlir
@@ -14,9 +14,9 @@ func.func @matmul_delegate_test() {
 
   // Convert dht to tf tensor
   %tft_a, %c3 = "tfd.move_dht_to_tft"(%dht_a, %c1)
-      : (!t.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
+      : (!tfrt_tensor.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
   %tft_b, %c4 = "tfd.move_dht_to_tft"(%dht_b, %c2)
-      : (!t.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
+      : (!tfrt_tensor.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
 
   // Print legacy TF tensors
   // CHECK: shape = [2, 2], values = [1, 1, 1, 1]
@@ -44,9 +44,9 @@ func.func @matmul_delegate_test() {
 
   // Convert tf tensor back to dht
   %dht_c, %c9 = "tfd.convert_tft_to_dht"(%tft_x, %cc2)
-      : (!tfd.tf_tensor, !tfrt.chain) -> (!t.tensor, !tfrt.chain)
+      : (!tfd.tf_tensor, !tfrt.chain) -> (!tfrt_tensor.tensor, !tfrt.chain)
   %dht_d, %c10 = "tfd.convert_tft_to_dht"(%tft_y, %cc3)
-      : (!tfd.tf_tensor, !tfrt.chain) -> (!t.tensor, !tfrt.chain)
+      : (!tfd.tf_tensor, !tfrt.chain) -> (!tfrt_tensor.tensor, !tfrt.chain)
 
   // Print the result dht
   // CHECK: shape = [2, 2], values = [4, 4, 4, 4]
@@ -67,7 +67,7 @@ func.func @bad_op_name_test() {
 
   // Convert dht to tf tensor
   %tft_a, %c2 = "tfd.move_dht_to_tft"(%dht_a, %c1)
-      : (!t.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
+      : (!tfrt_tensor.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
 
   // Create TF eager context
   %c3 = "tfd.init_eager_context"(%c0): (!tfrt.chain) -> !tfrt.chain
@@ -92,9 +92,9 @@ func.func @addn_delegate_test() {
 
   // Convert dht to tf tensor
   %tft_a, %c3 = "tfd.move_dht_to_tft"(%dht_a, %c1)
-      : (!t.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
+      : (!tfrt_tensor.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
   %tft_b, %c4 = "tfd.move_dht_to_tft"(%dht_b, %c2)
-      : (!t.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
+      : (!tfrt_tensor.tensor, !tfrt.chain) -> (!tfd.tf_tensor, !tfrt.chain)
 
   // Print legacy TF tensors
   // CHECK: shape = [], values = [1]

From 5bb57ef7c5373b069d5ce373fe787707a9ad3a83 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 13:22:33 -0700
Subject: [PATCH 096/287] Delete duplicate schema.fbs (new location is
 //third_party/tensorflow/compiler/mlir/lite/schema/schema.fbs)

PiperOrigin-RevId: 638391440
---
 tensorflow/lite/schema/schema.fbs | 1653 -----------------------------
 1 file changed, 1653 deletions(-)
 delete mode 100644 tensorflow/lite/schema/schema.fbs

diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
deleted file mode 100644
index 7ab78be26737ee..00000000000000
--- a/tensorflow/lite/schema/schema.fbs
+++ /dev/null
@@ -1,1653 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Revision History
-// Version 0: Initial version.
-// Version 1: Add subgraphs to schema.
-// Version 2: Rename operators to conform to NN API.
-// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
-// Version 3a: Add new builtin op code field. Has backward compatibility with
-//             version 3.
-// Version 3b: Rename fields in SignatureDef. Has backward compatibility with
-//             version 3 and 3a.
-// Version 3c: Move constant tensor buffers & custom op buffers outside from
-//             Flatbuffers. Has backward compatibility with version 3, 3a and
-//             3b.
-
-namespace tflite;
-
-// This corresponds to the version.
-file_identifier "TFL3";
-// File extension of any written files.
-file_extension "tflite";
-
-// IMPORTANT: All new members of tables, enums and unions must be added at the
-// end to ensure backwards compatibility.
-
-// The type of data stored in a tensor.
-enum TensorType : byte {
-  FLOAT32 = 0,
-  FLOAT16 = 1,
-  INT32 = 2,
-  UINT8 = 3,
-  INT64 = 4,
-  STRING = 5,
-  BOOL = 6,
-  INT16 = 7,
-  COMPLEX64 = 8,
-  INT8 = 9,
-  FLOAT64 = 10,
-  COMPLEX128 = 11,
-  UINT64 = 12,
-  // Experimental: Resource and variant types are experimental, that are subject
-  // to change. Do not implement custom kernels using resource & variant types
-  // now.
-  RESOURCE = 13,
-  VARIANT = 14,
-  UINT32 = 15,
-  UINT16 = 16,
-  INT4 = 17,
-  BFLOAT16 = 18,
-}
-
-// Custom quantization parameters for experimenting with new quantization
-// techniques.
-table CustomQuantization {
-  custom:[ubyte] (force_align: 16);
-}
-
-// Represents a specific quantization technique's parameters.
-union QuantizationDetails {
-  CustomQuantization,
-}
-
-// Parameters for converting a quantized tensor back to float.
-table QuantizationParameters {
-  // These four parameters are the asymmetric linear quantization parameters.
-  // Given a quantized value q, the corresponding float value f should be:
-  //   f = scale * (q - zero_point)
-  // For other quantization types, the QuantizationDetails below is used.
-  min:[float];  // For importing back into tensorflow.
-  max:[float];  // For importing back into tensorflow.
-  scale:[float];  // For dequantizing the tensor's values.
-  zero_point:[long];
-
-  // If this is not none, the other quantization parameters (i.e. min, max,
-  // scale, zero_point fields above) are ignored and the value of the
-  // QuantizationDetails union should be used.
-  details:QuantizationDetails;
-
-  // Specifies the dimension of the Tensor's shape that the scales and
-  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
-  // with quantization params:
-  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
-  // will be quantized across the second dimension of t.
-  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
-  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
-  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
-  quantized_dimension:int;
-}
-
-// Sparse tensors.
-// We use a modification of the TACO format.
-// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
-//
-// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
-// potentially with a k-dimensional block (0 <= k <= n) with dims
-// (dn, ..., dn+k-1), the format needs to specify:
-//   1. In what order to traverse these dimensions. For example, to store a 2-D
-//      matrix in row major order, the traversal order would be (d0, d1),
-//      whereas to store it in column major order, the traversal order would be
-//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
-//      could be (d0, d1, d2, d3).
-//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
-//      tensor dimension in (d0, ..., dn-1).
-//   3. In the traversal order defined above, the format (dense vs. sparse) and
-//      index metadata for each dimension. For a dense dimension, this is just
-//      the size of that dimension. For a sparse dimension, it's the same as
-//      the compressed index defined in the Compressed Sparse Row (CSR) format.
-//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
-
-// The storage type for a dimension. Currently we support:
-//   1. DENSE: each coordinate in this dimension is stored implicitly.
-//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
-//      compression technique is the same what CSR uses.
-// More types like a sparse dimension with a different compression technique
-// could be added to the list in the future.
-enum DimensionType : byte {
-  DENSE = 0,
-  SPARSE_CSR = 1,
-}
-
-table Int32Vector {
-  values:[int];
-}
-
-table Uint16Vector {
-  values:[ushort] (force_align: 4);
-}
-
-table Uint8Vector {
-  values:[ubyte] (force_align: 4);
-}
-
-// Variable-typed buffer to store the index metadata for a sparse dimension.
-// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
-// vector. We don't want the per-dimensional index to overflow that range.
-union SparseIndexVector {
-  Int32Vector,
-  Uint16Vector,
-  Uint8Vector
-}
-
-table DimensionMetadata {
-  // Whether a dimension is dense or sparse.
-  format:DimensionType;
-  // Index metadata used for a dimension.
-  //   - If format is DimensionType.DENSE then we use the dense_size field to
-  //     store the size of that dimension. Each index in that dimension is
-  //     stored implicitly.
-  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
-  //     array_indices to encode that dimension. array_segments represents how
-  //     to segment the indices array, each segment corresponds to one element
-  //     in the previous dimension. array_indices represents the index of the
-  //     non-zero elements within this dimension (as those in the CSR matrix
-  //     format, where the first array is row pointers and the second array is
-  //     column indices).
-  dense_size:int;
-  array_segments:SparseIndexVector;
-  array_indices:SparseIndexVector;
-}
-
-// Parameters to encode a sparse TfLite tensor.
-table SparsityParameters {
-  // The traversal order of the dimensions defined in the `shape` field of the
-  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
-  // ..., dn-1),
-  //   - if not block sparse, the traversal_order is just a permutation of (d0,
-  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
-  //     have traversal_order = (d0, d1).
-  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
-  //     traversal_order has n + k elements. The first n elements are still a
-  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
-  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
-  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
-  //     would have traversal_order = (d0, d1, d2, d3).
-  traversal_order:[int];
-  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
-  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
-  // tensor dimension in (d0, ..., dn).
-  // It's stored in the order of (dn, ..., dn+k-1).
-  // If not block-sparse, this field is NULL.
-  block_map:[int];
-  // In the traversal order defined above, the metadata needed for
-  // each dimension to locate the non-zero values in the original dense tensor.
-  // The size of the dim_metadata array = the size of the traversal_order array
-  // = n + k.
-  dim_metadata:[DimensionMetadata];
-}
-
-// The nested tensor type for VARIANT type.
-table VariantSubType {
-  // The tensor shape.
-  shape:[int];
-  type:TensorType;
-  // If false, the rank or the number of tensor dimensions is unknown.
-  // If false, "shape" must be [].
-  has_rank: bool = false;
-}
-
-table Tensor {
-  // The tensor shape. The meaning of each entry is operator-specific but
-  // builtin ops use: [batch size, height, width, number of channels] (That's
-  // Tensorflow's NHWC).
-  shape:[int];
-  type:TensorType;
-  // An index that refers to the buffers table at the root of the model. Or,
-  // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existent empty buffer).
-  //
-  // The data_buffer itself is an opaque container, with the assumption that the
-  // target device is little-endian. In addition, all builtin operators assume
-  // the memory is ordered such that if `shape` is [4, 3, 2], then index
-  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
-  buffer:uint;
-  name:string;  // For debugging and importing back into tensorflow.
-  quantization:QuantizationParameters;  // Optional.
-
-  is_variable:bool = false;
-
-  // Parameters to encode a sparse tensor. See the example in
-  // tensorflow/lite/testdata/sparse_tensor.json.
-  sparsity:SparsityParameters;  // Optional.
-
-  // Encodes `shape` with unknown dimensions. Unknown dimensions are
-  // represented with -1.
-  shape_signature:[int]; // Optional.
-
-  // This field is added to distinguish between scalars and tensors of unknown
-  // ranks (both of which shape is []).
-  // For scalars (rank = 0), shape = [] and has_rank = true.
-  // For tensors with known rank (rank > 0) and shape, shape = [...] and
-  // has_rank = true.
-  // For tensors with unknown rank and shape, shape = [] and has_rank = false.
-  has_rank: bool = false;
-
-  // The nested Tensor types for VARIANT type. This is always empty for
-  // non-VARIANT types. This is optional because the nested type can be omitted.
-  // Currently only 1 subtype is supported. The field is defined as an array for
-  // flexibility of supporting multiple subtypes in the future.
-  variant_tensors:[VariantSubType];
-}
-
-// A list of builtin operators. Builtin operators are slightly faster than custom
-// ones, but not by much. Moreover, while custom operators accept an opaque
-// object containing configuration parameters, builtins have a predetermined
-// set of acceptable options.
-// LINT.IfChange
-enum BuiltinOperator : int32 {
-  ADD = 0,
-  AVERAGE_POOL_2D = 1,
-  CONCATENATION = 2,
-  CONV_2D = 3,
-  DEPTHWISE_CONV_2D = 4,
-  DEPTH_TO_SPACE = 5,
-  DEQUANTIZE = 6,
-  EMBEDDING_LOOKUP = 7,
-  FLOOR = 8,
-  FULLY_CONNECTED = 9,
-  HASHTABLE_LOOKUP = 10,
-  L2_NORMALIZATION = 11,
-  L2_POOL_2D = 12,
-  LOCAL_RESPONSE_NORMALIZATION = 13,
-  LOGISTIC = 14,
-  LSH_PROJECTION = 15,
-  LSTM = 16,
-  MAX_POOL_2D = 17,
-  MUL = 18,
-  RELU = 19,
-  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
-  // since different model developers use RELU1 in different ways. Never
-  // create another op called RELU1.
-  RELU_N1_TO_1 = 20,
-  RELU6 = 21,
-  RESHAPE = 22,
-  RESIZE_BILINEAR = 23,
-  RNN = 24,
-  SOFTMAX = 25,
-  SPACE_TO_DEPTH = 26,
-  SVDF = 27,
-  TANH = 28,
-  CONCAT_EMBEDDINGS = 29,
-  SKIP_GRAM = 30,
-  CALL = 31,
-  CUSTOM = 32,
-  EMBEDDING_LOOKUP_SPARSE = 33,
-  PAD = 34,
-  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
-  GATHER = 36,
-  BATCH_TO_SPACE_ND = 37,
-  SPACE_TO_BATCH_ND = 38,
-  TRANSPOSE = 39,
-  MEAN = 40,
-  SUB = 41,
-  DIV = 42,
-  SQUEEZE = 43,
-  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
-  STRIDED_SLICE = 45,
-  BIDIRECTIONAL_SEQUENCE_RNN = 46,
-  EXP = 47,
-  TOPK_V2 = 48,
-  SPLIT = 49,
-  LOG_SOFTMAX = 50,
-  // DELEGATE is a special op type for the operations which are delegated to
-  // other backends.
-  // WARNING: Experimental interface, subject to change
-  DELEGATE = 51,
-  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
-  CAST = 53,
-  PRELU = 54,
-  MAXIMUM = 55,
-  ARG_MAX = 56,
-  MINIMUM = 57,
-  LESS = 58,
-  NEG = 59,
-  PADV2 = 60,
-  GREATER = 61,
-  GREATER_EQUAL = 62,
-  LESS_EQUAL = 63,
-  SELECT = 64,
-  SLICE = 65,
-  SIN = 66,
-  TRANSPOSE_CONV = 67,
-  SPARSE_TO_DENSE = 68,
-  TILE = 69,
-  EXPAND_DIMS = 70,
-  EQUAL = 71,
-  NOT_EQUAL = 72,
-  LOG = 73,
-  SUM = 74,
-  SQRT = 75,
-  RSQRT = 76,
-  SHAPE = 77,
-  POW = 78,
-  ARG_MIN = 79,
-  FAKE_QUANT = 80,
-  REDUCE_PROD = 81,
-  REDUCE_MAX = 82,
-  PACK = 83,
-  LOGICAL_OR = 84,
-  ONE_HOT = 85,
-  LOGICAL_AND = 86,
-  LOGICAL_NOT = 87,
-  UNPACK = 88,
-  REDUCE_MIN = 89,
-  FLOOR_DIV = 90,
-  REDUCE_ANY = 91,
-  SQUARE = 92,
-  ZEROS_LIKE = 93,
-  FILL = 94,
-  FLOOR_MOD = 95,
-  RANGE = 96,
-  RESIZE_NEAREST_NEIGHBOR = 97,
-  LEAKY_RELU = 98,
-  SQUARED_DIFFERENCE = 99,
-  MIRROR_PAD = 100,
-  ABS = 101,
-  SPLIT_V = 102,
-  UNIQUE = 103,
-  CEIL = 104,
-  REVERSE_V2 = 105,
-  ADD_N = 106,
-  GATHER_ND = 107,
-  COS = 108,
-  WHERE = 109,
-  RANK = 110,
-  ELU = 111,
-  REVERSE_SEQUENCE = 112,
-  MATRIX_DIAG = 113,
-  QUANTIZE = 114,
-  MATRIX_SET_DIAG = 115,
-  ROUND = 116,
-  HARD_SWISH = 117,
-  IF = 118,
-  WHILE = 119,
-  NON_MAX_SUPPRESSION_V4 = 120,
-  NON_MAX_SUPPRESSION_V5 = 121,
-  SCATTER_ND = 122,
-  SELECT_V2 = 123,
-  DENSIFY = 124,
-  SEGMENT_SUM = 125,
-  BATCH_MATMUL = 126,
-  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
-  CUMSUM = 128,
-  CALL_ONCE = 129,
-  BROADCAST_TO = 130,
-  RFFT2D = 131,
-  CONV_3D = 132,
-  IMAG=133,
-  REAL=134,
-  COMPLEX_ABS=135,
-  HASHTABLE = 136,
-  HASHTABLE_FIND = 137,
-  HASHTABLE_IMPORT = 138,
-  HASHTABLE_SIZE = 139,
-  REDUCE_ALL = 140,
-  CONV_3D_TRANSPOSE = 141,
-  VAR_HANDLE = 142,
-  READ_VARIABLE = 143,
-  ASSIGN_VARIABLE = 144,
-  BROADCAST_ARGS = 145,
-  RANDOM_STANDARD_NORMAL = 146,
-  BUCKETIZE = 147,
-  RANDOM_UNIFORM = 148,
-  MULTINOMIAL = 149,
-  GELU = 150,
-  DYNAMIC_UPDATE_SLICE = 151,
-  RELU_0_TO_1 = 152,
-  UNSORTED_SEGMENT_PROD = 153,
-  UNSORTED_SEGMENT_MAX = 154,
-  UNSORTED_SEGMENT_SUM = 155,
-  ATAN2 = 156,
-  UNSORTED_SEGMENT_MIN = 157,
-  SIGN = 158,
-  BITCAST = 159,
-  BITWISE_XOR = 160,
-  RIGHT_SHIFT = 161,
-  // All Operators start with STABLEHLO_ prefixes are subject to change
-  // Many of the ops below can not be executed by TFlite runtime
-  STABLEHLO_LOGISTIC = 162, // WARNING: Do not have runtime support
-  STABLEHLO_ADD = 163,
-  STABLEHLO_DIVIDE = 164, // WARNING: No runtime support yet
-  STABLEHLO_MULTIPLY = 165,
-  STABLEHLO_MAXIMUM = 166,
-  STABLEHLO_RESHAPE = 167, // WARNING: No runtime support yet
-  STABLEHLO_CLAMP = 168, // WARNING: No runtime support
-  STABLEHLO_CONCATENATE = 169, // WARNING: No runtime support
-  STABLEHLO_BROADCAST_IN_DIM = 170, // WARNING: No runtime support
-  STABLEHLO_CONVOLUTION = 171, // WARNING: No runtime support
-  STABLEHLO_SLICE = 172, // WARNING: No runtime support
-  STABLEHLO_CUSTOM_CALL = 173, // WARNING: No runtime support
-  STABLEHLO_REDUCE = 174, // WARNING: No runtime support
-  STABLEHLO_ABS = 175, // WARNING: No runtime support
-  STABLEHLO_AND = 176, // WARNING: No runtime support
-  STABLEHLO_COSINE = 177, // WARNING: No runtime support
-  STABLEHLO_EXPONENTIAL = 178, // WARNING: No runtime support
-  STABLEHLO_FLOOR = 179, // WARNING: No runtime support
-  STABLEHLO_LOG = 180, // WARNING: No runtime support
-  STABLEHLO_MINIMUM = 181,
-  STABLEHLO_NEGATE = 182, // WARNING: No runtime support
-  STABLEHLO_OR = 183, // WARNING: No runtime support
-  STABLEHLO_POWER = 184, // WARNING: No runtime support
-  STABLEHLO_REMAINDER = 185, // WARNING: No runtime support
-  STABLEHLO_RSQRT = 186, // WARNING: No runtime support
-  STABLEHLO_SELECT = 187, // WARNING: No runtime support
-  STABLEHLO_SUBTRACT = 188, // WARNING: No runtime support
-  STABLEHLO_TANH = 189, // WARNING: No runtime support
-  STABLEHLO_SCATTER = 190,
-  STABLEHLO_COMPARE = 191, // WARNING: No runtime support
-  STABLEHLO_CONVERT = 192, // WARNING: No runtime support
-  STABLEHLO_DYNAMIC_SLICE = 193, // WARNING: No runtime support
-  STABLEHLO_DYNAMIC_UPDATE_SLICE = 194, // WARNING: No runtime support
-  STABLEHLO_PAD = 195,
-  STABLEHLO_IOTA = 196, // WARNING: No runtime support
-  STABLEHLO_DOT_GENERAL = 197, // WARNING: No runtime support
-  STABLEHLO_REDUCE_WINDOW = 198,
-  STABLEHLO_SORT = 199, // WARNING: No runtime support
-  STABLEHLO_WHILE = 200, // WARNING: No runtime support
-  STABLEHLO_GATHER = 201,
-  STABLEHLO_TRANSPOSE = 202, // WARNING: No runtime support
-  DILATE = 203,
-  STABLEHLO_RNG_BIT_GENERATOR = 204,
-  REDUCE_WINDOW = 205 (deprecated),
-  STABLEHLO_COMPOSITE = 206, // WARNING: No runtime support
-}
-// LINT.ThenChange(nnapi_linter/linter.proto)
-
-// Options for the builtin operators.
-union BuiltinOptions {
-  Conv2DOptions,
-  DepthwiseConv2DOptions,
-  ConcatEmbeddingsOptions,
-  LSHProjectionOptions,
-  Pool2DOptions,
-  SVDFOptions,
-  RNNOptions,
-  FullyConnectedOptions,
-  SoftmaxOptions,
-  ConcatenationOptions,
-  AddOptions,
-  L2NormOptions,
-  LocalResponseNormalizationOptions,
-  LSTMOptions,
-  ResizeBilinearOptions,
-  CallOptions,
-  ReshapeOptions,
-  SkipGramOptions,
-  SpaceToDepthOptions,
-  EmbeddingLookupSparseOptions,
-  MulOptions,
-  PadOptions,
-  GatherOptions,
-  BatchToSpaceNDOptions,
-  SpaceToBatchNDOptions,
-  TransposeOptions,
-  ReducerOptions,
-  SubOptions,
-  DivOptions,
-  SqueezeOptions,
-  SequenceRNNOptions,
-  StridedSliceOptions,
-  ExpOptions,
-  TopKV2Options,
-  SplitOptions,
-  LogSoftmaxOptions,
-  CastOptions,
-  DequantizeOptions,
-  MaximumMinimumOptions,
-  ArgMaxOptions,
-  LessOptions,
-  NegOptions,
-  PadV2Options,
-  GreaterOptions,
-  GreaterEqualOptions,
-  LessEqualOptions,
-  SelectOptions,
-  SliceOptions,
-  TransposeConvOptions,
-  SparseToDenseOptions,
-  TileOptions,
-  ExpandDimsOptions,
-  EqualOptions,
-  NotEqualOptions,
-  ShapeOptions,
-  PowOptions,
-  ArgMinOptions,
-  FakeQuantOptions,
-  PackOptions,
-  LogicalOrOptions,
-  OneHotOptions,
-  LogicalAndOptions,
-  LogicalNotOptions,
-  UnpackOptions,
-  FloorDivOptions,
-  SquareOptions,
-  ZerosLikeOptions,
-  FillOptions,
-  BidirectionalSequenceLSTMOptions,
-  BidirectionalSequenceRNNOptions,
-  UnidirectionalSequenceLSTMOptions,
-  FloorModOptions,
-  RangeOptions,
-  ResizeNearestNeighborOptions,
-  LeakyReluOptions,
-  SquaredDifferenceOptions,
-  MirrorPadOptions,
-  AbsOptions,
-  SplitVOptions,
-  UniqueOptions,
-  ReverseV2Options,
-  AddNOptions,
-  GatherNdOptions,
-  CosOptions,
-  WhereOptions,
-  RankOptions,
-  ReverseSequenceOptions,
-  MatrixDiagOptions,
-  QuantizeOptions,
-  MatrixSetDiagOptions,
-  HardSwishOptions,
-  IfOptions,
-  WhileOptions,
-  DepthToSpaceOptions,
-  NonMaxSuppressionV4Options,
-  NonMaxSuppressionV5Options,
-  ScatterNdOptions,
-  SelectV2Options,
-  DensifyOptions,
-  SegmentSumOptions,
-  BatchMatMulOptions,
-  CumsumOptions,
-  CallOnceOptions,
-  BroadcastToOptions,
-  Rfft2dOptions,
-  Conv3DOptions,
-  HashtableOptions,
-  HashtableFindOptions,
-  HashtableImportOptions,
-  HashtableSizeOptions,
-  VarHandleOptions,
-  ReadVariableOptions,
-  AssignVariableOptions,
-  RandomOptions,
-  BucketizeOptions,
-  GeluOptions,
-  DynamicUpdateSliceOptions,
-  UnsortedSegmentProdOptions,
-  UnsortedSegmentMaxOptions,
-  UnsortedSegmentMinOptions,
-  UnsortedSegmentSumOptions,
-  ATan2Options,
-  SignOptions,
-  BitcastOptions,
-  BitwiseXorOptions,
-  RightShiftOptions,
-  // DO NOT add new options this union, will cause failure in Java api
-  // generation otherwise
-  // Add new builtin options into builtin options 2 instead
-}
-
-union BuiltinOptions2{
-  StablehloConcatenateOptions,
-  StablehloBroadcastInDimOptions,
-  StablehloSliceOptions,
-  StablehloConvolutionOptions,
-  StablehloCustomCallOptions,
-  StablehloReduceOptions,
-  StablehloScatterOptions,
-  StablehloCompareOptions,
-  StablehloDynamicSliceOptions,
-  StablehloPadOptions,
-  StablehloIotaOptions,
-  StablehloDotGeneralOptions,
-  StablehloReduceWindowOptions,
-  StablehloSortOptions,
-  StablehloWhileOptions,
-  StablehloGatherOptions,
-  StablehloTransposeOptions,
-  DilateOptions,
-  StablehloRngBitGeneratorOptions,
-  ReduceWindowOptions (deprecated),
-  StableHLOCompositeOptions,
-}
-
-table StablehloGatherOptions{
-  offset_dims : [long];
-  collapsed_slice_dims : [long];
-  start_index_map : [long];
-  index_vector_dim : long;
-  slice_sizes : [long];
-  indices_are_sorted : bool;
-}
-
-table StablehloTransposeOptions{
-  permutation : [long];
-}
-
-enum StablehloPrecisionConfig : uint {
-  DEFAULT,
-  HIGH,
-  HIGHEST,
-}
-
-table StablehloDotGeneralOptions{
-  lhs_batching_dimensions : [long];
-  rhs_batching_dimensions : [long];
-  lhs_contracting_dimensions : [long];
-  rhs_contracting_dimensions : [long];
-  precision_config : [StablehloPrecisionConfig];
-}
-
-table StablehloReduceWindowOptions{
-  window_dimensions : [long];
-  window_strides : [long];
-  base_dilations : [long];
-  window_dilations : [long];
-  padding : [long];
-  body_subgraph_index : int;
-}
-
-table StablehloWhileOptions{
-  cond_subgraph_index : int;
-  body_subgraph_index : int;
-}
-
-table StablehloSortOptions{
-  dimension : long;
-  is_stable : bool;
-  comparator_subgraph_index : int;
-}
-
-table StablehloConcatenateOptions {
-  dimension : long;
-}
-
-table StablehloBroadcastInDimOptions{
-  broadcast_dimensions : [long];
-}
-
-enum StablehloComparisonDirection : uint {
-  STABLEHLO_COMPARISON_DIRECTION_EQ,
-  STABLEHLO_COMPARISON_DIRECTION_NE,
-  STABLEHLO_COMPARISON_DIRECTION_GE,
-  STABLEHLO_COMPARISON_DIRECTION_GT,
-  STABLEHLO_COMPARISON_DIRECTION_LE,
-  STABLEHLO_COMPARISON_DIRECTION_LT,
-
-}
-
-enum StablehloComparisonType : uint {
-  STABLEHLO_COMPARISON_TYPE_NOTYPE,
-  STABLEHLO_COMPARISON_TYPE_FLOAT,
-  STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
-  STABLEHLO_COMPARISON_TYPE_SIGNED,
-  STABLEHLO_COMPARISON_TYPE_UNSIGNED,
-}
-
-table StablehloCompareOptions{
-  comparison_direction : StablehloComparisonDirection;
-  compare_type : StablehloComparisonType;
-}
-
-table StablehloDynamicSliceOptions{
-  slice_sizes : [long];
-}
-
-table StablehloPadOptions{
-  edge_padding_low : [long];
-  edge_padding_high : [long];
-  interior_padding : [long];
-}
-
-table StablehloIotaOptions{
-  iota_dimension : long;
-}
-
-table StablehloCustomCallOptions {
-  call_target_name : string;
-  has_side_effect : bool;
-  backend_config: string;
-  api_version : int; // will be decprecated
-  called_computations: [int]; // should point to subgraphs of the computations
-  custom_attributes : [ubyte];
-}
-
-table StablehloReduceOptions {
-  dimensions : [long];
-  body_subgraph_index : int;
-}
-
-table StablehloSliceOptions{
-  start_indices : [long];
-  limit_indices : [long];
-  strides : [long];
-}
-
-table StablehloConvolutionOptions{
-  window_strides : [long];
-  padding : [long];
-  lhs_dilation : [long];
-  rhs_dilation : [long];
-  window_reversal : [bool];
-  input_batch_dimension : long;
-  input_feature_dimension : long;
-  input_spatial_dimensions : [long];
-  kernel_input_feature_dimension : long;
-  kernel_output_feature_dimension : long;
-  kernel_spatial_dimensions : [long];
-  output_batch_dimension : long;
-  output_feature_dimension : long;
-  output_spatial_dimensions	: [long];
-  feature_group_count : long;
-  batch_group_count : long;
-  precision_config : [StablehloPrecisionConfig];
-}
-
-table StablehloScatterOptions {
-  indices_are_sorted: bool;
-  update_window_dims: [long];
-  inserted_window_dims: [long];
-  scatter_dims_to_operand_dims: [long];
-  index_vector_dim: long;
-  unique_indices: bool;
-  update_computation_subgraph_index: int;
-}
-
-enum RngAlgorithm : byte {
-  // An algorithm auto-selected by the system according to device type.
-  DEFAULT = 0,
-  // The Philox algorithm, as described in paper
-  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
-  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-  PHILOX = 1,
-  // The ThreeFry algorithm, as described in paper
-  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
-  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-  THREEFRY = 2,
-}
-
-table StablehloRngBitGeneratorOptions {
-  algorithm:RngAlgorithm;
-}
-
-// LINT.IfChange
-enum Padding : byte { SAME, VALID }
-// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
-
-// LINT.IfChange
-enum ActivationFunctionType : byte {
-  NONE = 0,
-  RELU = 1,
-  RELU_N1_TO_1 = 2,
-  RELU6 = 3,
-  TANH = 4,
-  SIGN_BIT = 5,
-}
-// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
-
-table Conv2DOptions {
-  padding:Padding;
-  stride_w:int;
-  stride_h:int;
-  fused_activation_function:ActivationFunctionType;
-  dilation_w_factor:int = 1;
-  dilation_h_factor:int = 1;
-  // Parameters for Conv2D version 8 or above.
-  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
-  quantized_bias_type: TensorType;
-}
-
-// Options for both Conv3D and Conv3DTranspose.
-table Conv3DOptions {
-  padding:Padding;
-  stride_d:int;
-  stride_w:int;
-  stride_h:int;
-  fused_activation_function:ActivationFunctionType;
-  dilation_d_factor:int = 1;
-  dilation_w_factor:int = 1;
-  dilation_h_factor:int = 1;
-}
-
-table Pool2DOptions {
-  padding:Padding;
-  stride_w:int;
-  stride_h:int;
-  filter_width:int;
-  filter_height:int;
-  fused_activation_function:ActivationFunctionType;
-}
-
-table DepthwiseConv2DOptions {
-  // Parameters for DepthwiseConv version 1 or above.
-  padding:Padding;
-  stride_w:int;
-  stride_h:int;
-  // `depth_multiplier` is redundant. It's used by CPU kernels in
-  // TensorFlow 2.0 or below, but ignored in versions above.
-  // See comments in lite/c/builtin_op_data.h for more details.
-  depth_multiplier:int;
-  fused_activation_function:ActivationFunctionType;
-  // Parameters for DepthwiseConv version 2 or above.
-  dilation_w_factor:int = 1;
-  dilation_h_factor:int = 1;
-}
-
-table ConcatEmbeddingsOptions {
-  num_channels:int;
-  num_columns_per_channel:[int];
-  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
-}
-
-enum LSHProjectionType: byte {
-  UNKNOWN = 0,
-  SPARSE = 1,
-  DENSE = 2,
-}
-
-table LSHProjectionOptions {
-  type: LSHProjectionType;
-}
-
-table SVDFOptions {
-  rank:int;
-  fused_activation_function:ActivationFunctionType;
-  // For weights-only quantization, use asymmetric quantization for non
-  // constant inputs at evaluation time.
-  asymmetric_quantize_inputs:bool;
-}
-
-// An implementation of TensorFlow RNNCell.
-table RNNOptions {
-  fused_activation_function:ActivationFunctionType;
-  asymmetric_quantize_inputs:bool;
-}
-
-// An implementation of TensorFlow dynamic_rnn with RNNCell.
-table SequenceRNNOptions {
-  time_major:bool;
-  fused_activation_function:ActivationFunctionType;
-  asymmetric_quantize_inputs:bool;
-}
-
-// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
-table BidirectionalSequenceRNNOptions {
-  time_major:bool;
-  fused_activation_function:ActivationFunctionType;
-  merge_outputs: bool;
-  asymmetric_quantize_inputs:bool;
-}
-
-// LINT.IfChange
-enum FullyConnectedOptionsWeightsFormat: byte {
-  DEFAULT = 0,
-  SHUFFLED4x16INT8 = 1,
-}
-// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
-
-// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
-table FullyConnectedOptions {
-  // Parameters for FullyConnected version 1 or above.
-  fused_activation_function:ActivationFunctionType;
-
-  // Parameters for FullyConnected version 2 or above.
-  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
-
-  // Parameters for FullyConnected version 5 or above.
-  // If set to true, then the number of dimension is preserved. Furthermore,
-  // all but the last dimension of the input and output shapes will be equal.
-  keep_num_dims: bool;
-
-  // Parameters for FullyConnected version 7 or above.
-  // If set to true, then weights-only op will use asymmetric quantization for
-  // inputs.
-  asymmetric_quantize_inputs: bool;
-
-  // Parameters for FullyConnected version 11 or above.
-  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
-  quantized_bias_type: TensorType;
-}
-
-table SoftmaxOptions {
-  beta: float;
-}
-
-// An implementation of TensorFlow concat.
-table ConcatenationOptions {
-  axis:int;
-  fused_activation_function:ActivationFunctionType;
-}
-
-table AddOptions {
-  fused_activation_function:ActivationFunctionType;
-  // Parameters supported by version 3.
-  pot_scale_int16:bool = true;
-}
-
-table MulOptions {
-  fused_activation_function:ActivationFunctionType;
-}
-
-table L2NormOptions {
-  // This field is currently ignored in the L2 Norm Op.
-  fused_activation_function:ActivationFunctionType;
-}
-
-table LocalResponseNormalizationOptions {
-  radius:int;
-  bias:float;
-  alpha:float;
-  beta:float;
-}
-
-// LINT.IfChange
-enum LSTMKernelType : byte {
-  // Full LSTM kernel which supports peephole and projection.
-  FULL = 0,
-  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
-  BASIC = 1,
-}
-// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
-
-// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
-table LSTMOptions {
-  // Parameters for LSTM version 1 or above.
-  fused_activation_function:ActivationFunctionType;
-  cell_clip: float; // Optional, 0.0 means no clipping
-  proj_clip: float; // Optional, 0.0 means no clipping
-
-  // Parameters for LSTM version 2 or above.
-  // Basic kernel is only supported in version 2 or above.
-  kernel_type: LSTMKernelType = FULL;
-
-  // Parameters for LSTM version 4 or above.
-  asymmetric_quantize_inputs: bool;
-}
-
-// An implementation of TensorFlow dynamic_rnn with LSTMCell.
-table UnidirectionalSequenceLSTMOptions {
-  fused_activation_function:ActivationFunctionType;
-  cell_clip: float; // Optional, 0.0 means no clipping
-  proj_clip: float; // Optional, 0.0 means no clipping
-
-  // If true then first dimension is sequence, otherwise batch.
-  time_major:bool;
-
-  // Parameter for Unidirectional Sequence LSTM version 3.
-  asymmetric_quantize_inputs:bool;
-
-  // Parameter for unidirectional sequence RNN version 4.
-  diagonal_recurrent_tensors:bool;
-}
-
-table BidirectionalSequenceLSTMOptions {
-  // Parameters supported by version 1:
-  fused_activation_function:ActivationFunctionType;
-  cell_clip: float; // Optional, 0.0 means no clipping
-  proj_clip: float; // Optional, 0.0 means no clipping
-
-  // If true, store the outputs of both directions into the first output.
-  merge_outputs: bool;
-
-  // Parameters supported by version 2:
-  // If true then first dimension is sequence, otherwise batch.
-  // Version 1 implementations assumed time_major to be true, so this default
-  // value should never change.
-  time_major: bool = true;
-
-  // Parameters for version 3 or above.
-  asymmetric_quantize_inputs:bool;
-}
-
-table ResizeBilinearOptions {
-  new_height: int (deprecated);
-  new_width: int (deprecated);
-  align_corners: bool;
-  half_pixel_centers: bool;
-}
-
-table ResizeNearestNeighborOptions {
-  align_corners: bool;
-  half_pixel_centers: bool;
-}
-
-// A call operation options
-table CallOptions {
-  // The subgraph index that needs to be called.
-  subgraph:uint;
-}
-
-table PadOptions {
-}
-
-table PadV2Options {
-}
-
-table ReshapeOptions {
-  new_shape:[int];
-}
-
-table SpaceToBatchNDOptions {
-}
-
-table BatchToSpaceNDOptions {
-}
-
-table SkipGramOptions {
-  ngram_size: int;
-  max_skip_size: int;
-  include_all_ngrams: bool;
-}
-
-table SpaceToDepthOptions {
-  block_size: int;
-}
-
-table DepthToSpaceOptions {
-  block_size: int;
-}
-
-table SubOptions {
-  fused_activation_function:ActivationFunctionType;
-  // Parameters supported by version 5
-  pot_scale_int16:bool = true;
-}
-
-table DivOptions {
-  fused_activation_function:ActivationFunctionType;
-}
-
-table TopKV2Options {
-}
-
-enum CombinerType : byte {
-  SUM = 0,
-  MEAN = 1,
-  SQRTN = 2,
-}
-
-table EmbeddingLookupSparseOptions {
-  combiner:CombinerType;
-}
-
-table GatherOptions {
-  axis: int;
-  // Parameters for Gather version 5 or above.
-  batch_dims: int = 0;
-}
-
-table TransposeOptions {
-}
-
-table ExpOptions {
-}
-
-table CosOptions {
-}
-
-table ReducerOptions {
-  keep_dims: bool;
-}
-
-table SqueezeOptions {
-  squeeze_dims:[int];
-}
-
-table SplitOptions {
-  num_splits: int;
-}
-
-table SplitVOptions {
-  num_splits: int;
-}
-
-table StridedSliceOptions {
-  begin_mask: int;
-  end_mask: int;
-  ellipsis_mask: int;
-  new_axis_mask: int;
-  shrink_axis_mask: int;
-  // If true, then the end tensor is an offset of the begin tensor.
-  offset: bool;
-}
-
-table LogSoftmaxOptions {
-}
-
-table CastOptions {
-  in_data_type: TensorType;
-  out_data_type: TensorType;
-}
-
-table DequantizeOptions {
-}
-
-table MaximumMinimumOptions {
-}
-
-table TileOptions {
-}
-
-table ArgMaxOptions {
-  output_type : TensorType;
-}
-
-table ArgMinOptions {
-  output_type : TensorType;
-}
-
-table GreaterOptions {
-}
-
-table GreaterEqualOptions {
-}
-
-table LessOptions {
-}
-
-table LessEqualOptions {
-}
-
-table NegOptions {
-}
-
-table SelectOptions {
-}
-
-table SliceOptions {
-}
-
-table TransposeConvOptions {
-  // Parameters supported by version 1, 2, 3:
-  padding:Padding;
-  stride_w:int;
-  stride_h:int;
-
-  // Parameters supported by version 4:
-  fused_activation_function:ActivationFunctionType = NONE;
-
-  // Parameters for TransposeConv version 5 or above.
-  // If set, use this for bias and accumulator.
-  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
-  quantized_bias_type: TensorType;
-}
-
-table ExpandDimsOptions {
-}
-
-table SparseToDenseOptions {
-  validate_indices:bool;
-}
-
-table EqualOptions {
-}
-
-table NotEqualOptions {
-}
-
-table ShapeOptions {
-  // Optional output type of the operation (int32 or int64). Defaults to int32.
-  out_type : TensorType;
-}
-
-table RankOptions {
-}
-
-table PowOptions {
-}
-
-table FakeQuantOptions {
-  // Parameters supported by version 1:
-  min:float;
-  max:float;
-  num_bits:int;
-
-  // Parameters supported by version 2:
-  narrow_range:bool;
-}
-
-table PackOptions {
-  values_count:int;
-  axis:int;
-}
-
-table LogicalOrOptions {
-}
-
-table OneHotOptions {
-  axis:int;
-}
-
-table AbsOptions {
-}
-
-
-table HardSwishOptions {
-}
-
-table LogicalAndOptions {
-}
-
-table LogicalNotOptions {
-}
-
-table UnpackOptions {
-  num:int;
-  axis:int;
-}
-
-table FloorDivOptions {
-}
-
-table SquareOptions {
-}
-
-table ZerosLikeOptions {
-}
-
-table FillOptions {
-}
-
-table FloorModOptions {
-}
-
-table RangeOptions {
-}
-
-table LeakyReluOptions {
-  alpha:float;
-}
-
-table SquaredDifferenceOptions {
-}
-
-// LINT.IfChange
-enum MirrorPadMode : byte {
-  // Doesn't include borders.
-  REFLECT = 0,
-  // Includes borders.
-  SYMMETRIC = 1,
-}
-// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
-
-table MirrorPadOptions {
-  mode:MirrorPadMode;
-}
-
-table UniqueOptions {
-  idx_out_type:TensorType = INT32;
-}
-
-table ReverseV2Options {
-}
-
-table AddNOptions {
-}
-
-table GatherNdOptions {
-}
-
-table WhereOptions {
-}
-
-table ReverseSequenceOptions {
-  seq_dim:int;
-  batch_dim:int = 0;
-}
-
-table MatrixDiagOptions {
-}
-
-table QuantizeOptions {
-}
-
-table MatrixSetDiagOptions {
-}
-
-table IfOptions {
-  then_subgraph_index:int;
-  else_subgraph_index:int;
-}
-
-table CallOnceOptions {
-  init_subgraph_index:int;
-}
-
-table WhileOptions {
-  cond_subgraph_index:int;
-  body_subgraph_index:int;
-}
-
-table NonMaxSuppressionV4Options {
-}
-
-table NonMaxSuppressionV5Options {
-}
-
-table ScatterNdOptions {
-}
-
-table SelectV2Options {
-}
-
-table DensifyOptions {
-}
-
-table SegmentSumOptions {
-}
-
-table BatchMatMulOptions {
-  adj_x:bool;
-  adj_y:bool;
-  // Parameters for BatchMatMul version 4 or above.
-  // If set to true, then weights-only op will use asymmetric quantization for
-  // inputs.
-  asymmetric_quantize_inputs: bool;
-}
-
-table CumsumOptions {
-  exclusive:bool;
-  reverse:bool;
-}
-
-table BroadcastToOptions {
-}
-
-table Rfft2dOptions {
-}
-
-table HashtableOptions {
-  // The identity of hash tables. This identity will be used across different
-  // subgraphs in the same interpreter instance.
-  table_id:int;
-  key_dtype:TensorType;
-  value_dtype:TensorType;
-}
-
-table HashtableFindOptions {
-}
-
-table HashtableImportOptions {
-}
-
-table HashtableSizeOptions {
-}
-
-table VarHandleOptions {
-  container:string;
-  shared_name:string;
-}
-
-table ReadVariableOptions {
-}
-
-table AssignVariableOptions {
-}
-
-table RandomOptions {
-  seed: long;
-  seed2: long;
-}
-
-table BucketizeOptions {
-  boundaries: [float];  // The bucket boundaries.
-}
-
-table GeluOptions {
-  approximate: bool;
-}
-
-table DynamicUpdateSliceOptions {
-}
-
-table UnsortedSegmentProdOptions {
-}
-
-table UnsortedSegmentMaxOptions {
-}
-
-table UnsortedSegmentSumOptions {
-}
-
-table ATan2Options {
-}
-
-table UnsortedSegmentMinOptions{
-}
-
-table SignOptions {
-}
-
-table BitcastOptions {
-}
-
-table BitwiseXorOptions {
-}
-
-table RightShiftOptions {
-}
-
-table DilateOptions {
-}
-
-enum ReduceWindowFunction : int {
-  UNSUPPORTED,
-  ADD,
-  MUL,
-  MINIMUM,
-  MAXIMUM,
-  ALL,
-  ANY,
-}
-
-table ReduceWindowOptions (deprecated) {
-  reduce_function: ReduceWindowFunction;
-}
-
-// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
-// builtin, or a string if the operator is custom.
-table OperatorCode {
-  // This field is for backward compatibility. This field will be used when
-  // the value of the extended builtin_code field has less than
-  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
-  deprecated_builtin_code:byte;
-  custom_code:string;
-
-  // The version of the operator. The version need to be bumped whenever new
-  // parameters are introduced into an op.
-  version:int = 1;
-
-  // This field is introduced for resolving op builtin code shortage problem
-  // (the original BuiltinOperator enum field was represented as a byte).
-  // This field will be used when the value of the extended builtin_code field
-  // has greater than BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
-  builtin_code:BuiltinOperator;
-}
-
-enum CustomOptionsFormat : byte {
-  FLEXBUFFERS = 0,
-}
-
-table StableHLOCompositeOptions {
-  name:string;
-  decomposition_subgraph_index:int32;
-  composite_attributes:[ubyte];
-  composite_attributes_format:CustomOptionsFormat;
-  version:int32;
-}
-
-// An operator takes tensors as inputs and outputs. The type of operation being
-// performed is determined by an index into the list of valid OperatorCodes,
-// while the specifics of each operations is configured using builtin_options
-// or custom_options.
-table Operator {
-  // Index into the operator_codes array. Using an integer here avoids
-  // complicate map lookups.
-  opcode_index:uint;
-
-  // Optional input are indicated by -1.
-  inputs:[int];
-  outputs:[int];
-
-  builtin_options:BuiltinOptions;
-  custom_options:[ubyte];
-  custom_options_format:CustomOptionsFormat;
-
-  // A list of booleans indicating the input tensors which are being mutated by
-  // this operator.(e.g. used by RNN and LSTM).
-  // For example, if the "inputs" array refers to 5 tensors and the second and
-  // fifth are mutable variables, then this list will contain
-  // [false, true, false, false, true].
-  //
-  // If the list is empty, no variable is mutated in this operator.
-  // The list either has the same length as `inputs`, or is empty.
-  mutating_variable_inputs:[bool];
-
-  // A list of indices to the subgraph's "tensors" that are internal to an Op.
-  // Internal tensors are those that do not flow in or out of the operation,
-  // but instead are part of internal computation. As such, the operation's
-  // implementation may manage its memory more efficiently. They are needed
-  // however (i.e. not just an implementation detail) since they are part of the
-  // computation, which may require relevant metadata such as quantization
-  // parameters.
-  intermediates:[int];
-
-  // When an op is using custom_options in a model that is larger than 2GB, then
-  // we instead use the following attributes to find the buffer location which
-  // is stored outside of flatbuffers, the offset is calculated relative to the
-  // beginning of the file and is only valid if > 1
-  large_custom_options_offset: ulong;
-  large_custom_options_size: ulong;
-
-  // Flatbuffers union struct has a 128 elements limit in JAVA, so a second
-  // union is added, in the case of where BuitlinOptions2 runs out, a third
-  // one can be added
-  builtin_options_2 : BuiltinOptions2;
-}
-
-// The root type, defining a subgraph, which typically represents an entire
-// model.
-table SubGraph {
-  // A list of all tensors used in this subgraph.
-  tensors:[Tensor];
-
-  // Indices of the tensors that are inputs into this subgraph. Note this is
-  // the list of non-static tensors that feed into the subgraph for inference.
-  inputs:[int];
-
-  // Indices of the tensors that are outputs out of this subgraph. Note this is
-  // the list of output tensors that are considered the product of the
-  // subgraph's inference.
-  outputs:[int];
-
-  // All operators, in execution order.
-  operators:[Operator];
-
-  // Name of this subgraph (used for debugging).
-  name:string;
-}
-
-// Table of raw data buffers (used for constant tensors). Referenced by tensors
-// by index. The generous alignment accommodates mmap-friendly data structures.
-table Buffer {
-  data:[ubyte] (force_align: 16);
-
-  // In a model that is larger than 2GB, then buffers instead uses the following
-  // attributes to find stored data, which is outside of flatbuffers
-  // the offset is calculated relative to the beginning of the file and is only
-  // valid if > 1.
-  offset: ulong;
-  size: ulong;
-}
-
-table Metadata {
-  // A human readable string to uniquely identify a Metadata.
-  name:string;
-  // An index to the buffers table.
-  buffer:uint;
-}
-
-// Map from an alias name of tensor to tensor index in the graph.
-// This is used in Signature def.
-table TensorMap {
-  // Represents the alias to use for this tensor.
-  name:string;
-
-  // The actual tensor index in the primary graph, that 'name' corresponds to.
-  tensor_index:uint;
-}
-
-// This corresponds to SignatureDef in Tensorflow SavedModel.
-// The SignatureDef will be part of the SavedModel provided for conversion.
-table SignatureDef {
-  // Named inputs for this signature.
-  inputs:[TensorMap];
-
-  // Named outputs for this signature.
-  outputs:[TensorMap];
-
-  // Key value which was in the Tensorflow SavedModel SignatureDef map.
-  signature_key:string;
-
-  // Model tag, deprecated.
-  deprecated_tag:string (deprecated);
-
-  // Index of subgraphs that corresponds to the exported method.
-  subgraph_index:uint;
-}
-
-table Model {
-  // Version of the schema.
-  version:uint;
-
-  // A list of all operator codes used in this model. This is
-  // kept in order because operators carry an index into this
-  // vector.
-  operator_codes:[OperatorCode];
-
-  // All the subgraphs of the model. The 0th is assumed to be the main
-  // model.
-  subgraphs:[SubGraph];
-
-  // A description of the model.
-  description:string;
-
-  // Buffers of the model.
-  // Note the 0th entry of this array must be an empty buffer (sentinel).
-  // This is a convention so that tensors without a buffer can provide 0 as
-  // their buffer.
-  buffers:[Buffer];
-
-  // Metadata about the model. Indirects into the existings buffers list.
-  // Deprecated, prefer to use metadata field.
-  metadata_buffer:[int];
-
-  // Metadata about the model.
-  metadata:[Metadata];
-
-  // Optional SignatureDefs for the model.
-  signature_defs:[SignatureDef];
-}
-
-root_type Model;

From 4a4307e14e17c916c02592c8bddf55f580ee2382 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Wed, 29 May 2024 13:39:12 -0700
Subject: [PATCH 097/287] Load variable concurrently

PiperOrigin-RevId: 638397250
---
 tensorflow/core/tfrt/mlrt/kernel/BUILD        |  26 ++
 .../core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc  | 205 ++++++++++++----
 .../tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc  | 228 +++++++++++++-----
 .../tfrt/mlrt/kernel/shard_restore_util.cc    |  94 ++++++++
 .../tfrt/mlrt/kernel/shard_restore_util.h     |  41 ++++
 .../mlrt/kernel/shard_restore_util_test.cc    |  79 ++++++
 .../mlrt/kernel/testdata/gen_checkpoint.py    |   7 +
 7 files changed, 580 insertions(+), 100 deletions(-)
 create mode 100644 tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.cc
 create mode 100644 tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h
 create mode 100644 tensorflow/core/tfrt/mlrt/kernel/shard_restore_util_test.cc

diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 525e76ce993bff..c8871340143dee 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -47,6 +47,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "shard_restore_util",
+    srcs = ["shard_restore_util.cc"],
+    hdrs = ["shard_restore_util.h"],
+    deps = [
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "ifrt_ops_kernel",
     srcs = ["ifrt_ops_kernel.cc"],
@@ -54,6 +68,7 @@ cc_library(
         ":context",
         ":kernel",
         ":kernel_runner_utils",
+        ":shard_restore_util",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -226,3 +241,14 @@ tf_cc_shared_test(
         "@tf_runtime//:hostcontext",
     ],
 )
+
+tf_cc_shared_test(
+    name = "shard_restore_util_test",
+    srcs = ["shard_restore_util_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":shard_restore_util",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index ef1979f320a496..60bccdffbcf616 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
@@ -48,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -60,6 +63,14 @@ namespace tensorflow {
 namespace tf_mlrt {
 
 namespace {
+int64_t GetSizeFromVarHandle(const ResourceHandle& handle) {
+  int size = 0;
+  for (auto& dtype_and_shape : handle.dtypes_and_shapes()) {
+    size += DataTypeSize(dtype_and_shape.dtype) *
+            dtype_and_shape.shape.num_elements();
+  }
+  return size;
+}
 
 struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
@@ -98,7 +109,23 @@ struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
   void Invoke();
 
  private:
+  // TODO(b/335247101): Consider exposing this as an option for tuning or
+  // dynamically decide it based on the size of the variables.
+  static constexpr int kNumRestoreClusters = 4;
+
+  // A shard of variables to be restored.
+  struct RestoreVariableShard {
+    tensorflow::Tensor prefix;
+    tensorflow::Tensor tensor_names;
+    tensorflow::Tensor shape_and_slices;
+    std::vector<tensorflow::tfrt_stub::FallbackTensor> var_handles;
+    tensorflow::AttrValue dtypes_attr_value;
+  };
+
   absl::Status InvokeHelper();
+
+  absl::Status RunShard(RestoreVariableShard shard);
+  absl::Status ValidateInput();
 };
 
 void MlrtIfrtRestoreVariableKernel::Invoke() {
@@ -109,7 +136,8 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
   }
 }
 
-absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
+absl::Status MlrtIfrtRestoreVariableKernel::RunShard(
+    RestoreVariableShard shard) {
   std::optional<IfrtModelContext*> ifrt_model_context =
       context().resource_context().GetResource<IfrtModelContext>(
           "IfrtModelContext");
@@ -117,40 +145,38 @@ absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
     return absl::FailedPreconditionError(
         "RestoreVariableOp: failed to fetch IfrtModelContext");
   }
-  const int num_outputs = var_handles().size();
-  DCHECK_EQ(num_outputs, tensor_names().tensor().NumElements());
+  const int num_outputs = shard.var_handles.size();
+  DCHECK_EQ(num_outputs, shard.tensor_names.NumElements());
   auto& fallback_request_state = context().fallback_request_state();
-  tensorflow::AttrValue dtypes_attr_value;
-  for (const auto& dtype : restored_dtypes()) {
-    dtypes_attr_value.mutable_list()->mutable_type()->Add(dtype);
-  }
+
   // Use `tf.RestoreV2` to restore tensor. This will also populate
   // tensorflow::ResourceManager.
   // TODO(b/319045348): avoid populating tensorflow::ResourceManager if the
   // variable is only used by device/IFRT.
   // TODO(b/319045348): consider directly calling restore function such as that
   // in /tensorflow/core/kernels/save_restore_v2_ops.cc
-  auto runner = tfrt_stub::OpKernelRunner::Create(
-                    /*op_name=*/
-                    "RestoreV2", /*node_name=*/"RestoreV2",
-                    context().params().device->name(),
-                    /*num_args=*/3,
-                    [&](tensorflow::AttrValueMap* attr_value_map) {
-                      attr_value_map->insert({"dtypes", dtypes_attr_value});
-                      return absl::OkStatus();
-                    },
-                    fallback_request_state.device_manager(),
-                    fallback_request_state.process_function_library_runtime())
-                    .value();
+  auto runner =
+      tfrt_stub::OpKernelRunner::Create(
+          /*op_name=*/
+          "RestoreV2", /*node_name=*/"RestoreV2",
+          context().params().device->name(),
+          /*num_args=*/3,
+          [&](tensorflow::AttrValueMap* attr_value_map) {
+            attr_value_map->insert({"dtypes", shard.dtypes_attr_value});
+            return absl::OkStatus();
+          },
+          fallback_request_state.device_manager(),
+          fallback_request_state.process_function_library_runtime())
+          .value();
 
   // Prepare the input tensors.
   std::vector<tensorflow::TensorValue> input_tf_tensor_values;
-  input_tf_tensor_values.resize(arguments().size());
-  for (int i = 0; i < arguments().size(); ++i) {
-    auto& fallback_tensor =
-        arguments()[i].Get<tensorflow::tfrt_stub::FallbackTensor>();
-    input_tf_tensor_values[i].tensor = &fallback_tensor.tensor();
-  }
+  static constexpr int kNumInputArgs = 3;
+  input_tf_tensor_values.resize(kNumInputArgs);
+  // We need to keep these tensor alive
+  input_tf_tensor_values[0].tensor = &shard.prefix;
+  input_tf_tensor_values[1].tensor = &shard.tensor_names;
+  input_tf_tensor_values[2].tensor = &shard.shape_and_slices;
 
   auto& params = context().params();
   SetUpParams(runner, input_tf_tensor_values, params);
@@ -179,13 +205,14 @@ absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
     auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
     auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
     const ResourceHandle& var_handle =
-        var_handles()[i].tensor().scalar<ResourceHandle>()();
+        shard.var_handles[i].tensor().scalar<tensorflow::ResourceHandle>()();
 
     TF_ASSIGN_OR_RETURN(ifrt_serving::DtypeAndShape dtype_and_shape,
                         ifrt_serving::GetDtypeAndShape(var_handle));
 
     std::string runtime_name =
         ifrt_serving::GetRuntimeNameFromVarHandle(var_handle);
+
     ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
         restored_tensor_info = {false, std::move(dtype_and_shape),
                                 std::move(future)};
@@ -206,24 +233,114 @@ absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
   DCHECK((*ifrt_model_context)->checkpoint_loader_queue() != nullptr);
   (*ifrt_model_context)
       ->checkpoint_loader_queue()
-      ->AddTask(
-          [runner = std::move(runner), async_state = std::move(async_state)]() {
-            auto* op_kernel_context_ptr = &async_state->context;
-            runner.Run(op_kernel_context_ptr);
-
-            auto& op_kernel_context = async_state->context;
-            if (!op_kernel_context.status().ok()) {
-              for (auto& result : async_state->results) {
-                std::move(result).Set(op_kernel_context.status());
-              }
-              return;
-            }
-            for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
-              DCHECK(op_kernel_context.mutable_output(i));
-              std::move(async_state->results[i])
-                  .Set(std::move(*op_kernel_context.mutable_output(i)));
-            }
-          });
+      ->AddTask([runner = std::move(runner),
+                 async_state = std::move(async_state),
+                 shard = std::move(shard)]() {
+        // Keep input tensor alive in `shard`.
+        auto* op_kernel_context_ptr = &async_state->context;
+        runner.Run(op_kernel_context_ptr);
+
+        auto& op_kernel_context = async_state->context;
+        if (!op_kernel_context.status().ok()) {
+          for (auto& result : async_state->results) {
+            std::move(result).Set(op_kernel_context.status());
+          }
+          return;
+        }
+        for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+          DCHECK(op_kernel_context.mutable_output(i));
+          std::move(async_state->results[i])
+              .Set(std::move(*op_kernel_context.mutable_output(i)));
+        }
+      });
+  return absl::OkStatus();
+}
+
+absl::Status MlrtIfrtRestoreVariableKernel::ValidateInput() {
+  if (prefix().tensor().NumElements() != 1) {
+    return absl::InvalidArgumentError(
+        "The prefix tensor must be a scalar tensor.");
+  }
+  if (!TensorShapeUtils::IsVector(tensor_names().tensor().shape()) ||
+      !TensorShapeUtils::IsVector(shape_and_slices().tensor().shape())) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Input tensor_names and shape_and_slices "
+                     "should be an 1-D tensors, got ",
+                     tensor_names().tensor().shape().DebugString(), " and ",
+                     shape_and_slices().tensor().shape().DebugString()));
+  }
+
+  if (tensor_names().tensor().NumElements() !=
+      shape_and_slices().tensor().NumElements()) {
+    return absl::InvalidArgumentError(
+        "The tensor_names and shape_and_slices tensors must have the same "
+        "number of elements.");
+  }
+
+  if (tensor_names().tensor().NumElements() != var_handles().size()) {
+    return absl::InvalidArgumentError(
+        "The tensor_names and var_handles must have the same number of "
+        "elements.");
+  }
+  if (tensor_names().tensor().NumElements() != restored_dtypes().size()) {
+    return absl::InvalidArgumentError(
+        "The tensor_names and restored_dtypes must have the same number of "
+        "elements.");
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
+  TF_RETURN_IF_ERROR(ValidateInput());
+
+  std::vector<int64_t> variable_sizes;
+  variable_sizes.reserve(var_handles().size());
+  for (auto& handle : var_handles()) {
+    variable_sizes.push_back(GetSizeFromVarHandle(
+        handle.tensor().scalar<tensorflow::ResourceHandle>()()));
+  }
+
+  std::vector<std::vector<int>> sharded_indices =
+      ShardVariables(kNumRestoreClusters, absl::MakeSpan(variable_sizes));
+
+  // Converts the names and slices back to the tensor.
+  auto vector_to_tensor = [](const std::vector<tsl::tstring>& vec) {
+    tensorflow::Tensor tensor(tensorflow::DT_STRING,
+                              TensorShape({static_cast<int>(vec.size())}));
+    for (int i = 0; i < vec.size(); ++i) {
+      tensor.flat<tsl::tstring>()(i) = vec[i];
+    }
+    return tensor;
+  };
+
+  const auto& tensor_names_flat = tensor_names().tensor().flat<tsl::tstring>();
+  const auto& shape_and_slices_flat =
+      shape_and_slices().tensor().flat<tsl::tstring>();
+
+  std::vector<RestoreVariableShard> shards;
+  shards.reserve(sharded_indices.size());
+  for (auto& sharded_index : sharded_indices) {
+    RestoreVariableShard shard;
+    std::vector<tsl::tstring> tensor_names;
+    std::vector<tsl::tstring> shape_and_slices;
+    for (int index : sharded_index) {
+      tensor_names.push_back(tensor_names_flat(index));
+      shape_and_slices.push_back(shape_and_slices_flat(index));
+      shard.var_handles.push_back(var_handles()[index]);
+      shard.dtypes_attr_value.mutable_list()->add_type(
+          restored_dtypes()[index]);
+    }
+
+    shard.prefix = prefix().tensor();
+    shard.tensor_names = vector_to_tensor(tensor_names);
+    shard.shape_and_slices = vector_to_tensor(shape_and_slices);
+    shards.push_back(std::move(shard));
+  }
+
+  for (const auto& shard : shards) {
+    TF_RETURN_IF_ERROR(RunShard(shard));
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index 4d19133f74b688..6038ff422a6eb0 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/synchronization/notification.h"
@@ -98,7 +99,8 @@ std::string EncodeRestoreDtypesInt32(int num_outputs) {
   return std::string(buffer.data(), buffer.size());
 }
 
-mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
+mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp(
+    int num_variables = 1) {
   mlrt::bc::Buffer buffer;
   mlrt::bc::Allocator allocator(&buffer);
 
@@ -113,37 +115,43 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
       .Assign(kernel_names);
   kernels.Def(kernel_names);
 
-  mlrt::testing::AttributeTable attributes(
-      executable_ctor.construct_attributes(4));
+  static constexpr int kNumAttributes =
+      4;  // Size of attributes when there are 1 variable.
+  mlrt::testing::AttributeTable attributes(executable_ctor.construct_attributes(
+      kNumAttributes + 2 * (num_variables - 1)));
 
-  std::string restore_dtypes = EncodeRestoreDtypesInt32(1);
+  std::string restore_dtypes = EncodeRestoreDtypesInt32(num_variables);
   attributes.Add("restore_dtypes", restore_dtypes);
 
-  attributes.Add("var_handle_op_node_def",
-                 absl::Substitute(
-                     R"pb(name: "VarHandleOp"
-                          op: "VarHandleOp"
-                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                          attr {
-                            key: "container"
-                            value { s: "$0" }
-                          }
-                          attr {
-                            key: "shared_name"
-                            value { s: "$1" }
-                          }
-                          attr {
-                            key: "dtype"
-                            value { type: DT_INT32 }
-                          }
-                          attr {
-                            key: "shape"
-                            value { shape { dim { size: 1 } } }
-                          }
-                     )pb",
-                     kContainer, kSharedName));
-
-  attributes.Add("var_handle_op_key", 0);
+  for (int i = 0; i < num_variables; ++i) {
+    attributes.Add(
+        absl::StrCat("var_handle_op_node_def", i),
+        absl::Substitute(
+            R"pb(name: "$0"
+                 op: "VarHandleOp"
+                 device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                 attr {
+                   key: "container"
+                   value { s: "$1" }
+                 }
+                 attr {
+                   key: "shared_name"
+                   value { s: "$2" }
+                 }
+                 attr {
+                   key: "dtype"
+                   value { type: DT_INT32 }
+                 }
+                 attr {
+                   key: "shape"
+                   value { shape { dim { size: 1 } } }
+                 }
+            )pb",
+            absl::StrCat("VarHandleOp", i), kContainer,
+            absl::StrCat(kSharedName, i)));
+
+    attributes.Add(absl::StrCat("var_handle_op_key", i), i);
+  }
 
   auto functions_ctor = executable_ctor.construct_functions(1);
 
@@ -157,38 +165,55 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
         regs.Def({"prefix_tensor", "name_tensor", "slice_tensor"}));
 
     const int kNumKernels = 4;
-    auto kernels_ctor = function_ctor.construct_kernels(kNumKernels);
+    auto kernels_ctor =
+        function_ctor.construct_kernels(kNumKernels + 2 * (num_variables - 1));
     int kernel_index = 0;
 
-    {
-      // Create VarHandleOp
-      auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
-      createop_ctor.construct_arguments(0);
-      createop_ctor.construct_results(0);
-      createop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("var_handle_op_node_def"),
-           attributes.GetHandle("var_handle_op_key")});
-      kernel_index++;
-    }
-    {
-      // Execute VarHandleOp
-      auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
-      executeop_ctor.construct_arguments(0);
-      executeop_ctor.construct_results(1).Assign({regs.Def("variable_handle")});
-      executeop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("var_handle_op_node_def"),
-           attributes.GetHandle("var_handle_op_key")});
-      executeop_ctor.construct_last_uses(1).Assign({0});
-      kernel_index++;
+    std::vector<std::string> variable_handle_names;
+    variable_handle_names.reserve(num_variables);
+    for (int i = 0; i < num_variables; ++i) {
+      variable_handle_names.push_back(absl::StrCat("variable_handle", i));
+      std::string variable_handle_op_node_def =
+          absl::StrCat("var_handle_op_node_def", i);
+      std::string variable_handle_op_key = absl::StrCat("var_handle_op_key", i);
+
+      {
+        // Create VarHandleOp
+        auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
+        createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+        createop_ctor.construct_arguments(0);
+        createop_ctor.construct_results(0);
+        createop_ctor.construct_attributes(2).Assign(
+            {attributes.GetHandle(variable_handle_op_node_def),
+             attributes.GetHandle(variable_handle_op_key)});
+        kernel_index++;
+      }
+      {
+        // Execute VarHandleOp
+        auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
+        executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
+        executeop_ctor.construct_arguments(0);
+        executeop_ctor.construct_results(1).Assign(
+            {regs.Def(variable_handle_names.back())});
+        executeop_ctor.construct_attributes(2).Assign(
+            {attributes.GetHandle(variable_handle_op_node_def),
+             attributes.GetHandle(variable_handle_op_key)});
+        executeop_ctor.construct_last_uses(1).Assign({0});
+        kernel_index++;
+      }
     }
-
     {
+      std::vector<std::string> args;
+      args.reserve(3 + num_variables);
+      args.push_back("prefix_tensor");
+      args.push_back("name_tensor");
+      args.push_back("slice_tensor");
+      for (int i = 0; i < num_variables; ++i) {
+        args.push_back(variable_handle_names[i]);
+      }
       auto restore_ctor = kernels_ctor.ConstructAt(kernel_index);
       restore_ctor.set_code(kernels.Use("tf_mlrt.ifrt_restore_variable"));
-      restore_ctor.construct_arguments(4).Assign(regs.Use(
-          {"prefix_tensor", "name_tensor", "slice_tensor", "variable_handle"}));
+      restore_ctor.construct_arguments(args.size()).Assign(regs.Use(args));
       restore_ctor.construct_results(0);
       restore_ctor.construct_attributes(1).Assign(
           {attributes.GetHandle("restore_dtypes")});
@@ -200,7 +225,6 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
       return_ctor.construct_arguments(0);
       kernel_index++;
     }
-    DCHECK_EQ(kernel_index, kNumKernels);
     function_ctor.set_num_regs(regs.size());
   }
   return buffer;
@@ -537,11 +561,103 @@ TEST_F(KernelTest, IfrtRestoreVariableOp) {
   TF_ASSERT_OK(execution_context.status());
 
   xla::ifrt::Future<tensorflow::Tensor> restored_future =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          absl::StrCat(kVariableRuntimeName, 0));
+  absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
+  TF_ASSERT_OK(restored_tensor.status());
+  EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
+}
+
+TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
+  std::string checkpoint_prefix =
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/core/tfrt/mlrt/kernel/testdata/"
+          "gen_checkpoint_data/variables") +
+      "/variables";
+
+  static constexpr int kNumVariables = 4;
+  auto buffer = CreateExecutableForIfrtRestoreVariableOp(kNumVariables);
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry_);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(execution_work_queue_.get());
+
+  execution_context.AddUserContext(std::move(tf_context_));
+
+  xla::ifrt::Future<tensorflow::Tensor> uninitialized_entry =
       ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
           kVariableRuntimeName);
+  ASSERT_TRUE(uninitialized_entry.IsReady());
+  EXPECT_THAT(uninitialized_entry.Await().status(),
+              ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
+
+  std::vector<mlrt::Value> args;
+  args.resize(3);
+
+  tensorflow::Tensor prefix_tensor =
+      AsTensor<tsl::tstring>({tsl::tstring(checkpoint_prefix)});
+  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(prefix_tensor)));
+
+  tensorflow::Tensor name_tensor =
+      AsTensor<tsl::tstring>({tsl::tstring("w/.ATTRIBUTES/VARIABLE_VALUE"),
+                              tsl::tstring("w1/.ATTRIBUTES/VARIABLE_VALUE"),
+                              tsl::tstring("w2/.ATTRIBUTES/VARIABLE_VALUE"),
+                              tsl::tstring("w3/.ATTRIBUTES/VARIABLE_VALUE")});
+  args.at(1).Set(tfrt_stub::FallbackTensor(std::move(name_tensor)));
+
+  tensorflow::Tensor slice_tensor = AsTensor<tsl::tstring>(
+      {tsl::tstring(""), tsl::tstring(""), tsl::tstring(""), tsl::tstring("")});
+  args.at(2).Set(tfrt_stub::FallbackTensor(std::move(slice_tensor)));
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  std::vector<mlrt::Value> results;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  TF_ASSERT_OK(execution_context.status());
+
+  xla::ifrt::Future<tensorflow::Tensor> restored_future =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          absl::StrCat(kVariableRuntimeName, 0));
   absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
   TF_ASSERT_OK(restored_tensor.status());
   EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
+
+  xla::ifrt::Future<tensorflow::Tensor> restored_future1 =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          absl::StrCat(kVariableRuntimeName, 1));
+  absl::StatusOr<tensorflow::Tensor> restored_tensor1 =
+      restored_future1.Await();
+  TF_ASSERT_OK(restored_tensor1.status());
+  EXPECT_THAT(*restored_tensor1, TensorEq(AsTensor<int32_t>({4, 5, 6}, {3})));
+
+  xla::ifrt::Future<tensorflow::Tensor> restored_future2 =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          absl::StrCat(kVariableRuntimeName, 2));
+  absl::StatusOr<tensorflow::Tensor> restored_tensor2 =
+      restored_future2.Await();
+  TF_ASSERT_OK(restored_tensor2.status());
+  EXPECT_THAT(*restored_tensor2, TensorEq(AsTensor<int32_t>({7, 8, 9}, {3})));
+
+  xla::ifrt::Future<tensorflow::Tensor> restored_future3 =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          absl::StrCat(kVariableRuntimeName, 3));
+  absl::StatusOr<tensorflow::Tensor> restored_tensor3 =
+      restored_future3.Await();
+  TF_ASSERT_OK(restored_tensor3.status());
+  EXPECT_THAT(*restored_tensor3,
+              TensorEq(AsTensor<int32_t>({10, 11, 12}, {3})));
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.cc b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.cc
new file mode 100644
index 00000000000000..cd3f49f3d6b37c
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.cc
@@ -0,0 +1,94 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+// Shard variables into cluster of roughly the same size.
+// The returned result is a vector of clusters and each cluster is represented
+// as a vector of variable indices.
+std::vector<std::vector<int>> ShardVariables(
+    int num_shards, absl::Span<int64_t> variable_sizes) {
+  DCHECK_GT(num_shards, 0);
+  // Shard the variables into clusters of roughly the same size. This is done
+  // via a greedy algorithm: add the next largest sized variable tensor to the
+  // smallest cluster.
+
+  // First, create a list of variables sorted by size in descending order.
+  struct IndexSize {
+    int index;
+    int64_t size;
+  };
+  std::vector<IndexSize> variable_index_sizes;
+  variable_index_sizes.reserve(variable_sizes.size());
+  for (int i = 0; i < variable_sizes.size(); ++i) {
+    variable_index_sizes.push_back({.index = i, .size = variable_sizes[i]});
+  }
+  std::sort(
+      variable_index_sizes.begin(), variable_index_sizes.end(),
+      [&](const IndexSize& a, const IndexSize& b) { return a.size > b.size; });
+
+  // Then shards the variables into clusters of roughly the same size.
+  // Each cluster is also roughly equally distributed in size b/c the variables
+  // are sorted by size in the previous step.
+  struct RestoreVariableCluster {
+    std::vector<int> indices;
+    size_t total_size = 0;
+  };
+
+  auto cmp = [](const RestoreVariableCluster& a,
+                const RestoreVariableCluster& b) {
+    return a.total_size > b.total_size;
+  };
+  std::priority_queue<RestoreVariableCluster,
+                      std::vector<RestoreVariableCluster>, decltype(cmp)>
+      min_heap;
+  for (int i = 0; i < num_shards; ++i) {
+    min_heap.push(RestoreVariableCluster());
+  }
+  for (int i = 0; i < variable_index_sizes.size(); ++i) {
+    RestoreVariableCluster min_cluster = min_heap.top();
+    min_heap.pop();
+    min_cluster.total_size += variable_index_sizes[i].size;
+    min_cluster.indices.push_back(variable_index_sizes[i].index);
+    min_heap.push(std::move(min_cluster));
+  }
+
+  std::vector<std::vector<int>> shards;
+  shards.reserve(min_heap.size());
+  while (!min_heap.empty()) {
+    auto& min_cluster = min_heap.top();
+    if (min_cluster.total_size > 0) {
+      shards.push_back(min_cluster.indices);
+    }
+    min_heap.pop();
+  }
+  return shards;
+}
+
+}  // namespace  tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h
new file mode 100644
index 00000000000000..d194b687071841
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
+
+#include <cstddef>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+// Shard variables into cluster of roughly the same size.
+//
+// `num_shards` is the number of shards to create.
+// `variable_sizes` is the sizes of the variables.
+//
+// Returns a list of clusters, each of which is represented
+// as a vector of variable indices.
+std::vector<std::vector<int>> ShardVariables(
+    int num_shards, absl::Span<int64_t> variable_sizes);
+
+}  // namespace  tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
diff --git a/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util_test.cc b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util_test.cc
new file mode 100644
index 00000000000000..9880a12ee505f5
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h"
+
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
+TEST(ShardRestoreUtilTest, Basic) {
+  int num_shards = 2;
+  std::vector<int64_t> shard_sizes = {8, 10, 3};
+
+  std::vector<std::vector<int>> shards =
+      ShardVariables(num_shards, absl::MakeSpan(shard_sizes));
+
+  EXPECT_EQ(shards.size(), 2);
+  EXPECT_THAT(shards[0], ElementsAre(1));
+  EXPECT_THAT(shards[1], ElementsAre(0, 2));
+}
+
+TEST(ShardRestoreUtilTest, Imbalance) {
+  int num_shards = 2;
+  std::vector<int64_t> shard_sizes = {3, 3, 10, 3};
+
+  std::vector<std::vector<int>> shards =
+      ShardVariables(num_shards, absl::MakeSpan(shard_sizes));
+
+  EXPECT_EQ(shards.size(), 2);
+  EXPECT_THAT(shards[0], UnorderedElementsAre(0, 1, 3));
+  EXPECT_THAT(shards[1], ElementsAre(2));
+}
+
+TEST(ShardRestoreUtilTest, SingleShard) {
+  int num_shards = 1;
+  std::vector<int64_t> shard_sizes = {10, 2};
+
+  std::vector<std::vector<int>> shards =
+      ShardVariables(num_shards, absl::MakeSpan(shard_sizes));
+
+  EXPECT_EQ(shards.size(), 1);
+  EXPECT_THAT(shards[0], ElementsAre(0, 1));
+}
+
+TEST(ShardRestoreUtilTest, NumVariablesLessThanShard) {
+  int num_shards = 2;
+  std::vector<int64_t> shard_sizes = {1};
+
+  std::vector<std::vector<int>> shards =
+      ShardVariables(num_shards, absl::MakeSpan(shard_sizes));
+
+  EXPECT_EQ(shards.size(), 1);
+  EXPECT_THAT(shards[0], ElementsAre(0));
+}
+
+}  // namespace
+}  // namespace  tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/testdata/gen_checkpoint.py b/tensorflow/core/tfrt/mlrt/kernel/testdata/gen_checkpoint.py
index 59e24e150f84c6..46ef3fdd527bcf 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/testdata/gen_checkpoint.py
+++ b/tensorflow/core/tfrt/mlrt/kernel/testdata/gen_checkpoint.py
@@ -33,16 +33,23 @@
 
 
 class ToyModule(module.Module):
+  """A toy module for testing checkpoing loading."""
 
   def __init__(self):
     super().__init__()
     self.w = variables.Variable(constant_op.constant([1, 2, 3]), name='w')
+    self.w1 = variables.Variable(constant_op.constant([4, 5, 6]), name='w1')
+    self.w2 = variables.Variable(constant_op.constant([7, 8, 9]), name='w2')
+    self.w3 = variables.Variable(constant_op.constant([10, 11, 12]), name='w3')
 
   @polymorphic_function.function(
       input_signature=[tensor.TensorSpec([None, 3], dtypes.int32, name='input')]
   )
   def serving_default(self, x):
     dummy = x + self.w
+    dummy = dummy + self.w1
+    dummy = dummy + self.w2
+    dummy = dummy + self.w3
     return dummy
 
 
From ef0687925348d57f4068ae2e482f5fd8ebc1113a Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Wed, 29 May 2024 13:46:56 -0700
Subject: [PATCH 098/287] PR #13165:  [ROCm] Update ROCm version to 6.0.2

Imported from GitHub PR https://github.com/openxla/xla/pull/13165

Copybara import of the project:

--
0a2b2ea90005d8c00e2afd77dac3a34be8673661 by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

[ROCm] Update ROCm version to 6.0.2

Merging this change closes #13165

PiperOrigin-RevId: 638399484
---
 third_party/xla/build_tools/rocm/run_xla.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/build_tools/rocm/run_xla.sh b/third_party/xla/build_tools/rocm/run_xla.sh
index cacf913d8ae9c7..798a737dc058ea 100755
--- a/third_party/xla/build_tools/rocm/run_xla.sh
+++ b/third_party/xla/build_tools/rocm/run_xla.sh
@@ -38,7 +38,7 @@ if [[ -n $1 ]]; then
     ROCM_INSTALL_DIR=$1
 else
     if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm-6.1.0
+        ROCM_INSTALL_DIR=/opt/rocm-6.0.2
     else
         ROCM_INSTALL_DIR=$ROCM_PATH
     fi

From 86867db63ce106c694b756af3d186ab02a164a24 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 14:00:58 -0700
Subject: [PATCH 099/287] [xla:gpu] Add support for u32 and u64 offset
 constants

PiperOrigin-RevId: 638404105
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 23 ++++++++++++-------
 .../gpu/runtime/address_computation_thunk.cc  |  2 +-
 .../gpu/runtime/address_computation_thunk.h   |  2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 262e0a9f25c4ef..8098a2e3c44ba4 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -238,14 +238,21 @@ absl::Status CollectSliceInfo(
     const auto* offset_value = fusion_instr.operand(param->parameter_number());
 
     if (auto* cst = DynCast<HloConstantInstruction>(offset_value)) {
-      // Loop offset is defined by a constant value.
-      auto s32_scalar = ShapeUtil::MakeShape(PrimitiveType::S32, {});
-      auto s64_scalar = ShapeUtil::MakeShape(PrimitiveType::S64, {});
-
-      if (cst->shape() == s32_scalar) {
-        arg_offsets.emplace_back() = cst->literal().data<int32_t>()[0];
-      } else if (cst->shape() == s64_scalar) {
-        arg_offsets.emplace_back() = cst->literal().data<int64_t>()[0];
+      // Loop offset is defined by a constant scalar value.
+      if (ShapeUtil::IsScalarWithElementType(cst->shape(),
+                                             PrimitiveType::S32)) {
+        arg_offsets.emplace_back() =
+            static_cast<uint64_t>(cst->literal().data<int32_t>()[0]);
+      } else if (ShapeUtil::IsScalarWithElementType(cst->shape(),
+                                                    PrimitiveType::S64)) {
+        arg_offsets.emplace_back() =
+            static_cast<uint64_t>(cst->literal().data<int64_t>()[0]);
+      } else if (ShapeUtil::IsScalarWithElementType(cst->shape(),
+                                                    PrimitiveType::U32)) {
+        arg_offsets.emplace_back() = cst->literal().data<uint32_t>()[0];
+      } else if (ShapeUtil::IsScalarWithElementType(cst->shape(),
+                                                    PrimitiveType::U64)) {
+        arg_offsets.emplace_back() = cst->literal().data<uint64_t>()[0];
       } else {
         return absl::InternalError(absl::StrCat(
             "Unsupported constant offset shape: ", cst->shape().ToString()));
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 0b056e3c593f2b..6c9b91d8e995be 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -174,7 +174,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
              *slice.offsets, src_shape.dimensions(), dst_shape.dimensions()))) {
       auto [offset, src_dim, dst_dim] = values;
 
-      if (int64_t* const_offset = std::get_if<int64_t>(&offset)) {
+      if (uint64_t* const_offset = std::get_if<uint64_t>(&offset)) {
         // Forward slice offsets that are known constant values
         VLOG(2) << "  - arg " << argument_idx << "[" << offset_idx
                 << "]: constant offset = " << *const_offset;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index 07889b1c77959c..f549736982e55c 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -49,7 +49,7 @@ class AddressComputationThunk : public Thunk {
   // Dynamic slice offset can be either: (1) a statically known constant value,
   // (2) a loop iteration number, or (3) a truly dynamic offset that is
   // computed on device and have to be transferred to host.
-  using Offset = std::variant<int64_t, LoopIter, BufferAllocation::Slice>;
+  using Offset = std::variant<uint64_t, LoopIter, BufferAllocation::Slice>;
 
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,

From 220c9a249edf7e03fce3f3c5ff6d5dfd5dc25e66 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 14:05:30 -0700
Subject: [PATCH 100/287] [xla:cpu] Pack constants of sub-byte element type
 into dense storage compatible with XLA format

+ added a few more tests that now pass with thunks runtime

PiperOrigin-RevId: 638405998
---
 third_party/xla/xla/service/cpu/BUILD         |  3 ++
 .../xla/xla/service/cpu/cpu_compiler.cc       | 46 ++++++++++++++++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 9dd8aef62db647..dd68b20e109a28 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -64,7 +64,10 @@ test_suite(
     name = "thunk_runtime_tests",
     tests = [
         "//xla/tests:array_elementwise_ops_test_cpu",
+        "//xla/tests:axpy_simple_test_cpu",
+        "//xla/tests:convert_test_cpu",
         "//xla/tests:copy_test_cpu",
+        "//xla/tests:floor_ceil_test_cpu",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 649cc9485625c7..b13856b01fde97 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -78,8 +78,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/literal.h"
 #include "xla/map_util.h"
 #include "xla/mlir_hlo/transforms/passes.h"
+#include "xla/primitive_util.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_to_all_decomposer.h"
@@ -1027,6 +1029,40 @@ std::vector<ComputationToEmit> SubcomputationEmissionOrder(
 
 }  // namespace
 
+static absl::StatusOr<CpuExecutable::ConstantAllocation>
+LiteralToConstantAllocation(BufferAllocation::Index index,
+                            const Literal& literal) {
+  // TODO(ezhulenev): This code is almost identical to code in XLA:GPU, we
+  // should standardize it. See `xla/service/gpu/ir_emission_utils.cc`.
+  PrimitiveType element_type = literal.shape().element_type();
+  if (!primitive_util::IsArrayType(element_type)) {
+    return absl::InternalError(
+        "Only array literals can be converted to constant allocations");
+  }
+
+  int64_t size_bytes = literal.size_bytes();
+  const void* untyped_data = literal.untyped_data();
+
+  // Pack sub-byte types into an XLA storage format.
+  if (primitive_util::IsSubByteNonPredType(element_type)) {
+    int bit_width = primitive_util::BitWidth(element_type);
+    int packed_size_bytes = CeilOfRatio<int64_t>(size_bytes, 8 / bit_width);
+
+    std::vector<uint8_t> packed(packed_size_bytes);
+    PackIntN(
+        bit_width,
+        absl::MakeSpan(reinterpret_cast<const char*>(untyped_data), size_bytes),
+        absl::MakeSpan(reinterpret_cast<char*>(packed.data()), packed.size()));
+
+    return CpuExecutable::ConstantAllocation{index, std::move(packed)};
+  }
+
+  // Create a constant allocation from the literal's untyped data.
+  return CpuExecutable::ConstantAllocation{
+      index, absl::Span<const uint8_t>(
+                 reinterpret_cast<const uint8_t*>(untyped_data), size_bytes)};
+}
+
 // Creates a vector of constant allocations from the given buffer assignment.
 static absl::StatusOr<std::vector<CpuExecutable::ConstantAllocation>>
 CreateConstantAllocations(const BufferAssignment& assignment) {
@@ -1057,13 +1093,9 @@ CreateConstantAllocations(const BufferAssignment& assignment) {
                        allocation.ToString()));
     }
 
-    const void* untyped_data = const_instr->literal().untyped_data();
-    int64_t size_in_bytes = const_instr->literal().size_bytes();
-
-    constants.push_back(CpuExecutable::ConstantAllocation{
-        allocation.index(),
-        absl::Span<const uint8_t>(
-            reinterpret_cast<const uint8_t*>(untyped_data), size_in_bytes)});
+    TF_ASSIGN_OR_RETURN(constants.emplace_back(),
+                        LiteralToConstantAllocation(allocation.index(),
+                                                    const_instr->literal()));
   }
 
   return constants;

From 9aa3dfffc96d1afcfd2d211752fa563a38e9d573 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 14:16:47 -0700
Subject: [PATCH 101/287] [xla:cpu] Add support for kAbs and kReverse elemental
 host kernels

PiperOrigin-RevId: 638409741
---
 third_party/xla/xla/service/cpu/BUILD            | 4 ++++
 third_party/xla/xla/service/cpu/thunk_emitter.cc | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index dd68b20e109a28..f4e884baa94416 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -68,6 +68,10 @@ test_suite(
         "//xla/tests:convert_test_cpu",
         "//xla/tests:copy_test_cpu",
         "//xla/tests:floor_ceil_test_cpu",
+        "//xla/tests:numerics_test_cpu",
+        "//xla/tests:reshape_test_cpu",
+        "//xla/tests:reverse_test_cpu",
+        "//xla/tests:unary_op_test_cpu",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 8a3a7cb8e367a2..043f4b7d8d750b 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -96,6 +96,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
 
     // Simple HLO instructions lowered to elemental host kernels (plain loops
     // behind the HostKernel API).
+    case HloOpcode::kAbs:
     case HloOpcode::kAdd:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
@@ -123,6 +124,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
+    case HloOpcode::kReverse:
     case HloOpcode::kRsqrt:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:

From a5d875a411376d1498cbb3d87b0f2c96ea8938e9 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 29 May 2024 14:32:35 -0700
Subject: [PATCH 102/287] [xla:cpu] Add support for kCall thunks

PiperOrigin-RevId: 638414662
---
 third_party/xla/xla/service/cpu/BUILD         |  2 +
 third_party/xla/xla/service/cpu/runtime/BUILD | 10 +++++
 .../xla/xla/service/cpu/runtime/call_thunk.cc | 32 ++++++++++++++++
 .../xla/xla/service/cpu/runtime/call_thunk.h  | 38 +++++++++++++++++++
 .../xla/xla/service/cpu/runtime/thunk.cc      |  2 +
 .../xla/xla/service/cpu/runtime/thunk.h       |  1 +
 .../xla/xla/service/cpu/thunk_emitter.cc      | 12 ++++++
 .../xla/xla/service/cpu/thunk_emitter.h       |  3 ++
 8 files changed, 100 insertions(+)
 create mode 100644 third_party/xla/xla/service/cpu/runtime/call_thunk.cc
 create mode 100644 third_party/xla/xla/service/cpu/runtime/call_thunk.h

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index f4e884baa94416..70d7b8a114c906 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -72,6 +72,7 @@ test_suite(
         "//xla/tests:reshape_test_cpu",
         "//xla/tests:reverse_test_cpu",
         "//xla/tests:unary_op_test_cpu",
+        "//xla/tests/exhaustive:exhaustive_binary_16_bit_test_cpu",
     ],
 )
 
@@ -827,6 +828,7 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
+        "//xla/service/cpu/runtime:call_thunk",
         "//xla/service/cpu/runtime:copy_thunk",
         "//xla/service/cpu/runtime:kernel_thunk",
         "//xla/service/cpu/runtime:thunk",
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index 6c9402f88a6b8d..0a1469d4c93ea4 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -44,6 +44,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "call_thunk",
+    srcs = ["call_thunk.cc"],
+    hdrs = ["call_thunk.h"],
+    deps = [
+        ":thunk",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 cc_library(
     name = "copy_thunk",
     srcs = ["copy_thunk.cc"],
diff --git a/third_party/xla/xla/service/cpu/runtime/call_thunk.cc b/third_party/xla/xla/service/cpu/runtime/call_thunk.cc
new file mode 100644
index 00000000000000..a68165a30fdf20
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/call_thunk.cc
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/runtime/call_thunk.h"
+
+#include <utility>
+
+#include "absl/status/status.h"
+#include "xla/service/cpu/runtime/thunk.h"
+
+namespace xla::cpu {
+
+CallThunk::CallThunk(ThunkSequence called_sequence)
+    : Thunk(Kind::kCall), called_sequence_(std::move(called_sequence)) {}
+
+absl::Status CallThunk::Execute(const ExecuteParams& params) {
+  return called_sequence_.Execute(params);
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/call_thunk.h b/third_party/xla/xla/service/cpu/runtime/call_thunk.h
new file mode 100644
index 00000000000000..c57b3ea4215b00
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/call_thunk.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CALL_THUNK_H_
+#define XLA_SERVICE_CPU_RUNTIME_CALL_THUNK_H_
+
+#include "absl/status/status.h"
+#include "xla/service/cpu/runtime/thunk.h"
+
+namespace xla::cpu {
+
+// A thunk constructed from a call instruction that simply calls a thunk
+// sequence emitted from the called computation.
+class CallThunk final : public Thunk {
+ public:
+  explicit CallThunk(ThunkSequence called_sequence);
+
+  absl::Status Execute(const ExecuteParams& params) final;
+
+ private:
+  ThunkSequence called_sequence_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_CALL_THUNK_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.cc b/third_party/xla/xla/service/cpu/runtime/thunk.cc
index f6461a95b6d29e..8009178419e5ef 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.cc
@@ -28,6 +28,8 @@ namespace xla::cpu {
 
 std::string_view Thunk::KindToString(Kind kind) {
   switch (kind) {
+    case Kind::kCall:
+      return "call";
     case Kind::kCopy:
       return "copy";
     case Kind::kKernel:
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.h b/third_party/xla/xla/service/cpu/runtime/thunk.h
index 1892eb972ea553..e8110ba46859d5 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.h
@@ -47,6 +47,7 @@ namespace xla::cpu {
 class Thunk {
  public:
   enum class Kind {
+    kCall,
     kCopy,
     kKernel,
   };
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 043f4b7d8d750b..cd04d4405c997c 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/ir_emitter2.h"
+#include "xla/service/cpu/runtime/call_thunk.h"
 #include "xla/service/cpu/runtime/copy_thunk.h"
 #include "xla/service/cpu/runtime/kernel_thunk.h"
 #include "xla/service/cpu/runtime/thunk.h"
@@ -141,6 +142,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kFusion:
       return EmitFusionKernelThunk(instruction);
 
+    case HloOpcode::kCall:
+      return EmitCallThunk(instruction);
+
     case HloOpcode::kCopy:
       return EmitCopyThunk(instruction);
 
@@ -151,6 +155,14 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
   }
 }
 
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
+    const HloInstruction* instruction) {
+  TF_ASSIGN_OR_RETURN(
+      ThunkSequence called_sequence,
+      EmitHloComputation(instruction->called_computations().front()));
+  return ThunkSequence::Of<CallThunk>(std::move(called_sequence));
+}
+
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
     const HloInstruction* instruction) {
   TF_ASSIGN_OR_RETURN(auto source_buffer,
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 1f12d3f34e56ab..b4e5a75d909c23 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -56,6 +56,9 @@ class ThunkEmitter {
   absl::StatusOr<ThunkSequence> EmitHloInstruction(
       const HloInstruction* instruction);
 
+  absl::StatusOr<ThunkSequence> EmitCallThunk(
+      const HloInstruction* instruction);
+
   absl::StatusOr<ThunkSequence> EmitCopyThunk(
       const HloInstruction* instruction);
 

From c82138f5a838ca67b55697d12a137b4662c08525 Mon Sep 17 00:00:00 2001
From: Victor Stone <victorstone@google.com>
Date: Wed, 29 May 2024 14:56:13 -0700
Subject: [PATCH 103/287] [XLA] Stop duplicating broadcasts before host memory
 offloading.

HostOffloader will do this as needed.

PiperOrigin-RevId: 638422085
---
 .../xla/xla/service/host_offload_legalize.cc  | 61 -------------------
 1 file changed, 61 deletions(-)

diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index 261a1cb5301410..793e482ef2e9d9 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -93,57 +93,6 @@ HloInstruction* FindDUSFromAnnotation(HloInstruction* instr) {
   return instr;
 }
 
-// Make sure that broadcasts are duplicated for each use.
-absl::StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
-  bool split_at_least_one = false;
-  for (HloComputation* computation : module->computations()) {
-    std::vector<HloInstruction*> broadcasts;
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kBroadcast ||
-          !instruction->HasConstantOperand()) {
-        continue;
-      }
-      broadcasts.push_back(instruction);
-    }
-    for (HloInstruction* instruction : broadcasts) {
-      if (instruction->opcode() != HloOpcode::kBroadcast ||
-          !instruction->HasConstantOperand()) {
-        continue;
-      }
-      absl::InlinedVector<HloUse, 8> uses;
-      for (HloInstruction* user : instruction->users()) {
-        for (int64_t i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) != instruction) {
-            continue;
-          }
-          uses.push_back(HloUse{user, i, /*operand_index=*/{}});
-        }
-      }
-
-      if (uses.size() <= 1) {
-        VLOG(5) << "Skipping broadcast " << instruction->ToString()
-                << " which has " << uses.size() << " uses";
-        continue;
-      }
-
-      VLOG(5) << "Splitting broadcast " << instruction->ToString()
-              << " which has " << uses.size() << " uses";
-      split_at_least_one = true;
-      // Don't create a new broadcast for the first use; we can still use the
-      // original.
-      for (int i = 1; i < uses.size(); ++i) {
-        const HloUse& use = uses[i];
-        HloInstruction* new_broadcast =
-            instruction->parent()->AddInstruction(instruction->Clone());
-        VLOG(5) << "New broadcast " << new_broadcast->ToString();
-        TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-            use.operand_number, new_broadcast));
-      }
-    }
-  }
-  return split_at_least_one;
-}
-
 // Walk up in the chain of memory offloaded instructions. absl::Status not-ok
 // when an instructions not supported or end of chain reached. Walks one
 // instruction at a time.
@@ -632,16 +581,6 @@ absl::StatusOr<bool> HostOffloadLegalize::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-
-  // Split broadcasts so that each HloUse of a broadcast instruction will get
-  // its own copy.
-  // TODO(b/319293925): Do not blindly duplicate all broadcasts, instead do it
-  // only when necessary.
-  TF_ASSIGN_OR_RETURN(bool duplicated_at_least_one_broadcast,
-                      DuplicateBroadcastForEachUse(module));
-  if (duplicated_at_least_one_broadcast) {
-    changed = true;
-  }
   if (!after_layout_) {
     return changed;
   }

From 752b7349d91464ff9c2e6ff2d4b4305732335143 Mon Sep 17 00:00:00 2001
From: Victor Stone <victorstone@google.com>
Date: Wed, 29 May 2024 15:40:12 -0700
Subject: [PATCH 104/287] Reverts c82138f5a838ca67b55697d12a137b4662c08525

PiperOrigin-RevId: 638435735
---
 .../xla/xla/service/host_offload_legalize.cc  | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index 793e482ef2e9d9..261a1cb5301410 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -93,6 +93,57 @@ HloInstruction* FindDUSFromAnnotation(HloInstruction* instr) {
   return instr;
 }
 
+// Make sure that broadcasts are duplicated for each use.
+absl::StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
+  bool split_at_least_one = false;
+  for (HloComputation* computation : module->computations()) {
+    std::vector<HloInstruction*> broadcasts;
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kBroadcast ||
+          !instruction->HasConstantOperand()) {
+        continue;
+      }
+      broadcasts.push_back(instruction);
+    }
+    for (HloInstruction* instruction : broadcasts) {
+      if (instruction->opcode() != HloOpcode::kBroadcast ||
+          !instruction->HasConstantOperand()) {
+        continue;
+      }
+      absl::InlinedVector<HloUse, 8> uses;
+      for (HloInstruction* user : instruction->users()) {
+        for (int64_t i = 0; i < user->operand_count(); ++i) {
+          if (user->operand(i) != instruction) {
+            continue;
+          }
+          uses.push_back(HloUse{user, i, /*operand_index=*/{}});
+        }
+      }
+
+      if (uses.size() <= 1) {
+        VLOG(5) << "Skipping broadcast " << instruction->ToString()
+                << " which has " << uses.size() << " uses";
+        continue;
+      }
+
+      VLOG(5) << "Splitting broadcast " << instruction->ToString()
+              << " which has " << uses.size() << " uses";
+      split_at_least_one = true;
+      // Don't create a new broadcast for the first use; we can still use the
+      // original.
+      for (int i = 1; i < uses.size(); ++i) {
+        const HloUse& use = uses[i];
+        HloInstruction* new_broadcast =
+            instruction->parent()->AddInstruction(instruction->Clone());
+        VLOG(5) << "New broadcast " << new_broadcast->ToString();
+        TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
+            use.operand_number, new_broadcast));
+      }
+    }
+  }
+  return split_at_least_one;
+}
+
 // Walk up in the chain of memory offloaded instructions. absl::Status not-ok
 // when an instructions not supported or end of chain reached. Walks one
 // instruction at a time.
@@ -581,6 +632,16 @@ absl::StatusOr<bool> HostOffloadLegalize::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
+
+  // Split broadcasts so that each HloUse of a broadcast instruction will get
+  // its own copy.
+  // TODO(b/319293925): Do not blindly duplicate all broadcasts, instead do it
+  // only when necessary.
+  TF_ASSIGN_OR_RETURN(bool duplicated_at_least_one_broadcast,
+                      DuplicateBroadcastForEachUse(module));
+  if (duplicated_at_least_one_broadcast) {
+    changed = true;
+  }
   if (!after_layout_) {
     return changed;
   }

From f77395b4e145d293d0a3347442c0ae38b24e439f Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 16:01:42 -0700
Subject: [PATCH 105/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638441439
---
 .../xla/xla/service/memory_space_assignment/BUILD | 15 +++++++--------
 .../service/memory_space_assignment/algorithm.cc  |  2 +-
 .../service/memory_space_assignment/algorithm.h   |  2 +-
 .../service/memory_space_assignment/allocation.cc |  2 +-
 .../service/memory_space_assignment/allocation.h  |  2 +-
 .../memory_space_assignment/cost_analysis_test.cc |  2 +-
 .../memory_bound_loop_optimizer.cc                |  2 +-
 .../memory_bound_loop_optimizer.h                 |  2 +-
 .../memory_bound_loop_optimizer_test.cc           |  2 +-
 .../memory_space_assignment.cc                    |  2 +-
 .../memory_space_assignment.h                     |  2 +-
 .../memory_space_assignment_test.cc               |  1 -
 .../prefetch_interval_picker.cc                   |  2 +-
 13 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 55b865c94a1e52..1d9c489377869d 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -44,7 +44,6 @@ cc_library(
         ":options",
         ":slice",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -61,6 +60,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -88,7 +88,6 @@ xla_cc_test(
         ":slice",
         ":testing_utils",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -203,7 +202,6 @@ cc_library(
         ":memory_space_assignment_proto_cc",
         ":slice",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_value",
@@ -216,6 +214,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
@@ -288,11 +287,11 @@ xla_cc_test(
     deps = [
         ":cost_analysis",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -309,7 +308,6 @@ cc_library(
         ":cost_analysis",
         ":memory_space_assignment_proto_cc",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
@@ -318,6 +316,7 @@ cc_library(
         "//xla/service/heap_simulator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
@@ -353,7 +352,6 @@ cc_library(
         ":memory_space_assignment_proto_cc",
         ":options",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -368,6 +366,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -387,7 +386,6 @@ xla_cc_test(
         ":options",
         ":prefetch_interval_picker",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -402,6 +400,7 @@ xla_cc_test(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
@@ -431,7 +430,6 @@ cc_library(
         ":utils",
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -454,6 +452,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index b2b9ef640b5545..91a109fe6f6304 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -71,7 +72,6 @@ limitations under the License.
 #include "xla/service/time_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
index ea775b8a9a7369..efaeae410a7a94 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.h
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_live_range.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/slice.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 00e22355fc6239..f14afcc0be813d 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
index f523adc5df420f..da5bd08afe038d 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/slice.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 
 namespace xla::memory_space_assignment {
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
index 47df6481df60f9..e4d93dd8c8f61f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
index 3e3cf4a10a0cba..a2daace86a1906 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
index cef1dffa52c693..002ece417ca5c0 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
index 0d4fdcdbe4c7b2..160c8983ddcf28 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index ae22ed48ffb0ed..d9a42a84d2349c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/slice.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index e37f5ad53aeae0..a85b2c04db9018 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -175,6 +175,7 @@ Useful logging and error messages
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_live_range.h"
@@ -187,7 +188,6 @@ Useful logging and error messages
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 7517fa78ca21ef..14cc1462a05f4c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -65,7 +65,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/testing_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc
index ad63a509dc4fdb..5e550b4f3fa855 100644
--- a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 

From fe858c654bd163c341a41af7eda5c198662e8b1c Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 16:05:12 -0700
Subject: [PATCH 106/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638442690
---
 third_party/xla/xla/python/ifrt/BUILD        | 2 --
 third_party/xla/xla/python/ifrt/array.h      | 2 +-
 third_party/xla/xla/python/ifrt/client.h     | 2 +-
 third_party/xla/xla/python/ifrt/executable.h | 1 -
 third_party/xla/xla/python/ifrt/mock.h       | 1 -
 third_party/xla/xla/python/ifrt/value.h      | 2 +-
 6 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 4eef2c9d43718b..ddf9a03825c243 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -86,7 +86,6 @@ cc_library(
         ":serdes",
         ":shape_proto_cc",
         ":sharding_proto_cc",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -333,7 +332,6 @@ cc_library(
     deps = [
         ":ifrt",
         "//xla:literal",
-        "//xla:status",
         "//xla:test",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_client",
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index e0b334831e8d20..3d5e2795d8ddad 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/dtype.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/value.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index ef42e3b3b42f88..0f0ad190d17569 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -103,7 +103,7 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // `on_done_with_host_buffer` will be called iff OK is returned.
   //
   // TODO(hyeontaek): Consider changing `on_done_with_host_buffer` into a
-  // returned `Future<Status>` for consistency with other IFRT APIs.
+  // returned `Future<absl::Status>` for consistency with other IFRT APIs.
   virtual absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
       const void* data, DType dtype, Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 6e9fcc149d2846..864b469f551c4f 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index ab69933e361ce0..1f720f00974a0c 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/framework/allocator.h"
diff --git a/third_party/xla/xla/python/ifrt/value.h b/third_party/xla/xla/python/ifrt/value.h
index 78ba3bd8242af8..a4dd5cc75146c7 100644
--- a/third_party/xla/xla/python/ifrt/value.h
+++ b/third_party/xla/xla/python/ifrt/value.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {

From b4f340ee6cd2012ae20719ac68cf45c4aaab61dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 16:33:40 -0700
Subject: [PATCH 107/287] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/4662f7552dc2e420ee20361c077b7aeb334a1087.

PiperOrigin-RevId: 638450863
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 04d0e390c8dfe3..0b34d08f09e7ff 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "edb0d2c6f5e343c83ea121817dc2599ad5453d5c"
-    TFRT_SHA256 = "97f7bfcbff025da3005e59b9ffe1bcb06b439874e3e2cd28a17d9287193d6901"
+    TFRT_COMMIT = "4662f7552dc2e420ee20361c077b7aeb334a1087"
+    TFRT_SHA256 = "af38c1561370ef6d813194b9d4d3e3a3389aebe2ab6d741a1b1edbe4532d079a"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 04d0e390c8dfe3..0b34d08f09e7ff 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "edb0d2c6f5e343c83ea121817dc2599ad5453d5c"
-    TFRT_SHA256 = "97f7bfcbff025da3005e59b9ffe1bcb06b439874e3e2cd28a17d9287193d6901"
+    TFRT_COMMIT = "4662f7552dc2e420ee20361c077b7aeb334a1087"
+    TFRT_SHA256 = "af38c1561370ef6d813194b9d4d3e3a3389aebe2ab6d741a1b1edbe4532d079a"
 
     tf_http_archive(
         name = "tf_runtime",

From 67a25a1c7578a694e18a3c27f868ae72972bba07 Mon Sep 17 00:00:00 2001
From: mmakevic-amd <Milica.Makevic@amd.com>
Date: Wed, 29 May 2024 16:41:59 -0700
Subject: [PATCH 108/287] PR #13107: [ROCm] Fix reduce_row_vectorized.hlo.test

Imported from GitHub PR https://github.com/openxla/xla/pull/13107

Copybara import of the project:

--
6555b6d216bb18b1f98eb36c5dac3f58f4d09c05 by mmakevic <Milica.Makevic@amd.com>:

Fix reduce_row_vectorized.hlo.test

Merging this change closes #13107

PiperOrigin-RevId: 638453162
---
 .../gpu/tests/reduce_row_vectorized.hlo       | 408 ++++++++++++------
 1 file changed, 266 insertions(+), 142 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
index bba7986d830fb3..c4976c5fc2b3a7 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
@@ -44,10 +44,10 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_19:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_20:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_21:.*]] = alloca float, align 4
-// CHECK:         %[[VAL_22:.*]] = alloca i32, align 4
+// CHECK-PTX:     %[[VAL_22:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_23:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_24:.*]] = alloca float, align 4
-// CHECK:         %[[VAL_25:.*]] = alloca i32, align 4
+// CHECK-PTX:     %[[VAL_25:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_26:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_27:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_28:.*]] = alloca float, align 4
@@ -60,7 +60,7 @@ ENTRY reduce.1 {
 // CHECK:         ret void
 // CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_34]]
 // CHECK:         %[[VAL_35:.*]] = load float, ptr @0, align 4
-// CHECK:         store float %[[VAL_35]], ptr %[[VAL_28]], align 4
+// CHECK:         store float %[[VAL_35]], ptr{{.*}} %[[VAL_28]], align 4
 // CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
 // CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
 // CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
@@ -71,211 +71,335 @@ ENTRY reduce.1 {
 // CHECK:         %lane_id = urem i32 %thread.id.x, 32
 // CHECK:         %[[VAL_37:.*]] = udiv i32 %block.id.x, 1
 // CHECK:         %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1
-// CHECK:         %[[VAL_39:.*]] = udiv i32 %block.id.x, 1
-// CHECK:         %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1
+// CHECK-PTX:     %[[VAL_39:.*]] = udiv i32 %block.id.x, 1
+// CHECK-PTX:     %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1
 // CHECK:         %[[VAL_41:.*]] = udiv i32 %block.id.x, 1
 // CHECK:         %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 32768
 // CHECK:         %[[VAL_43:.*]] = udiv i32 %block.id.x, 32768
 // CHECK:         %tile_origin.0 = mul i32 %[[VAL_43]], 1
 // CHECK:         %tile_origin.1 = mul i32 %[[VAL_42]], 4
-// CHECK:         %tile_origin.2 = mul i32 %[[VAL_40]], 512
-// CHECK:         %tile_origin.3 = mul i32 %[[VAL_38]], 2
-// CHECK:         store i32 %thread.id.1, ptr %[[VAL_27]], align 4
+// CHECK-PTX:     %tile_origin.2 = mul i32 %[[VAL_40]], 512
+// CHECK-GCN:     %tile_origin.2 = mul i32 %[[VAL_38]], 1024
+// CHECK-PTX:     %tile_origin.3 = mul i32 %[[VAL_38]], 2
+// CHECK:         store i32 %thread.id.1, ptr{{.*}} %[[VAL_27]], align 4
 // CHECK:         br label %[[VAL_44:.*]]
+
 // CHECK:       loop1.loop_header:                                ; preds = %[[VAL_45:.*]], %[[VAL_32]]
-// CHECK:         %[[VAL_46:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_46:.*]] = load i32, ptr{{.*}} %[[VAL_27]], align 4
 // CHECK:         %[[VAL_47:.*]] = icmp uge i32 %[[VAL_46]], 4
 // CHECK:         br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_49:.*]]
+
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_44]]
 // CHECK:         %[[VAL_50:.*]] = add nuw nsw i32 %[[VAL_46]], 4
-// CHECK:         store i32 %[[VAL_50]], ptr %[[VAL_27]], align 4
+// CHECK:         store i32 %[[VAL_50]], ptr{{.*}} %[[VAL_27]], align 4
 // CHECK:         br i1 true, label %[[VAL_52:.*]], label %[[VAL_53:.*]]
+
 // CHECK:       is_full_tile-after:                               ; preds = %[[VAL_54:.*]], %[[VAL_55:.*]]
-// CHECK:         br label %[[VAL_44]], !llvm.loop !5
+// CHECK:         br label %[[VAL_44]], !llvm.loop !{{(5|4)}}
+
 // CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_44]]
-// CHECK:         %[[VAL_56:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_57:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_56]], i32 16, i32 31)
-// CHECK:         store float %[[VAL_57]], ptr %[[VAL_20]], align 4
-// CHECK:         call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_20]], ptr %[[VAL_19]])
-// CHECK:         %[[VAL_58:.*]] = load float, ptr %[[VAL_19]], align 4
-// CHECK:         store float %[[VAL_58]], ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_59:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_60:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_59]], i32 8, i32 31)
-// CHECK:         store float %[[VAL_60]], ptr %[[VAL_18]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_18]], ptr %[[VAL_17]])
-// CHECK:         %[[VAL_61:.*]] = load float, ptr %[[VAL_17]], align 4
-// CHECK:         store float %[[VAL_61]], ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_62:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_63:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_62]], i32 4, i32 31)
-// CHECK:         store float %[[VAL_63]], ptr %[[VAL_16]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_16]], ptr %[[VAL_15]])
-// CHECK:         %[[VAL_64:.*]] = load float, ptr %[[VAL_15]], align 4
-// CHECK:         store float %[[VAL_64]], ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_65:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_66:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_65]], i32 2, i32 31)
-// CHECK:         store float %[[VAL_66]], ptr %[[VAL_14]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_14]], ptr %[[VAL_13]])
-// CHECK:         %[[VAL_67:.*]] = load float, ptr %[[VAL_13]], align 4
-// CHECK:         store float %[[VAL_67]], ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_68:.*]] = load float, ptr %[[VAL_28]], align 4
-// CHECK:         %[[VAL_69:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_68]], i32 1, i32 31)
-// CHECK:         store float %[[VAL_69]], ptr %[[VAL_12]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_12]], ptr %[[VAL_11]])
-// CHECK:         %[[VAL_70:.*]] = load float, ptr %[[VAL_11]], align 4
-// CHECK:         store float %[[VAL_70]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_56:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-GCN:     %[[VAL_57_1:.*]] = bitcast float %[[VAL_56]] to i32
+// CHECK-GCN:     %[[VAL_57_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_57_1]], i32 16)
+// CHECK-GCN:     %[[VAL_57:.*]] = bitcast i32 %[[VAL_57_2]] to float
+// CHECK-PTX:     %[[VAL_57:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_56]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_57]], ptr{{.*}} %[[VAL_20]], align 4
+// CHECK-GCN:     %[[VAL_58_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_58_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_20]] to ptr
+// CHECK-GCN:     %[[VAL_58_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_19]] to ptr
+// CHECK-GCN:     call void @[[SUM:Sum.*]](ptr %[[VAL_58_1]], ptr %[[VAL_58_2]], ptr %[[VAL_58_3]])
+// CHECK-PTX:     call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_20]], ptr %[[VAL_19]])
+// CHECK:         %[[VAL_58:.*]] = load float, ptr{{.*}} %[[VAL_19]], align 4
+// CHECK:         store float %[[VAL_58]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK:         %[[VAL_59:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-GCN:     %[[VAL_60_1:.*]] = bitcast float %[[VAL_59]] to i32
+// CHECK-GCN:     %[[VAL_60_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_60_1]], i32 8)
+// CHECK-GCN:     %[[VAL_60:.*]] = bitcast i32 %[[VAL_60_2]] to float
+// CHECK-PTX:     %[[VAL_60:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_59]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_60]], ptr{{.*}} %[[VAL_18]], align 4
+// CHECK-GCN:     %[[VAL_61_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_61_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_18]] to ptr
+// CHECK-GCN:     %[[VAL_61_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_17]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_61_1]], ptr %[[VAL_61_2]], ptr %[[VAL_61_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_18]], ptr %[[VAL_17]])
+// CHECK:         %[[VAL_61:.*]] = load float, ptr{{.*}} %[[VAL_17]], align 4
+// CHECK:         store float %[[VAL_61]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK:         %[[VAL_62:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-GCN:     %[[VAL_63_1:.*]] = bitcast float %[[VAL_62]] to i32
+// CHECK-GCN:     %[[VAL_63_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_63_1]], i32 4)
+// CHECK-GCN:     %[[VAL_63:.*]] = bitcast i32 %[[VAL_63_2]] to float
+// CHECK-PTX:     %[[VAL_63:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_62]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_63]], ptr{{.*}} %[[VAL_16]], align 4
+// CHECK-GCN:     %[[VAL_64_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_64_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_16]] to ptr
+// CHECK-GCN:     %[[VAL_64_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_64_1]], ptr %[[VAL_64_2]], ptr %[[VAL_64_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_16]], ptr %[[VAL_15]])
+// CHECK:         %[[VAL_64:.*]] = load float, ptr{{.*}} %[[VAL_15]], align 4
+// CHECK:         store float %[[VAL_64]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK:         %[[VAL_65:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-GCN:     %[[VAL_66_1:.*]] = bitcast float %[[VAL_65]] to i32
+// CHECK-GCN:     %[[VAL_66_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_66_1]], i32 2)
+// CHECK-GCN:     %[[VAL_66:.*]] = bitcast i32 %[[VAL_66_2]] to float
+// CHECK-PTX:     %[[VAL_66:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_65]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_66]], ptr{{.*}} %[[VAL_14]], align 4
+// CHECK-GCN:     %[[VAL_67_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_67_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_14]] to ptr
+// CHECK-GCN:     %[[VAL_67_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_13]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_67_1]], ptr %[[VAL_67_2]], ptr %[[VAL_67_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_14]], ptr %[[VAL_13]])
+// CHECK:         %[[VAL_67:.*]] = load float, ptr{{.*}} %[[VAL_13]], align 4
+// CHECK:         store float %[[VAL_67]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK:         %[[VAL_68:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-GCN:     %[[VAL_69_1:.*]] = bitcast float %[[VAL_68]] to i32
+// CHECK-GCN:     %[[VAL_69_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_69_1]], i32 1)
+// CHECK-GCN:     %[[VAL_69:.*]] = bitcast i32 %[[VAL_69_2]] to float
+// CHECK-PTX:     %[[VAL_69:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_68]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_69]], ptr{{.*}} %[[VAL_12]], align 4
+// CHECK-GCN:     %[[VAL_70_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_70_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_12]] to ptr
+// CHECK-GCN:     %[[VAL_70_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_70_1]], ptr %[[VAL_70_2]], ptr %[[VAL_70_3]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_12]], ptr %[[VAL_11]])
+// CHECK:         %[[VAL_70:.*]] = load float, ptr{{.*}} %[[VAL_11]], align 4
+// CHECK:         store float %[[VAL_70]], ptr{{.*}} %[[VAL_28]], align 4
 // CHECK:         %[[VAL_71:.*]] = udiv i32 %thread.id.2, 32
 // CHECK:         %[[VAL_72:.*]] = icmp ult i32 %thread.id.1, 4
 // CHECK:         br i1 %[[VAL_72]], label %thread_in_bounds-true, label %thread_in_bounds-after
+
 // CHECK:       thread_in_bounds-after:                           ; preds = %[[VAL_73:.*]], %[[VAL_48]]
 // CHECK:         br label %[[VAL_33]]
+
 // CHECK:       is_full_tile-true:                                ; preds = %[[VAL_49]]
-// CHECK:         store i32 0, ptr %[[VAL_26]], align 4
+// CHECK:         store i32 0, ptr{{.*}} %[[VAL_26]], align 4
 // CHECK:         br label %[[VAL_74:.*]]
+
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_75:.*]], %[[VAL_52]]
-// CHECK:         %[[VAL_76:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK:         %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 512
+// CHECK:         %[[VAL_76:.*]] = load i32, ptr{{.*}} %[[VAL_26]], align 4
+// CHECK-PTX:     %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 512
+// CHECK-GCN:     %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 1024
 // CHECK:         br i1 %[[VAL_77]], label %[[VAL_55]], label %[[VAL_78:.*]]
+
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_74]]
 // CHECK:         %[[VAL_79:.*]] = add nuw nsw i32 %[[VAL_76]], 64
-// CHECK:         store i32 %[[VAL_79]], ptr %[[VAL_26]], align 4
+// CHECK:         store i32 %[[VAL_79]], ptr{{.*}} %[[VAL_26]], align 4
 // CHECK:         %[[VAL_81:.*]] = add i32 %[[VAL_76]], %thread.id.2
-// CHECK:         store i32 0, ptr %[[VAL_25]], align 4
+// CHECK-GCN:     %[[VAL_88:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK-GCN:     %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]]
+// CHECK-GCN:     %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_89]], i32 %[[VAL_90]]
+// CHECK-GCN:     %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !6
+// CHECK-GCN:     store float %[[VAL_104]], ptr{{.*}} %[[VAL_29]], align 4
+// CHECK-GCN:     %[[VAL_105_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_105_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_29]] to ptr
+// CHECK-GCN:     %[[VAL_105_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_24]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_105_1]], ptr %[[VAL_105_2]], ptr %[[VAL_105_3]])
+// CHECK-GCN:     %[[VAL_105:.*]] = load float, ptr{{.*}} %[[VAL_24]], align 4
+// CHECK-GCN:     store float %[[VAL_105]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-PTX:     store i32 0, ptr %[[VAL_25]], align 4
 // CHECK:         br label %[[VAL_82:.*]]
-// CHECK:       loop3.loop_header:                                ; preds = %[[VAL_83:.*]], %[[VAL_78]]
-// CHECK:         %[[VAL_84:.*]] = load i32, ptr %[[VAL_25]], align 4
-// CHECK:         %[[VAL_85:.*]] = icmp uge i32 %[[VAL_84]], 2
-// CHECK:         br i1 %[[VAL_85]], label %[[VAL_75]], label %[[VAL_83]]
-// CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_82]]
-// CHECK:         %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1
-// CHECK:         store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4
-// CHECK:         %[[VAL_88:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
-// CHECK:         %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]]
-// CHECK:         %[[VAL_91:.*]] = add i32 %tile_origin.3, %[[VAL_84]]
-// CHECK:         %[[VAL_92:.*]] = mul nuw nsw i32 %[[VAL_91]], 1
-// CHECK:         %[[VAL_93:.*]] = add nuw nsw i32 0, %[[VAL_92]]
-// CHECK:         %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_90]], 2
-// CHECK:         %[[VAL_95:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_94]]
-// CHECK:         %[[VAL_96:.*]] = udiv i32 %[[VAL_95]], 1024
-// CHECK:         %[[VAL_97:.*]] = mul nuw nsw i32 %[[VAL_89]], 1
-// CHECK:         %[[VAL_98:.*]] = add nuw nsw i32 0, %[[VAL_97]]
-// CHECK:         %[[VAL_99:.*]] = udiv i32 %[[VAL_98]], 131072
-// CHECK:         %[[VAL_100:.*]] = mul nuw nsw i32 %[[VAL_88]], 1
-// CHECK:         %[[VAL_101:.*]] = add nuw nsw i32 0, %[[VAL_100]]
-// CHECK:         %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_98]], i32 %[[VAL_95]]
-// CHECK:         %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !7
-// CHECK:         store float %[[VAL_104]], ptr %[[VAL_29]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_24]])
-// CHECK:         %[[VAL_105:.*]] = load float, ptr %[[VAL_24]], align 4
-// CHECK:         store float %[[VAL_105]], ptr %[[VAL_28]], align 4
-// CHECK:         br label %[[VAL_82]], !llvm.loop !8
-// CHECK:       loop3.loop_exit:                                  ; preds = %[[VAL_82]]
-// CHECK:         br label %[[VAL_74]], !llvm.loop !9
+
+// CHECK-PTX:   loop3.loop_header:                                ; preds = %[[VAL_83:.*]], %[[VAL_78]]
+// CHECK-PTX:     %[[VAL_84:.*]] = load i32, ptr %[[VAL_25]], align 4
+// CHECK-PTX:     %[[VAL_85:.*]] = icmp uge i32 %[[VAL_84]], 2
+// CHECK-PTX:     br i1 %[[VAL_85]], label %[[VAL_75]], label %[[VAL_83]]
+
+// CHECK-PTX:   loop3.loop_body:                                  ; preds = %[[VAL_82]]
+// CHECK-PTX:     %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1
+// CHECK-PTX:     store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4
+// CHECK-PTX:     %[[VAL_88:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:     %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK-PTX:     %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]]
+// CHECK-PTX:     %[[VAL_91:.*]] = add i32 %tile_origin.3, %[[VAL_84]]
+// CHECK-PTX:     %[[VAL_92:.*]] = mul nuw nsw i32 %[[VAL_91]], 1
+// CHECK-PTX:     %[[VAL_93:.*]] = add nuw nsw i32 0, %[[VAL_92]]
+// CHECK-PTX:     %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_90]], 2
+// CHECK-PTX:     %[[VAL_95:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_94]]
+// CHECK-PTX:     %[[VAL_96:.*]] = udiv i32 %[[VAL_95]], 1024
+// CHECK-PTX:     %[[VAL_97:.*]] = mul nuw nsw i32 %[[VAL_89]], 1
+// CHECK-PTX:     %[[VAL_98:.*]] = add nuw nsw i32 0, %[[VAL_97]]
+// CHECK-PTX:     %[[VAL_99:.*]] = udiv i32 %[[VAL_98]], 131072
+// CHECK-PTX:     %[[VAL_100:.*]] = mul nuw nsw i32 %[[VAL_88]], 1
+// CHECK-PTX:     %[[VAL_101:.*]] = add nuw nsw i32 0, %[[VAL_100]]
+// CHECK-PTX:     %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_98]], i32 %[[VAL_95]]
+// CHECK-PTX:     %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !7
+// CHECK-PTX:     store float %[[VAL_104]], ptr %[[VAL_29]], align 4
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_24]])
+// CHECK-PTX:     %[[VAL_105:.*]] = load float, ptr %[[VAL_24]], align 4
+// CHECK-PTX:     store float %[[VAL_105]], ptr %[[VAL_28]], align 4
+// CHECK-PTX:     br label %[[VAL_82]], !llvm.loop !8
+
+// CHECK-PTX:   loop3.loop_exit:                                  ; preds = %[[VAL_82]]
+// CHECK-PTX:     br label %[[VAL_74]], !llvm.loop !9
+
 // CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_74]]
 // CHECK:         br label %[[VAL_45]]
 // CHECK:       is_full_tile-false:                               ; preds = %[[VAL_49]]
-// CHECK:         store i32 0, ptr %[[VAL_23]], align 4
+// CHECK:         store i32 0, ptr{{.*}} %[[VAL_23]], align 4
 // CHECK:         br label %[[VAL_106:.*]]
-// CHECK:       loop2.loop_header5:                               ; preds = %[[VAL_107:.*]], %[[VAL_53]]
-// CHECK:         %[[VAL_108:.*]] = load i32, ptr %[[VAL_23]], align 4
-// CHECK:         %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 512
+
+// CHECK:       loop2.loop_header{{(5|4)}}:                               ; preds = %[[VAL_107:.*]], %[[VAL_53]]
+// CHECK:         %[[VAL_108:.*]] = load i32, ptr{{.*}} %[[VAL_23]], align 4
+// CHECK-PTX:     %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 512
+// CHECK-GCN:     %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 1024
 // CHECK:         br i1 %[[VAL_109]], label %[[VAL_54]], label %[[VAL_110:.*]]
-// CHECK:       loop2.loop_body6:                                 ; preds = %[[VAL_106]]
+
+// CHECK:       loop2.loop_body{{(6|5)}}:                                 ; preds = %[[VAL_106]]
 // CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_108]], 64
-// CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_23]], align 4
+// CHECK:         store i32 %[[VAL_111]], ptr{{.*}} %[[VAL_23]], align 4
 // CHECK:         %[[VAL_113:.*]] = add i32 %[[VAL_108]], %thread.id.2
-// CHECK:         %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512
+// CHECK-PTX:     %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512
+// CHECK-GCN:     %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 1024
 // CHECK:         br i1 %[[VAL_114]], label %[[VAL_115:.*]], label %[[VAL_107]]
+
 // CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_116:.*]], %[[VAL_110]]
-// CHECK:         br label %[[VAL_106]], !llvm.loop !11
-// CHECK:       loop2.loop_exit4:                                 ; preds = %[[VAL_106]]
+// CHECK:         br label %[[VAL_106]], !llvm.loop !{{(11|9)}}
+
+// CHECK:       loop2.loop_exit{{(4|3)}}:                                 ; preds = %[[VAL_106]]
 // CHECK:         br label %[[VAL_45]]
+
 // CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_110]]
-// CHECK:         store i32 0, ptr %[[VAL_22]], align 4
+// CHECK-GCN:     %[[VAL_123:.*]] = add i32 %tile_origin.0, 0
+// CHECK-GCN:     %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK-GCN:     %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]]
+// CHECK-GCN:     %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_124]], i32 %[[VAL_125]]
+// CHECK-GCN:     %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !6
+// CHECK-GCN:     store float %[[VAL_138]], ptr{{.*}} %[[VAL_29]], align 4
+// CHECK-GCN:     %[[VAL_139_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr
+// CHECK-GCN:     %[[VAL_139_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_29]] to ptr
+// CHECK-GCN:     %[[VAL_139_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_21]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_139_1]], ptr %[[VAL_139_2]], ptr %[[VAL_139_3]])
+// CHECK-GCN:     %[[VAL_139:.*]] = load float, ptr{{.*}} %[[VAL_21]], align 4
+// CHECK-GCN:     store float %[[VAL_139]], ptr{{.*}} %[[VAL_28]], align 4
+// CHECK-PTX:     store i32 0, ptr %[[VAL_22]], align 4
 // CHECK:         br label %[[VAL_117:.*]]
-// CHECK:       loop3.loop_header11:                              ; preds = %[[VAL_118:.*]], %[[VAL_115]]
-// CHECK:         %[[VAL_119:.*]] = load i32, ptr %[[VAL_22]], align 4
-// CHECK:         %[[VAL_120:.*]] = icmp uge i32 %[[VAL_119]], 2
-// CHECK:         br i1 %[[VAL_120]], label %[[VAL_116]], label %[[VAL_118]]
-// CHECK:       loop3.loop_body12:                                ; preds = %[[VAL_117]]
-// CHECK:         %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1
-// CHECK:         store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4
-// CHECK:         %[[VAL_123:.*]] = add i32 %tile_origin.0, 0
-// CHECK:         %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
-// CHECK:         %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]]
-// CHECK:         %[[VAL_126:.*]] = add i32 %tile_origin.3, %[[VAL_119]]
-// CHECK:         %[[VAL_127:.*]] = mul nuw nsw i32 %[[VAL_126]], 1
-// CHECK:         %[[VAL_128:.*]] = add nuw nsw i32 0, %[[VAL_127]]
-// CHECK:         %[[VAL_129:.*]] = mul nuw nsw i32 %[[VAL_125]], 2
-// CHECK:         %[[VAL_130:.*]] = add nuw nsw i32 %[[VAL_128]], %[[VAL_129]]
-// CHECK:         %[[VAL_131:.*]] = udiv i32 %[[VAL_130]], 1024
-// CHECK:         %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_124]], 1
-// CHECK:         %[[VAL_133:.*]] = add nuw nsw i32 0, %[[VAL_132]]
-// CHECK:         %[[VAL_134:.*]] = udiv i32 %[[VAL_133]], 131072
-// CHECK:         %[[VAL_135:.*]] = mul nuw nsw i32 %[[VAL_123]], 1
-// CHECK:         %[[VAL_136:.*]] = add nuw nsw i32 0, %[[VAL_135]]
-// CHECK:         %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_133]], i32 %[[VAL_130]]
-// CHECK:         %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !7
-// CHECK:         store float %[[VAL_138]], ptr %[[VAL_29]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_21]])
-// CHECK:         %[[VAL_139:.*]] = load float, ptr %[[VAL_21]], align 4
-// CHECK:         store float %[[VAL_139]], ptr %[[VAL_28]], align 4
-// CHECK:         br label %[[VAL_117]], !llvm.loop !12
-// CHECK:       loop3.loop_exit10:                                ; preds = %[[VAL_117]]
-// CHECK:         br label %[[VAL_107]]
+
+// CHECK-PTX:   loop3.loop_header11:                              ; preds = %[[VAL_118:.*]], %[[VAL_115]]
+// CHECK-PTX:     %[[VAL_119:.*]] = load i32, ptr %[[VAL_22]], align 4
+// CHECK-PTX:     %[[VAL_120:.*]] = icmp uge i32 %[[VAL_119]], 2
+// CHECK-PTX:     br i1 %[[VAL_120]], label %[[VAL_116]], label %[[VAL_118]]
+
+// CHECK-PTX:   loop3.loop_body12:                                ; preds = %[[VAL_117]]
+// CHECK-PTX:     %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1
+// CHECK-PTX:     store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4
+// CHECK-PTX:     %[[VAL_123:.*]] = add i32 %tile_origin.0, 0
+// CHECK-PTX:     %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK-PTX:     %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]]
+// CHECK-PTX:     %[[VAL_126:.*]] = add i32 %tile_origin.3, %[[VAL_119]]
+// CHECK-PTX:     %[[VAL_127:.*]] = mul nuw nsw i32 %[[VAL_126]], 1
+// CHECK-PTX:     %[[VAL_128:.*]] = add nuw nsw i32 0, %[[VAL_127]]
+// CHECK-PTX:     %[[VAL_129:.*]] = mul nuw nsw i32 %[[VAL_125]], 2
+// CHECK-PTX:     %[[VAL_130:.*]] = add nuw nsw i32 %[[VAL_128]], %[[VAL_129]]
+// CHECK-PTX:     %[[VAL_131:.*]] = udiv i32 %[[VAL_130]], 1024
+// CHECK-PTX:     %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_124]], 1
+// CHECK-PTX:     %[[VAL_133:.*]] = add nuw nsw i32 0, %[[VAL_132]]
+// CHECK-PTX:     %[[VAL_134:.*]] = udiv i32 %[[VAL_133]], 131072
+// CHECK-PTX:     %[[VAL_135:.*]] = mul nuw nsw i32 %[[VAL_123]], 1
+// CHECK-PTX:     %[[VAL_136:.*]] = add nuw nsw i32 0, %[[VAL_135]]
+// CHECK-PTX:     %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_133]], i32 %[[VAL_130]]
+// CHECK-PTX:     %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !7
+// CHECK-PTX:     store float %[[VAL_138]], ptr %[[VAL_29]], align 4
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_21]])
+// CHECK-PTX:     %[[VAL_139:.*]] = load float, ptr %[[VAL_21]], align 4
+// CHECK-PTX:     store float %[[VAL_139]], ptr %[[VAL_28]], align 4
+// CHECK-PTX:     br label %[[VAL_117]], !llvm.loop !12
+
+// CHECK-PTX:   loop3.loop_exit10:                                ; preds = %[[VAL_117]]
+// CHECK-PTX:     br label %[[VAL_107]]
+
 // CHECK:       thread_in_bounds-true:                            ; preds = %[[VAL_48]]
 // CHECK:         %[[VAL_140:.*]] = icmp eq i32 %lane_id, 0
 // CHECK:         br i1 %[[VAL_140]], label %[[VAL_141:.*]], label %[[VAL_142:.*]]
+
 // CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_141]], %thread_in_bounds-true
-// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     fence syncscope("workgroup") seq_cst
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
 // CHECK:         %[[VAL_143:.*]] = icmp eq i32 %[[VAL_71]], 0
 // CHECK:         br i1 %[[VAL_143]], label %[[VAL_144:.*]], label %[[VAL_73]]
+
 // CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_145:.*]], %[[VAL_142]]
 // CHECK:         br label %thread_in_bounds-after
+
 // CHECK:       intra_warp_reduce_write-true:                     ; preds = %thread_in_bounds-true
-// CHECK:         %[[VAL_146:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_146:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4
 // CHECK:         %[[VAL_147:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %[[VAL_71]]
 // CHECK:         %[[VAL_148:.*]] = addrspacecast ptr addrspace(3) %[[VAL_147]] to ptr
 // CHECK:         store float %[[VAL_146]], ptr %[[VAL_148]], align 4
 // CHECK:         br label %[[VAL_142]]
+
 // CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_142]]
 // CHECK:         %[[VAL_149:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %lane_id
 // CHECK:         %[[VAL_150:.*]] = addrspacecast ptr addrspace(3) %[[VAL_149]] to ptr
-// CHECK:         store float %[[VAL_35]], ptr %[[VAL_10]], align 4
+// CHECK-GCN:     %[[VAL_150_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr
+// CHECK-GCN:     store float %[[VAL_35]], ptr %[[VAL_150_1]], align 4
+// CHECK-PTX:     store float %[[VAL_35]], ptr %[[VAL_10]], align 4
 // CHECK:         %[[VAL_151:.*]] = icmp ult i32 %thread.id.2, 2
-// CHECK:         %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_10]]
+// CHECK-GCN:     %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_150_1]]
+// CHECK-PTX:     %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_10]]
 // CHECK:         %[[VAL_153:.*]] = load float, ptr %[[VAL_152]], align 4
-// CHECK:         %[[VAL_154:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_153]], i32 16, i32 31)
-// CHECK:         store float %[[VAL_154]], ptr %[[VAL_9]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_9]], ptr %[[VAL_8]])
-// CHECK:         %[[VAL_155:.*]] = load float, ptr %[[VAL_8]], align 4
+// CHECK-GCN:     %[[VAL_154_1:.*]] = bitcast float %[[VAL_153]] to i32
+// CHECK-GCN:     %[[VAL_154_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_154_1]], i32 16)
+// CHECK-GCN:     %[[VAL_154:.*]] = bitcast i32 %[[VAL_154_2]] to float
+// CHECK-PTX:     %[[VAL_154:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_153]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_154]], ptr{{.*}} %[[VAL_9]], align 4
+// CHECK-GCN:     %[[VAL_155_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr
+// CHECK-GCN:     %[[VAL_155_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_155_1]], ptr %[[VAL_155_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_9]], ptr %[[VAL_8]])
+// CHECK:         %[[VAL_155:.*]] = load float, ptr{{.*}} %[[VAL_8]], align 4
 // CHECK:         store float %[[VAL_155]], ptr %[[VAL_152]], align 4
 // CHECK:         %[[VAL_156:.*]] = load float, ptr %[[VAL_152]], align 4
-// CHECK:         %[[VAL_157:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_156]], i32 8, i32 31)
-// CHECK:         store float %[[VAL_157]], ptr %[[VAL_7]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_7]], ptr %[[VAL_6]])
-// CHECK:         %[[VAL_158:.*]] = load float, ptr %[[VAL_6]], align 4
+// CHECK-GCN:     %[[VAL_157_1:.*]] = bitcast float %[[VAL_156]] to i32
+// CHECK-GCN:     %[[VAL_157_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_157_1]], i32 8)
+// CHECK-GCN:     %[[VAL_157:.*]] = bitcast i32 %[[VAL_157_2]] to float
+// CHECK-PTX:     %[[VAL_157:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_156]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_157]], ptr{{.*}} %[[VAL_7]], align 4
+// CHECK-GCN:     %[[VAL_158_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr
+// CHECK-GCN:     %[[VAL_158_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_158_1]], ptr %[[VAL_158_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_7]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_158:.*]] = load float, ptr{{.*}} %[[VAL_6]], align 4
 // CHECK:         store float %[[VAL_158]], ptr %[[VAL_152]], align 4
 // CHECK:         %[[VAL_159:.*]] = load float, ptr %[[VAL_152]], align 4
-// CHECK:         %[[VAL_160:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_159]], i32 4, i32 31)
-// CHECK:         store float %[[VAL_160]], ptr %[[VAL_5]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_5]], ptr %[[VAL_4]])
-// CHECK:         %[[VAL_161:.*]] = load float, ptr %[[VAL_4]], align 4
+// CHECK-GCN:     %[[VAL_160_1:.*]] = bitcast float %[[VAL_159]] to i32
+// CHECK-GCN:     %[[VAL_160_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_160_1]], i32 4)
+// CHECK-GCN:     %[[VAL_160:.*]] = bitcast i32 %[[VAL_160_2]] to float
+// CHECK-PTX:     %[[VAL_160:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_159]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_160]], ptr{{.*}} %[[VAL_5]], align 4
+// CHECK-GCN:     %[[VAL_161_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr
+// CHECK-GCN:     %[[VAL_161_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_161_1]], ptr %[[VAL_161_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_161:.*]] = load float, ptr{{.*}} %[[VAL_4]], align 4
 // CHECK:         store float %[[VAL_161]], ptr %[[VAL_152]], align 4
 // CHECK:         %[[VAL_162:.*]] = load float, ptr %[[VAL_152]], align 4
-// CHECK:         %[[VAL_163:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_162]], i32 2, i32 31)
-// CHECK:         store float %[[VAL_163]], ptr %[[VAL_3]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_3]], ptr %[[VAL_2]])
-// CHECK:         %[[VAL_164:.*]] = load float, ptr %[[VAL_2]], align 4
+// CHECK-GCN:     %[[VAL_163_1:.*]] = bitcast float %[[VAL_162]] to i32
+// CHECK-GCN:     %[[VAL_163_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_163_1]], i32 2)
+// CHECK-GCN:     %[[VAL_163:.*]] = bitcast i32 %[[VAL_163_2]] to float
+// CHECK-PTX:     %[[VAL_163:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_162]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_163]], ptr{{.*}} %[[VAL_3]], align 4
+// CHECK-GCN:     %[[VAL_164_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr
+// CHECK-GCN:     %[[VAL_164_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_164_1]], ptr %[[VAL_164_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_3]], ptr %[[VAL_2]])
+// CHECK:         %[[VAL_164:.*]] = load float, ptr{{.*}} %[[VAL_2]], align 4
 // CHECK:         store float %[[VAL_164]], ptr %[[VAL_152]], align 4
 // CHECK:         %[[VAL_165:.*]] = load float, ptr %[[VAL_152]], align 4
-// CHECK:         %[[VAL_166:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_165]], i32 1, i32 31)
-// CHECK:         store float %[[VAL_166]], ptr %[[VAL_1]], align 4
-// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_1]], ptr %[[VAL_0]])
-// CHECK:         %[[VAL_167:.*]] = load float, ptr %[[VAL_0]], align 4
+// CHECK-GCN:     %[[VAL_166_1:.*]] = bitcast float %[[VAL_165]] to i32
+// CHECK-GCN:     %[[VAL_166_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_166_1]], i32 1)
+// CHECK-GCN:     %[[VAL_166:.*]] = bitcast i32 %[[VAL_166_2]] to float
+// CHECK-PTX:     %[[VAL_166:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_165]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_166]], ptr{{.*}} %[[VAL_1]], align 4
+// CHECK-GCN:     %[[VAL_167_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_1]] to ptr
+// CHECK-GCN:     %[[VAL_167_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr
+// CHECK-GCN:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_167_1]], ptr %[[VAL_167_2]])
+// CHECK-PTX:     call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_167:.*]] = load float, ptr{{.*}} %[[VAL_0]], align 4
 // CHECK:         store float %[[VAL_167]], ptr %[[VAL_152]], align 4
 // CHECK:         %[[VAL_168:.*]] = icmp eq i32 %thread.id.2, 0
 // CHECK:         br i1 %[[VAL_168]], label %[[VAL_169:.*]], label %[[VAL_145]]
+
 // CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_169]], %[[VAL_144]]
 // CHECK:         br label %[[VAL_73]]
 // CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_144]]
@@ -289,7 +413,7 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_179:.*]] = load float, ptr %[[VAL_180:.*]], align 4
 // CHECK:         %[[VAL_181:.*]] = load float, ptr %[[VAL_182:.*]], align 4
 // CHECK:         %[[VAL_183:.*]] = fadd float %[[VAL_179]], %[[VAL_181]]
-// CHECK:         store float %[[VAL_183]], ptr %[[VAL_178]], align 4
-// CHECK:         %[[VAL_184:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         store float %[[VAL_183]], ptr{{.*}} %[[VAL_178]], align 4
+// CHECK:         %[[VAL_184:.*]] = load float, ptr{{.*}} %[[VAL_178]], align 4
 // CHECK:         store float %[[VAL_184]], ptr %[[VAL_185:.*]], align 4
 // CHECK:         ret void

From f037ed3676b1705cf1ab823d1853f7867944efcb Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 29 May 2024 16:48:10 -0700
Subject: [PATCH 109/287] PR #13065: [NVIDIA] Disable the bias reuse for fp8
 dot

Imported from GitHub PR https://github.com/openxla/xla/pull/13065

For some operations, we might want to do the `output to operand aliasing` to reuse the memory space. For example, in the dot operation, we can alias the output to the bias operand, when they have the same shape.

However, this aliasing is not always safe especially when the dot outputs fp8 outputs. In this case the bias is 2x larger than expected output and the HLO verifier will complain.

This PR fixes this issue by disabling the `output to operand aliasing`  for fp8 dot.

cc. @philipphack @hx89 @nluehr
Copybara import of the project:

--
711e3097a80d9ff6e48b80924a1e81ff940ac240 by kaixih <kaixih@nvidia.com>:

Disable bias reuse for fp8 dot

--
7d86b99396385f85e6e04a28dc8d824fd53e91f3 by kaixih <kaixih@nvidia.com>:

Update to a general solution

--
15f774667013045c7d8d9e38ac80a45645313b1e by kaixih <kaixih@nvidia.com>:

Address comments

Merging this change closes #13065

PiperOrigin-RevId: 638454718
---
 .../xla/xla/service/gpu/gemm_rewriter.cc       | 10 ++++++++++
 .../xla/service/gpu/tests/gemm_rewrite_test.cc | 18 ++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 36acfa1d2f2f0a..7b84cbbf7ad47c 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -1374,6 +1374,16 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     std::unique_ptr<HloInstruction> new_gemm =
         existing_gemm->CloneWithNewShape(instr->shape());
+
+    // The F8ConvertD may change the output dtype. We need to turn off the
+    // output to operand aliasing when it happens.
+    if (gemm_backend_config.beta() != 0.0 &&
+        !ShapeUtil::Equal(existing_gemm->operand(2)->shape(),
+                          new_gemm->shape())) {
+      xla::Cast<HloCustomCallInstruction>(new_gemm.get())
+          ->set_output_to_operand_aliasing({});
+    }
+
     TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(new_gemm)));
 
     VLOG(1) << "Conversion" << (reduce_damax ? " and amax calculation" : "")
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 67a7b99d451c12..c8aa11850366b8 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -5819,6 +5819,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
       x = <<F8E4M3>>[16,32] parameter(0)
       y = <<F8E4M3>>[32,16] parameter(1)
       b = f32[16,16] parameter(2)
+      one = f32[] constant(1)
+      ones = f32[16,16] broadcast(one), dimensions={}
+      b_ones = f32[16,16] add(b, ones)
       x_f32 = f32[16,32] convert(x)
       y_f32 = f32[32,16] convert(y)
       x_scale = f32[] parameter(3)
@@ -5828,7 +5831,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
       x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
       y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
       dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-      ROOT out = add(dot_a, b)
+      ROOT out = add(dot_a, b_ones)
           }
 
 )";
@@ -5844,12 +5847,15 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[16,16]{1,0} parameter(2)
+; CHECK:         [[C0:%[^ ]+]] = f32[16,16]{1,0} add({{.*}})
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -6401,6 +6407,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
       x_f16 = f16[16,32] convert(x)
       y_f16 = f16[32,16] convert(y)
       b = f16[16,16] parameter(2)
+      one = f16[] constant(1)
+      ones = f16[16,16] broadcast(one), dimensions={}
+      b_ones = f16[16,16] add(b, ones)
       x_scale = f16[] parameter(3)
       y_scale = f16[] parameter(4)
       z_scale = f16[] parameter(5)
@@ -6410,7 +6419,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
       x_unscaled = f16[16,32] multiply(x_f16, x_scale_bcast)
       y_unscaled = f16[32,16] multiply(y_f16, y_scale_bcast)
       dot_a = f16[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-      dot_a_bias = f16[16,16] add(dot_a, b)
+      dot_a_bias = f16[16,16] add(dot_a, b_ones)
       dot_a_scaled = f16[16,16] divide(dot_a_bias, z_scale_bcast)
       c1 = f16[] constant(-<<F8E4M3_AMAX>>)
       c1_bcast = f16[16,16] broadcast(c1), dimensions={}
@@ -6433,12 +6442,13 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 ; CHECK:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f16[16,16]{1,0} parameter(2)
+; CHECK:         [[C0:%[^ ]+]] = f16[16,16]{1,0} add({{.*}})
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(3)
 ; CHECK:         [[P3:%[^ ]+]] = f16[] parameter(4)
 ; CHECK:         [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX:         [[P4:%[^ ]+]] = f16[] parameter(5)
 ; CHECK-PTX:       [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
+; CHECK-NOT:       output_to_operand_aliasing
 ; CHECK-GCN:       [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={

From 8bf5e0a81f49864e70a2dfff283855fdd2b26b99 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Wed, 29 May 2024 16:50:05 -0700
Subject: [PATCH 110/287] Fix diff command for bazelrc in XLA build script

PiperOrigin-RevId: 638455152
---
 third_party/xla/.kokoro/linux/build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index 06f18d562fe8e4..9de4462701128f 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -72,9 +72,11 @@ if is_linux_gpu_job ; then
     RBE_FLAGS="--config=rbe_linux_cuda_nvcc --jobs=150"
     (
       #TODO(b/338885148): Remove this block after TF was updated to cuDNN 9
-      sed -i 's/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g' ./github/xla/.bazelrc
+      pushd github/xla
+      sed -i 's/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g' .bazelrc
       echo "The following changes were made:"
       git diff -- .bazelrc || true
+      popd
     )
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi

From 2a449fa433d113ea997cc6afe6d6c166ff6e377a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 17:12:09 -0700
Subject: [PATCH 111/287] Adds IDs of modules on which the compilation stage
 was run to the CompilationLogEntry and the
 `MetricsHookInterface::RecordCompilationMetrics()` API.

PiperOrigin-RevId: 638461027
---
 third_party/xla/xla/service/BUILD                    | 1 +
 third_party/xla/xla/service/metrics.proto            | 4 +++-
 third_party/xla/xla/service/metrics_hook_interface.h | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index c2b1c934959796..1b0058ebe3b642 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -7630,6 +7630,7 @@ cc_library(
         ":metrics_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/service/metrics.proto b/third_party/xla/xla/service/metrics.proto
index 90325b70fcc6fc..d41c4edae3f7b5 100644
--- a/third_party/xla/xla/service/metrics.proto
+++ b/third_party/xla/xla/service/metrics.proto
@@ -9,7 +9,7 @@ import "google/protobuf/timestamp.proto";
 // Defines pass specific metrics.
 message PassMetrics {
   // Unique ID of the module on which the pass was run.
-  int64 module_id = 1;
+  uint64 module_id = 1;
   // The name of the pass.
   string pass_name = 2;
   // Duration of the pass.
@@ -41,4 +41,6 @@ message CompilationLogEntry {
   int32 task_index = 4;
   // Pass specific metrics.
   repeated PassMetrics pass_metrics = 5;
+  // IDs of modules on which the compilation stage was run.
+  repeated uint64 module_ids = 6;
 }
diff --git a/third_party/xla/xla/service/metrics_hook_interface.h b/third_party/xla/xla/service/metrics_hook_interface.h
index d8c0a45d0a402b..58b9300bacccb0 100644
--- a/third_party/xla/xla/service/metrics_hook_interface.h
+++ b/third_party/xla/xla/service/metrics_hook_interface.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
 #define XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
 
+#include <cstdint>
+
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/service/metrics.pb.h"
 
 namespace xla {
@@ -48,10 +51,11 @@ class MetricsHookInterface {
   virtual void RecordStageError(absl::string_view stage,
                                 absl::string_view error_status) const = 0;
 
-  // Captures metrics for a given XLA compilation stage. The `pass_metrics` can
-  // be empty if no pass specific metrics are available.
+  // Captures metrics for a given XLA compilation stage and module_ids. The
+  // `pass_metrics` can be empty if no pass specific metrics are available.
   virtual void RecordCompilationMetrics(
       CompilationLogEntry::CompilationStage stage, absl::Duration latency,
+      absl::Span<const uint64_t> module_ids,
       absl::Span<const PassMetrics> pass_metrics) const = 0;
 };
 }  // namespace xla

From e9a3e0590ae1cd97d5da9732657c8af477909829 Mon Sep 17 00:00:00 2001
From: Farzin Houshmand <farzinh@google.com>
Date: Wed, 29 May 2024 17:21:26 -0700
Subject: [PATCH 112/287] Change MatchShapeCoveringDynamicIndexInstruction to
 return the dynamic index if matched.

PiperOrigin-RevId: 638463074
---
 ...scan_loop_accumulator_input_unification.cc |  7 ++++---
 .../xla/xla/service/while_loop_unroller.cc    | 21 +++++++++----------
 .../xla/xla/service/while_loop_unroller.h     | 11 +++++-----
 .../xla/service/while_loop_unroller_test.cc   |  8 ++++---
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
index 19cfbeb3e18f3f..03fa781a8ff5e5 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
@@ -96,7 +96,8 @@ FindAccumulatorInputPairs(const HloAliasAnalysis& alias_analysis,
       }
       HloInstruction* gte_user = gte->users().at(0);
       if (MatchShapeCoveringDynamicIndexInstruction(
-              gte_user, gte, HloOpcode::kDynamicUpdateSlice, config)) {
+              gte_user, gte, HloOpcode::kDynamicUpdateSlice, config)
+              .has_value()) {
         // The accumulator should be written at the same index
         if (computation->root_instruction()->mutable_operand(param_idx) ==
             gte_user) {
@@ -188,8 +189,8 @@ FindAccumulatorInputPairs(const HloAliasAnalysis& alias_analysis,
     HloInstruction* gte_user = input_gte_inner->users().at(0);
     // Check if the input_gte_inner is a shape covering read-only instruction
     if (MatchShapeCoveringDynamicIndexInstruction(
-            gte_user, input_gte_inner, HloOpcode::kDynamicUpdateSlice,
-            config)) {
+            gte_user, input_gte_inner, HloOpcode::kDynamicUpdateSlice, config)
+            .has_value()) {
       acc_input_pairs.emplace_back(acc, input_gte_inner);
     }
   }
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 1dfa8dac54619d..253b3f79120a3e 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -328,10 +328,9 @@ bool IsLoopInductionVar(const HloInstruction* instr,
   }
 }
 
-bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
-                                               HloInstruction* input,
-                                               HloOpcode opcode,
-                                               const WhileLoopConfig& config) {
+std::optional<int64_t> MatchShapeCoveringDynamicIndexInstruction(
+    HloInstruction* instr, HloInstruction* input, HloOpcode opcode,
+    const WhileLoopConfig& config) {
   // Based on the instruction type, start indices start from index 1 or 2 of the
   // operands.
   int64_t start_indices_offset;
@@ -340,11 +339,11 @@ bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
   } else if (instr->opcode() == HloOpcode::kDynamicUpdateSlice) {
     start_indices_offset = 2;
   } else {
-    return false;
+    return std::nullopt;
   }
   HloInstruction* operand = instr->mutable_operand(0);
   if (operand != input) {
-    return false;
+    return std::nullopt;
   }
 
   int64_t dynamic_index = -1;
@@ -356,7 +355,7 @@ bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
       std::optional<int64_t> offset =
           LiteralUtil::LiteralAsScalarInt64(index->literal());
       if (offset.has_value() && offset.value() != 0) {
-        return false;
+        return std::nullopt;
       }
     }
 
@@ -366,22 +365,22 @@ bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
       // In order to cover the whole shape only a single non-constant index is
       // allowed.
       if (dynamic_index != -1) {
-        return false;
+        return std::nullopt;
       }
       dynamic_index = start_index - start_indices_offset;
     }
   }
 
   if (dynamic_index == -1) {
-    return false;
+    return std::nullopt;
   }
 
   // The shape's broadcast_dim must be exactly equal to the loop trip count.
   if (operand->shape().dimensions(dynamic_index) != config.trip_count) {
-    return false;
+    return std::nullopt;
   }
 
-  return true;
+  return dynamic_index;
 }
 
 /*static*/ std::optional<WhileLoopConfig> WhileLoopUnroller::IsLoopUnrollable(
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index d74924655716fa..b1684dccb51e88 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/pattern_matcher.h"
@@ -48,12 +49,12 @@ struct WhileLoopConfig {
 // shape of the instruction. To satisfy this:
 // 1. All start indices must be constant zero except only a single dimension.
 // 2. The start index of that dimension should be equal to the enclosing loop
-// induction variable.
+//    induction variable.
 // 3. And, the size of that dimension must match the loop trip count.
-bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
-                                               HloInstruction* input,
-                                               HloOpcode opcode,
-                                               const WhileLoopConfig& config);
+// If so, it returns the dynamic index.
+std::optional<int64_t> MatchShapeCoveringDynamicIndexInstruction(
+    HloInstruction* instr, HloInstruction* input, HloOpcode opcode,
+    const WhileLoopConfig& config);
 
 // This pass unrolls while loops with the given unrolling factor. The value of
 // unroll_factor = -1 will fully unroll the loop.
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index ba011caf4a20a9..3d8c2dd93825df 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -1006,7 +1006,8 @@ TEST_F(WhileLoopUnrollerTest, MatchShapeCoveringDS) {
   HloInstruction* input = body->GetInstructionWithName("get-tuple-element.2");
   HloInstruction* instr = body->GetInstructionWithName("slice");
   EXPECT_TRUE(MatchShapeCoveringDynamicIndexInstruction(
-      instr, input, HloOpcode::kDynamicSlice, config.value()));
+                  instr, input, HloOpcode::kDynamicSlice, config.value())
+                  .has_value());
 }
 
 TEST_F(WhileLoopUnrollerTest, MatchShapeCoveringDSNested) {
@@ -1062,8 +1063,9 @@ TEST_F(WhileLoopUnrollerTest, MatchShapeCoveringDSNested) {
       module->GetComputationWithName("fused_computation.slice");
   HloInstruction* instr = inner_fusion_comp->GetInstructionWithName("slice");
   EXPECT_TRUE(MatchShapeCoveringDynamicIndexInstruction(
-      instr, inner_fusion_comp->parameter_instruction(0),
-      HloOpcode::kDynamicSlice, config.value()));
+                  instr, inner_fusion_comp->parameter_instruction(0),
+                  HloOpcode::kDynamicSlice, config.value())
+                  .has_value());
 }
 
 }  // namespace

From 821775c20dca7a7831783dacbf612c5567048eb6 Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Wed, 29 May 2024 17:21:46 -0700
Subject: [PATCH 113/287] [IFRT] Add several commonly used APIs to
 `xla::ifrt::Sharding`

This CL adds commonly used APIs for sharding.
* Equality test (`==`, `!=`): Checks if two shardings have the same logical partitioning (see below) and same device assignment.
* `GetShardShape`: Returns a shard shape if this sharding always returns a single shard shape. A faste version of taking the first shard's shape from `Disassemble()` result.
* `HasSamePartitioning`: Tests if two shardings have the same logical partitioning, which is when two shardings have the same type and their own partitioning scheme is equivalent.
* `WithDeviceAssignment`: Returns a new `Sharding` of the same type, with devices and/or memory_kind replaced.

PiperOrigin-RevId: 638463143
---
 third_party/xla/xla/python/ifrt/sharding.cc   | 224 +++++++--
 third_party/xla/xla/python/ifrt/sharding.h    |  70 ++-
 .../xla/xla/python/ifrt/sharding_test.cc      | 429 ++++++++++++++++--
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |   1 -
 .../xla/xla/python/pjrt_ifrt/xla_sharding.cc  |  68 ++-
 .../xla/xla/python/pjrt_ifrt/xla_sharding.h   |   9 +
 .../xla/python/pjrt_ifrt/xla_sharding_test.cc | 106 ++++-
 7 files changed, 814 insertions(+), 93 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/sharding.cc b/third_party/xla/xla/python/ifrt/sharding.cc
index af406bf4ce4165..e302535cc4f974 100644
--- a/third_party/xla/xla/python/ifrt/sharding.cc
+++ b/third_party/xla/xla/python/ifrt/sharding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <utility>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/index.h"
@@ -142,25 +144,6 @@ std::vector<Index> GetTileIndices(absl::Span<const int64_t> dim_shards) {
   return result;
 }
 
-// Returns the tile shape after disassembling `shape` with `sharding_param`.
-//
-// Fails if can't shard evenly.
-absl::StatusOr<Shape> GetDisassembledShape(const ShardingParam& sharding_param,
-                                           const Shape& shape) {
-  std::vector<int64_t> dims;
-  dims.reserve(shape.dims().size());
-  for (const auto [dim, dim_shards] :
-       llvm::zip(shape.dims(), sharding_param.dim_shards())) {
-    if (dim % dim_shards != 0) {
-      return FailedPrecondition(
-          "Uneven shard is not supported. dim: %d, dim_shards: %d", dim,
-          dim_shards);
-    }
-    dims.push_back(dim / dim_shards);
-  }
-  return Shape(dims);
-}
-
 }  // namespace
 
 char Sharding::ID = 0;
@@ -172,6 +155,14 @@ char ShardingParamSharding::ID = 0;
 
 char DeserializeShardingOptions::ID = 0;
 
+bool Sharding::operator==(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  return HasSamePartitioning(other) && memory_kind_ == other.memory_kind_ &&
+         devices() == other.devices();
+}
+
 absl::StatusOr<std::unique_ptr<Sharding>> Sharding::FromProto(
     DeviceList::LookupDeviceFunc lookup_device,
     const ShardingProto& sharding_proto) {
@@ -197,6 +188,32 @@ std::unique_ptr<SingleDeviceSharding> SingleDeviceSharding::Create(
       new SingleDeviceSharding(device, memory_kind));
 }
 
+absl::StatusOr<Shape> SingleDeviceSharding::GetShardShape(
+    const Shape& shape) const {
+  return shape;
+}
+
+bool SingleDeviceSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  return llvm::isa<SingleDeviceSharding>(&other);
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>>
+SingleDeviceSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != 1) {
+    return InvalidArgument(
+        "SingleDeviceSharding can only have one device, but was asked to have "
+        "%d devices",
+        devices->size());
+  }
+  return Create(devices.value_or(devices_).front(),
+                memory_kind.value_or(memory_kind_));
+}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 SingleDeviceSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
@@ -238,6 +255,32 @@ OpaqueSharding::OpaqueSharding(DeviceList devices, MemoryKind memory_kind)
     : llvm::RTTIExtends<OpaqueSharding, Sharding>(
           std::move(devices), memory_kind, /*is_fully_replicated=*/false) {}
 
+absl::StatusOr<Shape> OpaqueSharding::GetShardShape(const Shape& shape) const {
+  return InvalidArgument(
+      "OpaqueSharding does not have shard shape information");
+}
+
+bool OpaqueSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  // If the objects are not the same, we cannot tell whether the two
+  // OpaqueShardings are using the same logical partitioning.
+  return false;
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>> OpaqueSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != devices_.size()) {
+    return InvalidArgument(
+        "OpaqueSharding should have the same number of devices as the current "
+        "sharding, but was asked to have %d devices",
+        devices->size());
+  }
+  return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_));
+}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 OpaqueSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
@@ -304,6 +347,46 @@ ConcreteSharding::ConcreteSharding(
       shape_(std::move(dynamic_shape)),
       shard_shapes_(std::move(shard_dynamic_shapes)) {}
 
+absl::StatusOr<Shape> ConcreteSharding::GetShardShape(
+    const Shape& shape) const {
+  return InvalidArgument("ConcreteSharding does not have a fixed shard shape");
+}
+
+bool ConcreteSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  const auto* other_concrete_sharding =
+      llvm::dyn_cast<ConcreteSharding>(&other);
+  if (!other_concrete_sharding) {
+    return false;
+  }
+  return shape_ == other_concrete_sharding->shape_ &&
+         shard_shapes_ == other_concrete_sharding->shard_shapes_;
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>>
+ConcreteSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != devices_.size()) {
+    return InvalidArgument(
+        "ConcreteSharding should have the same number of devices as the "
+        "current sharding, but was asked to have %d devices",
+        devices->size());
+  }
+  if (has_static_shape()) {
+    return Create(devices.value_or(devices_),
+                  memory_kind.value_or(memory_kind_), std::get<Shape>(shape_),
+                  std::get<std::vector<Shape>>(shard_shapes_));
+  } else {
+    return Create(devices.value_or(devices_),
+                  memory_kind.value_or(memory_kind_),
+                  std::get<DynamicShape>(shape_),
+                  std::get<std::vector<DynamicShape>>(shard_shapes_));
+  }
+}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 ConcreteSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
@@ -403,6 +486,47 @@ ConcreteEvenSharding::ConcreteEvenSharding(DeviceList devices,
       shape_(std::move(shape)),
       shard_shape_(std::move(shard_shape)) {}
 
+absl::StatusOr<Shape> ConcreteEvenSharding::GetShardShape(
+    const Shape& shape) const {
+  if (shape != shape_) {
+    return InvalidArgument(
+        "ConcreteEvenSharding has a shard shape for shape %s, but was asked "
+        "to get a shard shape for shape %s",
+        shape_.DebugString(), shape.DebugString());
+  }
+  return shard_shape_;
+}
+
+bool ConcreteEvenSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  const auto* other_concrete_even_sharding =
+      llvm::dyn_cast<ConcreteEvenSharding>(&other);
+  if (!other_concrete_even_sharding) {
+    return false;
+  }
+  return devices_.size() == other_concrete_even_sharding->devices_.size() &&
+         shape_ == other_concrete_even_sharding->shape_ &&
+         shard_shape_ == other_concrete_even_sharding->shard_shape_ &&
+         is_fully_replicated_ ==
+             other_concrete_even_sharding->is_fully_replicated_;
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>>
+ConcreteEvenSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != devices_.size()) {
+    return InvalidArgument(
+        "ConcreteEvenSharding should have the same number of devices as the "
+        "current sharding, but was asked to have %d devices",
+        devices->size());
+  }
+  return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_),
+                shape_, shard_shape_, is_fully_replicated_);
+}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 ConcreteEvenSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
@@ -457,7 +581,7 @@ ShardingParamSharding::Create(ShardingParam sharding_param, DeviceList devices,
       absl::c_accumulate(sharding_param.minor_to_major().axis_sizes, 1,
                          std::multiplies<int64_t>());
   if (device_count != devices.size()) {
-    return FailedPrecondition(
+    return InvalidArgument(
         "Device counts don't match. From ShardingParam %d vs from DeviceList "
         "%d",
         device_count, devices.size());
@@ -477,14 +601,7 @@ ShardingParamSharding::ShardingParamSharding(ShardingParam sharding_param,
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 ShardingParamSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
-  if (shape.dims().size() != sharding_param_.dim_shards().size()) {
-    return FailedPrecondition(
-        "Ranks don't match. From Shape %d vs from ShardingParam %d",
-        shape.dims().size(), sharding_param_.dim_shards().size());
-  }
-
-  TF_ASSIGN_OR_RETURN(Shape local_shape,
-                      GetDisassembledShape(sharding_param_, shape));
+  TF_ASSIGN_OR_RETURN(Shape local_shape, GetShardShape(shape));
 
   std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
   for (Device* device : devices_) {
@@ -495,6 +612,54 @@ ShardingParamSharding::Disassemble(const Shape& shape) const {
   return result;
 }
 
+absl::StatusOr<Shape> ShardingParamSharding::GetShardShape(
+    const Shape& shape) const {
+  if (shape.dims().size() != sharding_param_.dim_shards().size()) {
+    return InvalidArgument(
+        "Numbers of dimensions don't match. From Shape %d vs from "
+        "ShardingParam %d",
+        shape.dims().size(), sharding_param_.dim_shards().size());
+  }
+  std::vector<int64_t> dims;
+  dims.reserve(shape.dims().size());
+  for (const auto [dim, dim_shards] :
+       llvm::zip(shape.dims(), sharding_param_.dim_shards())) {
+    if (dim % dim_shards != 0) {
+      return InvalidArgument(
+          "Uneven shard is not supported. dim: %d, dim_shards: %d", dim,
+          dim_shards);
+    }
+    dims.push_back(dim / dim_shards);
+  }
+  return Shape(dims);
+}
+
+bool ShardingParamSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  const auto* other_sharding_param_sharding =
+      llvm::dyn_cast<ShardingParamSharding>(&other);
+  if (!other_sharding_param_sharding) {
+    return false;
+  }
+  return sharding_param_ == other_sharding_param_sharding->sharding_param_;
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>>
+ShardingParamSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != devices_.size()) {
+    return InvalidArgument(
+        "ShardingParamSharding should have the same number of devices as the "
+        "current sharding, but was asked to have %d devices",
+        devices->size());
+  }
+  return Create(sharding_param_, devices.value_or(devices_),
+                memory_kind.value_or(memory_kind_));
+}
+
 absl::StatusOr<
     std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
 ShardingParamSharding::Disassemble(const DynamicShape& dynamic_shape) const {
@@ -509,8 +674,7 @@ absl::StatusOr<std::vector<IndexDomain>> ShardingParamSharding::IndexDomains(
   DCHECK(this);
 
   // Calculate the origins of tiles, ignoring device assignments.
-  TF_ASSIGN_OR_RETURN(Shape local_shape,
-                      GetDisassembledShape(sharding_param_, shape));
+  TF_ASSIGN_OR_RETURN(Shape local_shape, GetShardShape(shape));
   std::vector<Index> tile_indices =
       GetTileIndices(sharding_param_.dim_shards());
   std::vector<Index> origins;
diff --git a/third_party/xla/xla/python/ifrt/sharding.h b/third_party/xla/xla/python/ifrt/sharding.h
index a7d6fc343484e4..c7fbd258cee56d 100644
--- a/third_party/xla/xla/python/ifrt/sharding.h
+++ b/third_party/xla/xla/python/ifrt/sharding.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_SHARDING_H_
 
 #include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <utility>
@@ -58,10 +59,35 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   MemoryKind memory_kind() const { return memory_kind_; }
 
   // Returns if this sharding is fully replicated. A fully replicated sharding
-  // means that the logical shape and shard shapes are identical, and every
-  // shard of the array contains the entire data of the logical array.
+  // means that the logical shape and shard shapes are identical
+  // (`GetShardShape(shape) == shape`), and every shard of the array contains
+  // the entire data of the logical array.
   bool IsFullyReplicated() const { return is_fully_replicated_; }
 
+  // Returns if this sharding is equal to `other`.
+  bool operator==(const Sharding& other) const;
+  bool operator!=(const Sharding& other) const { return !(*this == other); }
+
+  // Returns a shard shape if the sharding always has the equal shape for all
+  // shards. Returns an error if the sharding may not have a single shard
+  // shape, or `shape` is not a valid shape for this sharding.
+  virtual absl::StatusOr<Shape> GetShardShape(const Shape& shape) const = 0;
+
+  // Returns if this sharding has the same logical partitioning as `other`. By
+  // the same logical partitioning, we mean that `Sharding` type is the same,
+  // and the partitioning scheme within the sharding is equivalent. It does not
+  // need to check if `Disassemble()` would return the same result.
+  virtual bool HasSamePartitioning(const Sharding& other) const = 0;
+
+  // Returns a new sharding with the same logical partitioning as this sharding,
+  // but with different devices and/or a different memory kind. If `devices` is
+  // provided, the number of devices must be the same as the number of devices
+  // in this sharding. If `memory_kind` is provided, it must be a valid memory
+  // kind for the devices used.
+  virtual absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const = 0;
+
   // Breaks a shape up into per-device shapes and shardings. See
   // Array::DisassembleIntoSingleDeviceArrays(). It may return an error if
   // disassembly is unsupported.
@@ -130,6 +156,14 @@ class SingleDeviceSharding final
 
   ~SingleDeviceSharding() override = default;
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
 
@@ -162,6 +196,14 @@ class OpaqueSharding : public llvm::RTTIExtends<OpaqueSharding, Sharding> {
 
   ~OpaqueSharding() override = default;
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
 
@@ -237,6 +279,14 @@ class ConcreteSharding : public llvm::RTTIExtends<ConcreteSharding, Sharding> {
 
   ~ConcreteSharding() override = default;
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
   absl::StatusOr<
@@ -288,6 +338,14 @@ class ConcreteEvenSharding
 
   ~ConcreteEvenSharding() override = default;
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
   absl::StatusOr<
@@ -318,6 +376,14 @@ class ShardingParamSharding
 
   const ShardingParam& sharding_param() const { return sharding_param_; }
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
   absl::StatusOr<
diff --git a/third_party/xla/xla/python/ifrt/sharding_test.cc b/third_party/xla/xla/python/ifrt/sharding_test.cc
index 7434b8ed37530e..43004e36ad18fe 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -41,6 +42,7 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::SizeIs;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
 class SingleDeviceShardingTest : public test_util::ShardingTest {};
@@ -56,6 +58,52 @@ TEST_P(SingleDeviceShardingTest, IsFullyReplicated) {
   EXPECT_TRUE(sharding->IsFullyReplicated());
 }
 
+TEST_P(SingleDeviceShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0});
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device_list.devices().front(), MemoryKind());
+  EXPECT_THAT(sharding->GetShardShape(Shape({10, 20})),
+              IsOkAndHolds(Shape({10, 20})));
+}
+
+TEST_P(SingleDeviceShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0});
+  std::shared_ptr<const Sharding> sharding0 = SingleDeviceSharding::Create(
+      device_list0.devices().front(), MemoryKind());
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({1});
+    std::shared_ptr<const Sharding> sharding1 = SingleDeviceSharding::Create(
+        device_list1.devices().front(), MemoryKind());
+    EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(SingleDeviceShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0});
+  std::shared_ptr<const Sharding> sharding0 = SingleDeviceSharding::Create(
+      device_list0.devices().front(), MemoryKind());
+  {
+    auto device_list1 = GetDevices({1});
+    std::shared_ptr<const Sharding> sharding1 = SingleDeviceSharding::Create(
+        device_list1.devices().front(), MemoryKind());
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    EXPECT_EQ(*new_sharding, *sharding1);
+  }
+  {
+    auto device_list1 = GetDevices({0, 1});
+    EXPECT_THAT(sharding0->WithDeviceAssignment(device_list1,
+                                                /*memory_kind=*/std::nullopt),
+                StatusIs(tsl::error::INVALID_ARGUMENT,
+                         HasSubstr("SingleDeviceSharding can only have one "
+                                   "device, but was asked to have 2 devices")));
+  }
+}
+
 TEST_P(SingleDeviceShardingTest, IndexDomains) {
   auto device_list = GetDevices({0});
   std::shared_ptr<const Sharding> sharding =
@@ -77,10 +125,8 @@ TEST_P(SingleDeviceShardingTest, Disassemble) {
 
     ASSERT_THAT(disassembled, SizeIs(1));
     const auto& [result_shape, result_sharding] = disassembled[0];
-    ASSERT_EQ(shape, result_shape);
-    ASSERT_TRUE(llvm::isa<SingleDeviceSharding>(*result_sharding));
-    EXPECT_THAT(result_sharding->devices().devices(),
-                ElementsAreArray(device_list.devices()));
+    EXPECT_EQ(shape, result_shape);
+    EXPECT_EQ(*result_sharding, *sharding);
   }
   {  // Disassemble dynamic shape.
     TF_ASSERT_OK_AND_ASSIGN(
@@ -92,10 +138,8 @@ TEST_P(SingleDeviceShardingTest, Disassemble) {
 
     ASSERT_THAT(disassembled, SizeIs(1));
     const auto& [result_shape, result_sharding] = disassembled[0];
-    ASSERT_EQ(dynamic_shape, result_shape);
-    ASSERT_TRUE(llvm::isa<SingleDeviceSharding>(*result_sharding));
-    EXPECT_THAT(result_sharding->devices().devices(),
-                ElementsAreArray(device_list.devices()));
+    EXPECT_EQ(dynamic_shape, result_shape);
+    EXPECT_EQ(*result_sharding, *sharding);
   }
 }
 
@@ -106,6 +150,58 @@ TEST_P(OpaqueShardingTest, IsFullyReplicated) {
   EXPECT_FALSE(sharding->IsFullyReplicated());
 }
 
+TEST_P(OpaqueShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding =
+      OpaqueSharding::Create(device_list, MemoryKind());
+  EXPECT_THAT(sharding->GetShardShape(Shape({10, 20})),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("OpaqueSharding does not have shard shape")));
+}
+
+TEST_P(OpaqueShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding0 =
+      OpaqueSharding::Create(device_list0, MemoryKind());
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        OpaqueSharding::Create(device_list0, MemoryKind());
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(OpaqueShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding0 =
+      OpaqueSharding::Create(device_list0, MemoryKind());
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        OpaqueSharding::Create(device_list0, MemoryKind());
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    // For OpaqueSharding, we cannot use an equality test.
+    ASSERT_TRUE(llvm::isa<OpaqueSharding>(*new_sharding));
+    EXPECT_THAT(new_sharding->devices().devices(),
+                ElementsAreArray(device_list1.devices()));
+  }
+  {
+    auto device_list1 = GetDevices({0, 1, 2, 3});
+    EXPECT_THAT(
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt),
+        StatusIs(tsl::error::INVALID_ARGUMENT,
+                 HasSubstr("OpaqueSharding should have the same number of "
+                           "devices as the current sharding, but was asked to "
+                           "have 4 devices")));
+  }
+}
+
 TEST_P(OpaqueShardingTest, FailedToDisassemble) {
   auto device_list = GetDevices({0, 1});
   std::shared_ptr<const Sharding> sharding =
@@ -150,6 +246,111 @@ TEST_P(ConcreteShardingTest, IsFullyReplicated) {
   EXPECT_FALSE(sharding->IsFullyReplicated());
 }
 
+TEST_P(ConcreteShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0, 1});
+  std::vector<Shape> shard_shapes;
+  shard_shapes.reserve(2);
+  shard_shapes.push_back(Shape({10}));
+  shard_shapes.push_back(Shape({20}));
+  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
+      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  EXPECT_THAT(
+      sharding->GetShardShape(Shape({30})),
+      StatusIs(
+          tsl::error::INVALID_ARGUMENT,
+          HasSubstr("ConcreteSharding does not have a fixed shard shape")));
+}
+
+TEST_P(ConcreteShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0, 1});
+  std::vector<Shape> shard_shapes0;
+  shard_shapes0.reserve(2);
+  shard_shapes0.push_back(Shape({10}));
+  shard_shapes0.push_back(Shape({20}));
+  std::shared_ptr<const Sharding> sharding0 = ConcreteSharding::Create(
+      device_list0, MemoryKind(), Shape({30}), shard_shapes0);
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::vector<Shape> shard_shapes1;
+    shard_shapes1.reserve(2);
+    shard_shapes1.push_back(Shape({10}));
+    shard_shapes1.push_back(Shape({20}));
+    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+        device_list1, MemoryKind(), Shape({30}), shard_shapes1);
+    EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different number of shards.
+  {
+    auto device_list1 = GetDevices({2, 3, 4});
+    std::vector<Shape> shard_shapes1;
+    shard_shapes1.reserve(3);
+    shard_shapes1.push_back(Shape({10}));
+    shard_shapes1.push_back(Shape({20}));
+    shard_shapes1.push_back(Shape({30}));
+    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+        device_list1, MemoryKind(), Shape({60}), shard_shapes1);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Difference shape.
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::vector<Shape> shard_shapes1;
+    shard_shapes1.reserve(2);
+    shard_shapes1.push_back(Shape({10}));
+    shard_shapes1.push_back(Shape({20}));
+    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+        device_list1, MemoryKind(), Shape({40}), shard_shapes1);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different shard shapes.
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::vector<Shape> shard_shapes1;
+    shard_shapes1.reserve(2);
+    shard_shapes1.push_back(Shape({10000}));
+    shard_shapes1.push_back(Shape({20}));
+    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+        device_list1, MemoryKind(), Shape({30}), shard_shapes1);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(ConcreteShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0, 1});
+  std::vector<Shape> shard_shapes0;
+  shard_shapes0.reserve(2);
+  shard_shapes0.push_back(Shape({10}));
+  shard_shapes0.push_back(Shape({20}));
+  std::shared_ptr<const Sharding> sharding0 = ConcreteSharding::Create(
+      device_list0, MemoryKind(), Shape({30}), shard_shapes0);
+  {
+    auto device_list1 = GetDevices({0, 1});
+    std::vector<Shape> shard_shapes1;
+    shard_shapes1.reserve(2);
+    shard_shapes1.push_back(Shape({10}));
+    shard_shapes1.push_back(Shape({20}));
+    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+        device_list1, MemoryKind(), Shape({30}), shard_shapes1);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    EXPECT_EQ(*new_sharding, *sharding1);
+  }
+  {
+    auto device_list1 = GetDevices({0, 1, 2, 3});
+    EXPECT_THAT(
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt),
+        StatusIs(tsl::error::INVALID_ARGUMENT,
+                 HasSubstr("ConcreteSharding should have the same number of "
+                           "devices as the current sharding, but was asked to "
+                           "have 4 devices")));
+  }
+}
+
 TEST_P(ConcreteShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1});
   std::vector<Shape> shard_shapes;
@@ -165,9 +366,8 @@ TEST_P(ConcreteShardingTest, Disassemble) {
   for (int i = 0; i < 2; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, shard_shapes[i]);
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -195,9 +395,8 @@ TEST_P(ConcreteShardingTest, DisassembleDynamicShape) {
   for (int i = 0; i < disassembled.size(); ++i) {
     const auto& [dynamic_shape, sharding] = disassembled[i];
     EXPECT_EQ(dynamic_shape, shard_dynamic_shapes[i]);
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -248,6 +447,97 @@ TEST_P(ConcreteEvenShardingTest, IsFullyReplicated) {
   }
 }
 
+TEST_P(ConcreteEvenShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/true);
+  EXPECT_THAT(sharding->GetShardShape(Shape({30})), IsOkAndHolds(Shape({15})));
+  EXPECT_THAT(
+      sharding->GetShardShape(Shape({45})),
+      StatusIs(
+          tsl::error::INVALID_ARGUMENT,
+          HasSubstr("ConcreteEvenSharding has a shard shape for shape [30], "
+                    "but was asked to get a shard shape for shape [45]")));
+}
+
+TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding0 =
+      ConcreteEvenSharding::Create(device_list0, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/true);
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
+                                     Shape({15}), /*is_fully_replicated=*/true);
+    EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different number of shards.
+  {
+    auto device_list1 = GetDevices({2, 3, 4});
+    std::shared_ptr<const Sharding> sharding1 =
+        ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
+                                     Shape({15}), /*is_fully_replicated=*/true);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Difference shape.
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({45}),
+                                     Shape({15}), /*is_fully_replicated=*/true);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different shard shape.
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
+                                     Shape({10}), /*is_fully_replicated=*/true);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different is_fully_replicated.
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 = ConcreteEvenSharding::Create(
+        device_list1, MemoryKind(), Shape({30}), Shape({15}),
+        /*is_fully_replicated=*/false);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(ConcreteEvenShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding0 =
+      ConcreteEvenSharding::Create(device_list0, MemoryKind(), Shape({30}),
+                                   Shape({15}), /*is_fully_replicated=*/true);
+  {
+    auto device_list1 = GetDevices({2, 3});
+    std::shared_ptr<const Sharding> sharding1 =
+        ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
+                                     Shape({15}), /*is_fully_replicated=*/true);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    EXPECT_EQ(*new_sharding, *sharding1);
+  }
+  {
+    auto device_list1 = GetDevices({0, 1, 2, 3});
+    EXPECT_THAT(
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt),
+        StatusIs(
+            tsl::error::INVALID_ARGUMENT,
+            HasSubstr("ConcreteEvenSharding should have the same number of "
+                      "devices as the current sharding, but was asked to "
+                      "have 4 devices")));
+  }
+}
+
 TEST_P(ConcreteEvenShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1});
   std::shared_ptr<const Sharding> sharding =
@@ -260,9 +550,8 @@ TEST_P(ConcreteEvenShardingTest, Disassemble) {
   for (int i = 0; i < 2; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({15}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -298,7 +587,7 @@ TEST_P(ShardingParamShardingTest, CreateFailsWhenDeviceCountNotMatch) {
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
 
   EXPECT_THAT(ShardingParamSharding::Create(param, device_list, MemoryKind()),
-              StatusIs(tsl::error::FAILED_PRECONDITION,
+              StatusIs(tsl::error::INVALID_ARGUMENT,
                        HasSubstr("Device counts don't match. From "
                                  "ShardingParam 6 vs from DeviceList 2")));
 }
@@ -334,6 +623,94 @@ TEST_P(ShardingParamShardingTest, IsFullyReplicated) {
   }
 }
 
+TEST_P(ShardingParamShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> sharding,
+      ShardingParamSharding::Create(param, device_list, MemoryKind()));
+  EXPECT_THAT(sharding->GetShardShape(Shape({6, 6})),
+              IsOkAndHolds(Shape({3, 2})));
+  EXPECT_THAT(sharding->GetShardShape(Shape({6, 6, 6})),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("Numbers of dimensions don't match. From "
+                                 "Shape 3 vs from ShardingParam 2")));
+}
+
+TEST_P(ShardingParamShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0, 1, 2, 3, 4, 5});
+  ShardingParam param0{/*dim_shards=*/{2, 3},
+                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> sharding0,
+      ShardingParamSharding::Create(param0, device_list0, MemoryKind()));
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    ShardingParam param1{/*dim_shards=*/{2, 3},
+                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> sharding1,
+        ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
+    EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different number of shards.
+  {
+    auto device_list1 = GetDevices({3, 4, 5});
+    ShardingParam param1{/*dim_shards=*/{3, 1},
+                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{1, 3}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> sharding1,
+        ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different sharding param.
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    ShardingParam param1{/*dim_shards=*/{3, 2},
+                         {/*permutation=*/{0, 1}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> sharding1,
+        ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(ShardingParamShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0, 1, 2, 3, 4, 5});
+  ShardingParam param0{/*dim_shards=*/{2, 3},
+                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> sharding0,
+      ShardingParamSharding::Create(param0, device_list0, MemoryKind()));
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    ShardingParam param1{/*dim_shards=*/{2, 3},
+                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<const Sharding> sharding1,
+        ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    EXPECT_EQ(*new_sharding, *sharding1);
+  }
+  {
+    auto device_list1 = GetDevices({0, 1, 2});
+    EXPECT_THAT(
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt),
+        StatusIs(
+            tsl::error::INVALID_ARGUMENT,
+            HasSubstr("ShardingParamSharding should have the same number of "
+                      "devices as the current sharding, but was asked to "
+                      "have 3 devices")));
+  }
+}
+
 TEST_P(ShardingParamShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
   ShardingParam param{/*dim_shards=*/{2, 3},
@@ -348,9 +725,8 @@ TEST_P(ShardingParamShardingTest, Disassemble) {
   for (int i = 0; i < 6; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({3, 2}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -362,11 +738,10 @@ TEST_P(ShardingParamShardingTest, DisassembleFailsWhenRankNotMatch) {
       std::shared_ptr<const Sharding> param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
-  EXPECT_THAT(
-      param_sharding->Disassemble(Shape({6, 6, 6})),
-      StatusIs(tsl::error::FAILED_PRECONDITION,
-               HasSubstr(
-                   "Ranks don't match. From Shape 3 vs from ShardingParam 2")));
+  EXPECT_THAT(param_sharding->Disassemble(Shape({6, 6, 6})),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("Numbers of dimensions don't match. From "
+                                 "Shape 3 vs from ShardingParam 2")));
 }
 
 TEST_P(ShardingParamShardingTest, DisassembleFailsForUnevenSharding) {
@@ -380,7 +755,7 @@ TEST_P(ShardingParamShardingTest, DisassembleFailsForUnevenSharding) {
   EXPECT_THAT(
       param_sharding->Disassemble(Shape({7, 6})),
       StatusIs(
-          tsl::error::FAILED_PRECONDITION,
+          tsl::error::INVALID_ARGUMENT,
           HasSubstr("Uneven shard is not supported. dim: 7, dim_shards: 2")));
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 1d07b45864cd2a..19b93ccf3b20c3 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -176,7 +176,6 @@ xla_cc_test(
         "//xla/python/ifrt:tuple_impl_test_lib",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
index f3d905ba99c11f..2f7f2aedda03df 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/device.h"
@@ -110,6 +112,48 @@ HloSharding::HloSharding(DeviceList devices, MemoryKind memory_kind,
           std::move(devices), memory_kind, xla_hlo_sharding.IsReplicated()),
       xla_hlo_sharding_(std::move(xla_hlo_sharding)) {}
 
+absl::StatusOr<Shape> HloSharding::GetShardShape(const Shape& shape) const {
+  if (shape.dims().size() != xla_hlo_sharding_.TiledDataRank()) {
+    return InvalidArgument(
+        "Numbers of dimensions don't match. From Shape %d vs from "
+        "HloSharding %d",
+        shape.dims().size(), xla_hlo_sharding_.TiledDataRank());
+  }
+  const absl::Span<const int64_t> tile_assignment_dims =
+      xla_hlo_sharding_.tile_assignment().dimensions();
+  Shape::Dimensions tile_shape;
+  tile_shape.reserve(shape.dims().size());
+  for (int64_t i = 0; i < shape.dims().size(); ++i) {
+    tile_shape.push_back(
+        xla::CeilOfRatio(shape.dims()[i], tile_assignment_dims[i]));
+  }
+  return Shape(std::move(tile_shape));
+}
+
+bool HloSharding::HasSamePartitioning(const Sharding& other) const {
+  if (this == &other) {
+    return true;
+  }
+  const auto* other_hlo_sharding = llvm::dyn_cast<HloSharding>(&other);
+  if (!other_hlo_sharding) {
+    return false;
+  }
+  return xla_hlo_sharding_ == other_hlo_sharding->xla_hlo_sharding_;
+}
+
+absl::StatusOr<std::unique_ptr<Sharding>> HloSharding::WithDeviceAssignment(
+    std::optional<DeviceList> devices,
+    std::optional<MemoryKind> memory_kind) const {
+  if (devices.has_value() && devices->size() != devices_.size()) {
+    return InvalidArgument(
+        "HloSharding should have the same number of devices as the current "
+        "sharding, but was asked to have %d devices",
+        devices->size());
+  }
+  return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_),
+                xla_hlo_sharding_);
+}
+
 absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
 HloSharding::Disassemble(const Shape& shape) const {
   TF_ASSIGN_OR_RETURN(auto index_domains, IndexDomains(shape));
@@ -179,6 +223,11 @@ absl::StatusOr<std::vector<IndexDomain>> HloSharding::IndexDomains(
                         xla_hlo_sharding_.ToString()));
   }
 
+  // Get the tile shape. This shape represents the shape of all per-shard
+  // buffers.
+  TF_ASSIGN_OR_RETURN(Shape tile_shape, GetShardShape(shape));
+  const absl::Span<const int64_t> tile_shape_dims = tile_shape.dims();
+
   // At the high-level, tile_assignment_dims[i] describes the number of ways the
   // shape is partitioned along i-th dimension. Note that
   // tile_assignment_dims[i] with i >= shape.size() encodes other information
@@ -188,15 +237,6 @@ absl::StatusOr<std::vector<IndexDomain>> HloSharding::IndexDomains(
   const absl::Span<const int64_t> tile_assignment_dims =
       xla_hlo_sharding_.tile_assignment().dimensions();
 
-  // Get the tile shape. This shape represents the shape of all per-shard
-  // buffers.
-  Shape::Dimensions tile_shape;
-  tile_shape.reserve(shape.dims().size());
-  for (int64_t i = 0; i < shape.dims().size(); ++i) {
-    tile_shape.push_back(
-        xla::CeilOfRatio(shape.dims()[i], tile_assignment_dims[i]));
-  }
-
   const int64_t replication_dim = xla_hlo_sharding_.SubgroupReplicationDim();
   int64_t num_replicas;
   if (replication_dim == -1) {
@@ -216,7 +256,7 @@ absl::StatusOr<std::vector<IndexDomain>> HloSharding::IndexDomains(
   do {
     for (int64_t i = 0; i < shape.dims().size(); ++i) {
       origin[i] =
-          std::min(tile_shape[i] * unique_tile_index[i], shape.dims()[i]);
+          std::min(tile_shape_dims[i] * unique_tile_index[i], shape.dims()[i]);
     }
     for (int64_t i = 0; i < num_replicas; ++i) {
       CHECK_LT(device_assignment_index, num_devices);
@@ -237,10 +277,10 @@ absl::StatusOr<std::vector<IndexDomain>> HloSharding::IndexDomains(
   result.reserve(num_devices);
   for (int device_idx = 0; device_idx < num_devices; ++device_idx) {
     Shape::Dimensions actual_tile_shape;
-    actual_tile_shape.reserve(tile_shape.size());
-    for (int i = 0; i < tile_shape.size(); ++i) {
-      actual_tile_shape.push_back(
-          std::min(tile_shape[i], shape.dims()[i] - origins[device_idx][i]));
+    actual_tile_shape.reserve(tile_shape_dims.size());
+    for (int i = 0; i < tile_shape_dims.size(); ++i) {
+      actual_tile_shape.push_back(std::min(
+          tile_shape_dims[i], shape.dims()[i] - origins[device_idx][i]));
     }
     result.push_back(IndexDomain(Index(origins[device_idx]),
                                  Shape(std::move(actual_tile_shape))));
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
index 9feb32f626e5a9..bdb8a2e49eec78 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -62,6 +63,14 @@ class HloSharding final
 
   ~HloSharding() override = default;
 
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<DeviceList> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
   absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
   Disassemble(const Shape& shape) const override;
   absl::StatusOr<
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
index 6f260d51748a9e..d005df9dde51be 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/python/ifrt/index.h"
@@ -44,6 +44,7 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::SizeIs;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
 class HloShardingTest : public test_util::ShardingTest {};
@@ -73,6 +74,79 @@ TEST_P(HloShardingTest, IsFullyReplicated) {
   }
 }
 
+TEST_P(HloShardingTest, GetShardShape) {
+  auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
+  auto xla_hlo_sharding = xla::HloSharding::IotaTile({2, 3});
+  std::shared_ptr<const HloSharding> sharding =
+      HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+  EXPECT_THAT(sharding->GetShardShape(Shape({6, 6})),
+              IsOkAndHolds(Shape({3, 2})));
+  EXPECT_THAT(sharding->GetShardShape(Shape({6, 6, 6})),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("Numbers of dimensions don't match. From "
+                                 "Shape 3 vs from HloSharding 2")));
+}
+
+TEST_P(HloShardingTest, HasSamePartitioning) {
+  auto device_list0 = GetDevices({0, 1, 2, 3, 4, 5});
+  auto xla_hlo_sharding0 = xla::HloSharding::IotaTile({2, 3});
+  std::shared_ptr<const HloSharding> sharding0 =
+      HloSharding::Create(device_list0, MemoryKind(), xla_hlo_sharding0);
+
+  EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    auto xla_hlo_sharding1 = xla::HloSharding::IotaTile({2, 3});
+    std::shared_ptr<const HloSharding> sharding1 =
+        HloSharding::Create(device_list1, MemoryKind(), xla_hlo_sharding1);
+    EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different number of shards.
+  {
+    auto device_list1 = GetDevices({3, 4, 5});
+    auto xla_hlo_sharding1 = xla::HloSharding::IotaTile({3, 1});
+    std::shared_ptr<const HloSharding> sharding1 =
+        HloSharding::Create(device_list1, MemoryKind(), xla_hlo_sharding1);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+  // Different HloSharding.
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    auto xla_hlo_sharding1 = xla::HloSharding::IotaTile({3, 2});
+    std::shared_ptr<const HloSharding> sharding1 =
+        HloSharding::Create(device_list1, MemoryKind(), xla_hlo_sharding1);
+    EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
+  }
+}
+
+TEST_P(HloShardingTest, WithDeviceAssignment) {
+  auto device_list0 = GetDevices({0, 1, 2, 3, 4, 5});
+  auto xla_hlo_sharding0 = xla::HloSharding::IotaTile({2, 3});
+  std::shared_ptr<const HloSharding> sharding0 =
+      HloSharding::Create(device_list0, MemoryKind(), xla_hlo_sharding0);
+  {
+    auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
+    auto xla_hlo_sharding1 = xla::HloSharding::IotaTile({2, 3});
+    std::shared_ptr<const HloSharding> sharding1 =
+        HloSharding::Create(device_list1, MemoryKind(), xla_hlo_sharding1);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto new_sharding,
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt));
+    EXPECT_EQ(*new_sharding, *sharding1);
+  }
+  {
+    auto device_list1 = GetDevices({0, 1, 2});
+    EXPECT_THAT(
+        sharding0->WithDeviceAssignment(device_list1,
+                                        /*memory_kind=*/std::nullopt),
+        StatusIs(tsl::error::INVALID_ARGUMENT,
+                 HasSubstr("HloSharding should have the same number of "
+                           "devices as the current sharding, but was asked to "
+                           "have 3 devices")));
+  }
+}
+
 TEST_P(HloShardingTest, IndexDomainsWithReplication) {
   auto device_list = GetDevices({0, 1});
   // Fully replicated.
@@ -104,9 +178,8 @@ TEST_P(HloShardingTest, DisassembleWithReplication) {
   for (int i = 0; i < 2; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({10, 20}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -144,9 +217,8 @@ TEST_P(HloShardingTest, DisassembleWithTile) {
   for (int i = 0; i < 2; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({5, 20}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -188,9 +260,8 @@ TEST_P(HloShardingTest, DisassembleWithUnevenTile) {
     } else {
       EXPECT_EQ(shape, Shape({5, 20}));
     }
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -234,9 +305,8 @@ TEST_P(HloShardingTest, DisassembleWithPartialTile) {
   for (int i = 0; i < 6; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({5, 20}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -280,9 +350,8 @@ TEST_P(HloShardingTest, DisassembleWithSubgroupReplicated) {
   for (int i = 0; i < 6; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({5, 20}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 
@@ -326,9 +395,8 @@ TEST_P(HloShardingTest, DisassembleWithSubgroupMaximalSlowPath) {
   for (int i = 0; i < 6; ++i) {
     const auto& [shape, sharding] = disassembled[i];
     EXPECT_EQ(shape, Shape({5, 20}));
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
-    EXPECT_THAT(sharding->devices().devices(),
-                ElementsAre(device_list.devices()[i]));
+    EXPECT_EQ(*sharding,
+              *SingleDeviceSharding::Create(device_list[i], MemoryKind()));
   }
 }
 

From ef0167c73e1e8ea03655f5425d3c32b224cbcc1e Mon Sep 17 00:00:00 2001
From: Shiqing Yan <shiqing@google.com>
Date: Wed, 29 May 2024 17:48:08 -0700
Subject: [PATCH 114/287] Allow model builder to build a model based on a file
 descriptor.

PiperOrigin-RevId: 638468928
---
 tensorflow/lite/core/BUILD            |  1 +
 tensorflow/lite/core/model_builder.cc | 41 +++++++++++++++++++++++++++
 tensorflow/lite/core/model_builder.h  | 22 ++++++++++++++
 3 files changed, 64 insertions(+)

diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index c3939ab4b33c8d..b1b1db307fa665 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -302,6 +302,7 @@ cc_library(
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/strings",
         "@flatbuffers",
     ],
     alwayslink = 1,
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index f3c33a0847828a..a065cca1ee45e9 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "flatbuffers/base.h"  // from @flatbuffers
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/vector.h"  // from @flatbuffers
@@ -61,6 +62,20 @@ std::unique_ptr<Allocation> GetAllocationFromFile(
   return allocation;
 }
 
+// Loads a model from `fd`. If `mmap_file` is true then use mmap,
+// otherwise make a copy of the model in a buffer.
+std::unique_ptr<Allocation> GetAllocationFromFile(
+    int fd, ErrorReporter* error_reporter) {
+  std::unique_ptr<Allocation> allocation;
+  if (MMAPAllocation::IsSupported()) {
+    allocation = std::make_unique<MMAPAllocation>(fd, error_reporter);
+  } else {
+    allocation = std::make_unique<FileCopyAllocation>(
+        absl::StrCat("/proc/self/fd/", fd).c_str(), error_reporter);
+  }
+  return allocation;
+}
+
 namespace impl {
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
@@ -89,6 +104,32 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
 #endif
 }
 
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFileDescriptor(
+    int fd, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+  std::unique_ptr<FlatBufferModel> model = BuildFromAllocation(
+      GetAllocationFromFile(fd, error_reporter), error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+  return model;
+#else
+  return ByteConvertModel(std::move(model), error_reporter);
+#endif
+}
+
+std::unique_ptr<FlatBufferModel>
+FlatBufferModel::VerifyAndBuildFromFileDescriptor(
+    int fd, TfLiteVerifier* extra_verifier, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+  std::unique_ptr<FlatBufferModel> model =
+      VerifyAndBuildFromAllocation(GetAllocationFromFile(fd, error_reporter),
+                                   extra_verifier, error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+  return model;
+#else
+  return ByteConvertModel(std::move(model), error_reporter);
+#endif
+}
+
 }  // namespace impl
 
 #endif
diff --git a/tensorflow/lite/core/model_builder.h b/tensorflow/lite/core/model_builder.h
index ddd45534d0090b..cd94fbd5ad8bc9 100644
--- a/tensorflow/lite/core/model_builder.h
+++ b/tensorflow/lite/core/model_builder.h
@@ -95,6 +95,28 @@ class FlatBufferModel {
       const char* filename, TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  /// Builds a model based on a file descriptor.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModel instance. Caller retains ownership of
+  /// `fd` and must ensure it is closed after BuildFromFile returns.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFileDescriptor(
+      int fd,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  /// Verifies whether the content of the file descriptor is legit, then builds
+  /// a model based on the file.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// file contents. By default, we always check with tflite::VerifyModelBuffer.
+  /// If extra_verifier is supplied, the file contents is also checked against
+  /// the extra_verifier after the check against tflite::VerifyModelBuilder.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFileDescriptor(
+      int fd, TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   /// Builds a model based on a pre-loaded flatbuffer.
   /// Caller retains ownership of the buffer and should keep it alive until
   /// the returned object is destroyed. Caller also retains ownership of

From 0e674cfd662a4e4f6049cac584ff1888c98f7e34 Mon Sep 17 00:00:00 2001
From: Siqiao Wu <siqiaowu@google.com>
Date: Wed, 29 May 2024 18:01:39 -0700
Subject: [PATCH 115/287] Support paging in TfrtSavedModelServable.

PiperOrigin-RevId: 638471838
---
 .../assets.extra/validation_result_do_not_edit    | Bin 0 -> 651 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets.extra/validation_result_do_not_edit

diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets.extra/validation_result_do_not_edit b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets.extra/validation_result_do_not_edit
new file mode 100644
index 0000000000000000000000000000000000000000..99eedbdb9bc6df2f6740760900b035495cf1151a
GIT binary patch
literal 651
zcmZ`$KW`H;6pv|IHEt<Ks89%1fe0~FLP8Z&w>D53OQBH%WjtQ(N6yq|pKM<su;gnn
zP$51B6C)png`I(Si6RvwCfj<y_xtDHjau!;w_jrN_~LW&u|2(#u+BE$;+wplgSPE;
z4|%F>dNzp|lIfR=Pb)ji0sR<vmSR!vzU6J+qD%S?d|no*ba|l-(plQbAMj2!Q#jTV
zQv(sLoK$qVmfi)F&Z>||exKjTokEiu2P%y^$2rIty#W2(+I!7!|IgR=uMhb)Z}G}g
z`+9~@q<<TqGJcQWn$Mddf;|5hFrBg^Heh$zBeqvNXsoh?wOEHSvD|ewM3L-=phxx;
zyoNJ_$A!2#Fd#8=MxlSDWA+5!Kxv{_8NwJ(cg0e_G~&jK!OIgYjFyyM9hYQ~5OXfk
zq~QR3)KNQ&std-5&C3gYjG!Qb*mzcg2OAORZdQ8=E$vOjIuQhX`X2JaplFi6Ug%k3
z1fxHYVj->`K$O{GX=NoN)(CP0R?RZ-Vf5OLgws~k2UN>UzdQRysNg3Us!Z308F+>2
zkUrN!PC|Y3BTCuLP@WgG9)qYqcO<uVsTRQ(eNi79CkSfsCSZfOqw}+>_Lcer&2`@R

literal 0
HcmV?d00001


From b4cc739e9d959d4e0e2135624ec5e3e623a51979 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 18:28:10 -0700
Subject: [PATCH 116/287] Implement `AssembleArrayFromSingleDeviceArrays` for
 BasicStringArray.

PiperOrigin-RevId: 638477621
---
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |   2 +
 .../pjrt_ifrt/basic_string_array_test.cc      | 344 ++++++++++++++++--
 .../xla/xla/python/pjrt_ifrt/pjrt_client.cc   | 117 ++++++
 3 files changed, 435 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 19b93ccf3b20c3..ee98154218fb0d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -230,6 +230,7 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/translate/mhlo_to_hlo:type_to_shape",
         "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -304,6 +305,7 @@ xla_cc_test(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index e8f2154268b5f8..9c3424a7bfb7dc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <numeric>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -29,6 +31,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
@@ -48,10 +51,12 @@ namespace xla {
 namespace ifrt {
 namespace {
 
+using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
-// Makes a simple single device sharded string array by means of
-// `BasicStringArray::Create` factory method.
+// Makes a simple single device sharded `BasicStringArray` from the
+// user-supplied buffers and on_done_with_buffer callback by means of the
+// factory method: `BasicStringArray::Create`.
 absl::StatusOr<tsl::RCReference<BasicStringArray>> CreateTestArray(
     Client* client, Future<BasicStringArray::Buffers> buffers,
     BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
@@ -114,7 +119,29 @@ TEST(BasicStringArrayTest, Destruction) {
   on_done_with_buffer_called.WaitForNotification();
 }
 
-TEST(BasicStringArrayTest, GetReadyFutureSuccess) {
+TEST(BasicStringArrayTest, Delete) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  BasicStringArray::Buffers buffers;
+  buffers.push_back({"abc", "def"});
+  absl::Notification on_done_with_buffer_called;
+  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
+      [&on_done_with_buffer_called]() { on_done_with_buffer_called.Notify(); };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
+                      std::move(on_done_with_buffer)));
+
+  tsl::Env::Default()->SchedClosure([&]() { array->Delete(); });
+
+  // Delete must have released the buffer by calling `on_done_with_buffer`.
+  on_done_with_buffer_called.WaitForNotification();
+
+  // IsDeleted should return true.
+  EXPECT_TRUE(array->IsDeleted());
+}
+
+TEST(GetReadyFutureTest, SuccessCase) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   // Make a BasicStringArray with a future that is not ready.
   auto promise = Future<BasicStringArray::Buffers>::CreatePromise();
@@ -134,7 +161,7 @@ TEST(BasicStringArrayTest, GetReadyFutureSuccess) {
   TF_EXPECT_OK(ready_future.Await());
 }
 
-TEST(BasicStringArrayTest, GetReadyFutureFailure) {
+TEST(GetReadyFutureTest, FailureCases) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   // Make a BasicStringArray with a future that is not ready.
   auto promise = Future<BasicStringArray::Buffers>::CreatePromise();
@@ -154,29 +181,7 @@ TEST(BasicStringArrayTest, GetReadyFutureFailure) {
   EXPECT_THAT(ready_future.Await(), StatusIs(absl::StatusCode::kInternal));
 }
 
-TEST(BasicStringArrayTest, Delete) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
-  BasicStringArray::Buffers buffers;
-  buffers.push_back({"abc", "def"});
-  absl::Notification on_done_with_buffer_called;
-  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
-      [&on_done_with_buffer_called]() { on_done_with_buffer_called.Notify(); };
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto array,
-      CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
-                      std::move(on_done_with_buffer)));
-
-  tsl::Env::Default()->SchedClosure([&]() { array->Delete(); });
-
-  // Delete must have released the buffer by calling `on_done_with_buffer`.
-  on_done_with_buffer_called.WaitForNotification();
-
-  // IsDeleted should return true.
-  EXPECT_TRUE(array->IsDeleted());
-}
-
-TEST(BasicStringArrayTest, MakeArrayFromHostBufferSuccess) {
+TEST(MakeArrayFromHostBufferTest, SuccessCase) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   Shape shape({1});
   Device* device = client->addressable_devices().at(0);
@@ -196,7 +201,7 @@ TEST(BasicStringArrayTest, MakeArrayFromHostBufferSuccess) {
       std::move(on_done_with_host_buffer)));
 }
 
-TEST(BasicStringArrayTest, MakeArrayFromHostBufferErrorHandling) {
+TEST(MakeArrayFromHostBufferTest, FailureCases) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   Shape shape({1});
   Device* device = client->addressable_devices().at(0);
@@ -246,6 +251,289 @@ TEST(BasicStringArrayTest, MakeArrayFromHostBufferErrorHandling) {
   }
 }
 
+// Makes a single device sharded string ifrt::Array. Makes the necessary host
+// string buffers.
+absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceStringTestArray(
+    absl::Span<const std::string> contents, Client* client,
+    Device* const device) {
+  Shape shape({1});
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  auto string_views = std::make_shared<std::vector<absl::string_view>>();
+  for (const auto& content : contents) {
+    string_views->push_back(content);
+  }
+  const void* data = string_views->data();
+  auto on_done_with_host_buffer = [string_views = std::move(string_views)]() {};
+
+  return client->MakeArrayFromHostBuffer(
+      data, DType(DType::kString), shape,
+      /*byte_strides=*/std::nullopt, std::move(sharding),
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      std::move(on_done_with_host_buffer));
+}
+
+// Makes a single device sharded test array containing floats on the given
+// Device.
+absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceFloatTestArray(
+    Client* client, Device* const device) {
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  return client->MakeArrayFromHostBuffer(
+      data->data(), dtype, shape,
+      /*byte_strides=*/std::nullopt, sharding,
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      /*on_done_with_host_buffer=*/nullptr);
+}
+
+// Makes a sharded string array with two shards.
+absl::StatusOr<tsl::RCReference<Array>> MakeShardedStringTestArrray(
+    Client* client, const std::string shard1_contents,
+    const std::string shard2_contents) {
+  auto devices = client->addressable_devices();
+  if (devices.size() < 2) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Test client has too few devices. Need 2, got:", devices.size()));
+  }
+
+  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+      DeviceList({devices[0], devices[1]}), MemoryKind());
+
+  std::vector<tsl::RCReference<Array>> arrays;
+  for (int i = 0; i < 2; ++i) {
+    TF_ASSIGN_OR_RETURN(
+        auto array, MakeSingleDeviceStringTestArray({absl::StrCat("shard ", i)},
+                                                    client, devices[i]));
+    arrays.push_back(std::move(array));
+  }
+
+  return client->AssembleArrayFromSingleDeviceArrays(
+      Shape({2}), std::move(opaque_sharding), absl::MakeSpan(arrays),
+      ArrayCopySemantics::kAlwaysCopy);
+}
+
+TEST(AssembleArrayFromSingleDeviceArraysTest,
+     SuccessWithReadySingleDeviceArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  // Make a BasicStringArray with two underlying basic string arrays.
+  const std::vector<std::string> per_shard_contents({"shard 0", "shard 1"});
+  TF_ASSERT_OK_AND_ASSIGN(auto array, MakeShardedStringTestArrray(
+                                          client.get(), per_shard_contents[0],
+                                          per_shard_contents[1]));
+  auto basic_string_array = llvm::dyn_cast<BasicStringArray>(array.get());
+  ASSERT_NE(basic_string_array, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(auto buffers, basic_string_array->buffers().Await());
+  EXPECT_EQ(buffers.size(), 2);
+
+  for (int i = 0; i < buffers.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("buffer #", i));
+    auto buffer = buffers[i];
+    EXPECT_THAT(buffer, testing::ElementsAre(per_shard_contents[i]));
+  }
+}
+
+TEST(AssembleArrayFromSingleDeviceArraysTest, FailsWithNonStringArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto devices = client->addressable_devices();
+  ASSERT_GE(devices.size(), 2);
+  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+      DeviceList({devices[0], devices[1]}), MemoryKind());
+
+  std::vector<tsl::RCReference<Array>> arrays(2);
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays[0], MakeSingleDeviceFloatTestArray(client.get(), devices[0]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays[1], MakeSingleDeviceStringTestArray({"string_array_contents"},
+                                                 client.get(), devices[1]));
+
+  EXPECT_THAT(client->AssembleArrayFromSingleDeviceArrays(
+                  Shape({2}), std::move(opaque_sharding),
+                  absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(AssembleArrayFromSingleDeviceArraysTest,
+     FailsWithNonSingleDeviceStringArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto devices = client->addressable_devices();
+  ASSERT_GE(devices.size(), 2);
+  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+      DeviceList({devices[0], devices[1]}), MemoryKind());
+
+  std::vector<tsl::RCReference<Array>> arrays(2);
+  const std::vector<std::string> per_shard_contents({"abc", "def"});
+  TF_ASSERT_OK_AND_ASSIGN(arrays[0], MakeShardedStringTestArrray(
+                                         client.get(), per_shard_contents[0],
+                                         per_shard_contents[1]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays[1], MakeSingleDeviceStringTestArray({"string_array_contents"},
+                                                 client.get(), devices[1]));
+
+  EXPECT_THAT(client->AssembleArrayFromSingleDeviceArrays(
+                  Shape({2}), std::move(opaque_sharding),
+                  absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+// Makes a `BasicStringArray::Buffers` and its associated
+// `BasicStringArray::OnDoneWithBuffer` from the given span of strings.
+std::pair<BasicStringArray::Buffers, BasicStringArray::OnDoneWithBuffer>
+MakeBuffersAndOnDoneWithBuffer(
+    absl::Span<const absl::string_view> input_strings) {
+  BasicStringArray::Buffers buffers;
+  auto string_holder = std::make_shared<std::vector<std::string>>();
+  string_holder->reserve(input_strings.size());
+  auto string_view_holder = std::make_shared<std::vector<absl::string_view>>();
+  string_view_holder->reserve(input_strings.size());
+  for (const auto str : input_strings) {
+    string_holder->push_back(std::string(str));
+  }
+  for (const auto& str : *string_holder) {
+    string_view_holder->push_back(absl::string_view(str));
+  }
+  buffers.push_back(*string_view_holder);
+
+  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
+      [string_holder = std::move(string_holder),
+       string_view_holder = std::move(string_view_holder)]() {};
+
+  return std::make_pair(std::move(buffers), std::move(on_done_with_buffer));
+}
+
+// Makes a simple single device sharded `BasicStringArray` that is not ready at
+// the time of creation. Returns a promise that can be set to make the array
+// ready. If the callers set this promise with buffers (i.e., not an error),
+// then they must ensure that the underlying strings live until the
+// `on-host-buffer-done` callback they provided is run.
+absl::StatusOr<std::pair<tsl::RCReference<BasicStringArray>,
+                         Promise<BasicStringArray::Buffers>>>
+CreateNonReadyTestArray(
+    Client* client, Device* const device,
+    BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
+  auto buffers_promise = Future<BasicStringArray::Buffers>::CreatePromise();
+  auto buffers_future = Future<BasicStringArray::Buffers>(buffers_promise);
+  Shape shape({1});
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  TF_ASSIGN_OR_RETURN(auto array,
+                      BasicStringArray::Create(client, shape, sharding,
+                                               std::move(buffers_future),
+                                               std::move(on_done_with_buffer)));
+
+  return std::make_pair(std::move(array), std::move(buffers_promise));
+}
+
+TEST(AssembleArrayFromSingleDeviceArraysTest,
+     FromNonReadySingleDeviceArraysSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto devices = client->addressable_devices();
+  ASSERT_GE(devices.size(), 2);
+  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+      DeviceList({devices[0], devices[1]}), MemoryKind());
+
+  // Make two non-ready single device sharded arrays.
+  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<Promise<BasicStringArray::Buffers>> promises;
+  arrays.reserve(2);
+  auto buf_and_on_done_with_buffer = MakeBuffersAndOnDoneWithBuffer({"abc"});
+  auto buffers0 = buf_and_on_done_with_buffer.first;
+  auto on_done_with_buffer0 = buf_and_on_done_with_buffer.second;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ret, CreateNonReadyTestArray(client.get(), devices[0],
+                                        std::move(on_done_with_buffer0)));
+  arrays.push_back(std::move(ret.first));
+  promises.push_back(std::move(ret.second));
+
+  buf_and_on_done_with_buffer = MakeBuffersAndOnDoneWithBuffer({"def"});
+  auto buffers1 = buf_and_on_done_with_buffer.first;
+  auto on_done_with_buffer1 = buf_and_on_done_with_buffer.second;
+  TF_ASSERT_OK_AND_ASSIGN(
+      ret, CreateNonReadyTestArray(client.get(), devices[1],
+                                   std::move(on_done_with_buffer1)));
+  arrays.push_back(std::move(ret.first));
+  promises.push_back(std::move(ret.second));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->AssembleArrayFromSingleDeviceArrays(
+                      Shape({1}), std::move(opaque_sharding),
+                      absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy));
+
+  tsl::Env::Default()->SchedClosure(([&]() mutable {
+    promises[0].Set(buffers0);
+    promises[1].Set(buffers1);
+  }));
+
+  auto basic_string_array = llvm::dyn_cast<BasicStringArray>(array.get());
+  ASSERT_NE(basic_string_array, nullptr);
+
+  auto buffers_future = basic_string_array->buffers();
+  TF_ASSERT_OK_AND_ASSIGN(auto buffers, buffers_future.Await());
+  EXPECT_EQ(buffers.size(), 2);
+  EXPECT_THAT(buffers[0], testing::ElementsAre("abc"));
+  EXPECT_THAT(buffers[1], testing::ElementsAre("def"));
+}
+
+TEST(AssembleArrayFromSingleDeviceArraysTest,
+     FromNonReadySingleDeviceArraysFailure) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto devices = client->addressable_devices();
+  ASSERT_GE(devices.size(), 2);
+  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+      DeviceList({devices[0], devices[1]}), MemoryKind());
+
+  // Make two non-ready single device sharded arrays.
+  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<Promise<BasicStringArray::Buffers>> promises;
+  arrays.reserve(2);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ret, CreateNonReadyTestArray(client.get(), devices[0],
+                                        /*on_done_with_buffer=*/nullptr));
+  arrays.push_back(std::move(ret.first));
+  promises.push_back(std::move(ret.second));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ret, CreateNonReadyTestArray(client.get(), devices[1],
+                                   /*on_done_with_buffer=*/nullptr));
+  arrays.push_back(std::move(ret.first));
+  promises.push_back(std::move(ret.second));
+
+  // Make a sharded BasicStringArray out of the single device arrays.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->AssembleArrayFromSingleDeviceArrays(
+                      Shape({1}), std::move(opaque_sharding),
+                      absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy));
+
+  // Make the single device arrays become ready with an error.
+  absl::Notification done_readying_single_device_arrays;
+  tsl::Env::Default()->SchedClosure(([&]() mutable {
+    promises[0].Set(absl::InternalError("injected from the test"));
+    promises[1].Set(absl::InternalError("injected from the test"));
+    done_readying_single_device_arrays.Notify();
+  }));
+
+  auto basic_string_array = llvm::dyn_cast<BasicStringArray>(array.get());
+  ASSERT_NE(basic_string_array, nullptr);
+
+  auto buffers_future = basic_string_array->buffers();
+  EXPECT_THAT(buffers_future.Await(),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("injected from the test")));
+
+  // Make sure to wait for the Closure to complete its work and set both
+  // promises before returning from the test. The consequent destruction of the
+  // promises can race with the Closure.
+  done_readying_single_device_arrays.WaitForNotification();
+}
+
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index 79fc87330c1aa6..ced5cb654c22f1 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -31,8 +32,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/layout.h"
@@ -127,6 +130,116 @@ absl::StatusOr<tsl::RCReference<Array>> MakeStringArrayFromHostBuffer(
       std::move(buffer_releaser));
 }
 
+absl::StatusOr<tsl::RCReference<Array>>
+AssembleStringArrayFromSingleDeviceStringArrays(
+    Shape shape, std::shared_ptr<const Sharding> sharding,
+    absl::Span<tsl::RCReference<Array>> arrays, ArrayCopySemantics semantics) {
+  // BufferBackingState contains the per-shard vectors of the strings and
+  // string_views underlying a BasicString::Buffer.  Not thread safe.
+  struct BufferBackingStore {
+    explicit BufferBackingStore(int num_shards)
+        : per_shard_strings(num_shards), per_shard_string_views(num_shards) {}
+    void clear() {
+      per_shard_strings.clear();
+      per_shard_string_views.clear();
+    }
+    void CopyBuffer(absl::Span<const absl::string_view> strbuf, int shard_index,
+                    BasicStringArray::Buffers* buffers) {
+      auto& strings = per_shard_strings[shard_index];
+      strings.reserve(strbuf.size());
+      auto& views = per_shard_string_views[shard_index];
+      views.reserve(strbuf.size());
+
+      for (int i = 0; i < strbuf.size(); ++i) {
+        strings.push_back(std::string(strbuf[i].data(), strbuf[i].size()));
+      }
+      for (const auto& str : strings) {
+        views.push_back(str);
+      }
+      (*buffers)[shard_index] = absl::MakeConstSpan(views);
+    }
+    std::vector<std::vector<std::string>> per_shard_strings;
+    std::vector<std::vector<absl::string_view>> per_shard_string_views;
+  };
+  auto buffer_backing_store =
+      std::make_shared<BufferBackingStore>(sharding->devices().size());
+  auto on_done_with_buffer = [buffer_holder = buffer_backing_store]() {};
+
+  struct BufferCopyingState {
+    BufferCopyingState(int num_buffers_to_copy,
+                       std::shared_ptr<BufferBackingStore> buffer_backing_store)
+        : num_buffers_to_copy(num_buffers_to_copy),
+          buffer_backing_store(std::move(buffer_backing_store)),
+          buffers(num_buffers_to_copy) {}
+    absl::Mutex mu;
+    int num_buffers_to_copy ABSL_GUARDED_BY(mu);
+    std::shared_ptr<BufferBackingStore> buffer_backing_store
+        ABSL_GUARDED_BY(mu);
+    BasicStringArray::Buffers buffers ABSL_GUARDED_BY(mu);
+  };
+  auto buffer_copying_state = std::make_shared<BufferCopyingState>(
+      arrays.size(), std::move(buffer_backing_store));
+
+  auto buffers_promise = Future<BasicStringArray::Buffers>::CreatePromise();
+  auto buffers_future = Future<BasicStringArray::Buffers>(buffers_promise);
+
+  auto buffer_copier = [state = buffer_copying_state,
+                        promise = buffers_promise](
+                           absl::StatusOr<BasicStringArray::Buffers> strbuf,
+                           int shard_index) mutable {
+    absl::MutexLock lock(&state->mu);
+    if (state->num_buffers_to_copy == 0) {
+      // Nothing to be done. We get here if the buffers of a single
+      // device array became ready with a an error previously.
+      return;
+    }
+    if (!strbuf.ok()) {
+      promise.Set(strbuf.status());
+      state->num_buffers_to_copy = 0;  // Don't copy any more buffers.
+
+      // Release the partially copied buffers and reclaim the memory.
+      // These are no longer needed. The empty buffer_holder itself lives
+      // on until the on_done_with_buffer is called.
+      state->buffer_backing_store->clear();
+      state->buffer_backing_store = nullptr;
+      return;
+    }
+
+    state->buffer_backing_store->CopyBuffer(strbuf->front(), shard_index,
+                                            &state->buffers);
+
+    if (--state->num_buffers_to_copy > 0) {
+      return;  // We have more single device arrays we need to wait for.
+    }
+    // We have all the buffers. Set the promise.
+    promise.Set(std::move(state->buffers));
+  };
+
+  for (int i = 0; i < arrays.size(); ++i) {
+    auto basic_string_array = llvm::dyn_cast<BasicStringArray>(arrays[i].get());
+    if (!basic_string_array) {
+      return absl::InvalidArgumentError(
+          "All single device arrays must be BasicStringArrays");
+    }
+    if (!llvm::isa<SingleDeviceSharding>(basic_string_array->sharding())) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "All single device arrays must have single device sharding. got: %s "
+          "for shard index: %d",
+          basic_string_array->sharding().DebugString(), i));
+    }
+
+    basic_string_array->buffers().OnReady(
+        [shard_index = i, buffer_copier = buffer_copier](
+            absl::StatusOr<BasicStringArray::Buffers> strbuf) mutable {
+          buffer_copier(std::move(strbuf), shard_index);
+        });
+  }
+
+  return BasicStringArray::Create(arrays[0]->client(), std::move(shape),
+                                  std::move(sharding), buffers_future,
+                                  std::move(on_done_with_buffer));
+}
+
 }  // namespace
 
 char PjRtCompatibleClient::ID = 0;
@@ -355,6 +468,10 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
         "%d vs. %d",
         sharding->devices().size(), arrays.size());
   }
+  if (arrays[0]->dtype().kind() == DType::kString) {
+    return AssembleStringArrayFromSingleDeviceStringArrays(shape, sharding,
+                                                           arrays, semantics);
+  }
   PjRtArray::PjRtBuffers buffers;
   buffers.reserve(arrays.size());
   DType dtype = arrays[0]->dtype();

From 6f8baa0bbaa221d1d4245c2c71a7b5d4a62f0591 Mon Sep 17 00:00:00 2001
From: Vladyslav Tsilytskyi <tsilytskyi@google.com>
Date: Wed, 29 May 2024 18:50:20 -0700
Subject: [PATCH 117/287] [stream_executor:host] Implement Launch method

This is a part of the ongoing effort https://github.com/openxla/xla/issues/7234

PiperOrigin-RevId: 638481503
---
 third_party/xla/xla/stream_executor/BUILD     |  1 +
 .../xla/xla/stream_executor/host/BUILD        |  4 +-
 .../xla/stream_executor/host/host_executor.cc | 30 ++++++--
 .../xla/stream_executor/host/host_executor.h  |  2 +
 .../xla/stream_executor/host/host_kernel.cc   |  5 ++
 .../xla/stream_executor/host/host_kernel.h    |  5 +-
 .../stream_executor/host/host_kernel_test.cc  | 69 +++++++++++++++----
 .../stream_executor/stream_executor_test.cc   | 11 +--
 8 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index c43aa809982ad6..c517a7a96f0092 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -752,6 +752,7 @@ xla_cc_test(
     deps = [
         ":stream_executor",
         "//xla/stream_executor/host:host_platform",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 1d437965758bad..6b503df696b907 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -117,9 +117,11 @@ xla_cc_test(
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel_factory",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -147,7 +149,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform/profile_utils:profile_utils_cpu_utils",
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index 86a79c2fe7a188..22873fa4e31ff5 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -42,9 +42,12 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
+#include "tsl/platform/cpu_info.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/profile_utils/cpu_utils.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace stream_executor {
 namespace host {
@@ -64,10 +67,14 @@ void HostExecutor::RegisterKernelFunctionLoader(KernelFunctionLoader loader) {
   KernelFunctionLoaderRegistry().push_back(std::move(loader));
 }
 
-absl::Status HostExecutor::Init() { return absl::OkStatus(); }
+absl::Status HostExecutor::Init() {
+  thread_pool_ = std::make_shared<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), "host-executor", tsl::port::MaxParallelism());
+  return absl::OkStatus();
+}
 
 absl::StatusOr<std::unique_ptr<Kernel>> HostExecutor::CreateKernel() {
-  return std::make_unique<HostKernel>();
+  return std::make_unique<HostKernel>(thread_pool_);
 }
 
 absl::Status HostExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
@@ -93,11 +100,20 @@ absl::Status HostExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                   const BlockDim& block_dims,
                                   const Kernel& kernel,
                                   const KernelArgs& args) {
-  // const HostKernel* host_kernel = AsHostKernel(&kernel);
-
-  // TODO(tsilytskyi): convert args into proper format
-  // host_kernel->Launch(thread_dims, args);
-  return absl::UnimplementedError("Not Implemented");
+  const HostKernel* host_kernel = AsHostKernel(&kernel);
+
+  const KernelArgsDeviceMemoryArray* device_mem =
+      DynCast<KernelArgsDeviceMemoryArray>(&args);
+
+  absl::Status result;
+  if (device_mem != nullptr) {
+    result = host_kernel->Launch(thread_dims, device_mem->device_memory_args());
+  } else {
+    result = absl::UnimplementedError(
+        "Host kernel implements Launch method only for DeviceMemoryArray "
+        "arguments.");
+  }
+  return result;
 }
 
 bool HostExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index 8caf2b52046f72..508d3061a5e7e3 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
+#include "tsl/platform/threadpool.h"
 
 namespace stream_executor {
 namespace host {
@@ -153,6 +154,7 @@ class HostExecutor : public StreamExecutor {
 
  private:
   int device_ordinal_;
+  std::shared_ptr<tsl::thread::ThreadPool> thread_pool_;
 };
 
 }  // namespace host
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.cc b/third_party/xla/xla/stream_executor/host/host_kernel.cc
index 6517877ff2710f..509277aab1deca 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel.cc
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.cc
@@ -31,6 +31,11 @@ limitations under the License.
 
 namespace stream_executor::host {
 
+HostKernel::HostKernel(std::shared_ptr<tsl::thread::ThreadPool> thread_pool)
+    : thread_pool_(thread_pool) {
+  // Kernel and arity will be set separately
+}
+
 HostKernel::HostKernel(unsigned arity, SE_HOST_Kernel* kernel,
                        std::shared_ptr<tsl::thread::ThreadPool> thread_pool)
     : function_(std::make_unique<KernelFunctionPtr>(kernel)),
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.h b/third_party/xla/xla/stream_executor/host/host_kernel.h
index a79de73c3c21de..1df0fe7c9ae278 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel.h
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.h
@@ -60,15 +60,12 @@ class HostKernel : public Kernel {
     SE_HOST_Kernel* ptr_;  // not owned
   };
 
-  explicit HostKernel() = default;
+  explicit HostKernel(std::shared_ptr<tsl::thread::ThreadPool> thread_pool);
 
   // TODO(tsilytskyi): make this implementation detail private
   explicit HostKernel(unsigned arity, SE_HOST_Kernel* kernel,
                       std::shared_ptr<tsl::thread::ThreadPool> thread_pool);
 
-  // TODO(b/331430625): Connect this API to Launch API defined at
-  // StreamExecutor level, which requires refactoring how arguments passed to
-  // kernels, as current KernelArgs structure tied to the GPU kernel ABI.
   absl::Status Launch(const ThreadDim& thread_dims,
                       absl::Span<const DeviceMemoryBase> buffers) const;
 
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
index 3a0cf0f7a009a6..56ea120c0ceb26 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/host/host_kernel_c_api.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -83,10 +85,11 @@ define ptr @LlvmAddI32(ptr noundef %0) {
 }
 )";
 
-static std::unique_ptr<StreamExecutor> NewStreamExecutor() {
-  Platform* platform = PlatformManager::PlatformWithName("Host").value();
+static absl::StatusOr<std::unique_ptr<StreamExecutor>> NewStreamExecutor() {
   StreamExecutorConfig config(/*ordinal=*/0);
-  return platform->GetUncachedExecutor(config).value();
+  TF_ASSIGN_OR_RETURN(auto platform, PlatformManager::PlatformWithName("Host"));
+  TF_ASSIGN_OR_RETURN(auto stream_exec, platform->GetUncachedExecutor(config));
+  return stream_exec;
 }
 
 TEST(HostKernelTest, Addition1D) {
@@ -147,17 +150,17 @@ TEST(HostKernelTest, JitAddition) {
   spec.AddLlvmHostKernel(llvm_kernel_add, "LlvmAddI32", "LlvmAddI32",
                          absl::Span<std::string>());
 
-  auto executor = NewStreamExecutor();
-  auto eg = executor.get();
-  EXPECT_NE(eg, nullptr);
-  TF_ASSERT_OK_AND_ASSIGN(auto add, KernelFactory::Create(eg, spec));
+  TF_ASSERT_OK_AND_ASSIGN(auto executor, NewStreamExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto add,
+                          KernelFactory::Create(executor.get(), spec));
 
-  // TODO(tsilytskyi): implement Launch part
-  // TF_ASSERT_OK(executor->Launch(ThreadDim(4), args));
+  const KernelArgsDeviceMemoryArray kargs{args, /*shared_memory_bytes=*/0};
+  TF_ASSERT_OK(
+      executor->Launch(stream.get(), ThreadDim(4), BlockDim(1), *add, kargs));
 
-  // std::vector<int32_t> expected = {6, 8, 10, 12};
-  // EXPECT_EQ(out, expected);
-  // EXPECT_TRUE(true);
+  std::vector<int32_t> expected = {6, 8, 10, 12};
+  EXPECT_EQ(out, expected);
 }
 
 //===----------------------------------------------------------------------===//
@@ -165,8 +168,8 @@ TEST(HostKernelTest, JitAddition) {
 //===----------------------------------------------------------------------===//
 
 static void BM_ThreadpoolKernel(benchmark::State& state) {
-  auto tp = std::make_shared<tsl::thread::ThreadPool>(tsl::Env::Default(),
-                                                      "XLAEigen", 2);
+  auto tp = std::make_shared<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), "XLAEigen", tsl::port::MaxParallelism());
 
   HostKernel kernel(/*arity=*/3, AddI32, tp);
 
@@ -185,6 +188,34 @@ static void BM_ThreadpoolKernel(benchmark::State& state) {
   }
 }
 
+static void BM_JitKernel(benchmark::State& state) {
+  const int input_size = state.range(0);
+  std::vector<int32_t> lhs(input_size);
+  std::vector<int32_t> rhs(input_size);
+  std::vector<int32_t> out(input_size);
+
+  DeviceMemoryBase lhs_mem(lhs.data(), lhs.size() * sizeof(int32_t));
+  DeviceMemoryBase rhs_mem(rhs.data(), rhs.size() * sizeof(int32_t));
+  DeviceMemoryBase out_mem(out.data(), out.size() * sizeof(int32_t));
+  std::vector<DeviceMemoryBase> args = {lhs_mem, rhs_mem, out_mem};
+
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddLlvmHostKernel(llvm_kernel_add, "LlvmAddI32", "LlvmAddI32",
+                         absl::Span<std::string>());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executor, NewStreamExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto add,
+                          KernelFactory::Create(executor.get(), spec));
+
+  const KernelArgsDeviceMemoryArray kargs{args, /*shared_memory_bytes=*/0};
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(executor->Launch(
+        stream.get(), ThreadDim(input_size), BlockDim(1), *add, kargs));
+  }
+}
+
 BENCHMARK(BM_ThreadpoolKernel)
     ->MeasureProcessCPUTime()
     ->Arg(10)
@@ -195,4 +226,14 @@ BENCHMARK(BM_ThreadpoolKernel)
     ->Arg(8192)
     ->Arg(10000);
 
+BENCHMARK(BM_JitKernel)
+    ->MeasureProcessCPUTime()
+    ->Arg(10)
+    ->Arg(1023)
+    ->Arg(1024)
+    ->Arg(2048)
+    ->Arg(4096)
+    ->Arg(8192)
+    ->Arg(10000);
+
 }  // namespace stream_executor::host
diff --git a/third_party/xla/xla/stream_executor/stream_executor_test.cc b/third_party/xla/xla/stream_executor/stream_executor_test.cc
index 8dd6c8b36e4183..34bd4593cbc475 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/statusor.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/statusor.h"
@@ -24,15 +25,15 @@ limitations under the License.
 
 namespace stream_executor {
 
-static std::unique_ptr<StreamExecutor> NewStreamExecutor() {
-  Platform* platform = PlatformManager::PlatformWithName("Host").value();
+static absl::StatusOr<std::unique_ptr<StreamExecutor>> NewStreamExecutor() {
   StreamExecutorConfig config(/*ordinal=*/0);
-  return platform->GetUncachedExecutor(config).value();
+  TF_ASSIGN_OR_RETURN(auto platform, PlatformManager::PlatformWithName("Host"));
+  TF_ASSIGN_OR_RETURN(auto stream_exec, platform->GetUncachedExecutor(config));
+  return stream_exec;
 }
 
 TEST(StreamExecutorTest, HostMemoryAllocate) {
-  auto executor = NewStreamExecutor();
-
+  TF_ASSERT_OK_AND_ASSIGN(auto executor, NewStreamExecutor());
   TF_ASSERT_OK_AND_ASSIGN(auto allocation, executor->HostMemoryAllocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
   EXPECT_EQ(allocation->size(), 1024);

From 3294c193ec569920f1e7b1d9e0dc508cfcd3b35e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 29 May 2024 19:19:00 -0700
Subject: [PATCH 118/287] Added a minor utility function to compute an
 elementwise product of two vectors.

PiperOrigin-RevId: 638486930
---
 third_party/xla/xla/BUILD                     |  1 +
 third_party/xla/xla/hlo/ir/BUILD              | 10 ++++++
 .../xla/xla/hlo/ir/hlo_instruction_utils.cc   | 35 +++++++++++++++++++
 .../xla/xla/hlo/ir/hlo_instruction_utils.h    | 30 ++++++++++++++++
 third_party/xla/xla/service/BUILD             |  2 +-
 .../xla/xla/service/algebraic_simplifier.cc   | 18 ++++------
 third_party/xla/xla/util.cc                   | 17 ++++++---
 third_party/xla/xla/util.h                    |  4 +++
 8 files changed, 100 insertions(+), 17 deletions(-)
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_instruction_utils.cc
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_instruction_utils.h

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 304ed5a4d2b031..75529903cf3451 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -361,6 +361,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 19a42093ab8d02..1102ef1a67bae0 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -155,6 +155,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_instruction_utils",
+    srcs = ["hlo_instruction_utils.cc"],
+    hdrs = ["hlo_instruction_utils.h"],
+    deps = [
+        ":hlo",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction_utils.cc b/third_party/xla/xla/hlo/ir/hlo_instruction_utils.cc
new file mode 100644
index 00000000000000..ca651d41c7cf5c
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction_utils.cc
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_instruction_utils.h"
+
+#include <cstdint>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+namespace hlo_instruction_utils {
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  if (hlo->opcode() != HloOpcode::kSlice) {
+    return false;
+  }
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64_t stride) { return stride == 1; });
+}
+
+}  // namespace hlo_instruction_utils
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h b/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h
new file mode 100644
index 00000000000000..9ed3eb2736cc62
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
+#define XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace hlo_instruction_utils {
+// Returns true if the given HLO is a slice operation which has a unit stride in
+// all dimensions.
+bool IsUnstridedSlice(const HloInstruction* hlo);
+
+}  // namespace hlo_instruction_utils
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 1b0058ebe3b642..429ba2e6a67c43 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -2461,12 +2461,12 @@ cc_library(
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_instruction_utils",
         "//xla/hlo/utils:hlo_sharding_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index a9b1ee2b1487fa..b427114c5f74ad 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instruction_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
@@ -65,7 +66,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -293,11 +293,6 @@ HloInstruction* BitcastingOperandOfReshapeOrCopyChain(
       instr, instr->mutable_operand(0), options);
 }
 
-bool IsUnstridedSlice(const HloInstruction* hlo) {
-  return absl::c_all_of(hlo->slice_strides(),
-                        [](int64_t stride) { return stride == 1; });
-}
-
 // Returns bool to determine whether a pair of converts can be eliminated.
 bool IsConvertPairNoOp(const HloInstruction* convert) {
   //    [operand_convert]         [convert]
@@ -1703,7 +1698,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleConcatenate(
   int64_t i = 0;
   while (i < operands.size()) {
     if (operands[i]->opcode() != HloOpcode::kSlice ||
-        !IsUnstridedSlice(operands[i])) {
+        !hlo_instruction_utils::IsUnstridedSlice(operands[i])) {
       new_operands.push_back(operands[i]);
       ++i;
       continue;
@@ -1713,7 +1708,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleConcatenate(
     int64_t j = i + 1;
     while (j < operands.size()) {
       if (operands[j]->opcode() != HloOpcode::kSlice ||
-          !IsUnstridedSlice(operands[j]) ||
+          !hlo_instruction_utils::IsUnstridedSlice(operands[j]) ||
           operands[j]->operand(0) != slice_operand ||
           operands[j]->slice_starts(concatenate_dimension) != slice_end) {
         break;
@@ -6052,7 +6047,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
 absl::StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     HloInstruction* slice) {
   CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
-  if (!IsUnstridedSlice(slice)) {
+  if (!hlo_instruction_utils::IsUnstridedSlice(slice)) {
     return false;
   }
   HloInstruction* reshape = slice->mutable_operand(0);
@@ -6213,7 +6208,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   }
 
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
-      IsUnstridedSlice(slice) && IsUnstridedSlice(slice->operand(0))) {
+      hlo_instruction_utils::IsUnstridedSlice(slice) &&
+      hlo_instruction_utils::IsUnstridedSlice(slice->operand(0))) {
     HloInstruction* operand_slice = slice->mutable_operand(0);
     std::vector<int64_t> new_slice_starts = slice->slice_starts();
     std::vector<int64_t> new_slice_limits = slice->slice_limits();
@@ -6425,7 +6421,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
 
   // Try to simplify concat -> slice to an operand of concat.
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate &&
-      IsUnstridedSlice(slice)) {
+      hlo_instruction_utils::IsUnstridedSlice(slice)) {
     HloInstruction* concat = slice->mutable_operand(0);
     int64_t concat_dim = concat->concatenate_dimension();
     int64_t piece_start = 0;
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 585e9615c98d4f..683dc508c2d3eb 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -17,21 +17,20 @@ limitations under the License.
 
 #include <stdarg.h>
 
+#include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <limits>
-#include <memory>
 #include <numeric>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -41,7 +40,6 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -318,6 +316,15 @@ int64_t Product(absl::Span<const int64_t> xs) {
                          std::multiplies<int64_t>());
 }
 
+std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
+                                     absl::Span<const int64_t> b) {
+  CHECK_EQ(a.size(), b.size());
+  std::vector<int64_t> result;
+  std::transform(a.begin(), a.end(), b.begin(), std::back_inserter(result),
+                 std::multiplies<int64_t>());
+  return result;
+}
+
 absl::InlinedVector<std::pair<int64_t, int64_t>, 8> CommonFactors(
     absl::Span<const int64_t> a, absl::Span<const int64_t> b) {
   CHECK_EQ(Product(a), Product(b));
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 91a2b50b38a26c..90755ae64e6366 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -714,6 +714,10 @@ std::unique_ptr<Derived> unique_ptr_down_cast(std::unique_ptr<Base> ptr) {
 
 int64_t Product(absl::Span<const int64_t> xs);
 
+// Returns an array of results after performing elementwise product of a and b.
+std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
+                                     absl::Span<const int64_t> b);
+
 // Returns the start indices of consecutive non-overlapping subsequences of `a`
 // and `b` with the same product, i.e. `(i, j)` so
 // • a = {a[0 = i_0], ..., a[i_1 - 1], a[i_1], ... , a[i_2 - 1], ...}

From 3dedae93b1b441128f9617b77eea67715352898a Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Wed, 29 May 2024 20:20:36 -0700
Subject: [PATCH 119/287] [TF] Remove unused TF->HLO conversion pattern

The C++ pattern is dead code as the op is already replaced by a TableGen pattern in legalize_tf_patterns.td

PiperOrigin-RevId: 638500450
---
 .../transforms/legalization_op_config_test.cc |  2 +-
 .../mlir/tf2xla/transforms/legalize_tf.cc     | 57 -------------------
 2 files changed, 1 insertion(+), 58 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 25b8196ebfe629..93f47e887c14f6 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -187,7 +187,7 @@ TEST_F(LegalizationOpConfigTest, CountAllMlirLoweringPatterns) {
     }
   }
 
-  EXPECT_EQ(mlir_only_patterns, 64);
+  EXPECT_EQ(mlir_only_patterns, 63);
 }
 
 // Counts which ops have lowerings without XlaOpKernels. This isn't a
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index fd0b33c20c7127..adeb2b25e664b5 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -2933,62 +2933,6 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
   }
 };
 
-// Converts Sigmoid op to HLO ops computing sigmoid with the following formula:
-//
-//     sigmoid = add(mul(tanh(mul(logits, 0.5)), 0.5), 0.5)
-//
-// Sample result with 2-d f16 inputs with B batches of with N elements each.
-//
-//    // Create an array of 0.5 the shape of the input array.
-//    %half = mhlo.constant dense<5.000000e-01> : tensor<f32>
-//    %half_array = "mhlo.broadcast"(half)
-//                           {broadcast_sizes = dense<2> : tensor<1xi64>}
-//                           : (tensor<f32>) -> tensor<2xf32>
-//
-//    // Compute Tanh of half the logits of the values.
-//    %halved_logits = mhlo.multiply %logits, %half_array : tensor<2xf32>
-//    %tanh = "mhlo.tanh"(%halved_logits) : (tensor<2xf32>) -> tensor<2xf32>
-//
-//    // Have the result of Tanh and add 0.5.
-//    %halved_tanh = mhlo.multiply %tanh, %half : tensor<2xf32>
-//    %sigmoid = mhlo.add %halved_tanh, %half : tensor<2xf32>
-//
-class ConvertSigmoidOp : public RewritePattern {
- public:
-  explicit ConvertSigmoidOp(MLIRContext *context)
-      : RewritePattern(
-            TF::SigmoidOp::getOperationName(), 0, context,
-            {mhlo::ConstantOp::getOperationName(),
-             shape::ShapeOfOp::getOperationName(),
-             shape::ToExtentTensorOp::getOperationName(),
-             mhlo::DynamicBroadcastInDimOp::getOperationName(),
-             mhlo::MulOp::getOperationName(), mhlo::TanhOp::getOperationName(),
-             mhlo::AddOp::getOperationName()}) {}
-
-  LogicalResult matchAndRewrite(Operation *sigmoid_op,
-                                PatternRewriter &rewriter) const override {
-    auto op = cast<TF::SigmoidOp>(sigmoid_op);
-    Location loc = op.getLoc();
-
-    // Create constant half with shape and element type same as the operand.
-    Value operand = op.getOperand();
-    auto operand_ty = mlir::cast<TensorType>(operand.getType());
-    auto scalar_ty =
-        tensorflow::GetTypeFromTFTensorShape({}, operand_ty.getElementType());
-    ElementsAttr attr = mlir::hlo::getSplat(&rewriter, scalar_ty, 0.5);
-    auto scalar_half = rewriter.create<ConstantOp>(loc, attr);
-    auto half = BroadcastToShapeOf(loc, scalar_half, operand, rewriter);
-
-    auto scaled_input = rewriter.create<MulOp>(loc, operand, half);
-    auto tanh_op = rewriter.create<TanhOp>(loc, scaled_input);
-    auto mul_op = rewriter.create<MulOp>(loc, tanh_op, half);
-    auto add_op = rewriter.create<AddOp>(loc, mul_op, half);
-
-    rewriter.replaceOp(op, add_op.getResult());
-    return success();
-  }
-};
-
 // Converts the tf.Slice op into mhlo.real_dynamic_slice
 // TODO(disc): To recover static special case's performance with folding and
 // canonicalization.
@@ -6881,7 +6825,6 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
     ConvertMatrixDiagPartV3Op,
     ConvertRangeOp,
     ConvertSelectOp,
-    ConvertSigmoidOp,
     ConvertShapeOp,
     ConvertSplitOp,
     ConvertSplitVOp,

From 754c5382e9c22b27c4995e225fb4d4523c520553 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Wed, 29 May 2024 20:29:24 -0700
Subject: [PATCH 120/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638502077
---
 third_party/xla/xla/backends/interpreter/BUILD     |  2 +-
 .../xla/xla/backends/interpreter/compiler.cc       |  2 +-
 .../xla/xla/backends/interpreter/compiler.h        |  2 +-
 third_party/xla/xla/backends/profiler/cpu/BUILD    |  2 +-
 .../backends/profiler/cpu/metadata_collector.cc    |  2 +-
 third_party/xla/xla/hlo/evaluator/BUILD            |  2 +-
 third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc | 14 +++++++-------
 third_party/xla/xla/pjrt/gpu/BUILD                 |  2 +-
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc |  1 -
 third_party/xla/xla/service/cpu/tests/BUILD        |  2 +-
 .../xla/xla/service/cpu/tests/cpu_noalias_test.cc  |  2 +-
 .../xla/xla/service/gpu/llvm_gpu_backend/BUILD     |  1 -
 .../gpu/llvm_gpu_backend/gpu_backend_lib.cc        |  1 -
 third_party/xla/xla/service/heap_simulator/BUILD   |  2 +-
 .../xla/service/heap_simulator/heap_simulator.cc   |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD      |  1 -
 .../xla/xla/stream_executor/gpu/gpu_blas_lt.h      |  2 +-
 third_party/xla/xla/tools/hlo_bisect/BUILD         |  2 +-
 .../xla/xla/tools/hlo_bisect/hlo_bisect_state.h    |  2 +-
 third_party/xla/xla/tools/hlo_opt/BUILD            |  1 -
 third_party/xla/xla/tools/hlo_opt/opt_main.cc      |  1 -
 third_party/xla/xla/translate/mhlo_to_hlo/BUILD    |  2 +-
 .../xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc   |  2 +-
 23 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 50c47f80b9a711..81917ca29b1865 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -30,7 +30,6 @@ cc_library(
         ":executable",
         ":platform_id",
         "//xla:literal",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -56,6 +55,7 @@ cc_library(
         "//xla/service:triangular_solve_expander",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/backends/interpreter/compiler.cc b/third_party/xla/xla/backends/interpreter/compiler.cc
index 2409d9602671e0..ae3841a1d4448c 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.cc
+++ b/third_party/xla/xla/backends/interpreter/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/backends/interpreter/executable.h"
 #include "xla/backends/interpreter/platform_id.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/service/qr_expander.h"
 #include "xla/service/topk_rewriter.h"
 #include "xla/service/triangular_solve_expander.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/platform.h"
diff --git a/third_party/xla/xla/backends/interpreter/compiler.h b/third_party/xla/xla/backends/interpreter/compiler.h
index 1f234f161befb1..51ce9921002776 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.h
+++ b/third_party/xla/xla/backends/interpreter/compiler.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/backends/interpreter/platform_id.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index e558b4e35a5f88..28b448ba142bf2 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -94,9 +94,9 @@ cc_library(
     ]),
     deps = [
         ":metadata_utils",
-        "//xla:status",
         "//xla/service:hlo_proto_cc",
         "//xla/service:xla_debug_info_manager",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
diff --git a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
index ebb73c2ca5fc7b..26735490d8851c 100644
--- a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/backends/profiler/cpu/metadata_utils.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/xla_debug_info_manager.h"
-#include "xla/status.h"
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 52ce1414644efa..be259e68f0b521 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -47,7 +47,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -74,6 +73,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index 9df1e652d24dd5..f76d7f21fe7b47 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/memory/memory.h"
 #include "absl/numeric/bits.h"
+#include "absl/status/status.h"
 #include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -74,7 +75,6 @@ limitations under the License.
 #include "xla/service/tuple_points_to_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
@@ -194,7 +194,7 @@ struct PopulateImpl {
 // native types to avoid templating the whole implementations.
 template <template <PrimitiveType> typename Trait, typename F>
 absl::Status Apply(Literal& literal, F&& literal_generator) {
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&, literal_generator = std::forward<F>(literal_generator)](
           auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
@@ -1408,7 +1408,7 @@ absl::Status HloEvaluator::HandleConcatenate(
 absl::Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
   auto operand = is_finite->operand(0);
   auto elem_ty = operand->shape().element_type();
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
@@ -1431,7 +1431,7 @@ absl::Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
 
 absl::Status HloEvaluator::HandleReal(const HloInstruction* real) {
   auto operand = real->operand(0);
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
@@ -1460,7 +1460,7 @@ absl::Status HloEvaluator::HandleReal(const HloInstruction* real) {
 
 absl::Status HloEvaluator::HandleImag(const HloInstruction* imag) {
   auto operand = imag->operand(0);
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
@@ -1493,7 +1493,7 @@ absl::Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
   TF_RET_CHECK(ShapeUtil::Compatible(real.shape(), imag.shape()));
 
   Literal result(complex->shape());
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1527,7 +1527,7 @@ absl::Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
   const Literal& lhs_literal = GetEvaluatedLiteralFor(lhs);
   const Literal& rhs_literal = GetEvaluatedLiteralFor(rhs);
   // Note here we switch on the operand's type.
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index acfb5224483722..50ee6294bff7b1 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -47,7 +47,6 @@ cc_library(
         ":gpu_topology_proto_cc",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -344,6 +343,7 @@ xla_cc_test(
         "//xla/service:compiler",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
+        "//xla/service/gpu:amdgpu_compiler_impl",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 28e1671c584432..537b6091d9a58f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -73,7 +73,6 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index eaa3e2f5f0386f..40d680768a607f 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -132,7 +132,6 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -142,6 +141,7 @@ xla_cc_test(
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tests:filecheck",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
index 8426238fbeb288..5e486f216e932a 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tests/filecheck.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index f91779bd5ba01f..88ace632771553 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -36,7 +36,6 @@ cc_library(
         "GOOGLE_CUDA=1",
     ]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 7f4deb747d7c42..78337fadad1726 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -80,7 +80,6 @@ limitations under the License.
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/heap_simulator/BUILD b/third_party/xla/xla/service/heap_simulator/BUILD
index 41ebb4275290b5..71a3a41b76dd22 100644
--- a/third_party/xla/xla/service/heap_simulator/BUILD
+++ b/third_party/xla/xla/service/heap_simulator/BUILD
@@ -38,7 +38,6 @@ cc_library(
     deps = [
         ":allocation_block",
         "//xla:comparison_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -60,6 +59,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
index 7ca0490f6db776..a45528e6c1ac2f 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/time_utils.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 68760af5de46ed..5eee37af6f467d 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -621,7 +621,6 @@ cc_library(
     ]),
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index ff6e21e15fe894..2365d53358d2cf 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/host_or_device_scalar.h"
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index 732384eca9dada..3ee57085839e29 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -43,7 +43,6 @@ cc_library(
     hdrs = ["hlo_bisect_state.h"],
     deps = [
         "//xla:literal",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -52,6 +51,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
index 8b489f795a30bc..22cd9d79378f85 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/literal.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 4d69ab619d70d0..2722a726871828 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -123,7 +123,6 @@ cc_library(
         "cpu_opt",
         ":opt_lib",
         "//xla:debug_options_flags",
-        "//xla:status",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_runner",
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index c7c1904dceb66a..101a6946cd55af 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/status.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
 #include "xla/tsl/util/command_line_flags.h"
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index 1386aa586a555c..aa63d9c521cb91 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -90,7 +90,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:types",
         "//xla:xla_data_proto_cc",
@@ -108,6 +107,7 @@ cc_library(
         "//xla/service:hlo_parser",
         "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:backend_configs_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 5ab7baea7ce6a9..f153fa5b33b64f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
@@ -84,7 +85,6 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"

From af365f040abac6acb20057ed0e343663d8cd494b Mon Sep 17 00:00:00 2001
From: Seher Ellis <sacer@google.com>
Date: Wed, 29 May 2024 20:40:39 -0700
Subject: [PATCH 121/287] [XLA:LHS] Add a rule for async ops that release
 nonextendable resources so that their overlap is closed right after their
 estimated time has passed. Also improve logging levels.

PiperOrigin-RevId: 638504189
---
 .../xla/service/latency_hiding_scheduler.cc   | 50 +++++++++++++++----
 .../xla/service/latency_hiding_scheduler.h    | 13 ++++-
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index bfc6ce7f49cd68..2b48b0aba08179 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -463,6 +463,14 @@ AsyncTracker::GetOccupiedSerialResourcesFromVector(
   return {};
 }
 
+// For now, only the target-defined resources have nonextendable hazard type, so
+// this async tracker does not know which resources are nonextendable.
+absl::InlinedVector<int64_t, 1>
+AsyncTracker::GetReleasedNonextendableResourcesFromVector(
+    const ResourcesVector& resources) const {
+  return {};
+}
+
 BufferInfoTracker::BufferInfoTracker(
     const HloModule* module, const HloAliasAnalysis* alias_analysis,
     const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
@@ -816,6 +824,19 @@ class ReadySetLt {
       return *value;
     }
 
+    // The following rule targets the async ops using resources that should be
+    // released right after the op's estimated time cost has past. It prevents
+    // increasing the overlaps of such async ops more than necessary.
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            PastDueCyclesForNonextendableResource(a) >
+                PastDueCyclesForNonextendableResource(b),
+            a,
+            PastDueCyclesForNonextendableResource(b) >
+                PastDueCyclesForNonextendableResource(a),
+            b, "kReleaseNonextendable")) {
+      return *value;
+    }
+
     if (sched_state_.config.enable_release_start_policy) {
       // Prioritise scheduling ready "start" ops, to avoid useless extension of
       // start-done latencies. This benefits future latency ops, as ops
@@ -1024,6 +1045,17 @@ class ReadySetLt {
     }
     return !ShouldDelaySendHostDone(gn_cand);
   }
+
+  HloGraphNode::TimeCost PastDueCyclesForNonextendableResource(
+      DefaultSchedulerCore::ScheduleCandidate& cand) const {
+    if (sched_state_.async_tracker
+            ->GetReleasedNonextendableResourcesFromVector(
+                cand.node->GetResources())
+            .empty()) {
+      return 0.0;
+    }
+    return std::max(sched_state_.current_time - cand.node->GetReadyTime(), 0.0);
+  }
   bool ShouldDelaySendHostDone(
       DefaultSchedulerCore::ScheduleCandidate& gn_cand) const {
     const HloGraphNode& gn = *gn_cand.node;
@@ -1154,7 +1186,7 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
         }
         return false;
       };
-  VLOG(6) << "Current time: " << sched_state.current_time;
+  VLOG(1) << "Current time: " << sched_state.current_time;
   ReadySetLt ready_lt{&sched_state, target_scheduling_rule_,
                       early_target_scheduling_rule_};
   // Construct a schedule candidate for caching.
@@ -1187,7 +1219,7 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
     if (ready_chosen.node == nullptr) {
       ready_chosen = ready_candidate;
       chosen_it = ready_node_it;
-      VLOG(6) << "Choosing from ready (" << ready_chosen.node->GetInstr().name()
+      VLOG(1) << "Choosing from ready (" << ready_chosen.node->GetInstr().name()
               << ") Reason: First Candidate";
       continue;
     }
@@ -1202,7 +1234,7 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
           }
           return std::string("N/A");
         };
-    VLOG(6) << "Choosing from ready ("
+    VLOG(1) << "Choosing from ready ("
             << (new_candidate_selected ? ready_candidate.node->GetInstr().name()
                                        : ready_chosen.node->GetInstr().name())
             << ") vs ("
@@ -1245,9 +1277,9 @@ void DefaultSchedulerCore::LogInstruction(const HloInstruction* instr) const {
 
 void PrintOccupierList(
     std::vector<std::pair<HloEdge*, HloGraphNode::TimeCost>>& occupiers) {
-  VLOG(1) << "Occupier list:";
+  VLOG(2) << "Occupier list:";
   for (int64_t i = 0; i < occupiers.size(); i++) {
-    VLOG(1) << "\tOccupier at index: " << i
+    VLOG(2) << "\tOccupier at index: " << i
             << " with projected finish time: " << occupiers[i].second
             << " original latency: " << occupiers[i].first->OriginalLatency()
             << " latency: " << occupiers[i].first->Latency();
@@ -1838,7 +1870,7 @@ absl::Status DefaultSchedulerCore::SchedulingStep(
   CHECK(node != nullptr);
   TF_ASSIGN_OR_RETURN(sched_state->current_time,
                       ScheduleNode(node, sched_state));
-  VLOG(5) << "Scheduled: ";
+  VLOG(1) << "Scheduled: " << node->GetInstr().name();
   XLA_VLOG_LINES(5, node->ToString());
   return absl::OkStatus();
 }
@@ -1879,11 +1911,11 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
   // Schedule in order bottom up.
   while (!sched_state.ready_set.empty()) {
     VLOG(10) << "Current ready time: " << sched_state.current_time;
-    VLOG(10) << "Current ready queue:";
-    XLA_VLOG_LINES(10, [&sched_state]() {
+    VLOG(1) << "Current ready queue:";
+    XLA_VLOG_LINES(1, [&sched_state]() {
       struct LogFormatter {
         void operator()(std::string* out, const HloGraphNode* n) const {
-          out->append(absl::StrCat("\t", n->GetInstr().ToString(),
+          out->append(absl::StrCat("\t", n->GetInstr().name(),
                                    " Ready time: ", n->GetReadyTime(),
                                    " Depth: ", n->GetGraphDepth()));
         }
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 51ca9e7b5f3bcf..64903272d957af 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -77,7 +77,12 @@ enum class ResourceUsageType {
 enum class ResourceHazardType {
   kShareable = 0,
   kSerial = 1,
-  kUnshareable = 2,
+  // The following hazard type represents the resources that are used by the
+  // async ops and should be released right after the estimated time cost has
+  // past. This hazard type is useful to prevent increasing such ops' overlaps
+  // more than necessary.
+  kNonextendable = 2,
+  kUnshareable = 3,
 };
 
 constexpr int64_t ResourceTypeToIndex(ResourceType resource_type) {
@@ -254,6 +259,12 @@ class AsyncTracker {
   virtual absl::InlinedVector<int64_t, 1> GetOccupiedSerialResourcesFromVector(
       const ResourcesVector& resources) const;
 
+  // Returns the list of the released nonextendable resources filtered from the
+  // given resources vector.
+  virtual absl::InlinedVector<int64_t, 1>
+  GetReleasedNonextendableResourcesFromVector(
+      const ResourcesVector& resources) const;
+
   inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
     return get_canonical_async_op_(hlo);
   }

From 8e437155b9927d505afe1c708616f0763aef3d9b Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 29 May 2024 21:13:30 -0700
Subject: [PATCH 122/287] Disable MLIR-based TF graph optimizer

PiperOrigin-RevId: 638510790
---
 .../core/tfrt/graph_executor/graph_execution_options.cc      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
index 2c997ff35da7e8..eb5d1a8618b2a2 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
@@ -40,6 +40,11 @@ tensorflow::SessionOptions CreateDefaultSessionOptions(
       ->mutable_rewrite_options()
       ->set_disable_meta_optimizer(!options.compile_options.enable_grappler);
 
+  // Disable MLIR-based graphdef optimizer as it is buggy.
+  config.mutable_graph_options()
+      ->mutable_rewrite_options()
+      ->set_disable_tfg_optimizer(true);
+
   // The following configs are constant.
 
   // Setting use_tfrt to true avoids grappler logic that lowers to v1 control

From 9bc464ce08dfbb2633427ed94c69144addd61c8a Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 29 May 2024 22:26:14 -0700
Subject: [PATCH 123/287] [XLA:GPU] Add a GUnit matcher for TiledHloInstruction

PiperOrigin-RevId: 638524836
---
 third_party/xla/xla/service/gpu/model/BUILD   |  1 +
 .../gpu/model/symbolic_tile_analysis_test.cc  | 69 ++++++++++++++-----
 .../service/gpu/model/tiled_hlo_instruction.h |  6 ++
 3 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 8282baf2b9ed8d..ca3aee3e83ee79 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -663,6 +663,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 4480481ae935af..c8f7cd863f8b61 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -24,6 +25,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
@@ -37,7 +39,28 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::ExplainMatchResult;
+using ::testing::Matcher;
+
+MATCHER_P3(MatchTiledHloInstructionImpl, tile_sizes, tile_strides,
+           block_id_to_tile_offsets_indexing, "") {
+  return ExplainMatchResult(ElementsAreArray(tile_sizes), arg.tile_sizes(),
+                            result_listener) &&
+         ExplainMatchResult(ElementsAreArray(tile_strides), arg.tile_strides(),
+                            result_listener) &&
+         ExplainMatchResult(MatchIndexingMap(block_id_to_tile_offsets_indexing),
+                            arg.block_id_to_tile_offsets_indexing(),
+                            result_listener);
+}
+
+Matcher<const TiledHloInstruction> MatchTiledHloInstruction(
+    absl::Span<const int64_t> tile_sizes,
+    absl::Span<const int64_t> tile_strides,
+    absl::string_view block_id_to_tile_offsets_indexing) {
+  return MatchTiledHloInstructionImpl(tile_sizes, tile_strides,
+                                      block_id_to_tile_offsets_indexing);
+}
 
 class SymbolicTileAnalysisTest : public HloTestBase {
  public:
@@ -91,21 +114,19 @@ ENTRY main {
   auto p0_from_subtract0 = root->operand(0);
   auto p0_from_subtract1 = root->operand(1)->operand(0)->operand(0);
 
-  EXPECT_THAT(p0_from_subtract0->tile_sizes(), ElementsAre(1, 10));
-  EXPECT_THAT(p0_from_subtract0->tile_strides(), ElementsAre(1, 1));
-
-  EXPECT_THAT(p0_from_subtract0->block_id_to_tile_offsets_indexing(),
-              MatchIndexingMap(R"(
+  EXPECT_THAT(*p0_from_subtract0, MatchTiledHloInstruction(
+                                      /*tile_sizes=*/{1, 10},
+                                      /*tile_strides=*/{1, 1},
+                                      /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> (d0 floordiv 10, (d0 mod 10) * 10)
     domain:
     d0 in [0, 19]
   )"));
 
-  EXPECT_THAT(p0_from_subtract1->tile_sizes(), ElementsAre(1, 97));
-  EXPECT_THAT(p0_from_subtract1->tile_strides(), ElementsAre(1, 1));
-
-  EXPECT_THAT(p0_from_subtract1->block_id_to_tile_offsets_indexing(),
-              MatchIndexingMap(R"(
+  EXPECT_THAT(*p0_from_subtract1, MatchTiledHloInstruction(
+                                      /*tile_sizes=*/{1, 97},
+                                      /*tile_strides=*/{1, 1},
+                                      /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> (d0 floordiv 10, 0)
     domain:
     d0 in [0, 19]
@@ -152,14 +173,18 @@ ENTRY main {
 
   const TiledHloInstruction* root = tiled_hlo_computation.GetRoot();
 
-  EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
+  EXPECT_THAT(*root, MatchTiledHloInstruction(
+                         /*tile_sizes=*/{2, 4, 2}, /*tile_strides=*/{1, 1, 1},
+                         /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> ((d0 floordiv 16) * 2, ((d0 floordiv 8) mod 2) * 4, (d0 mod 8) * 2)
     domain:
     d0 in [0, 31]
   )"));
 
-  EXPECT_THAT(root->operand(0)->block_id_to_tile_offsets_indexing(),
-              MatchIndexingMap(R"(
+  EXPECT_THAT(*root->operand(0),
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{4, 2, 2}, /*tile_strides=*/{1, 1, 1},
+                  /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> (((d0 floordiv 8) mod 2) * 4, (d0 mod 8) * 2, (d0 floordiv 16) * 2)
     domain:
     d0 in [0, 31]
@@ -186,21 +211,27 @@ ENTRY main {
   const TiledHloInstruction* p0_from_slice0 = root->operand(0)->operand(0);
   const TiledHloInstruction* p0_from_slice1 = root->operand(1)->operand(0);
 
-  EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
+  EXPECT_THAT(*root, MatchTiledHloInstruction(
+                         /*tile_sizes=*/{2, 2}, /*tile_strides=*/{1, 1},
+                         /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> ((d0 floordiv 4) * 2, (d0 mod 4) * 2)
     domain:
     d0 in [0, 7]
   )"));
 
-  EXPECT_THAT(p0_from_slice0->block_id_to_tile_offsets_indexing(),
-              MatchIndexingMap(R"(
+  EXPECT_THAT(*p0_from_slice0,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{2, 2}, /*tile_strides=*/{1, 1},
+                  /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> ((d0 floordiv 4) * 2, (d0 mod 4) * 2 + 2)
     domain:
     d0 in [0, 7]
   )"));
 
-  EXPECT_THAT(p0_from_slice1->block_id_to_tile_offsets_indexing(),
-              MatchIndexingMap(R"(
+  EXPECT_THAT(*p0_from_slice1,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{2, 2}, /*tile_strides=*/{1, 1},
+                  /*block_id_to_tile_offsets_indexing=*/R"(
     (d0) -> ((d0 floordiv 4) * 2 + 3, (d0 mod 4) * 2 + 4)
     domain:
     d0 in [0, 7]
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
index 045cfd27dd8aaf..97b7581645037f 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
@@ -94,6 +94,12 @@ class TiledHloInstruction {
 
   std::string ToString() const;
 
+  // This allows GUnit to print TiledHloInstruction.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const TiledHloInstruction& tiled_hlo) {
+    sink.Append(tiled_hlo.ToString());
+  }
+
  private:
   TiledHloInstruction(const HloInstruction* hlo,
                       std::vector<int64_t> tile_sizes,

From 3ae74e5e07d4bba43b8196d8edf7ced36028472d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 29 May 2024 22:29:28 -0700
Subject: [PATCH 124/287] Reverts a03918a8b580ea180e16cd902f2c8f077b50e790

PiperOrigin-RevId: 638525320
---
 third_party/xla/xla/service/BUILD             |  3 -
 .../xla/xla/service/gather_simplifier.cc      | 10 +-
 .../xla/xla/service/gather_simplifier_test.cc | 24 +----
 .../xla/xla/service/gpu/gpu_compiler.cc       |  6 --
 .../xla/xla/service/layout_normalization.cc   | 93 +------------------
 .../xla/service/layout_normalization_test.cc  | 21 +----
 6 files changed, 9 insertions(+), 148 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 429ba2e6a67c43..2eef83d317a299 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -7673,11 +7673,8 @@ xla_cc_test(
     srcs = ["gather_simplifier_test.cc"],
     deps = [
         ":gather_simplifier",
-        ":hlo_parser",
-        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gather_simplifier.cc b/third_party/xla/xla/service/gather_simplifier.cc
index 1913cb390aa136..354d26b4026a68 100644
--- a/third_party/xla/xla/service/gather_simplifier.cc
+++ b/third_party/xla/xla/service/gather_simplifier.cc
@@ -113,12 +113,11 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
   auto output_rank = static_cast<int64_t>(start_indices_dims.size() +
                                           dims.offset_dims().size());
   output_perm.reserve(output_rank);
+  auto offset_dim_index = static_cast<int64_t>(start_indices_dims.size());
   int64_t start_index_dim_index = 0;
   for (int64_t i = 0; i < output_rank; ++i) {
-    auto it = absl::c_find(dims.offset_dims(), i);
-    if (it != dims.offset_dims().end()) {
-      output_perm.push_back(std::distance(dims.offset_dims().begin(), it) +
-                            start_indices_dims.size());
+    if (absl::c_linear_search(dims.offset_dims(), i)) {
+      output_perm.push_back(offset_dim_index++);
     } else {
       output_perm.push_back(start_index_dim_index++);
     }
@@ -133,8 +132,7 @@ bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
          IsIdentityPermutation(dims.start_index_map()) &&
          dims.collapsed_slice_dims().empty() &&
          *dims.offset_dims().begin() == 1 &&
-         *dims.offset_dims().rbegin() == dims.offset_dims().size() &&
-         absl::c_is_sorted(dims.offset_dims());
+         *dims.offset_dims().rbegin() == dims.offset_dims().size();
 }
 
 bool GatherSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
diff --git a/third_party/xla/xla/service/gather_simplifier_test.cc b/third_party/xla/xla/service/gather_simplifier_test.cc
index 528d2da94f37ec..61b8bc716e120b 100644
--- a/third_party/xla/xla/service/gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/gather_simplifier_test.cc
@@ -17,10 +17,6 @@ limitations under the License.
 
 #include <optional>
 
-#include <gtest/gtest.h>
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/hlo_parser.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -178,26 +174,10 @@ TEST_F(GatherSimplifierTest, ZeroSizeSlice) {
 
   // The shape check is sufficient.
   RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
-      CHECK: %[[ZERO:.*]] = f32[] constant(0)
-      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={}
+      CHECK: %[[ZERO:.*]] = f32[] constant(0) 
+      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={} 
   )");
 }
 
-TEST_F(GatherSimplifierTest,
-       IsSimplifiedGatherReturnsFalseForUnsortedOffsetDims) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule gather_simplifier
-
-    ENTRY kernel_entry {
-      operand = f32[3,3] parameter(0)
-      indices = s32[2,1] parameter(1)
-      ROOT gather = s32[2,1,3]{2,1,0} gather(operand, indices), offset_dims={2,1}, collapsed_slice_dims={}, start_index_map={1}, index_vector_dim=1, slice_sizes={3,1}
-    })";
-  auto module = ParseAndReturnUnverifiedModule(kModuleStr).value();
-  auto gather = module->entry_computation()->root_instruction();
-  EXPECT_FALSE(
-      GatherSimplifier::IsSimplifiedGather(Cast<HloGatherInstruction>(gather)));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 6ba8c63764f45c..bea84c8de7aab5 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1226,9 +1226,6 @@ absl::Status GpuCompiler::OptimizeHloModule(
   // Layout normalization will create scatters that are not simplified and
   // also have unsorted update_window_dims.
   layout_normalization_pipeline.AddPass<ScatterSimplifier>();
-  // Layout normalization will create gathers that are not simplified and also
-  // have unsorted offset_dims.
-  layout_normalization_pipeline.AddPass<GatherSimplifier>();
   TF_RETURN_IF_ERROR(layout_normalization_pipeline.Run(hlo_module).status());
   // Run target-specific HLO optimization passes after layout assignment.
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
@@ -1382,9 +1379,6 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // Layout normalization will create scatters that are not simplified and
     // also have unsorted update_window_dims.
     pipeline.AddPass<ScatterSimplifier>();
-    // Layout normalization will create gathers that are not simplified and
-    // also have unsorted offset_dims.
-    pipeline.AddPass<GatherSimplifier>();
     pipeline.AddPass<BroadcastCanonicalizer>();
 
     pipeline.AddPass<ReductionDegenerateDimRemover>();
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index fdadd15f9e77c4..6b892249f0dfb1 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -365,7 +365,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   // A{I} -> R [S']{I} -> bitcast[S]{L2}
   //
   absl::Status HandleReshape(HloInstruction* hlo) override {
-    const auto& s = hlo->shape();
+    auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     TF_RET_CHECK(ShapeUtil::ReshapeIsBitcast(s, operand->shape()));
     TF_ASSIGN_OR_RETURN(auto a0, GetNormalizedInput(operand));
@@ -378,98 +378,9 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
-  absl::Status HandleGather(HloInstruction* hlo) override {
-    const auto& s = hlo->shape();
-    auto normalized_shape = Normalize(s);
-    auto* gather = Cast<HloGatherInstruction>(hlo);
-    TF_ASSIGN_OR_RETURN(auto* normalized_operand,
-                        GetNormalizedInput(gather->mutable_operand(0)));
-    // Since normalization might reorder the output differently than the
-    // 'start_indices' operand, we have no way to specify the order of the
-    // gather batch dimensions, as that is not an attribute in
-    // GatherDimensionNumbers. Gather implicitly assumes that the batch
-    // dimensions appear in the same order in 'start_indices' and output. So we
-    // require that there is just a single batch dimension. This is ensured by
-    // the GatherSimplifier pass.
-    if (gather->operand(1)->shape().rank() != 2) {
-      return FailedPrecondition(
-          "There should be just a single gather batch dimension. Make sure to "
-          "run GatherSimplifier before LayoutNormalization");
-    }
-    TF_ASSIGN_OR_RETURN(auto* normalized_start_indices,
-                        GetNormalizedInput(gather->mutable_operand(1)));
-
-    auto operand_permutation =
-        ToTransposeDimensions(gather->operand(0)->shape().layout());
-    auto normalized_slice_sizes =
-        ComposePermutations(gather->gather_slice_sizes(), operand_permutation);
-
-    const auto& dims = gather->gather_dimension_numbers();
-    GatherDimensionNumbers normalized_dims;
-    auto start_indices_permutation =
-        ToTransposeDimensions(gather->operand(1)->shape().layout());
-    normalized_dims.set_index_vector_dim(
-        start_indices_permutation[dims.index_vector_dim()]);
-    auto inverse_operand_permutation = InversePermutation(operand_permutation);
-    std::vector<int64_t> normalized_collapsed_slice_dims;
-    normalized_collapsed_slice_dims.reserve(dims.collapsed_slice_dims_size());
-    for (int64_t dim : dims.collapsed_slice_dims()) {
-      normalized_collapsed_slice_dims.push_back(
-          inverse_operand_permutation[dim]);
-    }
-    absl::c_sort(normalized_collapsed_slice_dims);
-    for (int64_t dim : normalized_collapsed_slice_dims) {
-      normalized_dims.add_collapsed_slice_dims(dim);
-    }
-
-    // Compute the permutation that we need to apply to the original
-    // offset_dims. We need to remap the dimensions that are not collapsed to
-    // the range [0, offset_dims.size() - 1], but also insert placeholders for
-    // the collapsed dimensions so that we can apply 'operand_permutation'.
-    std::vector<int64_t> permutation(operand_permutation.size(), -2);
-    for (int64_t collapsed_dim : dims.collapsed_slice_dims()) {
-      permutation[collapsed_dim] = -1;
-    }
-    for (int64_t i = 0, j = 0; i < permutation.size(); ++i) {
-      if (permutation[i] == -2) {
-        permutation[i] = j++;
-      }
-    }
-    permutation = ComposePermutations(permutation, operand_permutation);
-    // Now remove the placeholders.
-    int64_t l = 0;
-    for (int64_t i = 0; i < permutation.size(); ++i) {
-      if (permutation[i] >= 0) {
-        permutation[l++] = permutation[i];
-      }
-    }
-    permutation.erase(permutation.begin() + l, permutation.end());
-    auto normalized_offset_dims =
-        ComposePermutations(dims.offset_dims(), permutation);
-    auto inverse_output_permutation =
-        InversePermutation(ToTransposeDimensions(s.layout()));
-    for (int64_t dim : normalized_offset_dims) {
-      normalized_dims.add_offset_dims(inverse_output_permutation[dim]);
-    }
-
-    for (int64_t dim : dims.start_index_map()) {
-      normalized_dims.add_start_index_map(inverse_operand_permutation[dim]);
-    }
-
-    auto* normalized_gather =
-        gather->AddInstruction(HloInstruction::CreateGather(
-            normalized_shape, normalized_operand, normalized_start_indices,
-            normalized_dims, normalized_slice_sizes,
-            gather->indices_are_sorted()));
-    SetVisited(*normalized_gather);
-    auto* bc_to_orig = MakeBitcastHlo(normalized_gather, s);
-    TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
-    return absl::OkStatus();
-  }
-
   // Scatter is layout-preserving regarding the scatter operands, so we only
   // have to permute values inside the ScatterDimensionNumbers.
-  absl::Status HandleScatter(HloInstruction* hlo) override {
+  Status HandleScatter(HloInstruction* hlo) override {
     auto* scatter = Cast<HloScatterInstruction>(hlo);
     std::vector<HloInstruction*> normalized_operands;
     normalized_operands.reserve(scatter->scatter_operand_count());
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index fd31e5a2f94d22..d2b9d92d2fb934 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -778,7 +778,7 @@ ENTRY main {
 
 TEST_F(LayoutNormalizationTest, Scatter) {
   const char* hlo = R"(
-HloModule scatter
+HloModule simplified_scatter
 
 region_0.10 {
   Arg_0.11 = s16[] parameter(0)
@@ -872,24 +872,5 @@ ENTRY main.17 {
       });
 }
 
-TEST_F(LayoutNormalizationTest, Gather) {
-  const char* hlo = R"(
-HloModule gather
-
-ENTRY main.17 {
-  operand = s16[3,2,2,14,16]{0,1,4,3,2} parameter(0)
-  indices = s32[6,5]{0,1} parameter(1)
-  ROOT gather = s16[3,6,10,15]{0,1,3,2} gather(operand, indices), offset_dims={0,2,3}, index_vector_dim=1, slice_sizes={3,1,1,10,15}, start_index_map={0,1,2,3,4}, collapsed_slice_dims={1,2}
-}
-)";
-
-  CheckLayoutNormalization(hlo, R"(
-// CHECK: [[bitcast:[^ ]+]] = s16[2,14,16,2,3]{4,3,2,1,0} bitcast({{.*}})
-// CHECK: [[bitcast2:[^ ]+]] = s32[5,6]{1,0} bitcast({{.*}})
-// CHECK: s16[10,15,6,3]{3,2,1,0} gather([[bitcast]], [[bitcast2]]),
-// CHECK-SAME: offset_dims={0,1,3}, collapsed_slice_dims={0,3}, start_index_map={4,3,0,1,2}, index_vector_dim=0, slice_sizes={1,10,15,1,3}
-)");
-}
-
 }  // namespace
 }  // namespace xla

From 16a04b2e9d4e7e28d61d952ce94a6a4fc1c2f037 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 00:25:50 -0700
Subject: [PATCH 125/287] Automated Code Change

PiperOrigin-RevId: 638550941
---
 tensorflow/core/transforms/const_dedupe_hoist/BUILD   |  1 -
 tensorflow/core/transforms/const_dedupe_hoist/pass.cc | 11 +++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/transforms/const_dedupe_hoist/BUILD b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
index 54d58e8c7ee09f..6529814f18379c 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/BUILD
+++ b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
@@ -16,7 +16,6 @@ cc_library(
     hdrs = ["pass.h"],
     deps = [
         "//tensorflow/core/ir:Dialect",
-        "//tensorflow/core/platform:logging",
         "//tensorflow/core/transforms:PassIncGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
index 6a793cedea3b13..d25282631350ec 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
+++ b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
@@ -19,21 +19,24 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
-#include "tensorflow/core/ir/utility.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace tfg {

From b7216badeaabfc107dc056eee1d6697f348a3da0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 00:27:48 -0700
Subject: [PATCH 126/287] Automated Code Change

PiperOrigin-RevId: 638551258
---
 tensorflow/core/grappler/inputs/file_input_yielder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index e2511806ce35d8..e9ffb92cf0a929 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -80,7 +80,7 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
   }
   if (!s.ok()) {
     LOG(WARNING) << "Failed to read MetaGraphDef from " << filename << ": "
-                 << s.ToString();
+                 << s;
     // Attempt to process the next item on the list
     bad_inputs_ += 1;
     return NextItem(item);

From 374e4d7608ed294397b539ee1c4c673faf3bb85e Mon Sep 17 00:00:00 2001
From: "Dimitar (Mitko) Asenov" <dasenov@google.com>
Date: Thu, 30 May 2024 00:49:16 -0700
Subject: [PATCH 127/287] [XLA:GPU] Add a "draft" version of a generic Triton
 Fusion Emitter.

This change introduces a new Triton emitter that should eventually replace the existing `MatMul` and `SoftMax` emitters.

Highlights:
- The new emitter is a replacement of the existing `TiledSoftMax` emitter and covers all of those cases.
- For now the new emitter can also generate some non-SoftMax fusions, e.g. I've added a simple reduction fusion test.

TODO in future changes:
- Add more tests.
- The new emitter replaces `TiledSoftMax` but is otherwise unused, because no HLO-passes set it as a backend yet. We should hook it up.
- Extend the new emitter to support 2D and 3D launch grids. At the moment it only supports a 1D grid.
- Compute the `output_tile_sizes` in a generic way or pipe them in from the Tiling Analysis.
- Extend the launch-dimensions computation to support generic fusions. Not quite sure how to do this yet.

PiperOrigin-RevId: 638555878
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/xla/service/gpu/fusions/triton.cc     |  37 ++-
 .../xla/xla/service/gpu/gpu_fusible.cc        |   1 +
 .../xla/service/gpu/hlo_fusion_analysis.cc    |   3 +-
 .../xla/xla/service/gpu/ir_emission_utils.h   |   5 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  99 +++---
 .../xla/xla/service/gpu/ir_emitter_triton.h   |  36 ++-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 281 +++++++++++++-----
 .../gpu/triton_fusion_numerics_verifier.cc    |   5 +-
 .../xla/service/gpu/triton_support_test.cc    |  20 +-
 10 files changed, 348 insertions(+), 140 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 42397343601b94..64baed2d7f2fc3 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -528,6 +528,7 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/service:dump",
         "//xla/service:hlo_module_config",
+        "//xla/service:instruction_fusion",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/service/gpu/model:affine_map_printer",
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index b2fa6f00b51932..5db680bba6e8ed 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/triton.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
-#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -121,26 +121,28 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
 
     TritonWrapperResult triton_wrapper_result;
     LaunchDimensions launch_dimensions;
-    if (fusion_kind == kTritonSoftmaxFusionKind) {
+    if (fusion_kind == kTritonFusionKind ||
+        fusion_kind == kTritonSoftmaxFusionKind) {
       launch_dimensions = *this->launch_dimensions();
 
-      // This is a hack, we use TritonGemmConfig for Softmax too, but we ignore
-      // most parameters.
+      // This is a hack, we use TritonGemmConfig also for the Softmax and
+      // Generic emitters, but we ignore most parameters.
       TritonGemmConfig config;
       config.num_stages = 1;
       // Thread count per block is always a multiple of WarpSize.
       config.num_warps = launch_dimensions.num_threads_per_block() / WarpSize();
       config.num_ctas = 1;
 
-      TF_ASSIGN_OR_RETURN(auto analysis,
-                          TritonFusionAnalysis::Execute(*hlo_computation));
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
-          TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        ir_emitter_context.gpu_compute_capability(),
-                        ir_emitter_context.gpu_device_info(), config,
-                        ir_emitter_context.llvm_module(), &EmitSoftMax,
-                        *ir_emitter_context.mlir_context()));
+          TritonWrapper(
+              /*analysis=*/{}, impl_fn_name, hlo_computation,
+              ir_emitter_context.gpu_compute_capability(),
+              ir_emitter_context.gpu_device_info(), config,
+              /*output_tile_sizes=*/{},  // TODO(b/332649307): Pass useful data.
+              ir_emitter_context.llvm_module(),
+              (fusion_kind == kTritonFusionKind ? &EmitGeneric : &EmitSoftMax),
+              *ir_emitter_context.mlir_context()));
     } else {  // Must be a MatMul
       CHECK_EQ(fusion_kind, kTritonGemmFusionKind);
       if (!backend_config.has_triton_gemm_config()) {
@@ -166,6 +168,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
                         ir_emitter_context.gpu_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
+                        /*output_tile_sizes=*/{},
                         ir_emitter_context.llvm_module(), &EmitMatMul,
                         *ir_emitter_context.mlir_context()));
       TF_ASSIGN_OR_RETURN(
@@ -214,7 +217,17 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
 }
 
 std::optional<LaunchDimensions> TritonFusion::launch_dimensions() const {
-  if (analysis_.fusion_backend_config().kind() == kTritonSoftmaxFusionKind) {
+  if (analysis_.fusion_backend_config().kind() == kTritonFusionKind) {
+    // TODO(b/332649307): Change the line below to something more generic that
+    // can handle different instructions (not just Reduce) and different
+    // dimensions.
+    //
+    // One rough idea is to have a grid where:
+    // - 1 grid dimension corresponds to all batch dimensions in the HLO.
+    // - 1-2 grid dimension corresponds to block-able dimensions from the HLO.
+    return CalculateSoftMaxLaunchDimensions(analysis_.fusion());
+  } else if (analysis_.fusion_backend_config().kind() ==
+             kTritonSoftmaxFusionKind) {
     return CalculateSoftMaxLaunchDimensions(analysis_.fusion());
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 93f367b21d868e..f6df2d9123fc00 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -906,6 +906,7 @@ std::vector<const HloInstruction*> GetFusionRoots(
 }
 
 bool IsTritonSoftmaxFusion(const HloInstruction& instr) {
+  // TODO(b/332649307): Eventually turn this into a generic fusion.
   return instr.opcode() == HloOpcode::kFusion &&
          instr.fusion_kind() == HloInstruction::FusionKind::kCustom &&
          instr.backend_config<GpuBackendConfig>().ok() &&
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 765fd6b80354d8..abff91eaf68ed5 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -204,7 +204,8 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     return EmitterFusionKind::kCustomFusion;
   }
 
-  if (fusion_backend_config_.kind() == kTritonGemmFusionKind ||
+  if (fusion_backend_config_.kind() == kTritonFusionKind ||
+      fusion_backend_config_.kind() == kTritonGemmFusionKind ||
       fusion_backend_config_.kind() == kTritonSoftmaxFusionKind) {
     return EmitterFusionKind::kTriton;
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 226f7b68df1b98..99109b54fb4169 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -63,6 +63,11 @@ inline constexpr int64_t WarpSize() { return 32; }
 // FusionBackendConfig.kind requel to this string.
 inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
 
+// Generic fusions that use Triton have FusionBackendConfig.kind equal to this
+// string. This fusion kind will eventually subsume all usages of
+// kTritonGemmFusionKind and kTritonSoftmaxFusionKind.
+inline constexpr absl::string_view kTritonFusionKind = "__triton";
+
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
 inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 4a6a5d9141747d..4c619cb0683c0f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -122,6 +122,7 @@ limitations under the License.
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -2031,8 +2032,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         const se::DeviceDescription& device_info,
                         const TritonFusionAnalysis& analysis,
                         const HloComputation* computation,
-                        mlir::triton::FuncOp fn,
-                        const TritonGemmConfig& config) {
+                        mlir::triton::FuncOp fn, const TritonGemmConfig& config,
+                        const std::vector<int64_t>& output_tile_sizes) {
   TF_RETURN_IF_ERROR(CheckGemmTilingComplexityHeuristic(config));
 
   const HloInstruction* instr =
@@ -2446,16 +2447,49 @@ MakeTensorPtrOpAndBoundaryChecks CreateMakeTensorPtrOp(
 
 }  // namespace ir_emitter_triton_internal
 
-absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
-                              absl::string_view libdevice_path,
-                              const se::DeviceDescription& device_info,
-                              SymbolicTileAnalysis* analysis,
-                              const HloComputation* computation,
-                              mlir::triton::FuncOp fn) {
+absl::Status EmitGeneric(mlir::OpBuilder builder,
+                         absl::string_view libdevice_path,
+                         const se::DeviceDescription& device_info,
+                         const TritonFusionAnalysis&,
+                         const HloComputation* computation,
+                         mlir::triton::FuncOp fn, const TritonGemmConfig&,
+                         const std::vector<int64_t>& output_tile_sizes) {
+  SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
+      SymbolicTileAnalysis::AnalyzeComputation(*computation,
+                                               builder.getContext());
+  if (std::holds_alternative<FusionDecision>(symbolic_tile_analysis_or)) {
+    return Internal(
+        "Unsupported fusion in EmitGeneric: %s",
+        std::get<FusionDecision>(symbolic_tile_analysis_or).Explain());
+  }
+
+  const auto& symbolic_tile_analysis =
+      std::get<SymbolicTileAnalysis>(symbolic_tile_analysis_or);
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
   ImplicitLocOpBuilder b(loc, builder);
 
+  Value pid = b.create<ma::IndexCastUIOp>(
+      b.getIndexType(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
+
+  TF_ASSIGN_OR_RETURN(
+      TiledHloComputation tiled_hlo_computation,
+      symbolic_tile_analysis.ComputeTiledHloInstructions(output_tile_sizes));
+  TF_ASSIGN_OR_RETURN(Value result,
+                      EmitTiledScope(b, libdevice_path, device_info,
+                                     tiled_hlo_computation, fn, pid));
+
+  const auto& tiled_hlo = *tiled_hlo_computation.GetRoot();
+  auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
+      b, pid, tiled_hlo, fn.getArgument(computation->num_parameters()));
+  b.create<mt::StoreOp>(make_tensor.op, result, make_tensor.boundary_checks,
+                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::vector<int64_t>> SoftMaxOutputTileSizes(
+    const HloComputation* computation) {
   // Assumptions we make about the matcher:
   //   * matches Softmax "diamonds" on the last axis, along with any number of
   //     elementwise operations/bitcasts on any edge
@@ -2492,44 +2526,34 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
   }
 
   int row_len = reduce_input_shape.dimensions_minor(0);
-
-  Value pid = b.create<ma::IndexCastUIOp>(
-      b.getIndexType(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
-
-  std::vector<int64_t> output_tile_sizes(
-      computation->root_instruction()->shape().rank(), 1);
+  std::vector<int64_t> output_tile_sizes(root_shape.rank(), 1);
   output_tile_sizes.back() = row_len;
-
-  TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
-                      analysis->ComputeTiledHloInstructions(output_tile_sizes));
-  TF_ASSIGN_OR_RETURN(Value result,
-                      EmitTiledScope(b, libdevice_path, device_info,
-                                     tiled_hlo_computation, fn, pid));
-
-  const auto& tiled_hlo = *tiled_hlo_computation.GetRoot();
-  auto make_tensor = ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-      b, pid, tiled_hlo, fn.getArgument(computation->num_parameters()));
-  b.create<mt::StoreOp>(make_tensor.op, result, make_tensor.boundary_checks,
-                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
-
-  return absl::OkStatus();
+  return output_tile_sizes;
 }
 
 absl::Status EmitSoftMax(mlir::OpBuilder builder,
                          absl::string_view libdevice_path,
                          const se::DeviceDescription& device_info,
-                         const TritonFusionAnalysis& analysis,
+                         const TritonFusionAnalysis&,
                          const HloComputation* computation,
                          mlir::triton::FuncOp fn,
-                         const TritonGemmConfig& config) {
+                         const TritonGemmConfig& config,
+                         const std::vector<int64_t>& output_tile_sizes) {
   SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
       SymbolicTileAnalysis::AnalyzeComputation(*computation,
                                                builder.getContext());
   if (auto* symbolic_tile_analysis =
           std::get_if<SymbolicTileAnalysis>(&symbolic_tile_analysis_or)) {
-    return EmitTiledSoftMax(builder, libdevice_path, device_info,
-                            symbolic_tile_analysis, computation, fn);
+    TF_ASSIGN_OR_RETURN(std::vector<int64_t> output_tile_sizes,
+                        SoftMaxOutputTileSizes(computation));
+    return EmitGeneric(builder, libdevice_path, device_info,
+                       TritonFusionAnalysis{}, computation, fn,
+                       TritonGemmConfig{}, output_tile_sizes);
   }
+  // TODO(b/332649307): Remove the fallback on the legacy triton analysis once
+  //  the symbolic tile analysis can handle all cases.
+  TF_ASSIGN_OR_RETURN(TritonFusionAnalysis analysis,
+                      TritonFusionAnalysis::Execute(*computation));
 
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
@@ -2707,7 +2731,8 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
-    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
+    const std::vector<int64_t>& output_tile_sizes, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context) {
   mlir_context
       .loadDialect<mt::TritonDialect, mt::gpu::TritonGPUDialect,
                    mlir::arith::ArithDialect, mlir::affine::AffineDialect>();
@@ -2744,7 +2769,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
   TF_RETURN_IF_ERROR(ir_emitter(
       b, GetLibdevicePath(hlo_computation->parent()->config(), device_info),
-      device_info, analysis, hlo_computation, fn, config));
+      device_info, analysis, hlo_computation, fn, config, output_tile_sizes));
 
   b.create<mt::ReturnOp>(loc);
 
@@ -2767,8 +2792,8 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
-    llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
-    mlir::MLIRContext& mlir_context) {
+    const std::vector<int64_t>& output_tile_sizes, llvm::Module* llvm_module,
+    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
   if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
     auto ccCuda = std::get<se::CudaComputeCapability>(cc);
     if (!ccCuda.IsAtLeastAmpere()) {
@@ -2780,7 +2805,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   TF_ASSIGN_OR_RETURN(
       auto triton_module,
       CreateTritonModule(analysis, fn_name, hlo_computation, device_info,
-                         config, ir_emitter, mlir_context));
+                         config, output_tile_sizes, ir_emitter, mlir_context));
 
   VLOG(3) << hlo_computation->ToString(HloPrintOptions::ShortParsable());
   VLOG(2) << config.ToString();
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index c48d5de7423aed..fc3571f085fe5b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -57,34 +58,50 @@ struct TritonWrapperResult {
   std::optional<se::ClusterDim> cluster_dim;
 };
 
+// Generate Triton IR inside 'fn'. This uses the given output_tile_sizes
+// and the SymbolicTileAnalysis from the computation. The provided
+// TritonFusionAnalysis and TritonGemmConfig are ignored.
+absl::Status EmitGeneric(mlir::OpBuilder b, absl::string_view libdevice_path,
+                         const se::DeviceDescription& device_info,
+                         const TritonFusionAnalysis& analysis,
+                         const HloComputation* computation,
+                         mlir::triton::FuncOp fn,
+                         const TritonGemmConfig& config,
+                         const std::vector<int64_t>& output_tile_sizes);
+
 // Compute the launch dimensions for the given Triton MatMul.
 absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
     const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
     const TritonGemmConfig& config);
-// Use tiling and execution parameters from 'config'.
+
+// Use tiling and execution parameters from 'config'. output_tile_sizes is
+// ignored.
 absl::Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                         const se::DeviceDescription& device_info,
                         const TritonFusionAnalysis& analysis,
                         const HloComputation* computation,
-                        mlir::triton::FuncOp fn,
-                        const TritonGemmConfig& config);
+                        mlir::triton::FuncOp fn, const TritonGemmConfig& config,
+                        const std::vector<int64_t>& output_tile_sizes);
 
 // Compute the launch dimensions for the given Triton SoftMax.
 LaunchDimensions GetSoftMaxLaunchDimensions(const HloFusionAdaptor& fusion,
                                             const TritonGemmConfig& config);
+
 // Generate Softmax in Triton IR inside 'fn'.
-// Use execution parameters from 'config'.
+// Use execution parameters from 'config'. output_tile_sizes is ignored.
 absl::Status EmitSoftMax(mlir::OpBuilder b, absl::string_view libdevice_path,
                          const se::DeviceDescription& device_info,
                          const TritonFusionAnalysis& analysis,
                          const HloComputation* computation,
                          mlir::triton::FuncOp fn,
-                         const TritonGemmConfig& config);
+                         const TritonGemmConfig& config,
+                         const std::vector<int64_t>& output_tile_sizes);
 
 using TritonIrEmitter = std::function<absl::Status(
     mlir::OpBuilder, absl::string_view, const se::DeviceDescription&,
     const TritonFusionAnalysis& analysis, const HloComputation*,
-    mlir::triton::FuncOp, const TritonGemmConfig&)>;
+    mlir::triton::FuncOp, const TritonGemmConfig&,
+    const std::vector<int64_t>&)>;
 
 // Generate Triton IR by running the provided generator and compile it into LLVM
 // IR.
@@ -93,8 +110,8 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
-    llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
-    mlir::MLIRContext& mlir_context);
+    const std::vector<int64_t>& output_tile_sizes, llvm::Module* llvm_module,
+    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
 
 // Creates the initial Triton module for the given fusion. Visible for testing,
 // use TritonWrapper instead.
@@ -102,7 +119,8 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
-    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
+    const std::vector<int64_t>& output_tile_sizes, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context);
 
 // Compiles a given Triton module to LLVM IR.
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index d0116078f8bea2..882fe84664c369 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <iterator>
 #include <limits>
@@ -148,17 +149,17 @@ class TritonGemmTestWithoutTritonGemmAny : public TritonGemmTest {
 
 class TritonFilecheckTest : public TritonTest {
  public:
-  absl::Status CreateTritonIrAndFileCheck(absl::string_view hlo_text,
-                                          const TritonGemmConfig& config,
-                                          TritonIrEmitter emitter,
-                                          absl::string_view triton_fusion_name,
-                                          absl::string_view filecheck_pattern);
+  absl::Status CreateTritonIrAndFileCheck(
+      absl::string_view hlo_text, const TritonGemmConfig& config,
+      const std::vector<int64_t>& output_tile_sizes, TritonIrEmitter emitter,
+      absl::string_view triton_fusion_name,
+      absl::string_view filecheck_pattern);
 };
 
 absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
     absl::string_view hlo_text, const TritonGemmConfig& config,
-    TritonIrEmitter emitter, absl::string_view triton_fusion_name,
-    absl::string_view filecheck_pattern) {
+    const std::vector<int64_t>& output_tile_sizes, TritonIrEmitter emitter,
+    absl::string_view triton_fusion_name, absl::string_view filecheck_pattern) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> verified_module,
                       ParseAndReturnVerifiedModule(hlo_text));
 
@@ -170,9 +171,10 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
 
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
-      auto module, CreateTritonModule(analysis, "triton_fn", computation,
-                                      TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-                                      config, emitter, context));
+      auto module,
+      CreateTritonModule(analysis, "triton_fn", computation,
+                         TestGpuDeviceInfo::RTXA6000DeviceInfo(), config,
+                         output_tile_sizes, emitter, context));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -207,7 +209,8 @@ ENTRY e {
                          "num_ctas":1}}}
 })";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitMatMul,
                                           "triton_gemm_r", R"(
 CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x64xf32>
@@ -293,8 +296,8 @@ ENTRY e {
 })";
 
   TritonGemmConfig config(16, 16, 32, 1, 1, 1);
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x16xf32>
 CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
@@ -377,7 +380,8 @@ ENTRY main {
   ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
 })";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -430,7 +434,8 @@ ENTRY main {
   ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
 })";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -486,7 +491,8 @@ ENTRY main {
 }
 )";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -552,7 +558,8 @@ ENTRY main {
 }
 )";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -616,7 +623,8 @@ ENTRY main {
 }
 )";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -684,7 +692,8 @@ ENTRY main {
 }
 )";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -757,7 +766,8 @@ ENTRY main {
 }
 )";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
@@ -831,7 +841,8 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 CHECK-LABEL:   tt.func @triton_fn(
@@ -947,7 +958,8 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 32)>
 CHECK-LABEL:   tt.func @triton_fn(
@@ -1020,7 +1032,8 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 // CHECK:         #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
@@ -1092,7 +1105,8 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 // CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
@@ -1166,7 +1180,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(16, 16, 16, 1, 1, 1);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
+                                          /*output_tile_sizes=*/{}, EmitMatMul,
                                           "triton_gemm_computation", R"(
 CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>>
 CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
@@ -1202,8 +1217,8 @@ ENTRY e {
 })";
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 2);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
-                                          "triton_gemm", R"(
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_gemm", R"(
 CHECK:   tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32>
 CHECK-SAME:                 %[[P1:[^:]*]]: !tt.ptr<f32>
 CHECK-SAME:                 %[[P2:[^:]*]]: !tt.ptr<f32>
@@ -1250,8 +1265,9 @@ ENTRY e {
 })";
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 2);
-  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
-                                         "triton_gemm", R"(
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, /*output_tile_sizes=*/{},
+                                 EmitMatMul, "triton_gemm", R"(
 CHECK:     tt.func @triton_fn({{[^,]*}}, %[[DYNAMIC_SLICE_INPUT:[^:]*]]: !tt.ptr<f32> {{[^,]*}}, %[[START_INDEX0_PTR:[^:]*]]: !tt.ptr<i32>
 CHECK-DAG:   %[[C0_i32:.*]] = arith.constant 0 : i32
 CHECK-DAG:   %[[C1_i64:.*]] = arith.constant 1 : i64
@@ -1267,7 +1283,7 @@ CHECK-DAG:   %[[ROW_OFFSET_i64:.*]] = arith.extsi %[[ROW_OFFSET]] : i32 to i64
 CHECK-DAG:   %[[ROW_LIMIT:.*]] = arith.addi %[[ROW_OFFSET_i64]], %[[C5_i64]] : i64
 CHECK-DAG:   tt.make_tensor_ptr %[[DYNAMIC_SLICE_INPUT]], [%[[C2_i64]], %[[ROW_LIMIT]]], [%[[C1_i64]], %[[C2_i64]]], [%[[C0_i32]], %[[ROW_OFFSET]]]
 )"),
-              tsl::testing::IsOk());
+      tsl::testing::IsOk());
 }
 
 TEST_F(TritonFilecheckTest, SparseDot) {
@@ -1293,8 +1309,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK: %[[LHS:[0-9]+]] = tt.load
 CHECK: %[[RHS:[0-9]+]] = tt.load
 CHECK: %[[META:[0-9]+]] = tt.load
@@ -1325,8 +1341,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 64, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-DAG: %[[C24:.+]] = arith.constant dense<24>
 CHECK-DAG: %[[C48:.+]] = arith.constant dense<48>
 CHECK: %[[LHS:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 1>
@@ -1365,8 +1381,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK: %[[TWO:.+]] = arith.constant 2 : i32
 CHECK: %[[LHS:[0-9]+]] = tt.load
 CHECK: %[[RHS:[0-9]+]] = tt.load
@@ -1526,7 +1542,8 @@ ENTRY entry {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
-                    dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
+                    dev_info, config, /*output_tile_sizes=*/{}, &llvm_module,
+                    &EmitMatMul, mlir_context),
       tsl::testing::StatusIs(
           tsl::error::RESOURCE_EXHAUSTED,
           ::testing::HasSubstr("Shared memory size limit exceeded")));
@@ -1539,7 +1556,8 @@ ENTRY entry {
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
-                    dev_info, config, &llvm_module, &EmitMatMul, mlir_context));
+                    dev_info, config, /*output_tile_sizes=*/{}, &llvm_module,
+                    &EmitMatMul, mlir_context));
   // Use optin shared memory which is > shared_memory_per_block.
   EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
 }
@@ -2053,7 +2071,8 @@ ENTRY entry {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
-                    dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
+                    dev_info, config, /*output_tile_sizes=*/{}, &llvm_module,
+                    &EmitMatMul, mlir_context),
       tsl::testing::StatusIs(
           tsl::error::RESOURCE_EXHAUSTED,
           "Tiling complexity heuristic exceeded: 147456 > 9000"));
@@ -2065,7 +2084,8 @@ ENTRY entry {
   TF_CHECK_OK(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
-                    dev_info, config, &llvm_module, &EmitMatMul, mlir_context)
+                    dev_info, config, /*output_tile_sizes=*/{}, &llvm_module,
+                    &EmitMatMul, mlir_context)
           .status());
 }
 
@@ -3647,8 +3667,8 @@ ENTRY e {
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, GpuComputeComp(),
-                    dev_info, triton_gemm_config, &llvm_module, &EmitMatMul,
-                    mlir_context));
+                    dev_info, triton_gemm_config, /*output_tile_sizes=*/{},
+                    &llvm_module, &EmitMatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / optin shared memory so that any GPU that
   // has the optin one should be able to execute the test.
@@ -4834,8 +4854,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
 CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
@@ -4876,8 +4896,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
 CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
@@ -4919,8 +4939,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(64, 32, 32, 1, 1, 4);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
     )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-5,
@@ -4949,8 +4969,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -4993,8 +5013,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -5049,8 +5069,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -5165,8 +5185,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
 CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
@@ -5207,8 +5227,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
 CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
@@ -5249,8 +5269,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:      tt.dot
 CHECK-SAME: tensor<32x32xf16> * tensor<32x32xf16> -> tensor<32x32xf32>
 CHECK-NOT:  tt.dot
@@ -5279,8 +5299,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(64, 32, 32, 1, 1, 4);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
     )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-4,
@@ -5309,8 +5329,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -5353,8 +5373,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -5399,8 +5419,8 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      kHloText, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -5480,8 +5500,8 @@ ENTRY entry {
                     "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::VOLTA,
                                               /*minor=*/0},
-                    dev_info, TritonGemmConfig{}, &llvm_module, &EmitMatMul,
-                    mlir_context),
+                    dev_info, TritonGemmConfig{}, /*output_tile_sizes=*/{},
+                    &llvm_module, &EmitMatMul, mlir_context),
       tsl::testing::StatusIs(
           absl::StatusCode::kFailedPrecondition,
           ::testing::StrEq(
@@ -5519,8 +5539,8 @@ ENTRY e {
 })";
 
   TritonGemmConfig config(32, 16, 128, 1, 1, 4);
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheck(hlo_text, config, EmitMatMul, "triton_dot", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(
+      hlo_text, config, /*output_tile_sizes=*/{}, EmitMatMul, "triton_dot", R"(
 CHECK:      tt.dot
 CHECK-NOT:  inputPrecision = tf32
   )"));
@@ -5528,6 +5548,123 @@ CHECK-NOT:  inputPrecision = tf32
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonFilecheckTest, TestGenericEmitterReductionFusion) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_reduction_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125]{0} parameter(1)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  ROOT multiply = f32[125]{0} multiply(parameter_1, reduce_0)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[125]{0} parameter(1)
+  ROOT triton_reduction = f32[125]{0} fusion(param_0, param_1), kind=kCustom, calls=triton_reduction_computation, backend_config={"fusion_backend_config": {"kind":"__triton"}}
+})";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  std::vector<int64_t> output_tile_sizes = {1};
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, output_tile_sizes,
+                                          EmitGeneric,
+                                          "triton_reduction_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
+CHECK:            arith.index_castui %[[PID]] : i32 to index
+CHECK:            tt.addptr %[[P1]]
+CHECK-NEXT:       tt.load
+CHECK-SAME:       !tt.ptr<f32>
+CHECK-NEXT:       tt.splat
+
+CHECK:            tt.addptr %[[P0]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
+CHECK-NEXT:         %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
+CHECK-NEXT:         tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<f32>
+CHECK:            tt.addptr %[[P2]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<f32>>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       !tt.ptr<tensor<f32>>
+CHECK:            tt.return
+CHECK:          }
+)"));
+}
+
+TEST_F(TritonFilecheckTest, TestGenericEmitterWithSoftMaxSingleParameter) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton"}}
+})";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  std::vector<int64_t> output_tile_sizes = {1, 127};
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, output_tile_sizes,
+                                          EmitGeneric,
+                                          "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
+CHECK:            arith.index_castui %[[PID]] : i32 to index
+CHECK:            tt.addptr %[[P0]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+CHECK:            tt.addptr %[[P1]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
+CHECK:            tt.return
+CHECK:        }
+)"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc
index c3e92082e3c04e..1d4620acb7c138 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc
@@ -53,8 +53,6 @@ using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 // Returns the input instruction as a fusion instruction, if it represents a
 // Triton fusion. Otherwise, returns nullptr.
-//
-// TODO(b/326274248): Visit all Triton fusions. Now it's Softmax only.
 absl::StatusOr<const HloFusionInstruction*> AsTritonFusion(
     const HloInstruction* hlo) {
   if (hlo->opcode() != HloOpcode::kFusion) {
@@ -65,7 +63,8 @@ absl::StatusOr<const HloFusionInstruction*> AsTritonFusion(
                       fusion->backend_config<GpuBackendConfig>());
   const FusionBackendConfig& backend_config =
       gpu_config.fusion_backend_config();
-  if (backend_config.kind() == kTritonSoftmaxFusionKind) {
+  if (backend_config.kind() == kTritonFusionKind ||
+      backend_config.kind() == kTritonSoftmaxFusionKind) {
     return fusion;
   }
   return nullptr;
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
index 8a8b2242040a97..7e9a8fdd7312bd 100644
--- a/third_party/xla/xla/service/gpu/triton_support_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(b/343158720): Simplify the tests in this file after a generic emitter
+// has landed.
 #include "xla/service/gpu/triton_support.h"
 
 #include <memory>
@@ -459,7 +461,8 @@ ENTRY e {
     EXPECT_THAT(
         TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                       computation, GetCudaComputeCapability(), dev_info,
-                      config_, &llvm_module_, &EmitMatMul, mlir_context_),
+                      config_, /*output_tile_sizes=*/{}, &llvm_module_,
+                      &EmitMatMul, mlir_context_),
         tsl::testing::StatusIs(
             absl::StatusCode::kInternal,
             ::testing::HasSubstr("Failed to compile Triton kernel")));
@@ -503,7 +506,8 @@ ENTRY e {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                     computation, GetCudaComputeCapability(), dev_info, config_,
-                    &llvm_module_, &EmitMatMul, mlir_context_),
+                    /*output_tile_sizes=*/{}, &llvm_module_, &EmitMatMul,
+                    mlir_context_),
       tsl::testing::StatusIs(
           absl::StatusCode::kInternal,
           ::testing::HasSubstr("pm.run(triton_module.get()).succeeded()")));
@@ -543,7 +547,8 @@ ENTRY e {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                     computation, GetCudaComputeCapability(), dev_info, config_,
-                    &llvm_module_, &EmitMatMul, mlir_context_),
+                    /*output_tile_sizes=*/{}, &llvm_module_, &EmitMatMul,
+                    mlir_context_),
       tsl::testing::StatusIs(absl::StatusCode::kInternal,
                              ::testing::HasSubstr("num_batch_dims <= 1")));
 }
@@ -640,7 +645,8 @@ ENTRY main {
     EXPECT_THAT(
         TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                       computation, GetCudaComputeCapability(), dev_info,
-                      config_, &llvm_module_, &EmitSoftMax, mlir_context_),
+                      config_, /*output_tile_sizes=*/{}, &llvm_module_,
+                      &EmitSoftMax, mlir_context_),
         tsl::testing::StatusIs(
             absl::StatusCode::kInternal,
             ::testing::HasSubstr("Failed to compile Triton kernel")));
@@ -884,7 +890,8 @@ ENTRY main {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                     computation, GetCudaComputeCapability(), dev_info, config_,
-                    &llvm_module_, &EmitSoftMax, mlir_context_),
+                    /*output_tile_sizes=*/{}, &llvm_module_, &EmitSoftMax,
+                    mlir_context_),
       tsl::testing::StatusIs(
           absl::StatusCode::kInternal,
           ::testing::HasSubstr("operand->opcode() == HloOpcode::kConstant")));
@@ -933,7 +940,8 @@ ENTRY main {
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
                     computation, GetCudaComputeCapability(), dev_info, config_,
-                    &llvm_module_, &EmitSoftMax, mlir_context_),
+                    /*output_tile_sizes=*/{}, &llvm_module_, &EmitSoftMax,
+                    mlir_context_),
       tsl::testing::StatusIs(absl::StatusCode::kInvalidArgument,
                              ::testing::HasSubstr("Unsupported operation")));
 }

From bc42c0c120d1cd7b8decedd8a9e9e9676293ac93 Mon Sep 17 00:00:00 2001
From: Michael Levesque-Dion <mlevesquedion@google.com>
Date: Thu, 30 May 2024 01:03:34 -0700
Subject: [PATCH 128/287] Integrate StableHLO at openxla/stablehlo@c44d9af8

PiperOrigin-RevId: 638559828
---
 .../mlir/tf2xla/transforms/legalize_tf.cc     |  16 +-
 third_party/stablehlo/temporary.patch         | 126 ++-
 third_party/stablehlo/workspace.bzl           |   4 +-
 .../xla/third_party/stablehlo/temporary.patch | 126 ++-
 .../xla/third_party/stablehlo/workspace.bzl   |   4 +-
 .../xla/xla/mlir_hlo/bindings/c/Attributes.cc |  79 +-
 .../xla/xla/mlir_hlo/bindings/c/Attributes.h  |  35 +-
 .../mlir_hlo/bindings/python/MlirHloModule.cc |  49 +-
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc       |  44 +-
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td       |  26 +-
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td |   4 +
 .../xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc |  23 +-
 .../expand_ops_simplifier.cc                  |   3 +
 .../hlo_legalize_to_stablehlo.cc              |   6 +-
 .../legalize_to_linalg/legalize_to_linalg.cc  |   5 +
 .../legalize_torch_index_select_to_gather.cc  |   6 +-
 .../mhlo_canonicalize_gather.cc               |  10 +-
 .../mhlo_canonicalize_scatter.cc              |   8 +
 .../stablehlo_legalize_to_hlo.cc              |   6 +-
 .../mhlo/hlo-legalize-to-stablehlo.mlir       |  76 +-
 .../xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir  | 859 ++++++++++++------
 .../mhlo/stablehlo-legalize-to-hlo.mlir       |  76 +-
 .../Dialect/mhlo/verifier_scatter_op.mlir     | 804 +++++++++++++---
 .../xla/mlir_hlo/tests/python/attributes.py   |   4 +
 .../hlo_to_mhlo/attribute_importer.cc         |   6 +-
 .../translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc  |   9 +
 26 files changed, 1739 insertions(+), 675 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index adeb2b25e664b5..13c9c3f9306b92 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -1449,7 +1449,8 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
 
     auto dims_attr = GatherDimensionNumbersAttr::get(
         rewriter.getContext(), offset_dims, collapsed_slice_dims,
-        start_index_map, index_vector_dim);
+        /*operandBatchingDims=*/{},
+        /*startIndicesBatchingDims=*/{}, start_index_map, index_vector_dim);
     // TODO(disc): Remove this if-statement once fold and canonicalization is
     // implemented.
     if (params_ty.hasStaticShape() && indices_ty.hasStaticShape()) {
@@ -1956,7 +1957,9 @@ class ConvertMatrixDiagPartV3Op
     auto dims_attr = GatherDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*offsetDims=*/llvm::to_vector<4>(llvm::seq<int64_t>(0, num_dims - 2)),
-        /*collapsedSliceDims=*/collapsed_dims, start_index_map,
+        /*collapsedSliceDims=*/collapsed_dims,
+        /*operandBatchingDims=*/{},
+        /*startIndicesBatchingDims=*/{}, start_index_map,
         /*indexVectorDim=*/0);
     Value gather = rewriter.create<mhlo::GatherOp>(
         loc, op.getInput(), start_indices, dims_attr,
@@ -4373,6 +4376,8 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         llvm::to_vector<4>(
             llvm::seq<int64_t>(updates_rank - window_dims, updates_rank)),
         llvm::to_vector<4>(llvm::seq<int64_t>(0, num_index_dims)),
+        /*inputBatchingDims=*/{},
+        /*scatterIndicesBatchingDims=*/{},
         llvm::to_vector<4>(llvm::seq<int64_t>(0, num_index_dims)),
         indices_rank - 1);
 
@@ -5614,7 +5619,10 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     auto dims_attr = ScatterDimensionNumbersAttr::get(
         rewriter.getContext(),
         llvm::to_vector<4>(llvm::seq<int64_t>(segment_ids_rank, data_rank)),
-        inserted_window_dims, scatter_dims_to_operand_dims, index_vector_dim);
+        inserted_window_dims,
+        /*inputBatchingDims=*/{},
+        /*scatterIndicesBatchingDims=*/{}, scatter_dims_to_operand_dims,
+        index_vector_dim);
 
     auto scatter = rewriter.create<ScatterOp>(
         op.getLoc(), op.getType(), ValueRange(Value(broadcasted_init)),
@@ -5836,6 +5844,8 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
         rewriter.getContext(),
         /*offsetDims=*/llvm::to_vector<4>(llvm::seq<int64_t>(1, input_rank)),
         /*collapsedSliceDims=*/{0},
+        /*operandBatchingDims=*/{},
+        /*startIndicesBatchingDims=*/{},
         /*startIndexMap=*/{0},
         /*indexVectorDim=*/1);
 
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 85a49110b59f95..e58e282bc89283 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -175,53 +175,6 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
-@@ -155,7 +155,7 @@
- 
- // CHECK-LABEL: @maximum_f64
- func.func @maximum_f64(%arg0 : tensor<10xf64>, %arg1 : tensor<10xf64>) -> tensor<10xf64> {
--  // CHECK: stablehlo.maximum
-+  // CHECK: tosa.maximum
-   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<10xf64>, tensor<10xf64>) -> tensor<10xf64>
-   return %0 : tensor<10xf64>
- }
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-@@ -9,8 +9,7 @@
- 
- // CHECK-LABEL: @constant_f64
- func.func @constant_f64() -> tensor<10xf64> {
--  // TOSA does not support 64-bit types, so this should not legalize.
--  // CHECK: stablehlo.constant
-+  // CHECK: tosa.const
-   %0 = stablehlo.constant dense<0.000000e+00> : tensor<10xf64>
-   return %0 : tensor<10xf64>
- }
-diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.cpp b/stablehlo/stablehlo/dialect/AssemblyFormat.cpp
---- stablehlo/stablehlo/dialect/AssemblyFormat.cpp
-+++ stablehlo/stablehlo/dialect/AssemblyFormat.cpp
-@@ -305,8 +305,7 @@
- bool isCommutativeNoRegionMatchingDialect(OperationName innerOp,
-                                           StringRef reduceOpDialect) {
-   auto innerOpDialect = innerOp.getDialect();
--  return innerOpDialect &&
--         innerOpDialect->getNamespace().equals(reduceOpDialect) &&
-+  return innerOpDialect && innerOpDialect->getNamespace() == reduceOpDialect &&
-          innerOp.hasTrait<mlir::OpTrait::NOperands<2>::Impl>() &&
-          innerOp.hasTrait<mlir::OpTrait::OneResult>() &&
-          (innerOp.hasTrait<mlir::hlo::OpTrait::IsCommutative>() ||
-@@ -359,7 +358,7 @@
-   // Check E5.
-   LLVM_DEBUG(llvm::dbgs() << "Checking ReduceOp compact print E5\n");
-   auto retOp = block.getTerminator();
--  if (!retOp->getName().stripDialect().equals("return")) return false;
-+  if (retOp->getName().stripDialect() != "return") return false;
- 
-   return llvm::equal(innerOp.getResults(), retOp->getOperands());
- }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -2440,7 +2393,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
-@@ -0,0 +1,170 @@
+@@ -0,0 +1,171 @@
 +/* Copyright 2022 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2594,6 +2547,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +
 +    RewritePatternSet patterns(&getContext());
 +    populateStablehloRefineShapesPatterns(&patterns, &getContext());
++    populateStablehloShapeFolderPatterns(&patterns, &getContext());
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
@@ -2611,18 +2565,64 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/integrations/python/tests/stablehlo.py b/stablehlo/stablehlo/integrations/python/tests/stablehlo.py
+--- stablehlo/stablehlo/integrations/python/tests/stablehlo.py
++++ stablehlo/stablehlo/integrations/python/tests/stablehlo.py
+@@ -115,14 +115,17 @@
+       operand_batching_dims=[6, 7],
+       start_indices_batching_dims=[8, 9],
+       start_index_map=[10],
+-      index_vector_dim=11)
+-  assert attr is not None
+-  assert str(attr) == ("#stablehlo.gather<offset_dims = [1, 2], "
+-                       "collapsed_slice_dims = [3, 4, 5], "
+-                       "operand_batching_dims = [6, 7], "
+-                       "start_indices_batching_dims = [8, 9], "
+-                       "start_index_map = [10], "
+-                       "index_vector_dim = 11>")
++      index_vector_dim=11,
++  )
++  assert attr is not None
++  assert str(attr) == (
++      "#stablehlo.gather<offset_dims = [1, 2], "
++      "collapsed_slice_dims = [3, 4, 5], "
++      "operand_batching_dims = [6, 7], "
++      "start_indices_batching_dims = [8, 9], "
++      "start_index_map = [10], "
++      "index_vector_dim = 11>"
++  )
+   assert attr.offset_dims == [1, 2]
+   assert attr.collapsed_slice_dims == [3, 4, 5]
+   assert attr.operand_batching_dims == [6, 7]
+@@ -178,14 +181,17 @@
+       input_batching_dims=[6, 7],
+       scatter_indices_batching_dims=[8, 9],
+       scattered_dims_to_operand_dims=[10, 11],
+-      index_vector_dim=12)
+-  assert attr is not None
+-  assert str(attr) == ("#stablehlo.scatter<update_window_dims = [1, 2, 3], "
+-                       "inserted_window_dims = [4, 5], "
+-                       "input_batching_dims = [6, 7], "
+-                       "scatter_indices_batching_dims = [8, 9], "
+-                       "scatter_dims_to_operand_dims = [10, 11], "
+-                       "index_vector_dim = 12>")
++      index_vector_dim=12,
++  )
++  assert attr is not None
++  assert str(attr) == (
++      "#stablehlo.scatter<update_window_dims = [1, 2, 3], "
++      "inserted_window_dims = [4, 5], "
++      "input_batching_dims = [6, 7], "
++      "scatter_indices_batching_dims = [8, 9], "
++      "scatter_dims_to_operand_dims = [10, 11], "
++      "index_vector_dim = 12>"
++  )
+   assert attr.update_window_dims == [1, 2, 3]
+   assert attr.inserted_window_dims == [4, 5]
+   assert attr.input_batching_dims == [6, 7]
 diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
 --- stablehlo/stablehlo/reference/Api.cpp
 +++ stablehlo/stablehlo/reference/Api.cpp
-@@ -51,7 +51,7 @@
-   auto functions = module.getOps<func::FuncOp>();
- 
-   for (auto funcOp : functions)
--    if (funcOp.getSymName().equals(mainName)) return funcOp;
-+    if (funcOp.getSymName() == mainName) return funcOp;
- 
-   bool isSingleFunction =
-       std::distance(functions.begin(), functions.end()) == 1;
 @@ -68,7 +68,7 @@
  class DefaultInterpreterFallback : public InterpreterFallback {
   public:
@@ -2632,16 +2632,4 @@ diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/referen
  
    virtual llvm::Error operator()(Operation &op, Scope &scope,
                                   Process *process) final {
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -764,7 +764,7 @@
- 
-     // Clean up operand buffers after refinement
-     // Must do in this pattern to avoid needing multiple refinement iterations
--    if (op.getCallTargetName().equals(kCustomCallOperandBarrierTarget)) {
-+    if (op.getCallTargetName() == kCustomCallOperandBarrierTarget) {
-       Value operand = op.getOperand(0);
-       if (operand.getType() == op.getResult(0).getType()) {
-         op.replaceAllUsesWith(ValueRange(operand));
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index aaef166d96583c..bfaa9962a9997f 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "797bee217e1a041e9aac22cad4db207274596d94"
-    STABLEHLO_SHA256 = "e5619033e131ea2eeb9eab8c8e362f3ba12e111c6b4a15dac789ca216ff22c58"
+    STABLEHLO_COMMIT = "c44d9af8d4879adccf1054cb61a53377ae5898cb"
+    STABLEHLO_SHA256 = "a8f5d4df0256e9d1c7b35fead77c31b9d8d985a0909eb198374faa9f7de15e94"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 85a49110b59f95..e58e282bc89283 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -175,53 +175,6 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
-@@ -155,7 +155,7 @@
- 
- // CHECK-LABEL: @maximum_f64
- func.func @maximum_f64(%arg0 : tensor<10xf64>, %arg1 : tensor<10xf64>) -> tensor<10xf64> {
--  // CHECK: stablehlo.maximum
-+  // CHECK: tosa.maximum
-   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<10xf64>, tensor<10xf64>) -> tensor<10xf64>
-   return %0 : tensor<10xf64>
- }
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-@@ -9,8 +9,7 @@
- 
- // CHECK-LABEL: @constant_f64
- func.func @constant_f64() -> tensor<10xf64> {
--  // TOSA does not support 64-bit types, so this should not legalize.
--  // CHECK: stablehlo.constant
-+  // CHECK: tosa.const
-   %0 = stablehlo.constant dense<0.000000e+00> : tensor<10xf64>
-   return %0 : tensor<10xf64>
- }
-diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.cpp b/stablehlo/stablehlo/dialect/AssemblyFormat.cpp
---- stablehlo/stablehlo/dialect/AssemblyFormat.cpp
-+++ stablehlo/stablehlo/dialect/AssemblyFormat.cpp
-@@ -305,8 +305,7 @@
- bool isCommutativeNoRegionMatchingDialect(OperationName innerOp,
-                                           StringRef reduceOpDialect) {
-   auto innerOpDialect = innerOp.getDialect();
--  return innerOpDialect &&
--         innerOpDialect->getNamespace().equals(reduceOpDialect) &&
-+  return innerOpDialect && innerOpDialect->getNamespace() == reduceOpDialect &&
-          innerOp.hasTrait<mlir::OpTrait::NOperands<2>::Impl>() &&
-          innerOp.hasTrait<mlir::OpTrait::OneResult>() &&
-          (innerOp.hasTrait<mlir::hlo::OpTrait::IsCommutative>() ||
-@@ -359,7 +358,7 @@
-   // Check E5.
-   LLVM_DEBUG(llvm::dbgs() << "Checking ReduceOp compact print E5\n");
-   auto retOp = block.getTerminator();
--  if (!retOp->getName().stripDialect().equals("return")) return false;
-+  if (retOp->getName().stripDialect() != "return") return false;
- 
-   return llvm::equal(innerOp.getResults(), retOp->getOperands());
- }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -2440,7 +2393,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
-@@ -0,0 +1,170 @@
+@@ -0,0 +1,171 @@
 +/* Copyright 2022 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2594,6 +2547,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +
 +    RewritePatternSet patterns(&getContext());
 +    populateStablehloRefineShapesPatterns(&patterns, &getContext());
++    populateStablehloShapeFolderPatterns(&patterns, &getContext());
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
@@ -2611,18 +2565,64 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/integrations/python/tests/stablehlo.py b/stablehlo/stablehlo/integrations/python/tests/stablehlo.py
+--- stablehlo/stablehlo/integrations/python/tests/stablehlo.py
++++ stablehlo/stablehlo/integrations/python/tests/stablehlo.py
+@@ -115,14 +115,17 @@
+       operand_batching_dims=[6, 7],
+       start_indices_batching_dims=[8, 9],
+       start_index_map=[10],
+-      index_vector_dim=11)
+-  assert attr is not None
+-  assert str(attr) == ("#stablehlo.gather<offset_dims = [1, 2], "
+-                       "collapsed_slice_dims = [3, 4, 5], "
+-                       "operand_batching_dims = [6, 7], "
+-                       "start_indices_batching_dims = [8, 9], "
+-                       "start_index_map = [10], "
+-                       "index_vector_dim = 11>")
++      index_vector_dim=11,
++  )
++  assert attr is not None
++  assert str(attr) == (
++      "#stablehlo.gather<offset_dims = [1, 2], "
++      "collapsed_slice_dims = [3, 4, 5], "
++      "operand_batching_dims = [6, 7], "
++      "start_indices_batching_dims = [8, 9], "
++      "start_index_map = [10], "
++      "index_vector_dim = 11>"
++  )
+   assert attr.offset_dims == [1, 2]
+   assert attr.collapsed_slice_dims == [3, 4, 5]
+   assert attr.operand_batching_dims == [6, 7]
+@@ -178,14 +181,17 @@
+       input_batching_dims=[6, 7],
+       scatter_indices_batching_dims=[8, 9],
+       scattered_dims_to_operand_dims=[10, 11],
+-      index_vector_dim=12)
+-  assert attr is not None
+-  assert str(attr) == ("#stablehlo.scatter<update_window_dims = [1, 2, 3], "
+-                       "inserted_window_dims = [4, 5], "
+-                       "input_batching_dims = [6, 7], "
+-                       "scatter_indices_batching_dims = [8, 9], "
+-                       "scatter_dims_to_operand_dims = [10, 11], "
+-                       "index_vector_dim = 12>")
++      index_vector_dim=12,
++  )
++  assert attr is not None
++  assert str(attr) == (
++      "#stablehlo.scatter<update_window_dims = [1, 2, 3], "
++      "inserted_window_dims = [4, 5], "
++      "input_batching_dims = [6, 7], "
++      "scatter_indices_batching_dims = [8, 9], "
++      "scatter_dims_to_operand_dims = [10, 11], "
++      "index_vector_dim = 12>"
++  )
+   assert attr.update_window_dims == [1, 2, 3]
+   assert attr.inserted_window_dims == [4, 5]
+   assert attr.input_batching_dims == [6, 7]
 diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
 --- stablehlo/stablehlo/reference/Api.cpp
 +++ stablehlo/stablehlo/reference/Api.cpp
-@@ -51,7 +51,7 @@
-   auto functions = module.getOps<func::FuncOp>();
- 
-   for (auto funcOp : functions)
--    if (funcOp.getSymName().equals(mainName)) return funcOp;
-+    if (funcOp.getSymName() == mainName) return funcOp;
- 
-   bool isSingleFunction =
-       std::distance(functions.begin(), functions.end()) == 1;
 @@ -68,7 +68,7 @@
  class DefaultInterpreterFallback : public InterpreterFallback {
   public:
@@ -2632,16 +2632,4 @@ diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/referen
  
    virtual llvm::Error operator()(Operation &op, Scope &scope,
                                   Process *process) final {
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -764,7 +764,7 @@
- 
-     // Clean up operand buffers after refinement
-     // Must do in this pattern to avoid needing multiple refinement iterations
--    if (op.getCallTargetName().equals(kCustomCallOperandBarrierTarget)) {
-+    if (op.getCallTargetName() == kCustomCallOperandBarrierTarget) {
-       Value operand = op.getOperand(0);
-       if (operand.getType() == op.getResult(0).getType()) {
-         op.replaceAllUsesWith(ValueRange(operand));
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index aaef166d96583c..bfaa9962a9997f 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "797bee217e1a041e9aac22cad4db207274596d94"
-    STABLEHLO_SHA256 = "e5619033e131ea2eeb9eab8c8e362f3ba12e111c6b4a15dac789ca216ff22c58"
+    STABLEHLO_COMMIT = "c44d9af8d4879adccf1054cb61a53377ae5898cb"
+    STABLEHLO_SHA256 = "a8f5d4df0256e9d1c7b35fead77c31b9d8d985a0909eb198374faa9f7de15e94"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
index e5c3f4500efbb4..ff2b2903828f73 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
@@ -24,13 +24,20 @@ limitations under the License.
 //
 
 MlirAttribute mlirMhloScatterDimensionNumbersGet(
-    MlirContext ctx, intptr_t nUpdateWindowDims,
-    const int64_t *updateWindowDims, intptr_t nInsertedWindowDims,
-    const int64_t *insertedWindowDims, intptr_t nScatteredDimsToOperandDims,
-    const int64_t *scatteredDimsToOperandDims, int64_t indexVectorDim) {
+    MlirContext ctx,                                                  //
+    intptr_t nUpdateWindowDims, const int64_t *updateWindowDims,      //
+    intptr_t nInsertedWindowDims, const int64_t *insertedWindowDims,  //
+    intptr_t nInputBatchingDims, const int64_t *inputBatchingDims,    //
+    intptr_t nScatterIndicesBatchingDims,
+    const int64_t *scatterIndicesBatchingDims,  //
+    intptr_t nScatteredDimsToOperandDims,
+    const int64_t *scatteredDimsToOperandDims,  //
+    int64_t indexVectorDim) {
   return wrap(mlir::mhlo::ScatterDimensionNumbersAttr::get(
       unwrap(ctx), llvm::ArrayRef(updateWindowDims, nUpdateWindowDims),
       llvm::ArrayRef(insertedWindowDims, nInsertedWindowDims),
+      llvm::ArrayRef(inputBatchingDims, nInputBatchingDims),
+      llvm::ArrayRef(scatterIndicesBatchingDims, nScatterIndicesBatchingDims),
       llvm::ArrayRef(scatteredDimsToOperandDims, nScatteredDimsToOperandDims),
       indexVectorDim));
 }
@@ -65,6 +72,32 @@ int64_t mlirMhloScatterDimensionNumbersGetInsertedWindowDimsElem(
       .getInsertedWindowDims()[pos];
 }
 
+intptr_t mlirMhloScatterDimensionNumbersGetInputBatchingDimsSize(
+    MlirAttribute attr) {
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
+      .getInputBatchingDims()
+      .size();
+}
+
+int64_t mlirMhloScatterDimensionNumbersGetInputBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos) {
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
+      .getInputBatchingDims()[pos];
+}
+
+intptr_t mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsSize(
+    MlirAttribute attr) {
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
+      .getScatterIndicesBatchingDims()
+      .size();
+}
+
+int64_t mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos) {
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
+      .getScatterIndicesBatchingDims()[pos];
+}
+
 intptr_t mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsSize(
     MlirAttribute attr) {
   return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
@@ -88,13 +121,19 @@ int64_t mlirMhloDimensionNumbersGetIndexVectorDim(MlirAttribute attr) {
 //
 
 MlirAttribute mlirMhloGatherDimensionNumbersGet(
-    MlirContext ctx, intptr_t nOffsetDims, const int64_t *offsetDims,
-    intptr_t nCollapsedSliceDims, const int64_t *collapsedSliceDims,
-    intptr_t nStartIndexMap, const int64_t *startIndexMap,
+    MlirContext ctx,                                                    //
+    intptr_t nOffsetDims, const int64_t *offsetDims,                    //
+    intptr_t nCollapsedSliceDims, const int64_t *collapsedSliceDims,    //
+    intptr_t nOperandBatchingDims, const int64_t *operandBatchingDims,  //
+    intptr_t nStartIndicesBatchingDims,
+    const int64_t *startIndicesBatchingDims,                //
+    intptr_t nStartIndexMap, const int64_t *startIndexMap,  //
     int64_t indexVectorDim) {
   return wrap(mlir::mhlo::GatherDimensionNumbersAttr::get(
       unwrap(ctx), llvm::ArrayRef(offsetDims, nOffsetDims),
       llvm::ArrayRef(collapsedSliceDims, nCollapsedSliceDims),
+      llvm::ArrayRef(operandBatchingDims, nOperandBatchingDims),
+      llvm::ArrayRef(startIndicesBatchingDims, nStartIndicesBatchingDims),
       llvm::ArrayRef(startIndexMap, nStartIndexMap), indexVectorDim));
 }
 
@@ -127,6 +166,32 @@ int64_t mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsElem(
       .getCollapsedSliceDims()[pos];
 }
 
+intptr_t mlirMhloGatherDimensionNumbersGetOperandBatchingDimsSize(
+    MlirAttribute attr) {
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
+      .getOperandBatchingDims()
+      .size();
+}
+
+int64_t mlirMhloGatherDimensionNumbersGetOperandBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos) {
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
+      .getOperandBatchingDims()[pos];
+}
+
+intptr_t mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsSize(
+    MlirAttribute attr) {
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
+      .getStartIndicesBatchingDims()
+      .size();
+}
+
+int64_t mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos) {
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
+      .getStartIndicesBatchingDims()[pos];
+}
+
 intptr_t mlirMhloGatherDimensionNumbersGetStartIndexMapSize(
     MlirAttribute attr) {
   return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h
index 1eabbcd6ecde86..23fc4f11977de8 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h
@@ -27,6 +27,9 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirMhloScatterDimensionNumbersGet(
     MlirContext ctx,                                                  //
     intptr_t nUpdateWindowDims, const int64_t *updateWindowDims,      //
     intptr_t nInsertedWindowDims, const int64_t *insertedWindowDims,  //
+    intptr_t nInputBatchingDims, const int64_t *inputBatchingDims,    //
+    intptr_t nScatterIndicesBatchingDims,                             //
+    const int64_t *scatterIndicesBatchingDims,                        //
     intptr_t nScatteredDimsToOperandDims,                             //
     const int64_t *scatteredDimsToOperandDims,                        //
     int64_t indexVectorDim);
@@ -47,6 +50,17 @@ MLIR_CAPI_EXPORTED int64_t
 mlirMhloScatterDimensionNumbersGetInsertedWindowDimsElem(MlirAttribute attr,
                                                          intptr_t pos);
 MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetInputBatchingDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetInputBatchingDimsElem(MlirAttribute attr,
+                                                        intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsSize(
+    MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
 mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsSize(
     MlirAttribute attr);
 MLIR_CAPI_EXPORTED int64_t
@@ -58,9 +72,13 @@ mlirMhloDimensionNumbersGetIndexVectorDim(MlirAttribute attr);
 // Creates a new GatherDimensionNumbers attribute with the given parameters. The
 // first three pairs of arguments are interpreted as arrays.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloGatherDimensionNumbersGet(
-    MlirContext ctx, intptr_t nOffsetDims, const int64_t *offsetDims,
-    intptr_t nCollapsedSliceDims, const int64_t *collapsedSliceDims,
-    intptr_t nStartIndexMap, const int64_t *startIndexMap,
+    MlirContext ctx,                                                    //
+    intptr_t nOffsetDims, const int64_t *offsetDims,                    //
+    intptr_t nCollapsedSliceDims, const int64_t *collapsedSliceDims,    //
+    intptr_t nOperandBatchingDims, const int64_t *operandBatchingDims,  //
+    intptr_t nStartIndicesBatchingDims,
+    const int64_t *startIndicesBatchingDims,                //
+    intptr_t nStartIndexMap, const int64_t *startIndexMap,  //
     int64_t indexVectorDim);
 
 // Returns true if the given attribute is a GatherDimensionNumbers attribute.
@@ -78,6 +96,17 @@ MLIR_CAPI_EXPORTED int64_t
 mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsElem(MlirAttribute attr,
                                                         intptr_t pos);
 MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetOperandBatchingDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetOperandBatchingDimsElem(MlirAttribute attr,
+                                                         intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsSize(
+    MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
 mlirMhloGatherDimensionNumbersGetStartIndexMapSize(MlirAttribute attr);
 MLIR_CAPI_EXPORTED int64_t mlirMhloGatherDimensionNumbersGetStartIndexMapElem(
     MlirAttribute attr, intptr_t pos);
diff --git a/third_party/xla/xla/mlir_hlo/bindings/python/MlirHloModule.cc b/third_party/xla/xla/mlir_hlo/bindings/python/MlirHloModule.cc
index 18f87bbcf662df..386e6b1c6acc9a 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/python/MlirHloModule.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/python/MlirHloModule.cc
@@ -97,16 +97,22 @@ PYBIND11_MODULE(_mlirHlo, m) {
           "get",
           [](py::object cls, const std::vector<int64_t> &updateWindowDims,
              const std::vector<int64_t> &insertedWindowDims,
+             const std::vector<int64_t> &inputBatchingDims,
+             const std::vector<int64_t> &scatterIndicesBatchingDims,
              const std::vector<int64_t> &scatteredDimsToOperandDims,
              int64_t indexVectorDim, MlirContext ctx) {
             return cls(mlirMhloScatterDimensionNumbersGet(
                 ctx, updateWindowDims.size(), updateWindowDims.data(),
                 insertedWindowDims.size(), insertedWindowDims.data(),
+                inputBatchingDims.size(), inputBatchingDims.data(),
+                scatterIndicesBatchingDims.size(),
+                scatterIndicesBatchingDims.data(),
                 scatteredDimsToOperandDims.size(),
                 scatteredDimsToOperandDims.data(), indexVectorDim));
           },
           py::arg("cls"), py::arg("update_window_dims"),
-          py::arg("inserted_window_dims"),
+          py::arg("inserted_window_dims"), py::arg("input_batching_dims"),
+          py::arg("scatter_indices_batching_dims"),
           py::arg("scattered_dims_to_operand_dims"),
           py::arg("index_vector_dim"), py::arg("context") = py::none(),
           "Creates a ScatterDimensionNumbers with the given dimension "
@@ -125,6 +131,22 @@ PYBIND11_MODULE(_mlirHlo, m) {
                 self, mlirMhloScatterDimensionNumbersGetInsertedWindowDimsSize,
                 mlirMhloScatterDimensionNumbersGetInsertedWindowDimsElem);
           })
+      .def_property_readonly(
+          "input_batching_dims",
+          [](MlirAttribute self) {
+            return attributePropertyVector(
+                self, mlirMhloScatterDimensionNumbersGetInputBatchingDimsSize,
+                mlirMhloScatterDimensionNumbersGetInputBatchingDimsElem);
+          })
+      .def_property_readonly(
+          "scatter_indices_batching_dims",
+          [](MlirAttribute self) {
+            return attributePropertyVector(
+                self,
+                mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsSize,  // NOLINT(whitespace/line_length)
+                mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsElem  // NOLINT(whitespace/line_length)
+            );
+          })
       .def_property_readonly("scattered_dims_to_operand_dims",
                              scatteredDimsToOperandDimsFunc)
       .def_property_readonly("index_vector_dim", [](MlirAttribute self) {
@@ -137,15 +159,21 @@ PYBIND11_MODULE(_mlirHlo, m) {
           "get",
           [](py::object cls, const std::vector<int64_t> &offsetDims,
              const std::vector<int64_t> &collapsedSliceDims,
+             const std::vector<int64_t> &operandBatchingDims,
+             const std::vector<int64_t> &startIndicesBatchingDims,
              const std::vector<int64_t> &startIndexMap, int64_t indexVectorDim,
              MlirContext ctx) {
             return cls(mlirMhloGatherDimensionNumbersGet(
                 ctx, offsetDims.size(), offsetDims.data(),
                 collapsedSliceDims.size(), collapsedSliceDims.data(),
-                startIndexMap.size(), startIndexMap.data(), indexVectorDim));
+                operandBatchingDims.size(), operandBatchingDims.data(),
+                startIndicesBatchingDims.size(),
+                startIndicesBatchingDims.data(), startIndexMap.size(),
+                startIndexMap.data(), indexVectorDim));
           },
           py::arg("cls"), py::arg("offset_dims"),
-          py::arg("collapsed_slice_dims"), py::arg("start_index_map"),
+          py::arg("collapsed_slice_dims"), py::arg("operand_batching_dims"),
+          py::arg("start_indices_batching_dims"), py::arg("start_index_map"),
           py::arg("index_vector_dim"), py::arg("context") = py::none(),
           "Creates a GatherDimensionNumbers attribute with the given dimension "
           "configuration.")
@@ -163,6 +191,21 @@ PYBIND11_MODULE(_mlirHlo, m) {
                 self, mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsSize,
                 mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsElem);
           })
+      .def_property_readonly(
+          "operand_batching_dims",
+          [](MlirAttribute self) {
+            return attributePropertyVector(
+                self, mlirMhloGatherDimensionNumbersGetOperandBatchingDimsSize,
+                mlirMhloGatherDimensionNumbersGetOperandBatchingDimsElem);
+          })
+      .def_property_readonly(
+          "start_indices_batching_dims",
+          [](MlirAttribute self) {
+            return attributePropertyVector(
+                self,
+                mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsSize,
+                mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsElem);
+          })
       .def_property_readonly(
           "start_index_map",
           [](MlirAttribute self) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 4a4690f28e9cbd..f7ba2962b7b839 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -1171,12 +1171,15 @@ LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
   };
   SmallVector<Value, 4> shapeValues;
   auto getSliceDim = [&sliceSizes](int64_t index) -> Value {
-    return sliceSizes[index];
+    llvm::errs() << "ABOUT TO FAIL\n";
+    auto ret = sliceSizes[index];
+    llvm::errs() << "DID NOT FAIL\n";
+    return ret;
   };
   hlo::reifyGatherDimSizes(resultRank, getStartIndicesDim, getSliceDim,
                            op->getDimensionNumbers().getOffsetDims(),
                            op->getDimensionNumbers().getCollapsedSliceDims(),
-                           op->getDimensionNumbers().getStartIndexMap(),
+                           op->getDimensionNumbers().getOperandBatchingDims(),
                            op->getDimensionNumbers().getIndexVectorDim(),
                            shapeValues);
 
@@ -1207,6 +1210,8 @@ LogicalResult GatherOp::inferReturnTypeComponents(
       location, adaptor.getOperand(), adaptor.getStartIndices(),
       adaptor.getDimensionNumbers().getOffsetDims(),
       adaptor.getDimensionNumbers().getCollapsedSliceDims(),
+      adaptor.getDimensionNumbers().getOperandBatchingDims(),
+      adaptor.getDimensionNumbers().getStartIndicesBatchingDims(),
       adaptor.getDimensionNumbers().getStartIndexMap(),
       adaptor.getDimensionNumbers().getIndexVectorDim(),
       llvm::to_vector(adaptor.getSliceSizes().getValues<int64_t>()),
@@ -1266,6 +1271,8 @@ LogicalResult DynamicGatherOp::inferReturnTypeComponents(
       location, adaptor.getOperand(), adaptor.getStartIndices(),
       adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
       adaptor.getDimensionNumbers().getCollapsedSliceDims(),
+      adaptor.getDimensionNumbers().getOperandBatchingDims(),
+      adaptor.getDimensionNumbers().getStartIndicesBatchingDims(),
       adaptor.getDimensionNumbers().getStartIndexMap(),
       adaptor.getDimensionNumbers().getIndexVectorDim(), inferredReturnShapes);
 }
@@ -5754,6 +5761,8 @@ LogicalResult ScatterOp::verify() {
       getLoc(), getInputs(), getScatterIndices(), getUpdates(),
       getScatterDimensionNumbers().getUpdateWindowDims(),
       getScatterDimensionNumbers().getInsertedWindowDims(),
+      getScatterDimensionNumbers().getInputBatchingDims(),
+      getScatterDimensionNumbers().getScatterIndicesBatchingDims(),
       getScatterDimensionNumbers().getScatterDimsToOperandDims(),
       getScatterDimensionNumbers().getIndexVectorDim(), getUpdateComputation());
 }
@@ -6329,6 +6338,9 @@ void ScatterDimensionNumbersAttr::print(AsmPrinter& printer) const {
   printStruct(printer, "scatter",
               std::make_pair("update_window_dims", getUpdateWindowDims()),
               std::make_pair("inserted_window_dims", getInsertedWindowDims()),
+              std::make_pair("input_batching_dims", getInputBatchingDims()),
+              std::make_pair("scatter_indices_batching_dims",
+                             getScatterIndicesBatchingDims()),
               std::make_pair("scatter_dims_to_operand_dims",
                              getScatterDimsToOperandDims()),
               std::make_pair("index_vector_dim", getIndexVectorDim()));
@@ -6337,15 +6349,20 @@ Attribute ScatterDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
   if (failed(parser.parseLess())) return {};
   SmallVector<int64_t> updateWindowDims;
   SmallVector<int64_t> insertedWindowDims;
+  SmallVector<int64_t> inputBatchingDims;
+  SmallVector<int64_t> scatterIndicesBatchingDims;
   SmallVector<int64_t> scatterDimsToOperandDims;
   int64_t indexVectorDim = 0;
 
   if (failed(parseStruct(
           parser,
-          {"update_window_dims", "inserted_window_dims",
-           "scatter_dims_to_operand_dims", "index_vector_dim"},
+          {"update_window_dims", "inserted_window_dims", "input_batching_dims",
+           "scatter_indices_batching_dims", "scatter_dims_to_operand_dims",
+           "index_vector_dim"},
           {[&]() { return parseDims(parser, updateWindowDims); },
            [&]() { return parseDims(parser, insertedWindowDims); },
+           [&]() { return parseDims(parser, inputBatchingDims); },
+           [&]() { return parseDims(parser, scatterIndicesBatchingDims); },
            [&]() { return parseDims(parser, scatterDimsToOperandDims); },
            [&]() { return parser.parseInteger(indexVectorDim); }}))) {
     parser.emitError(parser.getCurrentLocation())
@@ -6355,13 +6372,17 @@ Attribute ScatterDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
 
   return ScatterDimensionNumbersAttr::get(
       parser.getContext(), updateWindowDims, insertedWindowDims,
-      scatterDimsToOperandDims, indexVectorDim);
+      inputBatchingDims, scatterIndicesBatchingDims, scatterDimsToOperandDims,
+      indexVectorDim);
 }
 
 // Custom printer and parser for GatherDimensionNumbersAttr.
 void GatherDimensionNumbersAttr::print(AsmPrinter& printer) const {
   printStruct(printer, "gather", std::make_pair("offset_dims", getOffsetDims()),
               std::make_pair("collapsed_slice_dims", getCollapsedSliceDims()),
+              std::make_pair("operand_batching_dims", getOperandBatchingDims()),
+              std::make_pair("start_indices_batching_dims",
+                             getStartIndicesBatchingDims()),
               std::make_pair("start_index_map", getStartIndexMap()),
               std::make_pair("index_vector_dim", getIndexVectorDim()));
 }
@@ -6371,15 +6392,20 @@ Attribute GatherDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
 
   SmallVector<int64_t> offsetDims;
   SmallVector<int64_t> collapsedSliceDims;
+  SmallVector<int64_t> operandBatchingDims;
+  SmallVector<int64_t> startIndicesBatchingDims;
   SmallVector<int64_t> startIndexMap;
   int64_t indexVectorDim = 0;
 
   if (failed(parseStruct(
           parser,
-          {"offset_dims", "collapsed_slice_dims", "start_index_map",
+          {"offset_dims", "collapsed_slice_dims", "operand_batching_dims",
+           "start_indices_batching_dims", "start_index_map",
            "index_vector_dim"},
           {[&]() { return parseDims(parser, offsetDims); },
            [&]() { return parseDims(parser, collapsedSliceDims); },
+           [&]() { return parseDims(parser, operandBatchingDims); },
+           [&]() { return parseDims(parser, startIndicesBatchingDims); },
            [&]() { return parseDims(parser, startIndexMap); },
            [&]() { return parser.parseInteger(indexVectorDim); }}))) {
     parser.emitError(parser.getCurrentLocation())
@@ -6387,9 +6413,9 @@ Attribute GatherDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
     return {};
   }
 
-  return GatherDimensionNumbersAttr::get(parser.getContext(), offsetDims,
-                                         collapsedSliceDims, startIndexMap,
-                                         indexVectorDim);
+  return GatherDimensionNumbersAttr::get(
+      parser.getContext(), offsetDims, collapsedSliceDims, operandBatchingDims,
+      startIndicesBatchingDims, startIndexMap, indexVectorDim);
 }
 
 // Custom printer and parser for DotDimensionNumbersAttr.
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index cfba735634a366..3c0d9097e4c1f4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -2753,14 +2753,16 @@ def MHLO_GatherOp: MHLO_Op<"gather", [InferTensorTypeWithReify, Pure]> {
     Example:
     ```mlir
     %result = "mhlo.gather"(%operand, %start_indices) {
-      dimension_numbers = #mhlo.gather<
-        offset_dims = [2, 3],
-        collapsed_slice_dims = [0],
-        start_index_map = [0, 2],
-        index_vector_dim = 2>,
+      dimension_numbers = #stablehlo.gather<
+        offset_dims = [3, 4],
+        collapsed_slice_dims = [1],
+        operand_batching_dims = [0],
+        start_indices_batching_dims = [1],
+        start_index_map = [2, 1],
+        index_vector_dim = 3>,
       slice_sizes = dense<[0, 2, 2]> : tensor<3xi64>,
       indices_are_sorted = false
-    } : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
+    } : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi64>
     ```
   }];
 
@@ -2910,13 +2912,15 @@ def MHLO_ScatterOp: MHLO_Op<"scatter",
        mhlo.return %0 : tensor<i32>
    }) {
      scatter_dimension_numbers = #mhlo.scatter<
-       update_window_dims = [2,3],
-       inserted_window_dims = [0],
-       scatter_dims_to_operand_dims = [1, 0],
-       index_vector_dim = 2>,
+       update_window_dims = [3, 4],
+       inserted_window_dims = [1],
+       input_batching_dims = [0],
+       scatter_indices_batching_dims = [1],
+       scatter_dims_to_operand_dims = [2, 1],
+       index_vector_dim = 3>,
      indices_are_sorted = false,
      unique_indices = false
-   } : (tensor<3x4x2xi32>, tensor<2x3x2xi64>, tensor<2x3x2x2xi32>) -> tensor<3x4x2xi32>
+   } : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>, tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64>
    ```
   }];
   let arguments = (ins
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
index 7bb55816ba2a27..3593e0917b0770 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
@@ -31,6 +31,8 @@ def MHLO_ScatterDimensionNumbers : AttrDef<MHLO_Dialect, "ScatterDimensionNumber
   let parameters = (ins
       MHLO_Dims:$updateWindowDims,
       MHLO_Dims:$insertedWindowDims,
+      MHLO_Dims:$inputBatchingDims,
+      MHLO_Dims:$scatterIndicesBatchingDims,
       MHLO_Dims:$scatterDimsToOperandDims,
       "int64_t":$indexVectorDim
   );
@@ -43,6 +45,8 @@ def MHLO_GatherDimensionNumbers : AttrDef<MHLO_Dialect, "GatherDimensionNumbers"
   let parameters = (ins
       MHLO_Dims:$offsetDims,
       MHLO_Dims:$collapsedSliceDims,
+      MHLO_Dims:$operandBatchingDims,
+      MHLO_Dims:$startIndicesBatchingDims,
       MHLO_Dims:$startIndexMap,
       "int64_t":$indexVectorDim
   );
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
index 260f736abfd4bc..0afccba2e587d6 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
@@ -471,19 +471,22 @@ GatherDimensionNumbersAttr
 MhloBytecodeInterface::readGatherDimensionNumbersAttr(
     DialectBytecodeReader &reader) const {
   LOG_READ_CALL;
-  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims, startIndexMap;
+  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims,
+      operandBatchingDims, startIndicesBatchingDims, startIndexMap;
   int64_t indexVectorDim;
 
   if (failed(reader.readSignedVarInts(offsetDims)) ||
       failed(reader.readSignedVarInts(collapsedSliceDims)) ||
+      failed(reader.readSignedVarInts(operandBatchingDims)) ||
+      failed(reader.readSignedVarInts(startIndicesBatchingDims)) ||
       failed(reader.readSignedVarInts(startIndexMap)) ||
       failed(reader.readSignedVarInt(indexVectorDim))) {
     return GatherDimensionNumbersAttr();
   }
 
-  return GatherDimensionNumbersAttr::get(getContext(), offsetDims,
-                                         collapsedSliceDims, startIndexMap,
-                                         indexVectorDim);
+  return GatherDimensionNumbersAttr::get(
+      getContext(), offsetDims, collapsedSliceDims, operandBatchingDims,
+      startIndicesBatchingDims, startIndexMap, indexVectorDim);
 }
 
 OutputOperandAliasAttr MhloBytecodeInterface::readOutputOperandAliasAttr(
@@ -531,19 +534,21 @@ MhloBytecodeInterface::readScatterDimensionNumbersAttr(
     DialectBytecodeReader &reader) const {
   LOG_READ_CALL;
   llvm::SmallVector<int64_t> updateWindowDims, insertedWindowDims,
-      scatterDimsToOperandDims;
+      inputBatchingDims, scatterIndicesBatchingDims, scatterDimsToOperandDims;
   int64_t indexVectorDim;
 
   if (failed(reader.readSignedVarInts(updateWindowDims)) ||
       failed(reader.readSignedVarInts(insertedWindowDims)) ||
+      failed(reader.readSignedVarInts(inputBatchingDims)) ||
+      failed(reader.readSignedVarInts(scatterIndicesBatchingDims)) ||
       failed(reader.readSignedVarInts(scatterDimsToOperandDims)) ||
       failed(reader.readSignedVarInt(indexVectorDim))) {
     return ScatterDimensionNumbersAttr();
   }
 
   return ScatterDimensionNumbersAttr::get(
-      getContext(), updateWindowDims, insertedWindowDims,
-      scatterDimsToOperandDims, indexVectorDim);
+      getContext(), updateWindowDims, insertedWindowDims, inputBatchingDims,
+      scatterIndicesBatchingDims, scatterDimsToOperandDims, indexVectorDim);
 }
 
 TransposeAttr MhloBytecodeInterface::readTransposeAttr(
@@ -663,6 +668,8 @@ void MhloBytecodeInterface::write(GatherDimensionNumbersAttr attr,
   writer.writeVarInt(mhlo_encoding::kGatherDimensionNumbers);
   writer.writeSignedVarInts(attr.getOffsetDims());
   writer.writeSignedVarInts(attr.getCollapsedSliceDims());
+  writer.writeSignedVarInts(attr.getOperandBatchingDims());
+  writer.writeSignedVarInts(attr.getStartIndicesBatchingDims());
   writer.writeSignedVarInts(attr.getStartIndexMap());
   writer.writeSignedVarInt(attr.getIndexVectorDim());
 }
@@ -698,6 +705,8 @@ void MhloBytecodeInterface::write(ScatterDimensionNumbersAttr attr,
   writer.writeVarInt(mhlo_encoding::kScatterDimensionNumbersAttr);
   writer.writeSignedVarInts(attr.getUpdateWindowDims());
   writer.writeSignedVarInts(attr.getInsertedWindowDims());
+  writer.writeSignedVarInts(attr.getInputBatchingDims());
+  writer.writeSignedVarInts(attr.getScatterIndicesBatchingDims());
   writer.writeSignedVarInts(attr.getScatterDimsToOperandDims());
   writer.writeSignedVarInt(attr.getIndexVectorDim());
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
index b75f13e5c8417b..5678819bd45744 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
@@ -178,6 +178,9 @@ struct SelectAndScatterExpanderPattern
         sas->getContext(),
         /*updateWindowDims=*/{},
         /*insertedWindowDims=*/scatterDims,
+        // TODO: b/342172264 - Implement handling of batching dims.
+        /*inputBatchingDims=*/{},
+        /*scatterIndicesBatchingDims=*/{},
         /*scatterDimsToOperandDims=*/scatterDims,
         /*indexVectorDim=*/source.getType().getRank());
     auto scatterOp = builder.create<mhlo::ScatterOp>(
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index cd86891b3ccbf7..da0897354f131a 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -278,6 +278,7 @@ Attribute convertAttr(Attribute hloAttr) {
   if (auto attr = mlir::dyn_cast<mhlo::GatherDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::GatherDimensionNumbersAttr::get(
         attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
+        attr.getOperandBatchingDims(), attr.getStartIndicesBatchingDims(),
         attr.getStartIndexMap(), attr.getIndexVectorDim());
   }
   if (auto attr = mlir::dyn_cast<mhlo::OutputOperandAliasAttr>(hloAttr)) {
@@ -300,8 +301,9 @@ Attribute convertAttr(Attribute hloAttr) {
   if (auto attr = mlir::dyn_cast<mhlo::ScatterDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::ScatterDimensionNumbersAttr::get(
         attr.getContext(), attr.getUpdateWindowDims(),
-        attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
-        attr.getIndexVectorDim());
+        attr.getInsertedWindowDims(), attr.getInputBatchingDims(),
+        attr.getScatterIndicesBatchingDims(),
+        attr.getScatterDimsToOperandDims(), attr.getIndexVectorDim());
   }
   if (auto attr = mlir::dyn_cast<mhlo::TransposeAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(Transpose);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index d0fe87d2451871..bf969fd343016b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -4017,6 +4017,11 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
   LogicalResult matchAndRewrite(
       mhlo::GatherOp gatherOp, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
+    // TODO: b/342172264 - Implement handling of batching dims.
+    if (!gatherOp.getDimensionNumbers().getOperandBatchingDims().empty() ||
+        !gatherOp.getDimensionNumbers().getStartIndicesBatchingDims().empty())
+      return failure();
+
     Location loc = gatherOp.getLoc();
 
     Value startIndices = adaptor.getStartIndices();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
index 3fa7e61dd7096b..8cc65ea23f04c2 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
@@ -117,8 +117,10 @@ struct TorchIndexSelectIsGather : public OpRewritePattern<TorchIndexSelectOp> {
     }
 
     auto gatherDimensionNumbersAttr = GatherDimensionNumbersAttr::get(
-        rewriter.getContext(), offsetDims, collapsedSliceDims, startIndexMap,
-        indexVectorDim);
+        rewriter.getContext(), offsetDims, collapsedSliceDims,
+        // TODO: b/342172264 - Implement handling of batching dims.
+        /*operandBatchingDims=*/{}, /*startIndicesBatchingDims=*/{},
+        startIndexMap, indexVectorDim);
 
     auto sliceSizesAttr = rewriter.getI64TensorAttr(sliceSizes);
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
index 3c897e38e4b5a1..f94cbab38b37c6 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
@@ -133,6 +133,12 @@ struct CanonicalizeGatherPattern : public OpRewritePattern<GatherOp> {
     }
 
     const auto& dims = gatherOp.getDimensionNumbers();
+
+    // TODO: b/342172264 - Implement handling of batching dims.
+    if (!dims.getOperandBatchingDims().empty() ||
+        !dims.getStartIndicesBatchingDims().empty())
+      return failure();
+
     int64_t operandRank =
         dims.getCollapsedSliceDims().size() + dims.getOffsetDims().size();
 
@@ -154,7 +160,9 @@ struct CanonicalizeGatherPattern : public OpRewritePattern<GatherOp> {
 
     auto newDims = GatherDimensionNumbersAttr::get(
         rewriter.getContext(), offsetDims,
-        /*collapsedSliceDims=*/{}, startIndexMap,
+        /*collapsedSliceDims=*/{},
+        /*operandBatchingDims=*/{},
+        /*startIndicesBatchingDims=*/{}, startIndexMap,
         /*indexVectorDim=*/1);
     TypedValue<RankedTensorType> result =
         b.create<GatherOp>(operand, startIndices, newDims,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
index e3ef577d66ec53..14172e122c1482 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
@@ -186,6 +186,11 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
     ScatterDimensionNumbersAttr dimsAttrs =
         scatterOp.getScatterDimensionNumbers();
 
+    // TODO: b/342172264 - Implement handling of batching dims.
+    if (!dimsAttrs.getInputBatchingDims().empty() ||
+        !dimsAttrs.getScatterIndicesBatchingDims().empty())
+      return failure();
+
     auto operandType =
         mlir::cast<RankedTensorType>(scatterOp.getInputs().front().getType());
     int64_t operandRank = operandType.getRank();
@@ -212,6 +217,9 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         /*updateWindowDims=*/
         llvm::to_vector<4>(llvm::seq<int64_t>(1, operandRank + 1)),
         /*insertedWindowDims=*/std::nullopt,
+        /*inputBatchingDims=*/{},
+        /*scatterIndicesBatchingDims=*/
+        {},
         /*scatterDimsToOperandDims=*/
         llvm::to_vector<4>(llvm::seq<int64_t>(0, scatterIndicesVectorSize)),
         /*indexVectorDim=*/1);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index a7adb9c987368b..4d329a95c12c69 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -102,6 +102,7 @@ Attribute convertAttr(Attribute stablehloAttr) {
           stablehloAttr)) {
     return mhlo::GatherDimensionNumbersAttr::get(
         attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
+        attr.getOperandBatchingDims(), attr.getStartIndicesBatchingDims(),
         attr.getStartIndexMap(), attr.getIndexVectorDim());
   }
   if (auto attr =
@@ -124,8 +125,9 @@ Attribute convertAttr(Attribute stablehloAttr) {
           stablehloAttr)) {
     return mhlo::ScatterDimensionNumbersAttr::get(
         attr.getContext(), attr.getUpdateWindowDims(),
-        attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
-        attr.getIndexVectorDim());
+        attr.getInsertedWindowDims(), attr.getInputBatchingDims(),
+        attr.getScatterIndicesBatchingDims(),
+        attr.getScatterDimsToOperandDims(), attr.getIndexVectorDim());
   }
   if (auto attr = mlir::dyn_cast<stablehlo::TransposeAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(Transpose);
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 32f555f88cc1cc..cd82c6a66fcbd0 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -959,28 +959,30 @@ func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
 // FusionOp aka mhlo.fusion is unsupported at the moment (see negative test below).
 
 // CHECK-LABEL: "op_gather"
-func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
+func.func @op_gather(%arg0: tensor<2x3x4x2xi32>, %arg1: tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32> {
   //      CHECK: "stablehlo.gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #stablehlo.gather<
-  // CHECK-SAME:     offset_dims = [2],
-  // CHECK-SAME:     collapsed_slice_dims = [0, 1],
-  // CHECK-SAME:     start_index_map = [0, 1],
-  // CHECK-SAME:     index_vector_dim = 2
-  // CHECK-SAME:   >,
+  // CHECK-SAME:     offset_dims = [3, 4],
+  // CHECK-SAME:     collapsed_slice_dims = [1],
+  // CHECK-SAME:     operand_batching_dims = [0],
+  // CHECK-SAME:     start_indices_batching_dims = [1],
+  // CHECK-SAME:     start_index_map = [2, 1],
+  // CHECK-SAME:     index_vector_dim = 3>,
   // CHECK-SAME:   indices_are_sorted = false,
-  // CHECK-SAME:   slice_sizes = array<i64: 1, 1, 1>
-  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
+  // CHECK-SAME:   slice_sizes = array<i64: 1, 1, 2, 2>
+  // CHECK-SAME: }> : (tensor<2x3x4x2xi32>, tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32>
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
-      offset_dims = [2],
-      collapsed_slice_dims = [0, 1],
-      start_index_map = [0, 1],
-      index_vector_dim = 2
-    >,
-    slice_sizes = dense<1> : tensor<3xi64>,
+      offset_dims = [3, 4],
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [1],
+      start_index_map = [2, 1],
+      index_vector_dim = 3>,
+    slice_sizes = dense<[1, 1, 2, 2]> : tensor<4xi64>,
     indices_are_sorted = false
-  } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
-  func.return %0 : tensor<1x5x1xf32>
+  } : (tensor<2x3x4x2xi32>, tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32>
+  func.return %0 : tensor<2x2x3x2x2xi32>
 }
 
 // CHECK-LABEL: "op_get_dimension_size"
@@ -1383,36 +1385,38 @@ func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
 }
 
 // CHECK-LABEL: "op_scatter"
-func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
+func.func @op_scatter(%arg0: tensor<2x3x4x2xi64>, %arg1: tensor<2x2x3x2xi64>, %arg2: tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64> {
   //      CHECK: "stablehlo.scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:  indices_are_sorted = true,
   // CHECK-SAME:  scatter_dimension_numbers = #stablehlo.scatter<
-  // CHECK-SAME:    update_window_dims = [1],
-  // CHECK-SAME:    inserted_window_dims = [0, 1],
-  // CHECK-SAME:    scatter_dims_to_operand_dims = [0, 1],
-  // CHECK-SAME:    index_vector_dim = 1
-  // CHECK-SAME:  >,
+  // CHECK-SAME:    update_window_dims = [3, 4],
+  // CHECK-SAME:    inserted_window_dims = [1],
+  // CHECK-SAME:    input_batching_dims = [0],
+  // CHECK-SAME:    scatter_indices_batching_dims = [1],
+  // CHECK-SAME:    scatter_dims_to_operand_dims = [2, 1],
+  // CHECK-SAME:    index_vector_dim = 3>,
   // CHECK-SAME:  unique_indices = true
   // CHECK-SAME: }> ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<i64>, %[[ARG4:arg.*]]: tensor<i64>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<i64>) -> ()
+  // CHECK-NEXT: }) : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>, tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64>
   %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-      %1 = "mhlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%1) : (tensor<f32>) -> ()
+    ^bb0(%arg3: tensor<i64>, %arg4: tensor<i64>):
+      %1 = "mhlo.add"(%arg3, %arg4) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+      "mhlo.return"(%1) : (tensor<i64>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [1],
-      inserted_window_dims = [0, 1],
-      scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1
-    >,
+      update_window_dims = [3, 4],
+      inserted_window_dims = [1],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [1],
+      scatter_dims_to_operand_dims = [2, 1],
+      index_vector_dim = 3>,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>, tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64>
+  func.return %0 : tensor<2x3x4x2xi64>
 }
 
 // CHECK-LABEL: "op_select_and_scatter"
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index b4bdfba82540b2..c857984722aeef 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -819,7 +819,7 @@ func.func @allgather_gather_along_zero_dimension(%arg0: tensor<128x0xf32>) -> te
 
 // -----
 
-func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{all_gather_dim cannot be negative}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = -1 : i64,
@@ -831,7 +831,7 @@ func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{all_gather_dim must be a valid index of operand}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 2 : i64,
@@ -843,7 +843,7 @@ func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c2(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{replica id #2 seen more than once}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -855,7 +855,7 @@ func.func @all_gather_c2(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{Invalid replica id -1}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -867,7 +867,7 @@ func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{replica id #4 not seen in replica groups}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -879,7 +879,7 @@ func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c5(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{channel_id cannot be negative when useGlobalDeviceIds is set}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -892,7 +892,7 @@ func.func @all_gather_c5(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c6(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{operand and result must have the same rank}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -904,7 +904,7 @@ func.func @all_gather_c6(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
 
 // -----
 
-func.func @all_gather_c6(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
+func.func @all_gather(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
   // expected-error@+1 {{operand and result should have the same shape except for the dimension size at 'all_gather_dim'}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -916,7 +916,7 @@ func.func @all_gather_c6(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
 
 // -----
 
-func.func @all_gather_c6(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
+func.func @all_gather(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
   // expected-error@+1 {{result gather dimension has size 100, expected to be a multiple of operand gather dimension size 32}}
   %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
@@ -3587,32 +3587,50 @@ func.func @reduce_precision_invalid_mantissa(%arg: tensor<2x4xf32>) -> tensor<2x
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
+func.func @gather(%operand : tensor<3x2x4x9xi32>, %start_indices : tensor<1x3x5x2xi32>) -> tensor<1x3x5x8xi32> {
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [3],
+      collapsed_slice_dims = [1, 2],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [1],
+      start_index_map = [1, 2],
+      index_vector_dim = 3
+    >,
+    slice_sizes = dense<[1, 1, 1, 8]> : tensor<4xi64>,
+    indices_are_sorted = false
+  } : (tensor<3x2x4x9xi32>, tensor<1x3x5x2xi32>) -> tensor<1x3x5x8xi32>
+  func.return %res : tensor<1x3x5x8xi32>
+}
+
+// -----
+
 // CHECK: gather
-func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+func.func @gather(%operand : tensor<?x?x?x?x?x?x?x?xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<8x?x7x1x6x1x?xi32> {
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0, 1]
+      offset_dims = [0, 2, 3, 4, 5],
+      collapsed_slice_dims = [0, 1, 3],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<?x?x?xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, 8, 1, 7, 1, 6, 1]> : tensor<8xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x?x?x?x?x?x?xi32>, tensor<1x5x2xi32>) -> tensor<8x?x7x1x6x1x?xi32>
+  func.return %res : tensor<8x?x7x1x6x1x?xi32>
 }
 
 // -----
@@ -3621,47 +3639,87 @@ func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<1x5x2xi3
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<1x5x8xi32> {
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+func.func @gather(%operand : tensor<?x?x?x?xi32>, %start_indices : tensor<1x3x5x2xi32>) -> tensor<1x3x5x8xi32> {
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0, 1]
+      offset_dims = [3],
+      collapsed_slice_dims = [1, 2],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [1],
+      start_index_map = [1, 2],
+      index_vector_dim = 3
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
-  func.return %res : tensor<?x?x?xi32>
+    slice_sizes = dense<[1, 1, 1, 8]> : tensor<4xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x?x?xi32>, tensor<1x3x5x2xi32>) -> tensor<1x3x5x8xi32>
+  func.return %res : tensor<1x3x5x8xi32>
 }
 
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) plus operand_batching_dims size (0) is not equal to operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [1, 2],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 4,
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (1) plus operand_batching_dims size (1) is not equal to operand rank (3)}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [1, 2],
+      collapsed_slice_dims = [0],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [0],
+      start_index_map = [1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects index_vector_dim to be in range [0, rank-of('start_indices')] i.e. [0, 3]. got: -1.}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = -1
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
@@ -3669,17 +3727,17 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects index_vector_dim to be in range [0, rank-of('start_indices')] i.e. [0, 3]. got: 4.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
+      start_index_map = [0, 1],
+      index_vector_dim = 4
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
@@ -3687,35 +3745,89 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
+      start_index_map = [0],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{start_index_map size (2) is not equal to size of index dimension (3) of start_indices (1)}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
       offset_dims = [2],
-      start_index_map = [0]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 3
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
+func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2, 1],
+      collapsed_slice_dims = [],
+      start_index_map = [0, 1],
+      index_vector_dim = 1
+    >,
+    slice_sizes = dense<[8, 6]> : tensor<2xi64>,
+    indices_are_sorted = false
+  } : (tensor<16x11xi32>, tensor<5x2xi32>) -> tensor<5x8x6xi32>
+  func.return %res : tensor<5x8x6xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects offset_dims to not repeat, got: [2, 2]}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2, 2],
+      collapsed_slice_dims = [],
+      start_index_map = [0, 1],
+      index_vector_dim = 1
+    >,
+    slice_sizes = dense<[8, 6]> : tensor<2xi64>,
+    indices_are_sorted = false
+  } : (tensor<16x11xi32>, tensor<5x2xi32>) -> tensor<5x8x6xi32>
+  func.return %res : tensor<5x8x6xi32>
+}
+
+// -----
+
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes has rank 2 instead of required rank 1}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of offset_dims to be in range [0, implied-result-rank) i.e. [0, 3). got: -1.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [-1],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0, 1]
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[[1, 1, 8]]> : tensor<1x3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
@@ -3723,17 +3835,109 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of offset_dims to be in range [0, implied-result-rank) i.e. [0, 3). got: 3.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [3],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{has duplicated dimension from collapsed_slice_dims and operand_batching_dims: 1}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [1, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 8]> : tensor<2xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x1xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{has duplicated dimension from collapsed_slice_dims and operand_batching_dims: 0}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [0],
+      start_index_map = [1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x1xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [1, 0],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of collapsed_slice_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [-1, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of collapsed_slice_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 17.}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 17],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
@@ -3741,197 +3945,285 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes size (6) not equal to (implied) operand rank (3)}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects that for each dim in collapsed_slice_dims, slice_sizes[dim] should be <= 1, but got 8}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 2],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<4x2xi32>) -> tensor<4x2x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects operand_batching_dims to be sorted, got: [1, 0]}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      operand_batching_dims = [1, 0],
+      start_indices_batching_dims = [0, 1],
+      start_index_map = [2],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<4x2xi32>) -> tensor<4x2x8xi32>
+  func.return %res : tensor<4x2x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of operand_batching_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [-1],
+      start_indices_batching_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of operand_batching_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [3],
+      start_indices_batching_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects that for each dim in operand_batching_dims, slice_sizes[dim] should be <= 1, but got 2}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [0, 1],
+      start_index_map = [2],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8, 1, 2, 3]> : tensor<6xi64>
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+    slice_sizes = dense<[2, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<3xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{inferred type(s) 'tensor<1x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<4x2xi32>) -> tensor<4x2x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects start_indices_batching_dims to not repeat, got: [1, 0, 1]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [1, 0, 1],
+      start_index_map = [2],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<3xi32>
-  func.return %res : tensor<3xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<4x2xi32>) -> tensor<4x2x8xi32>
+  func.return %res : tensor<4x2x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<?x?x?x?x?x?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<3xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{inferred type(s) 'tensor<8x?x7x1x6x1x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<4x2xi32>) -> tensor<4x2x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of start_indices_batching_dims to be in range [0, rank-of('start_indices')) i.e. [0, 2). got: -1.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1, 3],
-      index_vector_dim = 2,
-      offset_dims = [0, 2, 3, 4, 5],
-      start_index_map = [0, 1]
+      offset_dims = [2],
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [1, -1],
+      start_index_map = [2],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8, 1, 7, 1, 6, 1]> : tensor<8xi64>
-  } : (tensor<?x?x?x?x?x?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<3xi32>
-  func.return %res : tensor<3xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<4x2xi32>) -> tensor<4x2x8xi32>
+  func.return %res : tensor<4x2x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<4x2xi32>) -> tensor<4x2x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of start_indices_batching_dims to be in range [0, rank-of('start_indices')) i.e. [0, 2). got: 10.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 2],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [1, 10],
+      start_index_map = [2],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
-  func.return %res : tensor<?x?x?xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<4x2xi32>) -> tensor<4x2x8xi32>
+  func.return %res : tensor<4x2x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<2x5x1xi32>) -> tensor<2x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects start_indices_batching_dims not to include index_vector_dim 2}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [-1, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [0, 2],
+      start_index_map = [1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<2x5x1xi32>) -> tensor<2x5x8xi32>
+  func.return %res : tensor<2x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<2x5x1xi32>) -> tensor<2x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand_batching_dims and start_indices_batching_dims should have the same size}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 17],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [0, 1],
+      start_index_map = [1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<2x5x1xi32>) -> tensor<2x5x8xi32>
+  func.return %res : tensor<2x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<2x5xi32>) -> tensor<2x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand_batching_dims[1] and start_indices_batching_dims[1] must have compatible sizes, but got 4 and 5}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [0, 1],
+      start_index_map = [2],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, -1]> : tensor<3xi64>
-  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
-  func.return %res : tensor<?x?x?xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<2x5xi32>) -> tensor<2x5x8xi32>
+  func.return %res : tensor<2x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{has duplicated dimension from start_index_map and operand_batching_dims: 0}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 0],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
-  func.return %res : tensor<?x?x?xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{expects offset_dims to not repeat, got: [2, 2]}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<2x5x2xi32>) -> tensor<2x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{has duplicated dimension from start_index_map and operand_batching_dims: 0}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [2, 2],
-      start_index_map = [0, 1]
+      offset_dims = [2],
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[8, 6]> : tensor<2xi64>
-  } : (tensor<16x11xi32>, tensor<5x2xi32>) -> tensor<5x8x6xi32>
-  func.return %res : tensor<5x8x6xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<2x5x2xi32>) -> tensor<2x5x8xi32>
+  func.return %res : tensor<2x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of start_index_map to be in range [0, rank-of('operand')) i.e. [0, 3). got: -2.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [2, 1],
-      start_index_map = [0, 1]
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [-2, -1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[8, 6]> : tensor<2xi64>
-  } : (tensor<16x11xi32>, tensor<5x2xi32>) -> tensor<5x8x6xi32>
-  func.return %res : tensor<5x8x6xi32>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects each element of start_index_map to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [1, 1],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 3],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
@@ -3939,109 +4231,129 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice_sizes size (2) not equal to operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [1, 0],
-      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 1]
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
+    slice_sizes = dense<[1, 8]> : tensor<2xi64>,
+    indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
+func.func @gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice_sizes size (6) not equal to operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [-1],
-      start_index_map = [0, 1]
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, 8, 1, 2, 3]> : tensor<6xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
+func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
       collapsed_slice_dims = [0, 1],
-      index_vector_dim = 2,
-      offset_dims = [3],
-      start_index_map = [0, 1]
+      start_index_map = [0, 1],
+      index_vector_dim = 2
     >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, -1]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
+func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<1x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
       index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [-2, -1]
+      start_index_map = [0, 1]
     >,
     indices_are_sorted = false,
     slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<3xi32>
+  func.return %res : tensor<3xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
+func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<4x5xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<4x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
+      collapsed_slice_dims = [0],
+      operand_batching_dims = [1],
+      start_indices_batching_dims = [0],
       index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0, 3]
+      start_index_map = [0]
     >,
     indices_are_sorted = false,
     slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+  } : (tensor<2x4x9xi32>, tensor<4x5xi32>) -> tensor<3xi32>
+  func.return %res : tensor<3xi32>
 }
 
 // -----
 
-func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
-  // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
+func.func @gather(%operand : tensor<?x?x?x?x?x?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<8x?x7x1x6x1x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
+      collapsed_slice_dims = [0, 1, 3],
       index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0, 0]
+      offset_dims = [0, 2, 3, 4, 5],
+      start_index_map = [0, 1]
     >,
     indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>) -> tensor<1x5x8xi32>
-  func.return %res : tensor<1x5x8xi32>
+    slice_sizes = dense<[1, 1, 8, 1, 7, 1, 6, 1]> : tensor<8xi64>
+  } : (tensor<?x?x?x?x?x?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<3xi32>
+  func.return %res : tensor<3xi32>
 }
 
 // -----
@@ -4091,9 +4403,9 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 
 // -----
 
-func.func @dynamic_gather_c1(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) plus operand_batching_dims size (0) is not equal to operand rank (3)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [1, 2],
@@ -4108,9 +4420,9 @@ func.func @dynamic_gather_c1(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c2(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
+  // expected-error@+1 {{Expects index_vector_dim to be in range [0, rank-of('start_indices')] i.e. [0, 3]. got: 4.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4125,7 +4437,7 @@ func.func @dynamic_gather_c2(%operand : tensor<?x?x?xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c3(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
@@ -4142,12 +4454,12 @@ func.func @dynamic_gather_c3(%operand : tensor<?x?x?xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c4(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
+      collapsed_slice_dims = [0],
       index_vector_dim = 2,
       offset_dims = [2, 1],
       start_index_map = [0, 1]
@@ -4159,9 +4471,9 @@ func.func @dynamic_gather_c4(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
+  // expected-error@+1 {{Expects each element of offset_dims to be in range [0, implied-result-rank) i.e. [0, 3). got: -1.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4176,9 +4488,9 @@ func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
+  // expected-error@+1 {{Expects each element of offset_dims to be in range [0, implied-result-rank) i.e. [0, 3). got: 3.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4193,12 +4505,12 @@ func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
+  // expected-error@+1 {{has duplicated dimension from collapsed_slice_dims and operand_batching_dims: 1}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [1, 0],
+      collapsed_slice_dims = [1, 1],
       index_vector_dim = 2,
       offset_dims = [2],
       start_index_map = [0, 1]
@@ -4210,12 +4522,12 @@ func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
+  // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [1, 1],
+      collapsed_slice_dims = [1, 0],
       index_vector_dim = 2,
       offset_dims = [2],
       start_index_map = [0, 1]
@@ -4227,9 +4539,9 @@ func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
+  // expected-error@+1 {{Expects each element of collapsed_slice_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [-1, 1],
@@ -4244,9 +4556,9 @@ func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
+  // expected-error@+1 {{Expects each element of collapsed_slice_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 17.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 17],
@@ -4261,10 +4573,10 @@ func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c8(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
   %slize_sizes = mhlo.constant dense<[1,1,8]> : tensor<3xi32>
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
+  // expected-error@+1 {{Expects that for each dim in collapsed_slice_dims, slice_sizes[dim] should be <= 1, but got 8}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -4279,9 +4591,28 @@ func.func @dynamic_gather_c8(%operand : tensor<?x?x?xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c9(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?xi32>) -> tensor<?x?x?xi32> {
+  %slize_sizes = mhlo.constant dense<[2,1,8]> : tensor<3xi32>
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{Expects that for each dim in operand_batching_dims, slice_sizes[dim] should be <= 1, but got 2}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      operand_batching_dims = [0, 1],
+      start_indices_batching_dims = [0, 1],
+      start_index_map = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
+  // expected-error@+1 {{has duplicated dimension from start_index_map and operand_batching_dims: 0}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -4296,9 +4627,9 @@ func.func @dynamic_gather_c9(%operand : tensor<2x4x9xi32>, %start_indices : tens
 
 // -----
 
-func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
+  // expected-error@+1 {{Expects each element of start_index_map to be in range [0, rank-of('operand')) i.e. [0, 3). got: -2.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -4306,7 +4637,6 @@ func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : ten
       start_index_map = [-2, -1],
       index_vector_dim = 2
     >,
-    slice_sizes = array<i64: 1, 1, 8>,
     indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
@@ -4314,9 +4644,9 @@ func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
+  // expected-error@+1 {{Expects each element of start_index_map to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -4324,7 +4654,6 @@ func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : ten
       start_index_map = [0, 3],
       index_vector_dim = 2
     >,
-    slice_sizes = array<i64: 1, 1, 8>,
     indices_are_sorted = false
   } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
   func.return %res : tensor<1x5x8xi32>
@@ -4332,9 +4661,9 @@ func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c11(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<2xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<2xi32>) -> tensor<?x?x?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
+  // expected-error@+1 {{slice_sizes size (2) not equal to operand rank (3)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4349,7 +4678,7 @@ func.func @dynamic_gather_c11(%operand : tensor<?x?x?xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
   %slice_sizes = mhlo.constant dense<[1,1,-1]> : tensor<3xi32>
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
@@ -4367,7 +4696,7 @@ func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
   %slice_sizes = mhlo.constant dense<[1,1,8]> : tensor<3xi32>
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
@@ -4378,7 +4707,7 @@ func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : ten
       start_index_map = [0, 1],
       index_vector_dim = 2
     >,
-    slice_sizes = array<i64: 1, 1, 8>,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
     indices_are_sorted = false
   } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
@@ -4386,7 +4715,7 @@ func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
@@ -4403,7 +4732,7 @@ func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
@@ -4420,7 +4749,7 @@ func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c13(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
@@ -4437,7 +4766,7 @@ func.func @dynamic_gather_c13(%operand : tensor<?x?x?xi32>, %start_indices : ten
 
 // -----
 
-func.func @dynamic_gather_c13(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 7107b7d615e7b8..9a55189a5b79c8 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -942,28 +942,30 @@ func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
 }
 
 // CHECK-LABEL: "op_gather"
-func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
+func.func @op_gather(%arg0: tensor<2x3x4x2xi32>, %arg1: tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32> {
   //      CHECK: "mhlo.gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
-  // CHECK-SAME:     offset_dims = [2],
-  // CHECK-SAME:     collapsed_slice_dims = [0, 1],
-  // CHECK-SAME:     start_index_map = [0, 1],
-  // CHECK-SAME:     index_vector_dim = 2
-  // CHECK-SAME:   >,
+  // CHECK-SAME:     offset_dims = [3, 4],
+  // CHECK-SAME:     collapsed_slice_dims = [1],
+  // CHECK-SAME:     operand_batching_dims = [0],
+  // CHECK-SAME:     start_indices_batching_dims = [1],
+  // CHECK-SAME:     start_index_map = [2, 1],
+  // CHECK-SAME:     index_vector_dim = 3>,
   // CHECK-SAME:   indices_are_sorted = false,
-  // CHECK-SAME:   slice_sizes = dense<1> : tensor<3xi64>
-  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
+  // CHECK-SAME:   slice_sizes = dense<[1, 1, 2, 2]> : tensor<4xi64>
+  // CHECK-SAME: }> : (tensor<2x3x4x2xi32>, tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32>
   %0 = "stablehlo.gather"(%arg0, %arg1) {
     dimension_numbers = #stablehlo.gather<
-      offset_dims = [2],
-      collapsed_slice_dims = [0, 1],
-      start_index_map = [0, 1],
-      index_vector_dim = 2
-    >,
-    slice_sizes = array<i64: 1, 1, 1>,
+      offset_dims = [3, 4],
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [1],
+      start_index_map = [2, 1],
+      index_vector_dim = 3>,
+    slice_sizes = array<i64: 1, 1, 2, 2>,
     indices_are_sorted = false
-  } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
-  func.return %0 : tensor<1x5x1xf32>
+  } : (tensor<2x3x4x2xi32>, tensor<2x2x3x2xi64>) -> tensor<2x2x3x2x2xi32>
+  func.return %0 : tensor<2x2x3x2x2xi32>
 }
 
 // CHECK-LABEL: "op_get_dimension_size"
@@ -1366,36 +1368,38 @@ func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
 }
 
 // CHECK-LABEL: "op_scatter"
-func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
+func.func @op_scatter(%arg0: tensor<2x3x4x2xi64>, %arg1: tensor<2x2x3x2xi64>, %arg2: tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64> {
   //      CHECK: "mhlo.scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:  indices_are_sorted = true,
   // CHECK-SAME:  scatter_dimension_numbers = #mhlo.scatter<
-  // CHECK-SAME:    update_window_dims = [1],
-  // CHECK-SAME:    inserted_window_dims = [0, 1],
-  // CHECK-SAME:    scatter_dims_to_operand_dims = [0, 1],
-  // CHECK-SAME:    index_vector_dim = 1
-  // CHECK-SAME:  >,
+  // CHECK-SAME:    update_window_dims = [3, 4],
+  // CHECK-SAME:    inserted_window_dims = [1],
+  // CHECK-SAME:    input_batching_dims = [0],
+  // CHECK-SAME:    scatter_indices_batching_dims = [1],
+  // CHECK-SAME:    scatter_dims_to_operand_dims = [2, 1],
+  // CHECK-SAME:    index_vector_dim = 3>,
   // CHECK-SAME:  unique_indices = true
   // CHECK-SAME: }> ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<i64>, %[[ARG4:arg.*]]: tensor<i64>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<i64>) -> ()
+  // CHECK-NEXT: }) : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>, tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64>
   %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-      %1 = "stablehlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "stablehlo.return"(%1) : (tensor<f32>) -> ()
+    ^bb0(%arg3: tensor<i64>, %arg4: tensor<i64>):
+      %1 = "stablehlo.add"(%arg3, %arg4) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+      "stablehlo.return"(%1) : (tensor<i64>) -> ()
   }) {
     scatter_dimension_numbers = #stablehlo.scatter<
-      update_window_dims = [1],
-      inserted_window_dims = [0, 1],
-      scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1
-    >,
+      update_window_dims = [3, 4],
+      inserted_window_dims = [1],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [1],
+      scatter_dims_to_operand_dims = [2, 1],
+      index_vector_dim = 3>,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<2x3x4x2xi64>, tensor<2x2x3x2xi64>, tensor<2x2x3x2x2xi64>) -> tensor<2x3x4x2xi64>
+  func.return %0 : tensor<2x3x4x2xi64>
 }
 
 // CHECK-LABEL: "op_select_and_scatter"
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
index ea6f77a27e7ec1..ec176139afb4b3 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
@@ -24,6 +24,55 @@ func.func @scatter(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
+// CHECK: func @scatter_with_batching_dims
+func.func @scatter_with_batching_dims(%input_tensor: tensor<5x200x100x300xf32>,
+    %scatter_indices: tensor<5x10x2xi32>, %updates: tensor<5x10x300xf32>) ->
+      tensor<5x200x100x300xf32> {
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      inserted_window_dims = [1, 2],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1, 2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>, tensor<5x10x300xf32>) ->
+      tensor<5x200x100x300xf32>
+  func.return %0 : tensor<5x200x100x300xf32>
+}
+
+// -----
+
+// CHECK: func @valid_scatter_dimensions_with_dynamic_index_vector_dim
+func.func @valid_scatter_dimensions_with_dynamic_index_vector_dim(
+    %input_tensor: tensor<?x?x?xf32>, %scatter_indices: tensor<10x?xi32>,
+    %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1, 2],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<?x?x?xf32>, tensor<10x?xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
 // CHECK: func @scatter_with_promotable_types
 func.func @scatter_with_promotable_types(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
@@ -50,12 +99,12 @@ func.func @scatter_with_promotable_types(%input_tensor: tensor<200x100x300xf32>,
 
 // CHECK: func @scatter_with_promotable_quantized_types
 func.func @scatter_with_promotable_quantized_types(%input_tensor: tensor<200x100x300x!quant.uniform<i8:f32, 2.000000e+00:15>>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
-      tensor<200x100x300x!quant.uniform<i32:f32, 2.000000e+00:15>> {
+    %scatter_indices: tensor<10x2xi16>, %updates: tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
+      tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>> {
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
-  ^bb0(%lhs: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>, %rhs: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>):
-    %add = mhlo.add %lhs, %rhs : tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
-    "mhlo.return"(%add) : (tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>) -> ()
+  ^bb0(%lhs: tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>, %rhs: tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>):
+    %add = mhlo.add %lhs, %rhs : tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>
+    "mhlo.return"(%add) : (tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
@@ -65,17 +114,38 @@ func.func @scatter_with_promotable_quantized_types(%input_tensor: tensor<200x100
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300x!quant.uniform<i8:f32, 2.000000e+00:15>>, tensor<10x2xi32>,
+  } : (tensor<200x100x300x!quant.uniform<i8:f32, 2.000000e+00:15>>, tensor<10x2xi16>,
       tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
-      tensor<200x100x300x!quant.uniform<i32:f32, 2.000000e+00:15>>
-  func.return %0 : tensor<200x100x300x!quant.uniform<i32:f32, 2.000000e+00:15>>
+      tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>>
+  func.return %0 : tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>>
 }
+
 // -----
 
-func.func @invalid_scatter(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<10x2xf32>, %updates: tensor<10x300xf32>) ->
-      tensor<200x100x300xf32> {
-  // expected-error @+1 {{operand #1 must be ranked tensor of integer or index values, but got 'tensor<10x2xf32>'}}
+func.func @scatter_c1(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
+                            %arg2: tensor<1xi32>) -> tensor<3xi32> {
+  // expected-error @+1 {{Not all inputs have compatible shapes.}}
+  %0, %1 = "mhlo.scatter"(%arg0, %arg2, %arg1, %arg2, %arg2) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<i32>):
+    "mhlo.return"(%arg3, %arg5) : (tensor<i32>, tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [],
+      inserted_window_dims = [0],
+      scatter_dims_to_operand_dims = [0],
+      index_vector_dim = 1,
+    >,
+    unique_indices = false
+  } : (tensor<3xi32>, tensor<1xi32>, tensor<1x1xi32>, tensor<1xi32>, tensor<1xi32>) -> (tensor<3xi32>, tensor<3xi32>)
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+func.func @scatter_c2(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<?x?xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects rank-of operand to match size-of('update_window_dims') + size-of('inserted_window_dims') + size-of('input_batching_dims') i.e. 4 but got 3.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -83,23 +153,114 @@ func.func @invalid_scatter(%input_tensor: tensor<200x100x300xf32>,
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
+      inserted_window_dims = [0, 1, 2],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<?x?xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c2(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects rank-of operand to match size-of('update_window_dims') + size-of('inserted_window_dims') + size-of('input_batching_dims') i.e. 4 but got 3.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      inserted_window_dims = [1, 2],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c3(%input_tensor: tensor<3xi32>, %scatter_indices: tensor<1x1xi32>,
+                            %updates: tensor<1xi32>) -> tensor<3xi32> {
+  // expected-error @+1 {{Not all updates have compatible shapes.}}
+  %0, %1 = "mhlo.scatter"(%input_tensor, %input_tensor, %scatter_indices, %input_tensor, %updates) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<i32>):
+    "mhlo.return"(%arg3, %arg5) : (tensor<i32>, tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [],
+      inserted_window_dims = [0],
+      scatter_dims_to_operand_dims = [0],
+      index_vector_dim = 1,
+    >,
+    unique_indices = false
+  } : (tensor<3xi32>, tensor<3xi32>, tensor<1x1xi32>, tensor<3xi32>, tensor<1xi32>) -> (tensor<3xi32>, tensor<3xi32>)
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+func.func @scatter_c4(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{expects updates tensor must be of rank 3 ( == rank-of('scatter_indices') - 1 + size-of('update_window_dims'), where 'scatter_indices' is expanded by a trailing 1 dimension if 'index_vector_dim' == rank-of('scatter_indices')), but got 2.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1, 0],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
       index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xf32>, tensor<10x300xf32>) ->
+  } : (tensor<?x?x?xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c4(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{expects updates tensor must be of rank 3 ( == rank-of('scatter_indices') - 1 + size-of('update_window_dims'), where 'scatter_indices' is expanded by a trailing 1 dimension if 'index_vector_dim' == rank-of('scatter_indices')), but got 2.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
       tensor<200x100x300xf32>
   func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter(%input_tensor: tensor<?x?xf32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<?x?xf32>) ->
-      tensor<?x?xf32> {
-  // expected-error @+1 {{expects scatter index leaf dimension to be within [0, rank(scatter_indices) + 1. rank(scatter_indices) is 2 and scatter index leaf dimension is 3.}}
+func.func @scatter_c4(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x301xf32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{expects bounds of the window dimensions of updates to not exceed the bounds of the corresponding dimensions of operand. For dimension 1, updates bound is 301, operand bound is 300.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -109,21 +270,21 @@ func.func @invalid_scatter(%input_tensor: tensor<?x?xf32>,
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 3
+      index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<?x?xf32>, tensor<10x2xi32>, tensor<?x?xf32>) ->
-      tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x301xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+func.func @scatter_c4(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<11x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-  // expected-error @+1 {{expects scatter index leaf dimension to be within [0, rank(scatter_indices) + 1. rank(scatter_indices) is 2 and scatter index leaf dimension is -1.}}
+  // expected-error @+1 {{expects bounds of the scatter dimensions of updates to be same as the bounds of the corresponding dimensions of scatter indices. For scatter dimension 0, updates bound is 10 , scatter_indices bound is 11.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -133,20 +294,20 @@ func.func @invalid_scatter(%input_tensor: tensor<200x100x300xf32>,
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = -1
+      index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+  } : (tensor<200x100x300xf32>, tensor<11x2xi32>, tensor<10x300xf32>) ->
       tensor<200x100x300xf32>
   func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter(%input_tensor: tensor<?x?xf32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
-      tensor<?x?xf32> {
+func.func @scatter_c4(%input_tensor: tensor<5x200x100x300xf32>,
+    %scatter_indices: tensor<5x10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<5x200x100x300xf32> {
   // expected-error @+1 {{expects updates tensor must be of rank 3 ( == rank-of('scatter_indices') - 1 + size-of('update_window_dims'), where 'scatter_indices' is expanded by a trailing 1 dimension if 'index_vector_dim' == rank-of('scatter_indices')), but got 2.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -154,48 +315,74 @@ func.func @invalid_scatter(%input_tensor: tensor<?x?xf32>,
     "mhlo.return"(%add) : (tensor<f32>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [1, 0],
+      update_window_dims = [1],
+      inserted_window_dims = [1, 2],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1, 2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>, tensor<10x300xf32>) ->
+      tensor<5x200x100x300xf32>
+  func.return %0 : tensor<5x200x100x300xf32>
+}
+
+// -----
+
+func.func @scatter_c6_c23(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xi32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<f32>' vs 'tensor<i32>'}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs :  tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
       index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<?x?xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xi32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c6_c23_c24(%input_tensor: tensor<200x100x300xi32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-  // expected-error @+1 {{expects updates tensor must be of rank 3 ( == rank-of('scatter_indices') - 1 + size-of('update_window_dims'), where 'scatter_indices' is expanded by a trailing 1 dimension if 'index_vector_dim' == rank-of('scatter_indices')), but got 2.}}
+  // expected-error@+1 {{The element-type of reduction-region's argument at index 1 is expected to be promotable from 'i32', but got 'f32'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    %add = mhlo.add %lhs, %rhs :  tensor<f32>
     "mhlo.return"(%add) : (tensor<f32>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 2
+      index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+  } : (tensor<200x100x300xi32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
       tensor<200x100x300xf32>
   func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
+func.func @scatter_c7() ->  tensor<512x1x6400x6400xf32> {
   %base = mhlo.constant dense<0.000000e+00> : tensor<512x1x6400x6400xf32>
   %index = mhlo.constant dense<0> : tensor<1xi32>
   %update = mhlo.constant dense<1.000000e+00> : tensor<512x1x6400x6400xf32>
-
   // expected-error @+1 {{Expects update_window_dims to be sorted; got: [0, 1, 3, 2].}}
   %scatter = "mhlo.scatter"(%base, %index, %update) ({
     ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
@@ -209,17 +396,15 @@ func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
       unique_indices = true} :
     (tensor<512x1x6400x6400xf32>, tensor<1xi32>, tensor<512x1x6400x6400xf32>) ->
       tensor<512x1x6400x6400xf32>
-
-  func.return %scatter :  tensor<512x1x6400x6400xf32>
+  func.return %scatter : tensor<512x1x6400x6400xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
+func.func @scatter_c7() ->  tensor<512x1x6400x6400xf32> {
   %base = mhlo.constant dense<0.000000e+00> : tensor<512x1x6400x6400xf32>
   %index = mhlo.constant dense<0> : tensor<1xi32>
   %update = mhlo.constant dense<1.000000e+00> : tensor<512x1x6400x6400xf32>
-
   // expected-error @+1 {{Expects update_window_dims to not repeat; got: [0, 1, 2, 2].}}
   %scatter = "mhlo.scatter"(%base, %index, %update) ({
     ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
@@ -233,18 +418,38 @@ func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
       unique_indices = true} :
     (tensor<512x1x6400x6400xf32>, tensor<1xi32>, tensor<512x1x6400x6400xf32>) ->
       tensor<512x1x6400x6400xf32>
-
-  func.return %scatter :  tensor<512x1x6400x6400xf32>
+  func.return %scatter : tensor<512x1x6400x6400xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
+func.func @scatter_c8() ->  tensor<512x1x6400x6400xf32> {
   %base = mhlo.constant dense<0.000000e+00> : tensor<512x1x6400x6400xf32>
   %index = mhlo.constant dense<0> : tensor<1xi32>
   %update = mhlo.constant dense<1.000000e+00> : tensor<512x1x6400x6400xf32>
+  // expected-error @+1 {{Expects each element of update_window_dims to be in range [0, rank-of('updates')) i.e. [0, 4). got: -1.}}
+  %scatter = "mhlo.scatter"(%base, %index, %update) ({
+    ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
+      "mhlo.return"(%arg6) : (tensor<f32>) -> ()
+  }) {
+    indices_are_sorted = true,
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [-1, 0, 1, 2],
+      scatter_dims_to_operand_dims = [3]>,
+      index_vector_dim = 0,
+      unique_indices = true} :
+    (tensor<512x1x6400x6400xf32>, tensor<1xi32>, tensor<512x1x6400x6400xf32>) ->
+      tensor<512x1x6400x6400xf32>
+  func.return %scatter : tensor<512x1x6400x6400xf32>
+}
+
+// -----
 
-  // expected-error @+1 {{Expects each element of update_window_dims to be in range [0, rank-of('updates') i.e. [0, 4). got: 4.}}
+func.func @scatter_c8() ->  tensor<512x1x6400x6400xf32> {
+  %base = mhlo.constant dense<0.000000e+00> : tensor<512x1x6400x6400xf32>
+  %index = mhlo.constant dense<0> : tensor<1xi32>
+  %update = mhlo.constant dense<1.000000e+00> : tensor<512x1x6400x6400xf32>
+  // expected-error @+1 {{Expects each element of update_window_dims to be in range [0, rank-of('updates')) i.e. [0, 4). got: 4.}}
   %scatter = "mhlo.scatter"(%base, %index, %update) ({
     ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
       "mhlo.return"(%arg6) : (tensor<f32>) -> ()
@@ -257,16 +462,64 @@ func.func @invalid_scatter_dimensions() ->  tensor<512x1x6400x6400xf32> {
       unique_indices = true} :
     (tensor<512x1x6400x6400xf32>, tensor<1xi32>, tensor<512x1x6400x6400xf32>) ->
       tensor<512x1x6400x6400xf32>
+  func.return %scatter : tensor<512x1x6400x6400xf32>
+}
+
+// -----
+
+func.func @scatter_c9(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xf32> {
+// expected-error @+1 {{has duplicated dimension from inserted_window_dims and input_batching_dims: 1}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [1, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
+}
+
+// -----
 
-  func.return %scatter :  tensor<512x1x6400x6400xf32>
+func.func @scatter_c9(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xf32> {
+// expected-error @+1 {{has duplicated dimension from inserted_window_dims and input_batching_dims: 0}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<?x?x?xf32>,
-    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) ->
+func.func @scatter_c10(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?xi32>, %updates: tensor<?x?xf32>) ->
       tensor<?x?x?xf32> {
-
   // expected-error @+1 {{Expects inserted_window_dims to be sorted; got: [1, 0].}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -281,16 +534,16 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<?x?x?xf32>,
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<?x?x?xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  } : (tensor<?x?x?xf32>, tensor<?x?xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c11(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-// expected-error @+1 {{Expects inserted_window_dims to not repeat; got: [1, 1].}}
+  // expected-error @+1 {{Expects each element of inserted_window_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -298,7 +551,7 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
-      inserted_window_dims = [1, 1],
+      inserted_window_dims = [-1, 3],
       scatter_dims_to_operand_dims = [0, 1],
       index_vector_dim = 1
     >,
@@ -311,11 +564,10 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c11(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
-  // expected-error @+1 {{Expects each element of inserted_window_dims to be in range [0, rank-of('operand') i.e. [0, 3). got: 3.}}
+  // expected-error @+1 {{Expects each element of inserted_window_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -336,10 +588,10 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-
-  // expected-error @+1 {{Expects rank-of operand to match size-of('update_window_dims')  + size-of('inserted_window_dims') i.e. 4 but got 3.}}
+func.func @scatter_c12(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?xi32>, %updates: tensor<?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects input_batching_dims to be sorted; got: [1, 0].}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -347,23 +599,49 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
-      inserted_window_dims = [0, 1, 2],
-      scatter_dims_to_operand_dims = [0, 1],
+      input_batching_dims = [1, 0],
+      scatter_indices_batching_dims = [0, 1],
+      scatter_dims_to_operand_dims = [2],
       index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  } : (tensor<?x?x?xf32>, tensor<?x?xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<?x?xf32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<?x?xf32>) ->
-      tensor<?x?xf32> {
+func.func @scatter_c13(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{Expects each element of input_batching_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [1],
+      input_batching_dims = [-1],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
+}
 
-  // expected-error @+1 {{Scatter op has 3 elements in scatter_dims_to_operand_dims and the bound of dimension index_vector_dim=1 of scatter_indices is 2. These two numbers must be equal.}}
+// -----
+
+func.func @scatter_c13(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{Expects each element of input_batching_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -371,21 +649,172 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<?x?xf32>,
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
-      inserted_window_dims = [0],
-      scatter_dims_to_operand_dims = [0, 1, 2],
+      inserted_window_dims = [1],
+      input_batching_dims = [3],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [1],
       index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<?x?xf32>, tensor<10x2xi32>, tensor<?x?xf32>) ->
-      tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
 }
 
-func.func @valid_scatter_dimensions_with_dynamic_index_vector_dim(
-    %input_tensor: tensor<?x?x?xf32>, %scatter_indices: tensor<10x?xi32>,
-    %updates: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+// -----
+
+func.func @scatter_c14(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects scatter_indices_batching_dims to not repeat; got: [1, 0, 1].}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      input_batching_dims = [0, 1],
+      scatter_indices_batching_dims = [1, 0, 1],
+      scatter_dims_to_operand_dims = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<?x?x?xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c15(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects each element of scatter_indices_batching_dims to be in range [0, rank-of('scatter_indices')) i.e. [0, 3). got: -1.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      input_batching_dims = [0, 1],
+      scatter_indices_batching_dims = [1, -1],
+      scatter_dims_to_operand_dims = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<?x?x?xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c15(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects each element of scatter_indices_batching_dims to be in range [0, rank-of('scatter_indices')) i.e. [0, 3). got: 3.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      input_batching_dims = [0, 1],
+      scatter_indices_batching_dims = [1, 3],
+      scatter_dims_to_operand_dims = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<?x?x?xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c16(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{expects scatter_indices_batching_dims not to include index_vector_dim 2.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      input_batching_dims = [0, 1],
+      scatter_indices_batching_dims = [1, 2],
+      scatter_dims_to_operand_dims = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<?x?x?xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @scatter_c17(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x5x2xi32>, %updates: tensor<10x5x300xf32>) ->
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{input_batching_dims and scatter_indices_batching_dims should have the same size.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      inserted_window_dims = [1],
+      input_batching_dims = [0],
+      scatter_indices_batching_dims = [0, 1],
+      scatter_dims_to_operand_dims = [1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x5x2xi32>, tensor<10x5x300xf32>) ->
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
+}
 
+// -----
+
+func.func @scatter_c18(%input_tensor: tensor<5x100x300xf32>,
+    %scatter_indices: tensor<10x5x2xi32>, %updates: tensor<10x5x300xf32>) ->
+      tensor<5x100x300xf32> {
+  // expected-error @+1 {{input_batching_dims[1] and scatter_indices_batching_dims[1] must have compatible sizes, but got 100 and 10.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [2],
+      input_batching_dims = [0, 1],
+      scatter_indices_batching_dims = [1, 0],
+      scatter_dims_to_operand_dims = [2],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<5x100x300xf32>, tensor<10x5x2xi32>, tensor<10x5x300xf32>) ->
+      tensor<5x100x300xf32>
+  func.return %0 : tensor<5x100x300xf32>
+}
+
+// -----
+
+func.func @scatter_c19(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Scatter op has 3 elements in scatter_dims_to_operand_dims and the bound of dimension index_vector_dim=1 of scatter_indices is 2. These two numbers must be equal.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -399,16 +828,17 @@ func.func @valid_scatter_dimensions_with_dynamic_index_vector_dim(
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<?x?x?xf32>, tensor<10x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  } : (tensor<?x?x?xf32>, tensor<10x2xi32>, tensor<?x?xf32>) ->
+      tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-
-  // expected-error @+1 {{Invalid scatter_dims_to_operand_dims mapping; domain is [0, 3), got: 1->3.}}
+func.func @scatter_c19(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Scatter op has 3 elements in scatter_dims_to_operand_dims and the bound of dimension index_vector_dim=2 of scatter_indices is 1. These two numbers must be equal.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -417,22 +847,22 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
-      scatter_dims_to_operand_dims = [0, 3],
-      index_vector_dim = 1
+      scatter_dims_to_operand_dims = [0, 1, 2],
+      index_vector_dim = 2
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  } : (tensor<?x?x?xf32>, tensor<10x2xi32>, tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c20(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
-  // expected-error @+1 {{Expects scatter_dims_to_operand_dims to not repeat; got: [0, 0].}}
+  // expected-error @+1 {{has duplicated dimension from scatter_dims_to_operand_dims and input_batching_dims: 0}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -453,11 +883,35 @@ func.func @invalid_scatter_dimensions(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_update_window_dims(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x301xf32>) ->
-      tensor<200x100x300xf32> {
+func.func @scatter_c20(%input_tensor: tensor<200x10x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x10x300xf32> {
+  // expected-error @+1 {{has duplicated dimension from scatter_dims_to_operand_dims and input_batching_dims: 1}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0],
+      input_batching_dims = [1],
+      scatter_indices_batching_dims = [0],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x10x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x10x300xf32>
+  func.return %0 : tensor<200x10x300xf32>
+}
 
-  // expected-error @+1 {{expects bounds of the window dimensions of updates to not exceed the bounds of the corresponding dimensions of operand. For dimension 1, updates bound is 301, operand bound is 300.}}
+// -----
+
+func.func @scatter_c21(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<?x?x?xi32>, %updates: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects each element of scatter_dims_to_operand_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: -1.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -466,23 +920,43 @@ func.func @invalid_scatter_update_window_dims(%input_tensor: tensor<200x100x300x
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
-      scatter_dims_to_operand_dims = [0, 1],
+      scatter_dims_to_operand_dims = [-1, 0],
       index_vector_dim = 1
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x301xf32>) ->
-      tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<200x100x300xf32>, tensor<?x?x?xi32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_update_window_dims(%input_tensor: tensor<200x100x300xf32>,
-    %scatter_indices: tensor<11x2xi32>, %updates: tensor<10x300xf32>) ->
-      tensor<200x100x300xf32> {
+func.func @scatter_c21(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<?x?xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects each element of scatter_dims_to_operand_dims to be in range [0, rank-of('operand')) i.e. [0, 3). got: 3.}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 3],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<?x?xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
 
-  // expected-error @+1 {{expects bounds of the scatter dimensions of updates to be same as the bounds of the corresponding dimensions of scatter indices. For scatter dimension 0, updates bound is 10 , scatter_indices bound is 11.}}
+// -----
+
+func.func @scatter_c22(%input_tensor: tensor<?x?x?xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32> {
+  // expected-error @+1 {{Expects index_vector_dim to be in range [0, rank-of('scatter_indices')] i.e. [0, 2]. got: 3.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -492,23 +966,21 @@ func.func @invalid_scatter_update_window_dims(%input_tensor: tensor<200x100x300x
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1
+      index_vector_dim = 3
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<11x2xi32>, tensor<10x300xf32>) ->
-      tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<?x?x?xf32>, tensor<10x2xi32>, tensor<?x?x?xf32>) ->
+      tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_return_type(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c22(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
-      tensor<200x100xf32> {
-
-  // expected-error @+2 {{'mhlo.scatter' op failed to infer returned types}}
-  // expected-error @+1 {{inferred type(s) 'tensor<200x100x300xf32>' are incompatible with return type(s) of operation 'tensor<200x100xf32>'}}
+      tensor<200x100x300xf32> {
+  // expected-error @+1 {{Expects index_vector_dim to be in range [0, rank-of('scatter_indices')] i.e. [0, 2]. got: -1.}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
@@ -518,21 +990,20 @@ func.func @invalid_scatter_return_type(%input_tensor: tensor<200x100x300xf32>,
       update_window_dims = [1],
       inserted_window_dims = [0, 1],
       scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1
+      index_vector_dim = -1
     >,
     indices_are_sorted = true,
     unique_indices = true
   } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
-      tensor<200x100xf32>
-  func.return %0 : tensor<200x100xf32>
+      tensor<200x100x300xf32>
+  func.return %0 : tensor<200x100x300xf32>
 }
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error @+1 {{Reduction-region must take 2 parameters, but takes 1 parameter(s)}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>):
@@ -554,10 +1025,9 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error @+1 {{The reduction-region expected to return some value(s)}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -579,10 +1049,9 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error @+1 {{Reduction-region here must produce 1 tensors, but produces 2 instead}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -604,7 +1073,7 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
 
@@ -629,10 +1098,9 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error @+1 {{Reduction-region here must produce tensor-typed result(s), but produces 'tuple<tensor<f32>, tensor<f32>>' instead}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -656,10 +1124,9 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error@+1 {{The type of reduction-region's parameter at index 0 is different than the corresponding result type: 'tensor<f32>' vs 'tensor<i32>'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
@@ -682,10 +1149,9 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100x300xf32> {
-
   // expected-error@+1 {{The type of reduction-region's parameter at index 1 is different than the corresponding result type: 'tensor<i32>' vs 'tensor<f32>'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<i32>):
@@ -707,15 +1173,15 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xi32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xi32>) ->
-      tensor<200x100x300xf32> {
+      tensor<200x100x300xi8> {
 
-  // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<f32>' vs 'tensor<i32>'}}
+  // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<i8>' vs 'tensor<i32>'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %add = mhlo.add %lhs, %rhs :  tensor<f32>
-    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  ^bb0(%lhs: tensor<i8>, %rhs: tensor<i8>):
+    %add = mhlo.add %lhs, %rhs : tensor<i8>
+    "mhlo.return"(%add) : (tensor<i8>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
@@ -725,22 +1191,22 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xf32>,
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xi32>) ->
-      tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<200x100x300xi32>, tensor<10x2xi32>, tensor<10x300xi32>) ->
+      tensor<200x100x300xi8>
+  func.return %0 : tensor<200x100x300xi8>
 }
 
 // -----
 
-func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xi32>,
-    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
-      tensor<200x100x300xf32> {
+func.func @scatter_c23(%input_tensor: tensor<200x100x300xi32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xi8>) ->
+      tensor<200x100x300xi8> {
 
-  // expected-error@+1 {{The element-type of reduction-region's argument at index 1 is expected to be promotable from 'i32', but got 'f32'}}
+  // expected-error@+1 {{The element-type of reduction-region's argument at index 1 is expected to be promotable from 'i32', but got 'i8'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %add = mhlo.add %lhs, %rhs :  tensor<f32>
-    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  ^bb0(%lhs: tensor<i8>, %rhs: tensor<i8>):
+    %add = mhlo.add %lhs, %rhs : tensor<i8>
+    "mhlo.return"(%add) : (tensor<i8>) -> ()
   }) {
     scatter_dimension_numbers = #mhlo.scatter<
       update_window_dims = [1],
@@ -750,7 +1216,57 @@ func.func @invalid_scatter_reducer(%input_tensor: tensor<200x100x300xi32>,
     >,
     indices_are_sorted = true,
     unique_indices = true
-  } : (tensor<200x100x300xi32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
-      tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
+  } : (tensor<200x100x300xi32>, tensor<10x2xi32>, tensor<10x300xi8>) ->
+      tensor<200x100x300xi8>
+  func.return %0 : tensor<200x100x300xi8>
+}
+
+// -----
+
+func.func @scatter_c23(%input_tensor: tensor<200x100x300x!quant.uniform<i8:f32, 2.000000e+00:15>>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
+      tensor<200x100x300x!quant.uniform<i32:f64, 2.000000e+00:15>> {
+
+  // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<!quant.uniform<i32:f64, 2.000000e+00:15>>' vs 'tensor<!quant.uniform<i8:f32, 2.000000e+00:15>>'}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<!quant.uniform<i32:f64, 2.000000e+00:15>>, %rhs: tensor<!quant.uniform<i32:f64, 2.000000e+00:15>>):
+    %add = mhlo.add %lhs, %rhs : tensor<!quant.uniform<i32:f64, 2.000000e+00:15>>
+    "mhlo.return"(%add) : (tensor<!quant.uniform<i32:f64, 2.000000e+00:15>>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300x!quant.uniform<i8:f32, 2.000000e+00:15>>, tensor<10x2xi32>, tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
+      tensor<200x100x300x!quant.uniform<i32:f64, 2.000000e+00:15>>
+  func.return %0 : tensor<200x100x300x!quant.uniform<i32:f64, 2.000000e+00:15>>
+}
+
+// -----
+
+func.func @scatter_c23(%input_tensor: tensor<200x100x300x!quant.uniform<i8:f64, 2.000000e+00:15>>,
+    %scatter_indices: tensor<10x2xi16>, %updates: tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
+      tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>> {
+
+  // expected-error@+1 {{The element-type of reduction-region's argument at index 1 is expected to be promotable from '!quant.uniform<i8:f64, 2.000000e+00:15>', but got '!quant.uniform<i16:f32, 2.000000e+00:15>'}}
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>, %rhs: tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>):
+    %add = mhlo.add %lhs, %rhs : tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>
+    "mhlo.return"(%add) : (tensor<!quant.uniform<i16:f32, 2.000000e+00:15>>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300x!quant.uniform<i8:f64, 2.000000e+00:15>>, tensor<10x2xi16>, tensor<10x300x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
+      tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>>
+  func.return %0 : tensor<200x100x300x!quant.uniform<i16:f32, 2.000000e+00:15>>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/python/attributes.py b/third_party/xla/xla/mlir_hlo/tests/python/attributes.py
index d92283fb67ffd3..d04fe77dde37f1 100644
--- a/third_party/xla/xla/mlir_hlo/tests/python/attributes.py
+++ b/third_party/xla/xla/mlir_hlo/tests/python/attributes.py
@@ -125,6 +125,8 @@ def test_gather_dimension_numbers():
   attr = mhlo.GatherDimensionNumbers.get(
       offset_dims=[1, 2],
       collapsed_slice_dims=[3, 4, 5],
+      operand_batching_dims=[],
+      start_indices_batching_dims=[],
       start_index_map=[6],
       index_vector_dim=7,
   )
@@ -185,6 +187,8 @@ def test_scatter_dimension_numbers():
   attr = mhlo.ScatterDimensionNumbers.get(
       update_window_dims=[1, 2, 3],
       inserted_window_dims=[4, 5],
+      input_batching_dims=[],
+      scatter_indices_batching_dims=[],
       scattered_dims_to_operand_dims=[6, 7],
       index_vector_dim=8,
   )
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.cc
index 18fd20290ef106..9a2fa06fe19848 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.cc
@@ -57,8 +57,9 @@ mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
   std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
                                        dnums.start_index_map().end());
   return mlir::mhlo::GatherDimensionNumbersAttr::get(
-      builder->getContext(), offset_dims, collapsed_slice_dims, start_index_map,
-      dnums.index_vector_dim());
+      builder->getContext(), offset_dims, collapsed_slice_dims,
+      /*operandBatchingDims=*/{}, /*startIndicesBatchingDims=*/{},
+      start_index_map, dnums.index_vector_dim());
 }
 
 mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
@@ -72,6 +73,7 @@ mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
       dnums.scatter_dims_to_operand_dims().end());
   return mlir::mhlo::ScatterDimensionNumbersAttr::get(
       builder->getContext(), update_window_dims, inserted_window_dims,
+      /*inputBatchingDims=*/{}, /*scatterIndicesBatchingDims=*/{},
       scatter_dims_to_operand_dims, dnums.index_vector_dim());
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index f153fa5b33b64f..ab617ea345041b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -522,6 +522,10 @@ static xla::ComparisonDirection Convert_comparison_direction(
 
 static xla::GatherDimensionNumbers Convert_dimension_numbers(
     mlir::mhlo::GatherDimensionNumbersAttr input) {
+  assert(input.getOperandBatchingDims().empty() &&
+         input.getStartIndicesBatchingDims().empty() &&
+         "batching dimensions aren't supported in xla::GatherDimensionNumbers");
+
   xla::GatherDimensionNumbers output;
 
   auto offset_dims = input.getOffsetDims();
@@ -545,6 +549,11 @@ static xla::GatherDimensionNumbers Convert_dimension_numbers(
 
 static xla::ScatterDimensionNumbers Convert_scatter_dimension_numbers(
     mlir::mhlo::ScatterDimensionNumbersAttr input) {
+  assert(
+      input.getInputBatchingDims().empty() &&
+      input.getScatterIndicesBatchingDims().empty() &&
+      "batching dimensions aren't supported in xla::ScatterDimensionNumbers");
+
   xla::ScatterDimensionNumbers output;
 
   auto update_window_dims = input.getUpdateWindowDims();

From 8abbc170985ae90233b4189c03446fc2edd666d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 02:03:56 -0700
Subject: [PATCH 129/287] compat: Update forward compatibility horizon to
 2024-05-30

PiperOrigin-RevId: 638574706
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d422b1a34b47b6..ee7befbcc0d463 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 6efeb065d06077be41bb9e0b04d91366d652dd68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 02:04:00 -0700
Subject: [PATCH 130/287] Update GraphDef version to 1878.

PiperOrigin-RevId: 638574726
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 51fd7d80ac04ac..9a4e301fcfdadb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1877  // Updated: 2024/5/29
+#define TF_GRAPH_DEF_VERSION 1878  // Updated: 2024/5/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 5ad6a21a3dc5c45f3576109083038cb7e0921081 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 02:04:48 -0700
Subject: [PATCH 131/287] [NFC] More informative error message on error from
 MLIR emitters

1. Dump MLIR module to dumping folder for inspection
2. Don't attempt to attach MLIR module contents to error: it's too large to
fit, inconsistent with other errors, and will lose the stack trace.

PiperOrigin-RevId: 638574923
---
 .../xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc  | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index 58db089a342612..af564e4368cd9a 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -287,7 +287,7 @@ MlirFusionEmitterBase::CreateLLVMModule(
     const BufferAssignment* buffer_assignment) const {
   bool is_amd = std::holds_alternative<se::RocmComputeCapability>(
       device.gpu_compute_capability());
-  auto* hlo_module = fusion.GetModule();
+  HloModule* hlo_module = fusion.GetModule();
   std::unique_ptr<mlir::interpreter::MlirCompilationTrace> trace = nullptr;
   if (DumpingEnabledForHloModule(*hlo_module) &&
       DumpingEnabledForHloPass("mlir-fusion-emitter",
@@ -574,12 +574,9 @@ absl::Status MlirFusionEmitterBase::RunPassPipeline(
         std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
             *trace));
   }
+
   if (pm.run(module).failed()) {
-    std::string module_dump;
-    llvm::raw_string_ostream os(module_dump);
-    module->print(os);
-    return absl::InternalError(absl::StrFormat(
-        "Failed to run pass pipeline.\nMLIR module:\n%s", module_dump));
+    return absl::InternalError("Failed to run pass pipeline");
   }
   return absl::OkStatus();
 }

From 94630fa75d77a6a8151254dc0679f7890affb90c Mon Sep 17 00:00:00 2001
From: nhatle <nhat.le@intel.com>
Date: Thu, 30 May 2024 02:10:28 -0700
Subject: [PATCH 132/287] PR #11832: [XLA:CPU][oneDNN] Enable oneDNN MatMul +
 BiasAdd + Sigmoid fusion.

Imported from GitHub PR https://github.com/openxla/xla/pull/11832

This PR enables oneDNN MatMul+BiasAdd+Sigmoid fusion for FP32, BF16 and F16.
Copybara import of the project:

--
0a3429d4ae71db667878e7a04c9f3bd021a5aad5 by nhatle <nhat.le@intel.com>:

Enable OneDNN MatMul + BiasAdd + Sigmoid fusion

--
21cf1253d5a284849a1dc0adc0e4d3d2253305d7 by nhatle <nhat.le@intel.com>:

Small fix

--
e6fff8b7bead347f3c725deb99922ebcbdb7d090 by nhatle <nhat.le@intel.com>:

Update HandleDivide to use new version of ElementwiseSafeIntermediates function

Merging this change closes #11832

PiperOrigin-RevId: 638576403
---
 .../xla/xla/service/cpu/backend_config.proto  |  1 +
 .../xla/xla/service/cpu/onednn_matmul.cc      |  3 +
 .../xla/service/cpu/onednn_matmul_rewriter.cc | 24 +++++
 .../xla/xla/tests/onednn_matmul_test.cc       | 98 ++++++++++++++++++-
 4 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index b63bcd5c376014..08cfbf222e068c 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -29,6 +29,7 @@ message OneDnnMatMulConfig {
     LINEAR = 7;
     ELU = 8;
     RELU6 = 9;
+    SIGMOID = 10;
   }
   repeated FusionKind fused_ops = 3;
   bool bias_broadcast = 4;
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 4a6484ec07d405..45f1fe630a3399 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -141,6 +141,9 @@ std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
       case OneDnnMatMulConfig::RELU6:
         post_ops.append_eltwise(dnnl::algorithm::eltwise_clip_v2, 0.f, 6.0f);
         break;
+      case OneDnnMatMulConfig::SIGMOID:
+        post_ops.append_eltwise(dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
+        break;
       case OneDnnMatMulConfig::BIAS: {
         bias_md = fused_mds.at(fused_operand_idx);
         // Extend bias rank to match result rank.
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 57418d4959e145..19c15b38db283d 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -762,6 +762,30 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
+  auto SigmoidActivation(HloInstruction* instr, HloInstruction** src) {
+    return Match(instr,
+                 m::Divide(BcastConstScalar(1.0),
+                           m::AddAnyOrder(BcastConstScalar(1.0),
+                                          m::Exp(m::Negate(m::Op(src))))));
+  }
+
+  Status HandleDivide(HloInstruction* instr) override {
+    HloInstruction* matmul_call;
+    HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* optional_bitcast = nullptr;
+    HloInstruction* src;
+    if (SigmoidActivation(instr, &src)) {
+      if (Match(src, ElementwiseSafeIntermediates(
+                         &intermediate_instr, &optional_bitcast,
+                         OneDnnMatmulInstr(&matmul_call))
+                         .WithOneUser())) {
+        return FuseActivation(OneDnnMatMulConfig::SIGMOID, instr, matmul_call,
+                              intermediate_instr, optional_bitcast);
+      }
+    }
+    return OkStatus();
+  }
+
   absl::Status FuseActivation(OneDnnMatMulConfig_FusionKind kind,
                               HloInstruction* activation,
                               HloInstruction* matmul,
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 2248b05f5ddcdf..de1a75b6cc1ea5 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -81,7 +81,6 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
-
   const char* fused_matmul_bias_elu_rewrite_str_ = R"(
     ; CHECK:     custom_call_target="__onednn$matmul",
     ; CHECK:       backend_config={
@@ -109,6 +108,15 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
+  const char* fused_matmul_bias_sigmoid_rewrite_str_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","SIGMOID"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
 };
 
 TEST_F(MatmulTest, SimpleTestF32) {
@@ -1062,6 +1070,94 @@ TEST_F(MatmulTest, BiasAddELUFusion_F16_2) {
   MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_elu_rewrite_str_);
 }
 
+TEST_F(MatmulTest, SIGMOIDTestF32) {
+  const char* matmul_module_str = R"(
+    HloModule matmul.bias.sigmoid.test.f32
+
+    ENTRY matmul.bias.sigmoid.test.f32 {
+      arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+      arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+      onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      const.0 = f32[32]{0} constant(5)
+      bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={3}
+      add.0 = f32[32,32,4,32] add(onednn.matmul.0, bcast.0)
+      
+      const.1 = f32[] constant(1)
+      bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+      negate.0 = f32[32,32,4,32] negate(add.0)
+      exponential.0 = f32[32,32,4,32] exponential(negate.0)
+      add.1 = f32[32,32,4,32] add(bcast.1, exponential.0)
+      divide.0 = f32[32,32,4,32] divide(bcast.1, add.1)
+      tuple.0 =(f32[32,32,4,32]) tuple(divide.0)
+      ROOT get-tuple-element.0 = f32[32,32,4,32] get-tuple-element(tuple.0), index=0
+    })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_sigmoid_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SIGMOIDTestBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16";
+  }
+  const char* matmul_module_str = R"(
+    HloModule matmul.bias.sigmoid.test.bf16
+                                                                      
+    ENTRY matmul.bias.sigmoid.test.bf16 {
+      arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+      convert.0 = bf16[32,32,4,16] convert(arg.0)
+      arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+      convert.1 = bf16[32,32,16,32] convert(arg.1)
+      onednn.matmul.0 = bf16[32,32,4,32] dot(convert.0, convert.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      convert.2 = f32[32,32,4,32] convert(onednn.matmul.0)
+      const.0 = f32[32]{0} constant(5)
+      bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={3}
+      add.0 = f32[32,32,4,32] add(convert.2, bcast.0)
+
+      const.1 = f32[] constant(1)
+      bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+      negate.0 = f32[32,32,4,32] negate(add.0)
+      exponential.0 = f32[32,32,4,32] exponential(negate.0)
+      add.1 = f32[32,32,4,32] add(bcast.1, exponential.0)
+      divide.0 = f32[32,32,4,32] divide(bcast.1, add.1)
+      tuple.0 =(f32[32,32,4,32]) tuple(divide.0)
+      ROOT get-tuple-element.0 = f32[32,32,4,32] get-tuple-element(tuple.0), index=0
+    })";
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_sigmoid_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SIGMOIDTestF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16";
+  }
+  const char* matmul_module_str = R"(
+    HloModule matmul.bias.sigmoid.test.f16
+                                                                      
+    ENTRY matmul.bias.sigmoid.test.f16 {
+      arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+      convert.0 = f16[32,32,4,16] convert(arg.0)
+      arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+      convert.1 = f16[32,32,16,32] convert(arg.1)
+      onednn.matmul.0 = f16[32,32,4,32] dot(convert.0, convert.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      convert.2 = f32[32,32,4,32] convert(onednn.matmul.0)
+      const.0 = f32[32]{0} constant(5)
+      bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={3}
+      add.0 = f32[32,32,4,32] add(convert.2, bcast.0)
+
+      const.1 = f32[] constant(1)
+      bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+      negate.0 = f32[32,32,4,32] negate(add.0)
+      exponential.0 = f32[32,32,4,32] exponential(negate.0)
+      add.1 = f32[32,32,4,32] add(bcast.1, exponential.0)
+      divide.0 = f32[32,32,4,32] divide(bcast.1, add.1)
+      tuple.0 =(f32[32,32,4,32]) tuple(divide.0)
+      ROOT get-tuple-element.0 = f32[32,32,4,32] get-tuple-element(tuple.0), index=0
+    })";
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_sigmoid_rewrite_str_);
+}
+
 TEST_F(MatmulTest, SimpleTestBF16Gemv1) {
   if (!IsSupportedType(PrimitiveType::BF16)) {
     GTEST_SKIP() << "CPU does not support BF16.";

From 4dbe674eca9af43d6effae81f6b3a63bfe041388 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 30 May 2024 02:12:29 -0700
Subject: [PATCH 133/287] Slightly refactor IsSupportedF8Pattern in
 gemm_rewriter.cc.

This makes no functional changes.

The overall logic is a bit more clear now. A type InstrPath is added, aliasing the type 'std::vector<std::pair<HloInstruction *, int>>' which was previously passed around. This type is now always ordered from operand to user instead of being reversed within IsSupportedF8Pattern. The comments for several functions were expanded. FindF8SubgraphRecursive now only returns an InstrPath instead of both returning one and taking one in.

PiperOrigin-RevId: 638576830
---
 .../xla/xla/service/gpu/gemm_rewriter.cc      | 109 ++++++++++--------
 1 file changed, 63 insertions(+), 46 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 7b84cbbf7ad47c..e0923792b493ea 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -69,7 +69,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/dnn.pb.h"
 
-
 namespace xla {
 namespace gpu {
 namespace {
@@ -184,67 +183,87 @@ absl::StatusOr<HloInstruction *> InvertAndConvertScalar(HloInstruction *scalar,
   return scalar;
 }
 
-// Recursively collects unary, divide, dynamic-slice, pad, multiply or select
-// operands of instr and the index of the operand identifying the next op in the
-// sequence until an instruction with FP8 element type is reached. Returns an
-// empty vector when no FP8 instruction is reached.
-std::vector<std::pair<HloInstruction *, int>> FindF8SubgraphRecursive(
-    HloInstruction *instr, absl::flat_hash_set<int> &visited_instrs,
-    std::vector<std::pair<HloInstruction *, int>> subgraph) {
+// A path of instructions by traversing downwards through users, as (op,
+// operand_index) pairs. operand_index is the index to get to the previous
+// element in the path. I.e.,
+// path[i].first->operand(path[i].second) == path[i-1].first
+using InstrPath = std::vector<std::pair<HloInstruction *, int>>;
+
+// From 'instr', recursively traverses operands until an FP8 instruction is
+// encountered. Only unary ops and a few types of non-unary ops are traversed.
+// If an FP8 instruction is found, returns the path from the FP8 instruction to
+// 'instr'. Returns nullopt when no FP8 instruction is reached.
+//
+// The intent is, given 'instr' is the operand of a dot, to find a sequence of
+// instruction that can potentially be fused into a cuBLAS LT FP8 gemm.
+std::optional<InstrPath> FindF8SubgraphRecursive(
+    HloInstruction *instr, absl::flat_hash_set<int> &visited_instrs) {
   // Avoid visiting the same instruction more than once.
   if (!visited_instrs.emplace(instr->unique_id()).second) {
-    return {};
+    return std::nullopt;
   }
-  subgraph.emplace_back(std::make_pair(instr, 0));
   if (IsF8Type(instr)) {
-    return subgraph;
+    // The initial operand index is meaningless. Arbitrarily use -1.
+    return InstrPath{{instr, -1}};
   }
   if (instr->operand_count() == 1 || instr->opcode() == HloOpcode::kDivide ||
       instr->opcode() == HloOpcode::kDynamicSlice ||
       instr->opcode() == HloOpcode::kPad) {
-    return FindF8SubgraphRecursive(instr->mutable_operand(0), visited_instrs,
-                                   std::move(subgraph));
+    std::optional<InstrPath> subgraph =
+        FindF8SubgraphRecursive(instr->mutable_operand(0), visited_instrs);
+    if (subgraph) {
+      subgraph->emplace_back(std::make_pair(instr, 0));
+    }
+    return subgraph;
   } else if (instr->opcode() == HloOpcode::kMultiply ||
              instr->opcode() == HloOpcode::kSelect) {
     for (int k = 0; k < 2; ++k) {
       // Iterate over operands 0 and 1 for multiply and operands 1 and 2 for
       // select.
       int operand_idx = k + (instr->opcode() == HloOpcode::kSelect);
-      subgraph.back().second = operand_idx;
-      auto binary_subgraph = FindF8SubgraphRecursive(
-          instr->mutable_operand(operand_idx), visited_instrs, subgraph);
-      if (!binary_subgraph.empty()) {
-        return binary_subgraph;
+      std::optional<InstrPath> subgraph = FindF8SubgraphRecursive(
+          instr->mutable_operand(operand_idx), visited_instrs);
+      if (subgraph) {
+        subgraph->emplace_back(std::make_pair(instr, operand_idx));
+        return subgraph;
       }
     }
   }
-  return {};
+  return std::nullopt;
 }
 
-// Returns whether instr and its operands describe a pattern which is compatible
-// with rewriting the dot operating on instr into an FP8 Custom Call. If
-// applicable, captures the operand of the Custom Call, its scaling factor,
-// whether the scaling factor is applied by multiplication and intermediate
-// unary ops.
-bool IsSupportedF8Pattern(
-    HloInstruction *instr, HloInstruction *&x, HloInstruction *&x_scale,
-    bool &x_mult_scale, std::vector<std::pair<HloInstruction *, int>> &x_ops) {
+// Given an operand of a dot, 'instr', returns true if this operand allows
+// rewriting the dot in an FP8 cublasLT custom call, optionally with scaling.
+// In particular, returns true if either 'instr' is FP8 or there is a there is a
+// path from an FP8 instruction 'x' to 'instr' consisting of the following.
+// 1. A convert to a wider type.
+// 2. Optionally, a multiplication/division by a scalar, representing the scale.
+//    If present, the scalar scale is returned as 'x_scale' and 'x_mult_scale'
+//    is set to true or false depending on whether there is a multiplication or
+//    a division.
+// 3. A possibly-empty set of ops communative with steps (1) and (2), meaning
+//    they can be safely moved before step (1). Such ops are returned in
+//    'x_ops'.
+// Steps (1) and (2) together are a dequantization, and can be fused into a
+// cublas LT matmul. Step (3) can be moved before the cublas LT matmul.
+bool IsSupportedF8Pattern(HloInstruction *instr, HloInstruction *&x,
+                          HloInstruction *&x_scale, bool &x_mult_scale,
+                          InstrPath &x_ops) {
   absl::flat_hash_set<int> visited_instrs;
-  std::vector<std::pair<HloInstruction *, int>> subgraph =
-      FindF8SubgraphRecursive(instr, visited_instrs,
-                              std::vector<std::pair<HloInstruction *, int>>{});
-
-  if (subgraph.empty()) {
+  std::optional<InstrPath> maybe_subgraph =
+      FindF8SubgraphRecursive(instr, visited_instrs);
+  if (!maybe_subgraph) {
     return false;
   }
+  InstrPath &subgraph = maybe_subgraph.value();
 
   // Directly operating on an FP8 operand.
   if (subgraph.size() == 1) {
     x = subgraph[0].first;
+    CHECK(IsF8Type(x));
     return true;
   }
 
-  std::reverse(subgraph.begin(), subgraph.end());
   // When not operating directly on an FP8 operand, the second and
   // third instructions in the subgraph must describe a dequantization, i.e. a
   // convert instruction followed by a multiply/divide instruction.
@@ -272,6 +291,7 @@ bool IsSupportedF8Pattern(
     return instr->GetModule()->config().use_spmd_partitioning();
   };
 
+  // Skip the initial FP8 instruction and the two dequantization instructions.
   for (int i = 3; i < subgraph.size(); ++i) {
     // The remaining instructions must be commutative with dequantization.
     // Bitcast, broadcast, copy, dynamic-slice, pad, reshape, select, slice,
@@ -580,10 +600,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
           GemmIsSupportedByCublasLt(*instr, gemm_backend_config));
       HloInstruction *a, *b, *a_scale = nullptr, *b_scale = nullptr;
       // Sequence of ops between dequantization and GEMM which are
-      // mathematically commutative with dequantization. The second element of
-      // the pair gives the index of the operand identifying the next op in the
-      // sequence.
-      std::vector<std::pair<HloInstruction *, int>> a_ops, b_ops;
+      // mathematically commutative with dequantization.
+      InstrPath a_ops, b_ops;
       bool a_mult_scale{}, b_mult_scale{};
       if (supported_by_cublaslt &&
           Match(instr,
@@ -948,12 +966,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return *rocm_cc;
   }
 
-  absl::StatusOr<bool> CreateF8CustomCall(
-      HloInstruction *instr, GpuBackendConfig &gpu_backend_config,
-      HloInstruction *a, HloInstruction *b, HloInstruction *a_scale,
-      HloInstruction *b_scale, bool a_mult_scale, bool b_mult_scale,
-      std::vector<std::pair<HloInstruction *, int>> a_ops,
-      std::vector<std::pair<HloInstruction *, int>> b_ops) {
+  absl::StatusOr<bool> CreateF8CustomCall(HloInstruction *instr,
+                                          GpuBackendConfig &gpu_backend_config,
+                                          HloInstruction *a, HloInstruction *b,
+                                          HloInstruction *a_scale,
+                                          HloInstruction *b_scale,
+                                          bool a_mult_scale, bool b_mult_scale,
+                                          InstrPath a_ops, InstrPath b_ops) {
     GemmBackendConfig &gemm_backend_config =
         *gpu_backend_config.mutable_gemm_backend_config();
     if (IsCuda(gpu_version_)) {
@@ -1110,9 +1129,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     // Sequentially apply the collected unary, dynamic-slice, pad and select ops
     // to the unconverted and unscaled operands.
-    auto shift_ops =
-        [&instr](HloInstruction *&x,
-                 std::vector<std::pair<HloInstruction *, int>> &x_ops) -> void {
+    auto shift_ops = [&instr](HloInstruction *&x, InstrPath &x_ops) -> void {
       for (std::pair<HloInstruction *, int> op : x_ops) {
         std::vector<HloInstruction *> operands = {x};
         // Insert the additional operands of dynamic-slice ops.

From 8f8ce5cbf917285f77e1c08e75ffa011324565ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Thu, 30 May 2024 02:14:49 -0700
Subject: [PATCH 134/287] [XLA:GPU] Add TritonSupportTest for dynamic-slice
 Also support indices of type S8, S16. (S32 was already supported.)

PiperOrigin-RevId: 638577418
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  23 ++--
 .../xla/xla/service/gpu/triton_support.cc     |  43 +++++++
 .../xla/xla/service/gpu/triton_support.h      |   9 ++
 .../xla/service/gpu/triton_support_test.cc    | 109 ++++++++++++++++++
 .../service/gpu/triton_tiling_propagation.cc  |  24 +---
 6 files changed, 180 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 64baed2d7f2fc3..6e1a991aea1205 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1206,6 +1206,7 @@ cc_library(
     hdrs = ["triton_support.h"],
     deps = [
         ":variant_visitor",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 4c619cb0683c0f..56f1c1f0c435a2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1556,21 +1556,24 @@ class MatMulEmitterHelper {
             majormost_dim_start_index_ptr_val, mt::CacheModifier::NONE,
             mt::EvictionPolicy::NORMAL,
             /*isVolatile=*/false);
-        Value majormost_dim_start_index_lower_limit_val =
-            CreateConst(b_, majormost_dim_start_index_val.getType(), 0);
         int64_t majormost_dim_start_index_upper_limit =
             hlo->operand(0)->shape().dimensions(majormost_dim) -
             hlo->dynamic_slice_sizes().at(majormost_dim);
-        Value majormost_dim_start_index_upper_limit_val =
-            CreateConst(b_, majormost_dim_start_index_val.getType(),
-                        majormost_dim_start_index_upper_limit);
-        // Our Triton codegen only supports signed integers so far.
+        // We don't want to cast S64 indices to S32, because that could result
+        // in an incorrect value.
+        if (majormost_dim_start_index_val.getType().isInteger() &&
+            majormost_dim_start_index_val.getType().getIntOrFloatBitWidth() ==
+                64) {
+          return UncompilableMatmul(
+              "64 bit dynamic-slice indices are not supported yet.");
+        }
         majormost_dim_start_index_val =
-            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val,
-                                   majormost_dim_start_index_lower_limit_val);
+            Cast(b_, majormost_dim_start_index_val, b_.getI32Type());
         majormost_dim_start_index_val =
-            b_.create<ma::MinSIOp>(majormost_dim_start_index_val,
-                                   majormost_dim_start_index_upper_limit_val);
+            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(0));
+        majormost_dim_start_index_val = b_.create<ma::MinSIOp>(
+            majormost_dim_start_index_val,
+            Cst32(majormost_dim_start_index_upper_limit));
 
         // How many "rows" (non-contracting dim values) are there in a slice of
         // size 1?
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 155ef105d645fe..27e456bf7f0984 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/triton_support.h"
 
+#include <cstdint>
 #include <iterator>
 #include <variant>
 #include <vector>
@@ -25,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
@@ -314,6 +316,43 @@ CodegenDecision CanTritonHandleReduce(
   return "Reduction is not a row-reduction of a single operand.";
 }
 
+CodegenDecision IsTritonSupportedDynamicSlice(
+    const HloDynamicSliceInstruction& instr) {
+  for (const HloInstruction* index_operand : instr.index_operands()) {
+    switch (index_operand->shape().element_type()) {
+      case S8:
+      case S16:
+      case S32:
+        break;  // supported
+      default:
+        return CodegenDecision(
+            "Dynamic slice is only supported with S8, S16, or S32 indices.");
+    }
+  }
+
+  // Similar to normal slice, we cannot slice a non-major-most dimension as
+  // that would introduce non-contiguous strides under tiling. The existing
+  // check against this in GetRequirementsIfSupportedOrder is not suitable for
+  // dynamic slices, so we instead check for this here.
+  const HloInstruction* input = instr.operand(0);
+  Layout in_layout = input->shape().layout();
+  int64_t majormost_dim_id =
+      in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
+
+  for (int i = 0; i < input->shape().dimensions_size(); ++i) {
+    if (i == majormost_dim_id) {
+      continue;
+    } else if (input->shape().dimensions(i) != instr.slice_sizes(i)) {
+      return CodegenDecision(
+          "Unsupported dynamic slice on non-major-most dimension.");
+    }
+  }
+
+  // TODO(b/343143854): Check the subtleties of which dynamic slices are
+  // supported, for example that a fragmented dimension cannot be sliced.
+  return CodegenDecision{};
+}
+
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   if (instr.IsElementwise()) {
@@ -334,6 +373,10 @@ CodegenDecision IsTritonSupportedInstruction(
       }
       return "Only supports root tuples.";
     }
+    case HloOpcode::kDynamicSlice: {
+      return IsTritonSupportedDynamicSlice(
+          *Cast<HloDynamicSliceInstruction>(&instr));
+    }
     case HloOpcode::kBitcast:
     case HloOpcode::kTranspose:
     case HloOpcode::kSlice:
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index 072c9ab948ec00..66e5e5f2c73906 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -52,6 +53,14 @@ bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
 
+// Checks dynamic slice against requirements of triton emitter.
+//
+// This is exposed separately from IsTritonSupportedInstruction because we can
+// use it in the dimension order propagation without adding a dependency on the
+// GPU version.
+CodegenDecision IsTritonSupportedDynamicSlice(
+    const HloDynamicSliceInstruction& instr);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
index 7e9a8fdd7312bd..6ba5b1ff6ca17c 100644
--- a/third_party/xla/xla/service/gpu/triton_support_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -34,8 +34,10 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
@@ -474,6 +476,113 @@ INSTANTIATE_TEST_SUITE_P(DotTestTestSuite, DotTest,
                                             ::testing::Values(HloOpcode::kDot)),
                          TestParamsToString);
 
+struct DynamicSliceTestParam {
+  PrimitiveType data_type;
+  PrimitiveType index_type;
+  bool is_the_majormost_dim_being_sliced;
+
+  using TupleType = std::tuple<PrimitiveType, PrimitiveType, bool>;
+
+  explicit DynamicSliceTestParam(const TupleType& tuple)
+      : data_type(std::get<0>(tuple)),
+        index_type(std::get<1>(tuple)),
+        is_the_majormost_dim_being_sliced(std::get<2>(tuple)) {}
+};
+
+std::string DynamicSliceTestParamToString(
+    const ::testing::TestParamInfo<DynamicSliceTestParam>& info) {
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(info.param.data_type), "_",
+      primitive_util::LowercasePrimitiveTypeName(info.param.index_type), "_",
+      info.param.is_the_majormost_dim_being_sliced ? "majormost"
+                                                   : "not_majormost");
+}
+
+class DynamicSliceTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<DynamicSliceTestParam> {};
+
+TEST_P(DynamicSliceTest, IsTritonSupportedExecutesCorrectlyForDynamicSlice) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      GetParam().data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  constexpr absl::string_view kHloTestTemplate =
+      R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = $0[$2,$3] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = $1[] parameter(2)
+  start_index1 = $1[] parameter(3)
+  dynamic_slice = $0[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  convert = f32[5,2] convert(dynamic_slice)
+  ROOT dot = f32[5, 4] dot(convert, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = $0[$2,$3] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = $1[] constant($4)
+  start_index1 = $1[] constant($5)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate,
+      primitive_util::LowercasePrimitiveTypeName(GetParam().data_type),
+      primitive_util::LowercasePrimitiveTypeName(GetParam().index_type),
+      GetParam().is_the_majormost_dim_being_sliced ? 7 : 5,  // input dim0
+      GetParam().is_the_majormost_dim_being_sliced ? 2 : 4,  // input dim1
+      GetParam().is_the_majormost_dim_being_sliced ? 1 : 0,  // start_index0
+      GetParam().is_the_majormost_dim_being_sliced ? 0 : 1   // start_index1
+  );
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm");
+  ASSERT_NE(computation, nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kDynamicSlice);
+
+  const bool is_supported_instruction =
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .CanFuse();
+  const bool is_supported_dynamic_slice =
+      IsTritonSupportedDynamicSlice(*Cast<HloDynamicSliceInstruction>(instr))
+          .CanFuse();
+  EXPECT_EQ(is_supported_instruction, is_supported_dynamic_slice);
+
+  if (is_supported_instruction) {
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+  } else {
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                tsl::testing::StatusIs(absl::StatusCode::kFailedPrecondition));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    All, DynamicSliceTest,
+    ::testing::ConvertGenerator<DynamicSliceTestParam::TupleType>(
+        ::testing::Combine(::testing::Values(F16, BF16, F32),
+                           ::testing::Values(S8, S16, S32, S64, U8, U16, U32,
+                                             U64),
+                           ::testing::Bool())),
+    DynamicSliceTestParamToString);
+
 TEST_F(TritonSupportTest, UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
   const std::string kHloTest = R"(
 triton_gemm___computation {
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 58b9492fda2bd3..277470eae9c2ea 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -977,25 +977,11 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
       return "Dynamic slices for now are only supported in GEMM fusions.";
     }
 
-    // Similar to normal slice, we cannot slice a non-major-most dimension as
-    // that would introduce non-contiguous strides under tiling. The existing
-    // check against this in GetRequirementsIfSupportedOrder is not suitable for
-    // dynamic slices, so we instead check for this here.
-    const HloInstruction* input = hlo.operand(0);
-    Layout in_layout = input->shape().layout();
-    int64_t majormost =
-        in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
-    const HloDynamicSliceInstruction* dynamic_slice =
-        Cast<HloDynamicSliceInstruction>(&hlo);
-
-    for (int i = 0; i < input->shape().dimensions_size(); ++i) {
-      if (i == majormost) {
-        continue;
-      } else if (input->shape().dimensions(i) !=
-                 dynamic_slice->slice_sizes(i)) {
-        return FusionDecision(
-            "Unsupported dynamic slice on non-major-most dimension.");
-      }
+    if (CodegenDecision decision = IsTritonSupportedDynamicSlice(
+            *Cast<HloDynamicSliceInstruction>(&hlo));
+        !decision.CanFuse()) {
+      // CodegenDecision is actually the same type as FusionDecision.
+      return decision;
     }
 
     return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,

From af36bb9e4e2b46f174e5d00b1d44ba5e7400d775 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 02:21:34 -0700
Subject: [PATCH 135/287] [XLA] Update shmem memory usage requirements for new
 column reduction vector size

PiperOrigin-RevId: 638579013
---
 third_party/xla/xla/service/gpu/gpu_fusible.cc              | 4 ++--
 third_party/xla/xla/service/gpu/multi_output_fusion_test.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index f6df2d9123fc00..b004930c5f994c 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -574,9 +574,9 @@ static int64_t SharedMemoryUsageNoCache(const HloInstruction& instr) {
       // __shared__[32] is used for row reduction.
       return 32 * primitive_size * num_variadic;
     } else {
-      // __shared__[2][32][33] cache is used for column reduction ("2" comes
+      // __shared__[4][32][33] cache is used for column reduction ("4" comes
       // from potential x-tiling).
-      return 2 * 32 * 33 * primitive_size * num_variadic;
+      return 4 * 32 * 33 * primitive_size * num_variadic;
     }
   } else if (GetDescriptionForTiledTransposeEmitter(instr, instr).has_value()) {
     // Tile size for transposition.
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
index 499848aacb270c..b333a04a841882 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
@@ -1200,7 +1200,7 @@ TEST_F(MultiOutputFusionTest, SharedMemoryBudget) {
                     .value();
   ASSERT_TRUE(mof_.Run(module.get()).value());
 
-  EXPECT_EQ(2, CountMultiOutputFusions(module.get()));
+  EXPECT_EQ(5, CountMultiOutputFusions(module.get()));
 }
 
 TEST_F(MultiOutputFusionTest, DoNotGroupTooManyReductions) {

From 8c119008478a09b70a5d7b8d2c326fc34aafc121 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 02:58:02 -0700
Subject: [PATCH 136/287] [XLA] Propagate error properly via Status from MLIR
 pipeline

PiperOrigin-RevId: 638586753
---
 third_party/xla/xla/service/gpu/fusions/mlir/BUILD            | 1 +
 .../xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc   | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 762d59232ea60f..948bbe51a7f83d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -213,6 +213,7 @@ cc_library(
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
+        "@local_tsl//tsl/framework/mlir:status_scoped_diagnostic_handler",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index af564e4368cd9a..ff01eda19e2657 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -100,6 +100,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
+#include "tsl/framework/mlir/status_scoped_diagnostic_handler.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -575,8 +576,9 @@ absl::Status MlirFusionEmitterBase::RunPassPipeline(
             *trace));
   }
 
+  tsl::StatusScopedDiagnosticHandler diagnostic_handler(module.getContext());
   if (pm.run(module).failed()) {
-    return absl::InternalError("Failed to run pass pipeline");
+    return diagnostic_handler.consumeStatus();
   }
   return absl::OkStatus();
 }

From 43fcba8aeb627182afea8c5655f55d0557ab53dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 04:22:16 -0700
Subject: [PATCH 137/287] Automated Code Change

PiperOrigin-RevId: 638604921
---
 .../transforms/prepare_for_export/prepare_for_export.cc   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 0397018add2b28..0ab6c240c1e790 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -60,7 +60,7 @@ struct PrepareForExportPass
 
 // Materializes some splat before export because it may be more efficient in
 // HLOInstruction.
-void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
+static void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
   // Arbitrarily chosen "small" number. This could be chosen based on the proto
   // size too.
   if (attr.getNumElements() < 32) return;
@@ -88,7 +88,7 @@ void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
 }
 
 // Ensure that there aren't any implicit capture before exporting.
-void prepareWhileOp(WhileOp whileOp) {
+static void prepareWhileOp(WhileOp whileOp) {
   llvm::SetVector<Value> implicitInputs;
   getUsedValuesDefinedAbove(whileOp->getRegions(), implicitInputs);
   if (implicitInputs.empty()) return;
@@ -137,7 +137,7 @@ void prepareWhileOp(WhileOp whileOp) {
   whileOp->erase();
 }
 
-void prepareBroadcastInDim(BroadcastInDimOp bcast) {
+static void prepareBroadcastInDim(BroadcastInDimOp bcast) {
   DenseIntElementsAttr dims = bcast.getBroadcastDimensions();
   // If dimensions aren't sorted, there is a transpose fused into the op, which
   // XLA Builder does not support, we unfuse here.
@@ -166,7 +166,7 @@ void prepareBroadcastInDim(BroadcastInDimOp bcast) {
 }
 
 // Make implicitly captured constant explicit before exporting
-void prepareExplicitCapturedConstants(Operation *op) {
+static void prepareExplicitCapturedConstants(Operation *op) {
   for (Region &region : op->getRegions()) {
     assert(region.getBlocks().size() == 1 &&
            "Only OPs with single block regions are allowed");

From de59727828ca6ff2a04ded6dd6887e5b1192fe34 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Thu, 30 May 2024 05:18:21 -0700
Subject: [PATCH 138/287] Update OpenXLA's Triton dependency to include the AMD
 backend.

- https://github.com/openxla/triton/pull/7
- https://github.com/openxla/triton/commit/25e4e02dcaf57f01ba49608345a9bbbbc20152bf
- https://github.com/triton-lang/triton/pull/4031

PiperOrigin-RevId: 638615931
---
 .../temporary/fp8_splat_partial_revert.patch  |  35 -----
 .../temporary/local_alloc_lowering_fix.patch  | 120 ------------------
 third_party/triton/temporary/series.bzl       |   5 +-
 third_party/triton/workspace.bzl              |   4 +-
 .../temporary/fp8_splat_partial_revert.patch  |  35 -----
 .../temporary/local_alloc_lowering_fix.patch  | 120 ------------------
 .../third_party/triton/temporary/series.bzl   |   5 +-
 .../xla/third_party/triton/workspace.bzl      |   4 +-
 8 files changed, 6 insertions(+), 322 deletions(-)
 delete mode 100644 third_party/triton/temporary/fp8_splat_partial_revert.patch
 delete mode 100644 third_party/triton/temporary/local_alloc_lowering_fix.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch

diff --git a/third_party/triton/temporary/fp8_splat_partial_revert.patch b/third_party/triton/temporary/fp8_splat_partial_revert.patch
deleted file mode 100644
index 57f6c64ecb0ff4..00000000000000
--- a/third_party/triton/temporary/fp8_splat_partial_revert.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-This can be deleted on the next integrate; it is a revert of a previous patch.
-diff --git a/include/triton/Conversion/MLIRTypes.h b/include/triton/Conversion/MLIRTypes.h
---- a/include/triton/Conversion/MLIRTypes.h
-+++ b/include/triton/Conversion/MLIRTypes.h
-@@ -26,6 +26,15 @@ inline Type f32Ty(MLIRContext *ctx) { re
- inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
- inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
- 
-+inline bool isFloat(Type type) {
-+  return type.isF32() || type.isF64() || type.isF16() || type.isF128() ||
-+         type.isBF16() || type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
-+         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
-+         type.isFloat8E5M2FNUZ();
-+}
-+
-+inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
-+
- } // namespace type
- } // namespace triton
- } // namespace mlir
-diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
-@@ -74,9 +74,9 @@ struct ArithConstantSplatOpConversion
-     auto values = mlir::dyn_cast<SplatElementsAttr>(op.getValue());
-     auto elemType = values.getElementType();
-     Attribute val;
--    if (isa<FloatType>(elemType)) {
-+    if (type::isFloat(elemType)) {
-       val = values.getValues<FloatAttr>()[0];
--    } else if (isa<IntegerType>(elemType)) {
-+    } else if (type::isInt(elemType)) {
-       val = values.getValues<IntegerAttr>()[0];
-     } else {
-       llvm::errs() << "ArithConstantSplatOpConversion get unsupported type: "
diff --git a/third_party/triton/temporary/local_alloc_lowering_fix.patch b/third_party/triton/temporary/local_alloc_lowering_fix.patch
deleted file mode 100644
index 098f19aab9f359..00000000000000
--- a/third_party/triton/temporary/local_alloc_lowering_fix.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-No plans to upstream this fix at the moment given the friction with OAI. We
-can consider upstreaming this in the future if it starts causing issues while
-patching.
-
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -2772,6 +2772,11 @@ struct CanonicalizeConvertFromAlloc
-     auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
-     if (!convert)
-       return failure();
-+    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
-+    // to SharedEncoding, so we want to keep this layout conversion.
-+    if (mlir::isa<triton::gpu::DotOperandEncodingAttr>(
-+            convert.getSrc().getType().getEncoding()))
-+      return failure();
-     rewriter.replaceOpWithNewOp<triton::gpu::LocalAllocOp>(
-         op, op->getResult(0).getType(), convert.getSrc());
-     return mlir::success();
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -227,6 +227,20 @@ public:
-     auto newType = MemDescType::get(argType.getShape(),
-                                     argType.getElementType(), newLayout);
-     rewriter.setInsertionPointAfterValue(arg);
-+
-+    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
-+    // to SharedEncoding.
-+    if (auto dotOpEnc = mlir::dyn_cast<DotOperandEncodingAttr>(
-+            argType.getEncoding())) {
-+      // Create a layout conversion from DotOperandEncoding to BlockedEncoding
-+      // then pass it to the LocalAllocOp.
-+      auto newArgType = RankedTensorType::get(
-+          argType.getShape(), argType.getElementType(), dotOpEnc.getParent());
-+      auto dotOperandToBlockedCvt =
-+          rewriter.create<ConvertLayoutOp>(arg.getLoc(), newArgType, arg);
-+      return rewriter.create<LocalAllocOp>(arg.getLoc(), newType,
-+                                                dotOperandToBlockedCvt);
-+    }
-     return rewriter.create<LocalAllocOp>(arg.getLoc(), newType, arg);
-   }
- 
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -3588,6 +3588,25 @@ def test_dot_without_load(dtype_str, dev
-     kernel[(1, )](out)
-     assert torch.all(out == out_ref)
- 
-+@pytest.mark.interpreter
-+def test_dot_on_broadcast(device):
-+    @triton.jit
-+    def _kernel(a, b, out):
-+        a_offsets = tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
-+        lhs = tl.load(a + a_offsets, mask=a_offsets < 32 * 64)
-+        rhs = tl.load(b)
-+        rhs_bc = tl.broadcast_to(rhs, [32, 32])
-+        c = tl.dot(lhs, rhs_bc)
-+        out_ptr = out + tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
-+        tl.store(out_ptr, c)
-+
-+    a = torch.ones((64, 32), dtype=getattr(torch, 'float32'), device=device)
-+    b = torch.tensor([1.0], dtype=getattr(torch, 'float32'), device=device)
-+    out_ref = torch.matmul(a, torch.broadcast_to(b, (32, 32)))
-+    out = torch.zeros((64, 32), dtype=getattr(torch, 'float32'), device=device)
-+    _kernel[(1, )](a, b, out, num_stages=1, num_warps=4)
-+    assert torch.all(out == out_ref)
-+
- 
- # ---------------
- # test arange
-diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
---- a/test/TritonGPU/accelerate-matmul.mlir
-+++ b/test/TritonGPU/accelerate-matmul.mlir
-@@ -148,3 +148,21 @@ module attributes {"triton_gpu.target" =
-     tt.return
-   }
- }
-+
-+// -----
-+
-+// CHECK-DAG: #[[$BLOCKED:.*]] = #triton_gpu.blocked
-+// CHECK-DAG: #mma = #triton_gpu.nvidia_mma<{versionMajor = 3
-+#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @local_alloc_dot_operand(%in0:  tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> {tt.divisibility = 16 : i32}, %in1: f32, %in2: tensor<64x32xf32, #blocked>) -> (tensor<64x32xf32, #blocked>) {
-+    // CHECK-LABEL: local_alloc_dot_operand
-+    %splat_in1 = tt.splat %in1 : f32 -> tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
-+    // CHECK: %[[LHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc
-+    // CHECK: %[[RHS_CVT:.*]] = triton_gpu.convert_layout {{.*}} #triton_gpu.dot_op<{{.*}}> -> {{.*}} #[[$BLOCKED]]
-+    // CHECK: %[[RHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc %[[RHS_CVT]]
-+    // CHECK: tt.dot %[[LHS_LOCAL_ALLOC]], %[[RHS_LOCAL_ALLOC]]
-+    %res = tt.dot %in0, %splat_in1, %in2, inputPrecision = tf32 : tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x32xf32, #blocked>
-+    tt.return %res :  tensor<64x32xf32, #blocked>
-+  }
-+}
-diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
---- a/test/TritonGPU/canonicalize.mlir
-+++ b/test/TritonGPU/canonicalize.mlir
-@@ -57,3 +57,19 @@ tt.func @test_canonicalize_convert_histo
-     tt.return %2 : tensor<512xi32, #blocked2>
- }
- }  // end module
-+
-+// -----
-+
-+// CHECK: #[[$BLOCKED:.*]] = #triton_gpu.blocked
-+#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
-+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @cvt_from_dot_op_into_local_allow_not_canonicalized(%in: tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<256x32xf32, #shared1> {
-+    // CHECK-LABEL: cvt_from_dot_op_into_local_allow_not_canonicalized
-+    %cvt_in = triton_gpu.convert_layout %in : tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<256x32xf32, #blocked>
-+    %alloc = triton_gpu.local_alloc %cvt_in : (tensor<256x32xf32, #blocked>) -> !tt.memdesc<256x32xf32, #shared1>
-+    // CHECK: %[[ALLOC:.*]] = triton_gpu.local_alloc {{.*}} (tensor<{{.*}}, #[[$BLOCKED]]{{.*}}>) ->
-+    tt.return %alloc : !tt.memdesc<256x32xf32, #shared1>
-+  }
-+} // end module
-+
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index 15a1a6d220d765..53a5059fe432d7 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -5,7 +5,4 @@ These are created temporarily and should be moved to the first copybara workflow
 internal patch during the next triton integration process.
 """
 
-temporary_patch_list = [
-    "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
-    "//third_party/triton/temporary:local_alloc_lowering_fix.patch",
-]
+temporary_patch_list = []
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 1aace3709dc23a..0d9ff9580e1c8c 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl637553582"
-    TRITON_SHA256 = "400077180416fc59486b698a6523013ee11589c6269e1aeb992292ca12cc1e58"
+    TRITON_COMMIT = "cl638583630"
+    TRITON_SHA256 = "769385a2295fa7256a04fcdc886054fb0853a25ee1c35dcdc0aabf755508f9fc"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch b/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
deleted file mode 100644
index 57f6c64ecb0ff4..00000000000000
--- a/third_party/xla/third_party/triton/temporary/fp8_splat_partial_revert.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-This can be deleted on the next integrate; it is a revert of a previous patch.
-diff --git a/include/triton/Conversion/MLIRTypes.h b/include/triton/Conversion/MLIRTypes.h
---- a/include/triton/Conversion/MLIRTypes.h
-+++ b/include/triton/Conversion/MLIRTypes.h
-@@ -26,6 +26,15 @@ inline Type f32Ty(MLIRContext *ctx) { re
- inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
- inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
- 
-+inline bool isFloat(Type type) {
-+  return type.isF32() || type.isF64() || type.isF16() || type.isF128() ||
-+         type.isBF16() || type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
-+         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
-+         type.isFloat8E5M2FNUZ();
-+}
-+
-+inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
-+
- } // namespace type
- } // namespace triton
- } // namespace mlir
-diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
---- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
-@@ -74,9 +74,9 @@ struct ArithConstantSplatOpConversion
-     auto values = mlir::dyn_cast<SplatElementsAttr>(op.getValue());
-     auto elemType = values.getElementType();
-     Attribute val;
--    if (isa<FloatType>(elemType)) {
-+    if (type::isFloat(elemType)) {
-       val = values.getValues<FloatAttr>()[0];
--    } else if (isa<IntegerType>(elemType)) {
-+    } else if (type::isInt(elemType)) {
-       val = values.getValues<IntegerAttr>()[0];
-     } else {
-       llvm::errs() << "ArithConstantSplatOpConversion get unsupported type: "
diff --git a/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch b/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch
deleted file mode 100644
index 098f19aab9f359..00000000000000
--- a/third_party/xla/third_party/triton/temporary/local_alloc_lowering_fix.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-No plans to upstream this fix at the moment given the friction with OAI. We
-can consider upstreaming this in the future if it starts causing issues while
-patching.
-
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -2772,6 +2772,11 @@ struct CanonicalizeConvertFromAlloc
-     auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
-     if (!convert)
-       return failure();
-+    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
-+    // to SharedEncoding, so we want to keep this layout conversion.
-+    if (mlir::isa<triton::gpu::DotOperandEncodingAttr>(
-+            convert.getSrc().getType().getEncoding()))
-+      return failure();
-     rewriter.replaceOpWithNewOp<triton::gpu::LocalAllocOp>(
-         op, op->getResult(0).getType(), convert.getSrc());
-     return mlir::success();
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -227,6 +227,20 @@ public:
-     auto newType = MemDescType::get(argType.getShape(),
-                                     argType.getElementType(), newLayout);
-     rewriter.setInsertionPointAfterValue(arg);
-+
-+    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
-+    // to SharedEncoding.
-+    if (auto dotOpEnc = mlir::dyn_cast<DotOperandEncodingAttr>(
-+            argType.getEncoding())) {
-+      // Create a layout conversion from DotOperandEncoding to BlockedEncoding
-+      // then pass it to the LocalAllocOp.
-+      auto newArgType = RankedTensorType::get(
-+          argType.getShape(), argType.getElementType(), dotOpEnc.getParent());
-+      auto dotOperandToBlockedCvt =
-+          rewriter.create<ConvertLayoutOp>(arg.getLoc(), newArgType, arg);
-+      return rewriter.create<LocalAllocOp>(arg.getLoc(), newType,
-+                                                dotOperandToBlockedCvt);
-+    }
-     return rewriter.create<LocalAllocOp>(arg.getLoc(), newType, arg);
-   }
- 
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -3588,6 +3588,25 @@ def test_dot_without_load(dtype_str, dev
-     kernel[(1, )](out)
-     assert torch.all(out == out_ref)
- 
-+@pytest.mark.interpreter
-+def test_dot_on_broadcast(device):
-+    @triton.jit
-+    def _kernel(a, b, out):
-+        a_offsets = tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
-+        lhs = tl.load(a + a_offsets, mask=a_offsets < 32 * 64)
-+        rhs = tl.load(b)
-+        rhs_bc = tl.broadcast_to(rhs, [32, 32])
-+        c = tl.dot(lhs, rhs_bc)
-+        out_ptr = out + tl.arange(0, 64)[:, None] * 32 + tl.arange(0, 32)[None, :]
-+        tl.store(out_ptr, c)
-+
-+    a = torch.ones((64, 32), dtype=getattr(torch, 'float32'), device=device)
-+    b = torch.tensor([1.0], dtype=getattr(torch, 'float32'), device=device)
-+    out_ref = torch.matmul(a, torch.broadcast_to(b, (32, 32)))
-+    out = torch.zeros((64, 32), dtype=getattr(torch, 'float32'), device=device)
-+    _kernel[(1, )](a, b, out, num_stages=1, num_warps=4)
-+    assert torch.all(out == out_ref)
-+
- 
- # ---------------
- # test arange
-diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
---- a/test/TritonGPU/accelerate-matmul.mlir
-+++ b/test/TritonGPU/accelerate-matmul.mlir
-@@ -148,3 +148,21 @@ module attributes {"triton_gpu.target" =
-     tt.return
-   }
- }
-+
-+// -----
-+
-+// CHECK-DAG: #[[$BLOCKED:.*]] = #triton_gpu.blocked
-+// CHECK-DAG: #mma = #triton_gpu.nvidia_mma<{versionMajor = 3
-+#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @local_alloc_dot_operand(%in0:  tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> {tt.divisibility = 16 : i32}, %in1: f32, %in2: tensor<64x32xf32, #blocked>) -> (tensor<64x32xf32, #blocked>) {
-+    // CHECK-LABEL: local_alloc_dot_operand
-+    %splat_in1 = tt.splat %in1 : f32 -> tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
-+    // CHECK: %[[LHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc
-+    // CHECK: %[[RHS_CVT:.*]] = triton_gpu.convert_layout {{.*}} #triton_gpu.dot_op<{{.*}}> -> {{.*}} #[[$BLOCKED]]
-+    // CHECK: %[[RHS_LOCAL_ALLOC:.*]] = triton_gpu.local_alloc %[[RHS_CVT]]
-+    // CHECK: tt.dot %[[LHS_LOCAL_ALLOC]], %[[RHS_LOCAL_ALLOC]]
-+    %res = tt.dot %in0, %splat_in1, %in2, inputPrecision = tf32 : tensor<64x256xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x32xf32, #blocked>
-+    tt.return %res :  tensor<64x32xf32, #blocked>
-+  }
-+}
-diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
---- a/test/TritonGPU/canonicalize.mlir
-+++ b/test/TritonGPU/canonicalize.mlir
-@@ -57,3 +57,19 @@ tt.func @test_canonicalize_convert_histo
-     tt.return %2 : tensor<512xi32, #blocked2>
- }
- }  // end module
-+
-+// -----
-+
-+// CHECK: #[[$BLOCKED:.*]] = #triton_gpu.blocked
-+#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
-+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @cvt_from_dot_op_into_local_allow_not_canonicalized(%in: tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<256x32xf32, #shared1> {
-+    // CHECK-LABEL: cvt_from_dot_op_into_local_allow_not_canonicalized
-+    %cvt_in = triton_gpu.convert_layout %in : tensor<256x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<256x32xf32, #blocked>
-+    %alloc = triton_gpu.local_alloc %cvt_in : (tensor<256x32xf32, #blocked>) -> !tt.memdesc<256x32xf32, #shared1>
-+    // CHECK: %[[ALLOC:.*]] = triton_gpu.local_alloc {{.*}} (tensor<{{.*}}, #[[$BLOCKED]]{{.*}}>) ->
-+    tt.return %alloc : !tt.memdesc<256x32xf32, #shared1>
-+  }
-+} // end module
-+
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index 15a1a6d220d765..53a5059fe432d7 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -5,7 +5,4 @@ These are created temporarily and should be moved to the first copybara workflow
 internal patch during the next triton integration process.
 """
 
-temporary_patch_list = [
-    "//third_party/triton/temporary:fp8_splat_partial_revert.patch",
-    "//third_party/triton/temporary:local_alloc_lowering_fix.patch",
-]
+temporary_patch_list = []
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 1aace3709dc23a..0d9ff9580e1c8c 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl637553582"
-    TRITON_SHA256 = "400077180416fc59486b698a6523013ee11589c6269e1aeb992292ca12cc1e58"
+    TRITON_COMMIT = "cl638583630"
+    TRITON_SHA256 = "769385a2295fa7256a04fcdc886054fb0853a25ee1c35dcdc0aabf755508f9fc"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,

From 209d568900cf33e81cf77076459647453ad410d4 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 07:45:04 -0700
Subject: [PATCH 139/287] [XLA] Simplify multihost_hlo_runner interface

Remove redundant --hlo_file flag, and allow multiple modules.

Also SPMD and num hosts/replicas is now inferred from the module.

PiperOrigin-RevId: 638649677
---
 third_party/xla/docs/tools.md                 |  10 +-
 .../xla/xla/service/gpu/build_defs.bzl        |   2 +-
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |   6 +
 .../xla/tools/multihost_hlo_runner/README.md  |  13 +-
 .../multihost_hlo_runner/hlo_runner_main.cc   | 167 +++++++++---------
 5 files changed, 101 insertions(+), 97 deletions(-)

diff --git a/third_party/xla/docs/tools.md b/third_party/xla/docs/tools.md
index fe7d8ee5a6a86c..23bca688bc1f9f 100644
--- a/third_party/xla/docs/tools.md
+++ b/third_party/xla/docs/tools.md
@@ -13,7 +13,7 @@ The easiest way to obtain the HLO for a program being compiled with XLA is
 usually to use the `XLA_FLAGS` environment variable:
 
 ```
-XLA_FLAGS=--xla_dump_to=/tmp/myfolder ./myprogram-entry-point
+$ XLA_FLAGS=--xla_dump_to=/tmp/myfolder ./myprogram-entry-point
 ```
 
 which stores all before-optimization HLO files in the folder specified, along
@@ -27,7 +27,7 @@ implementation. For example, the usual invocation to run an input file
 `computation.hlo` on an NVIDIA GPU and to check it for correctness is:
 
 ```
-run_hlo_module --platform=CUDA --reference_platform=Interpreter computation.hlo
+$ run_hlo_module --platform=CUDA --reference_platform=Interpreter computation.hlo
 ```
 
 As with all the tools, `--help` can be used to obtain the full list of options.
@@ -38,7 +38,7 @@ Multihost HLO runner is a very similar tool, with the caveat that it supports
 SPMD, including cross host communication. A typical invocation looks like:
 
 ```
-hlo_runner_main  --device_type=gpu --use_spmd_partitioning=true --num_partitions=4 --num_replicas=1 --hlo_file=computation.hlo
+$ hlo_runner_main  /path/to/module.hlo
 ```
 
 ## Running passes/stages of HLO compilation: `hlo-opt`
@@ -115,7 +115,7 @@ Deviceless compilation might run into issues if autotuning is required. Luckily,
 we can also provide those on the command line:
 
 ```
-hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=gpu_specs/a100_80.txtpb --xla_gpu_load_autotune_results_from=results.textpb input.hlo
+$ hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=gpu_specs/a100_80.txtpb --xla_gpu_load_autotune_results_from=results.textpb input.hlo
 ```
 
 The autotune file is text serialization of `autotune_results.proto`, with
@@ -151,5 +151,5 @@ The flags from `XLA_FLAGS` are also supported, so the tool can be used to test
 running a single pass:
 
 ```
-hlo-opt --platform=CUDA --stage=hlo --xla-hlo-enable-passes-only=algebraic_simplifer input.hlo
+$ hlo-opt --platform=CUDA --stage=hlo --xla-hlo-enable-passes-only=algebraic_simplifer input.hlo
 ```
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 06e351fb1bd622..44e80accc5519d 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -167,7 +167,7 @@ def gen_gpu_hlo_compile_tests(name, hlo_files, multihost_hlo_runner_binary_path,
                     "--num_replicas=%d" % num_replicas,
                     "--num_partitions=%d" % num_partitions,
                     "--use_spmd_partitioning=true",
-                    "--hlo_file=%s" % hlo_path,
+                    hlo_path,
                 ],
                 data = ["//xla/tools/multihost_hlo_runner:cuda_hlo_runner_main", data_label],
                 tags = backend_tags[backend] + ["requires-mem:16g"],
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index b77e9eb4d521aa..1056c11c71ec2a 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -39,11 +39,14 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:service",
         "//xla/service:cpu_plugin",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
@@ -72,6 +75,8 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:service",
         "//xla/service:cpu_plugin",
         "//xla/service:gpu_plugin",
         "//xla/stream_executor:cuda_platform",
@@ -79,6 +84,7 @@ xla_cc_binary(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/README.md b/third_party/xla/xla/tools/multihost_hlo_runner/README.md
index fb7f196ede663e..eaebfcd1bf81f4 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/README.md
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/README.md
@@ -18,14 +18,11 @@ If we have enough GPUs, we can replay these HLOs like this:
 
 ```
 bazel run -c opt --config=cuda --dynamic_mode=off \
-  //xla/tools/multihost_hlo_runner:hlo_runner_main \
-  -- --device_type=gpu --use_spmd_partitioning=true \
-  --num_partitions=2 --num_replicas=1 \
-  --hlo_file=my-hlo.txt
+  //xla/tools/multihost_hlo_runner:hlo_runner_main -- my-hlo.txt
 ```
 
 Tip: If the input generation takes too long or uses too much host memory,
-consider using --hlo_argument_mode=uninitialized.
+consider using `--hlo_argument_mode=uninitialized`.
 
 It is also possible to compile the same HLO without running it by setting
 `--run=false`
@@ -33,9 +30,7 @@ It is also possible to compile the same HLO without running it by setting
 ```
 bazel run -c opt --config=cuda --dynamic_mode=off \
   //xla/tools/multihost_hlo_runner:hlo_runner_main \
-  -- --device_type=gpu --use_spmd_partitioning=true \
-  --num_partitions=2 --num_replicas=1 --run=false \
-  --hlo_file=my-hlo.txt
+  -- --run=false my-hlo.txt
 ```
 
 In that case, a single GPU is necessary.
@@ -47,4 +42,4 @@ In that case, a single GPU is necessary.
   -   `CUDA_VISIBLE_DEVICES` must be set correctly or not set at all.
 -   Crashes:
     -   We may want to use `--dynamic_mode=off`.
-    -   CUDA and Cudnn should be set up correctly.
\ No newline at end of file
+    -   CUDA and Cudnn should be set up correctly.
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index 3c60e182408f4f..dc512badf5067c 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -15,16 +15,21 @@ limitations under the License.
 
 // Utility for launching some HLO text that supports multiple hosts/devices.
 
+#include <iostream>
 #include <memory>
 #include <string>
 #include <string_view>
+#include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
 #include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
+#include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/statusor.h"
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
@@ -33,49 +38,87 @@ limitations under the License.
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace {
 const char* const kUsage = R"(
 This tool lets you run an HLO module on one or more GPUs.
 You can also pass in debug option flags for the HloModule.
 
+Note that SPMD options are set inside the module header (number of partitions
+and number of replicas), as those are fixed for a given module.
+
 Usage:
 
-Single-GPU HLO:
+  bazel run hlo_runner_main -- /path/to/module.hlo
+
+The tool can be used to just compile the HLO and not run it:
+
+  bazel run hlo_runner_main -- /path/to/module1.hlo --run=false
+
+Note that multiple HLOs can also be launched:
 
-  bazel run hlo_runner_main -- \
-    --num_replicas=1 \
-    --num_partitions=1 \
-    --hlo_file=path/to/hlo_module
+  bazel run hlo_runner_main -- /path/to/module1.hlo /path/to/module2.hlo
 
-2-GPU sharded HLO:
+If multiple HLOs are launched, we assume that they are encoded in the same
+format (HLO text by default). Running multiple HLOs is convenient when replaying
+all HLOs from an execution dump, with e.g.:
 
-  bazel run hlo_runner_main -- \
-    --use_spmd_partitioning=true \
-    --num_replicas=1 \
-    --num_partitions=2 \
-    --hlo_file=path/to/hlo_module
+  bazel run hlo_runner_main -- /dump/*before_optimizations*.txt
 
-2 hosts-Mock GPU sharded HLO:
-  bazel run hlo_runner_main -- \
-     --use_spmd_partitioning=true \
-    --num_replicas=1 \
-    --num_partitions=2 \
-    --num_nodes=2 \
-    --enable_mock_gpu=true \
-    --hlo_file=path/to/hlo_module
+Mock GPU usage:
+  bazel run hlo_runner_main -- --enable_mock_gpu=true /path/to/hlo_module.hlo
 
 Tip: If the input generation takes too long or uses too much host memory,
 consider using --hlo_argument_mode=uninitialized.
 )";
 
+absl::StatusOr<std::unique_ptr<xla::PjRtClient>> GetClient(
+    const std::string& device_type_str, bool enable_mock_nccl, int num_nodes,
+    const std::string& address_str, int task_id,
+    std::unique_ptr<xla::DistributedRuntimeService>* service) {
+  if (device_type_str == "host") {
+    CHECK_EQ(num_nodes, 1);
+    return xla::FunctionalHloRunner::CreateHostClient();
+  }
+
+  CHECK_EQ(device_type_str, "gpu");
+
+  if (enable_mock_nccl) {
+    CHECK_GT(num_nodes, 1);
+    return xla::FunctionalHloRunner::CreateMockGpuClient(num_nodes);
+  } else {
+    if (num_nodes == 1) {
+      return xla::FunctionalHloRunner::CreateGpuClient();
+    } else {
+      CHECK_GT(address_str.length(), 0);
+      // Multinode. Start service on task 0.
+      if (task_id == 0) {
+        std::string coordinator_bind_address =
+            "[::]:" + address_str.substr(address_str.rfind(":") + 1);
+        xla::CoordinationServiceImpl::Options options;
+        options.num_nodes = num_nodes;
+        auto status_or = xla::GetDistributedRuntimeService(
+            coordinator_bind_address, options);
+        TF_QCHECK_OK(status_or.status());
+        *service = std::move(status_or.value());
+      }
+      xla::DistributedRuntimeClient::Options options;
+      options.node_id = task_id;
+      options.init_timeout = absl::Seconds(300);
+      auto distributed_client =
+          xla::GetDistributedRuntimeClient(address_str, options);
+      TF_QCHECK_OK(distributed_client->Connect());
+      return xla::FunctionalHloRunner::CreateGpuClient(distributed_client,
+                                                       task_id, num_nodes);
+    }
+  }
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
   std::string input_format_str = "text";
   xla::InputFormat input_format;
-  std::string hlo_file = "";
   bool should_run = true;
   bool enable_mock_nccl = false;
   std::string dump_output_literal_to = "";
@@ -91,8 +134,6 @@ int main(int argc, char** argv) {
       tsl::Flag("input_format", &input_format_str,
                 "HLO input mode: text, proto_text, proto_binary, or "
                 "snapshot_proto_binary"),
-      tsl::Flag("hlo_file", &hlo_file,
-                "A text or proto buf file for HLO input"),
       tsl::Flag("run", &should_run, "Should we run the compiled HLO?"),
       tsl::Flag("dump_output_literal_to", &dump_output_literal_to,
                 "A path to which the HLO output will be dumped. "
@@ -137,71 +178,33 @@ int main(int argc, char** argv) {
       LOG(QFATAL) << kUsageString;
     }
   }
-  // tsl::Flags::Parse() leaves unknown flags in argv.
-  // argv[0] is always the program name.
-  if (argc > 1) {
-    if (hlo_file.empty()) {
-      LOG(INFO) << "--hlo_file was not specified; assuming " << argv[1]
-                << " is the HLO file name.";
-      hlo_file = argv[1];
-      --argc;
-    }
-    CHECK_LT(argc, 2) << ": Encountered unknown flags.";
-  }
 
-  std::unique_ptr<xla::DistributedRuntimeService> service;
+  // tsl::Flags::Parse() leaves unknown flags in argv, we assume that those are
+  // HLO files to run. Note that argv[0] is the binary name and is excluded.
+  QCHECK_GT(argc, 1) << "No HLO file specified";
 
-  // The main logic:
-  absl::StatusOr<std::unique_ptr<xla::PjRtClient>> client = [&] {
-    if (device_type_str == "host") {
-      CHECK_EQ(num_nodes, 1);
-      return xla::FunctionalHloRunner::CreateHostClient();
-    }
+  QCHECK(dump_output_literal_to.empty() || argc == 2)
+      << "Can only dump output literal when single input file is specified";
 
-    CHECK_EQ(device_type_str, "gpu");
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  absl::StatusOr<std::unique_ptr<xla::PjRtClient>> client =
+      GetClient(device_type_str, enable_mock_nccl, num_nodes, address_str,
+                task_id, &service);
+  TF_QCHECK_OK(client.status());
 
-    if (enable_mock_nccl) {
-      CHECK_GT(num_nodes, 1);
-      return xla::FunctionalHloRunner::CreateMockGpuClient(num_nodes);
+  for (int c = 1; c < argc; c++) {
+    const char* filename = argv[c];
+    std::cout << "\n** Running " << filename << " **\n";
+    if (should_run) {
+      TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndRunAndDump(
+          *client.value(), xla::GetDebugOptionsFromFlags(), preproc_options,
+          raw_compile_options, running_options, {filename}, input_format,
+          dump_output_literal_to, task_id));
     } else {
-      if (num_nodes == 1) {
-        return xla::FunctionalHloRunner::CreateGpuClient();
-      } else {
-        CHECK_GT(address_str.length(), 0);
-        // Multinode. Start service on task 0.
-        if (task_id == 0) {
-          std::string coordinator_bind_address =
-              "[::]:" + address_str.substr(address_str.rfind(":") + 1);
-          xla::CoordinationServiceImpl::Options options;
-          options.num_nodes = num_nodes;
-          auto status_or = xla::GetDistributedRuntimeService(
-              coordinator_bind_address, options);
-          TF_QCHECK_OK(status_or.status());
-          service = std::move(status_or.value());
-        }
-        xla::DistributedRuntimeClient::Options options;
-        options.node_id = task_id;
-        options.init_timeout = absl::Seconds(300);
-        auto distributed_client =
-            xla::GetDistributedRuntimeClient(address_str, options);
-        TF_QCHECK_OK(distributed_client->Connect());
-        return xla::FunctionalHloRunner::CreateGpuClient(distributed_client,
-                                                         task_id, num_nodes);
-      }
+      TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndCompile(
+          *client.value(), xla::GetDebugOptionsFromFlags(), preproc_options,
+          raw_compile_options, {argv[c]}, input_format, task_id));
     }
-  }();
-
-  TF_QCHECK_OK(client.status());
-
-  if (should_run) {
-    TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndRunAndDump(
-        *client.value(), xla::GetDebugOptionsFromFlags(), preproc_options,
-        raw_compile_options, running_options, {hlo_file}, input_format,
-        dump_output_literal_to, task_id));
-  } else {
-    TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndCompile(
-        *client.value(), xla::GetDebugOptionsFromFlags(), preproc_options,
-        raw_compile_options, hlo_file, input_format, task_id));
   }
 
   return 0;

From 0f101b03dff48018c0d1e2ddc545a290fc9e8ad6 Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Thu, 30 May 2024 08:20:51 -0700
Subject: [PATCH 140/287] [XLA:GPU][Triton] Lower FP8 conversion instructions
 to Triton's FP_TO_FP instead of arith to be correctly handled.

PiperOrigin-RevId: 638659789
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc        | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 56f1c1f0c435a2..f877cb9e6471a2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -252,6 +252,11 @@ Value OnesLike(ImplicitLocOpBuilder& b, Value x) {
   return CreateConst(b, x.getType(), 1);
 }
 
+bool IsFp8Type(Type t) {
+  return t.isFloat8E5M2() || t.isFloat8E4M3FN() || t.isFloat8E5M2FNUZ() ||
+         t.isFloat8E4M3FNUZ() || t.isFloat8E4M3B11FNUZ();
+}
+
 // Triton type conversions.
 Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_ty = value.getType();
@@ -282,6 +287,14 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   auto src_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(src_element_ty);
   auto dst_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(dst_element_ty);
   if (src_fp_element_ty && dst_fp_element_ty) {
+    // F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
+    // because LLVM doesn't support casts from/to FP8.
+    // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as
+    // we can't test the code below without patching the feature.
+    if (IsFp8Type(src_element_ty) || IsFp8Type(dst_element_ty)) {
+      return b.create<mt::FpToFpOp>(dst_ty, value);
+    }
+
     if (src_fp_element_ty.getFPMantissaWidth() >
         dst_fp_element_ty.getFPMantissaWidth()) {
       return b.create<ma::TruncFOp>(dst_ty, value);

From 9824ffd3e87e4d839b0bb047be13fb4e0ef77959 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 08:23:22 -0700
Subject: [PATCH 141/287] [xla:cpu] Add support for kWhile thunk

PiperOrigin-RevId: 638660503
---
 third_party/xla/xla/service/cpu/BUILD         |  1 +
 .../xla/xla/service/cpu/ir_emitter2.cc        |  2 +-
 third_party/xla/xla/service/cpu/runtime/BUILD | 15 ++++++
 .../xla/xla/service/cpu/runtime/thunk.cc      |  2 +
 .../xla/xla/service/cpu/runtime/thunk.h       |  1 +
 .../xla/service/cpu/runtime/while_thunk.cc    | 52 +++++++++++++++++++
 .../xla/xla/service/cpu/runtime/while_thunk.h | 47 +++++++++++++++++
 .../xla/xla/service/cpu/thunk_emitter.cc      | 26 ++++++++++
 .../xla/xla/service/cpu/thunk_emitter.h       |  3 ++
 9 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/cpu/runtime/while_thunk.cc
 create mode 100644 third_party/xla/xla/service/cpu/runtime/while_thunk.h

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 70d7b8a114c906..9a54668d825c1c 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -832,6 +832,7 @@ cc_library(
         "//xla/service/cpu/runtime:copy_thunk",
         "//xla/service/cpu/runtime:kernel_thunk",
         "//xla/service/cpu/runtime:thunk",
+        "//xla/service/cpu/runtime:while_thunk",
         "//xla/stream_executor:launch_dim",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 642c82664af4d4..00eb70e56d1f1f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -141,7 +141,7 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
   absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name, bool is_reducer) override {
-    return absl::UnimplementedError("Not implemented");
+    return absl::UnimplementedError("EmitThreadLocalCall is not implemented");
   }
 
   bool fast_min_max() override { return fast_min_max_; }
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index 0a1469d4c93ea4..6618e175ac475a 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -124,3 +124,18 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ],
 )
+
+cc_library(
+    name = "while_thunk",
+    srcs = ["while_thunk.cc"],
+    hdrs = ["while_thunk.h"],
+    deps = [
+        ":thunk",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.cc b/third_party/xla/xla/service/cpu/runtime/thunk.cc
index 8009178419e5ef..b2dfe1ab5aacce 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.cc
@@ -34,6 +34,8 @@ std::string_view Thunk::KindToString(Kind kind) {
       return "copy";
     case Kind::kKernel:
       return "kernel";
+    case Kind::kWhile:
+      return "while";
   }
 }
 
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.h b/third_party/xla/xla/service/cpu/runtime/thunk.h
index e8110ba46859d5..5616d1f328b026 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.h
@@ -50,6 +50,7 @@ class Thunk {
     kCall,
     kCopy,
     kKernel,
+    kWhile,
   };
 
   virtual ~Thunk() = default;
diff --git a/third_party/xla/xla/service/cpu/runtime/while_thunk.cc b/third_party/xla/xla/service/cpu/runtime/while_thunk.cc
new file mode 100644
index 00000000000000..0c5f3712d5a0fa
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/while_thunk.cc
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/runtime/while_thunk.h"
+
+#include <utility>
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
+#include "xla/stream_executor/device_memory.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+WhileThunk::WhileThunk(BufferAllocation::Slice cond_buffer,
+                       ThunkSequence cond_sequence, ThunkSequence body_sequence)
+    : Thunk(Kind::kWhile),
+      cond_buffer_(cond_buffer),
+      cond_sequence_(std::move(cond_sequence)),
+      body_sequence_(std::move(body_sequence)) {}
+
+absl::Status WhileThunk::Execute(const ExecuteParams& params) {
+  TF_ASSIGN_OR_RETURN(
+      se::DeviceMemoryBase cond_data,
+      params.buffer_allocations->GetDeviceAddress(cond_buffer_));
+
+  bool* condition = reinterpret_cast<bool*>(cond_data.opaque());
+
+  TF_RETURN_IF_ERROR(cond_sequence_.Execute(params));
+  while (*condition) {
+    TF_RETURN_IF_ERROR(body_sequence_.Execute(params));
+    TF_RETURN_IF_ERROR(cond_sequence_.Execute(params));
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/while_thunk.h b/third_party/xla/xla/service/cpu/runtime/while_thunk.h
new file mode 100644
index 00000000000000..dc76ad9aea5108
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime/while_thunk.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_WHILE_THUNK_H_
+#define XLA_SERVICE_CPU_RUNTIME_WHILE_THUNK_H_
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/runtime/thunk.h"
+
+namespace xla::cpu {
+
+// While loop written as two thunk sequences:
+//
+// while (condition_thunk.Execute(...) && condition_buffer) {
+//   body_thunk.Execute(...);
+// }
+//
+// Condition buffer must be a i1 (bool) buffer that holds a loop predicate.
+class WhileThunk final : public Thunk {
+ public:
+  WhileThunk(BufferAllocation::Slice cond_buffer, ThunkSequence cond_sequence,
+             ThunkSequence body_sequence);
+
+  absl::Status Execute(const ExecuteParams& params) final;
+
+ private:
+  BufferAllocation::Slice cond_buffer_;
+  ThunkSequence cond_sequence_;
+  ThunkSequence body_sequence_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_WHILE_THUNK_H_
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index cd04d4405c997c..937a3c34d4a3f4 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/cpu/runtime/copy_thunk.h"
 #include "xla/service/cpu/runtime/kernel_thunk.h"
 #include "xla/service/cpu/runtime/thunk.h"
+#include "xla/service/cpu/runtime/while_thunk.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "tsl/platform/errors.h"
@@ -50,6 +51,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
   if (!module.has_schedule()) {
     return absl::InternalError("HLO module must be scheduled to emit thunks");
   }
+  VLOG(0) << module.ToString();
   return EmitHloComputation(module.entry_computation());
 }
 
@@ -89,6 +91,16 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kTuple:
       return ThunkSequence::Empty();
 
+    // No-op operations that are used only to define an execution order for the
+    // HLO dataflow graph.
+    case HloOpcode::kAfterAll:
+      return ThunkSequence::Empty();
+
+    // Control flow thunks check predicates on the host and launch nested thunk
+    // sequences for branches and loops.
+    case HloOpcode::kWhile:
+      return EmitWhileThunk(instruction);
+
     // Allocations for constants owned by the executable, and resolved at run
     // time according to the buffer assignment (using allocation index). We do
     // not need to emit any thunks for constant instructions.
@@ -195,6 +207,20 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
   return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
 }
 
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitWhileThunk(
+    const HloInstruction* instruction) {
+  HloInstruction* cond = instruction->while_condition()->root_instruction();
+  TF_ASSIGN_OR_RETURN(auto cond_buffer, GetAllocationSlice(cond));
+
+  TF_ASSIGN_OR_RETURN(ThunkSequence cond_thunk,
+                      EmitHloComputation(instruction->while_condition()));
+  TF_ASSIGN_OR_RETURN(ThunkSequence body_thunk,
+                      EmitHloComputation(instruction->while_body()));
+
+  return ThunkSequence::Of<WhileThunk>(cond_buffer, std::move(cond_thunk),
+                                       std::move(body_thunk));
+}
+
 absl::StatusOr<std::vector<BufferAllocation::Slice>>
 ThunkEmitter::GetLeafAllocationSlices(const HloInstruction* instruction) {
   std::vector<BufferAllocation::Slice> buffers;
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index b4e5a75d909c23..b2844e92068805 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -68,6 +68,9 @@ class ThunkEmitter {
   absl::StatusOr<ThunkSequence> EmitFusionKernelThunk(
       const HloInstruction* instruction);
 
+  absl::StatusOr<ThunkSequence> EmitWhileThunk(
+      const HloInstruction* instruction);
+
   // Returns the list of buffer allocation slices assigned to the given
   // instruction leaf buffers. We do not materialize tuples at run time and only
   // read and write from buffers corresponding to arrays.

From 8bc29ecb5c800027331d8fcbf46b1417efbdbcf3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 08:30:48 -0700
Subject: [PATCH 142/287] Add flag to disable autotune for hlo compile tests.

PiperOrigin-RevId: 638662736
---
 third_party/xla/xla/service/gpu/build_defs.bzl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 44e80accc5519d..7acba9419a6124 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -64,7 +64,15 @@ def gpu_kernel_library(name, copts = [], local_defines = [], **kwargs):
 
 register_extension_info(extension = gpu_kernel_library, label_regex_for_dep = "{extension_name}")
 
-def gen_gpu_hlo_compile_tests(name, hlo_files, multihost_hlo_runner_binary_path, backends = [], disabled_backends = [], backend_tags = {}, backend_args = {}):
+def gen_gpu_hlo_compile_tests(
+        name,
+        hlo_files,
+        multihost_hlo_runner_binary_path,
+        backends = [],
+        disabled_backends = [],
+        backend_tags = {},
+        backend_args = {},
+        xla_flags = []):
     """Macro to generate Bazel tests for compiling HLO files on a GPU.
 
     This macro creates individual Bazel test targets for each specified HLO file.
@@ -168,7 +176,7 @@ def gen_gpu_hlo_compile_tests(name, hlo_files, multihost_hlo_runner_binary_path,
                     "--num_partitions=%d" % num_partitions,
                     "--use_spmd_partitioning=true",
                     hlo_path,
-                ],
+                ] + xla_flags,
                 data = ["//xla/tools/multihost_hlo_runner:cuda_hlo_runner_main", data_label],
                 tags = backend_tags[backend] + ["requires-mem:16g"],
             )

From f07c25bcd256ba880c3667417ee39e140ef1a1f4 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 08:34:42 -0700
Subject: [PATCH 143/287] [XLA] Support running multiple modules in
 run_hlo_module

PiperOrigin-RevId: 638663872
---
 third_party/xla/docs/tools.md                 |  8 ++
 third_party/xla/xla/tools/BUILD               |  1 +
 third_party/xla/xla/tools/run_hlo_module.h    |  1 -
 .../xla/xla/tools/run_hlo_module_main.cc      | 84 +++++++++----------
 4 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/third_party/xla/docs/tools.md b/third_party/xla/docs/tools.md
index 23bca688bc1f9f..d3a0e06d5227d4 100644
--- a/third_party/xla/docs/tools.md
+++ b/third_party/xla/docs/tools.md
@@ -41,6 +41,14 @@ SPMD, including cross host communication. A typical invocation looks like:
 $ hlo_runner_main  /path/to/module.hlo
 ```
 
+Invocation with multiple modules is supported for both `run_hlo_module` and
+`hlo_runner_main`, which is often convenient to replay all modules in a dump
+directory:
+
+```shell
+$ hlo_runner_main /dump/*before_optimizations*
+```
+
 ## Running passes/stages of HLO compilation: `hlo-opt`
 
 When debugging or understanding the workings of the compiler, it is often useful
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 72833194a5283d..c6fbf05d30ae77 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -587,6 +587,7 @@ xla_cc_binary(
         ":run_hlo_module_lib",
         "//xla:debug_options_flags",
         "//xla/service:cpu_plugin",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_runner",
         "//xla/service:interpreter_plugin",
         "//xla/service:platform_util",
diff --git a/third_party/xla/xla/tools/run_hlo_module.h b/third_party/xla/xla/tools/run_hlo_module.h
index 898b489effb6f6..a4f928ed57debf 100644
--- a/third_party/xla/xla/tools/run_hlo_module.h
+++ b/third_party/xla/xla/tools/run_hlo_module.h
@@ -47,7 +47,6 @@ struct RunHloModuleOptions {
   float abs_error_bound{1e-3};
   float rel_error_bound{1e-3};
   std::string input_format;
-  std::string input_module;
   bool use_buffer_assignment_from_proto{false};
   // The format and the usage of the option is platform-dependent.
   std::string input_compilation_environments;
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index 4f577bf3d279b3..218f76aa83aa1c 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -17,12 +17,14 @@ limitations under the License.
 // given platform(s). See kUsage for details.
 
 #include <iostream>
+#include <memory>
 #include <random>
 #include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
 #include "xla/tools/run_hlo_module.h"
@@ -38,8 +40,8 @@ This tool lets you read a HloModule from a file and execute the module on given
 platform.
 
 The file can be one of the followings:
-1) a binary or text proto file, the proto should be in xla.HloProto type.
-2) a hlo text dump, the string should be in HloModule::ToString() format.
+1) An hlo text dump, the string should be in HloModule::ToString() format.
+2) A binary or text proto file, the proto should be in xla.HloProto type.
 
 By default, the module is run on a reference platform such as the interpreter
 and the reference result is compared against the test result.
@@ -48,10 +50,11 @@ You can also pass in debug option flags for the HloModule.
 
 Usage:
 
-  bazel run run_hlo_module -- \
-    --input_format=[hlo|pb|pbtxt]               \
-    --platform=[CPU|CUDA|Interpreter] \
-    path/to/hlo_module
+  bazel run run_hlo_module -- --platform=[CPU|CUDA|Interpreter] /path/module.hlo
+
+Multiple files can be run as well:
+
+  bazel run run_hlo_module -- --platform=[CPU|CUDA|Interpreter] /path/*.hlo
 )";
 const char kInterpreterPlatformName[] = "Interpreter";
 
@@ -120,9 +123,6 @@ int main(int argc, char** argv) {
                 "  hlo : HLO textual format\n"
                 "  pb : xla::HloProto in binary proto format\n"
                 "  pbtxt : xla::HloProto in text proto format"),
-      tsl::Flag("input_module", &opts.input_module,
-                "A path to a file containing the HLO module. Can also pass "
-                "a this as argv[1], but this flag is more explicit."),
       tsl::Flag(
           "iterations", &opts.iterations,
           "The number of times to run the module. Each iteration will be run "
@@ -162,45 +162,45 @@ int main(int argc, char** argv) {
       reference_platform ? std::make_unique<xla::HloRunner>(reference_platform)
                          : nullptr;
 
-  std::string hlo_filename;
-  if (!opts.input_module.empty()) {
-    hlo_filename = opts.input_module;
-  } else {
-    QCHECK(argc == 2) << "Must specify a single input file";
-    hlo_filename = argv[1];
-  }
+  QCHECK(argc > 1) << "Input HLO file missing.";
 
-  std::unique_ptr<std::minstd_rand0> engine;
-  if (opts.random_init_input_literals) {
-    engine = std::make_unique<std::minstd_rand0>();
-  }
   int failure_count = 0;
-  const int iteration_count = opts.iterations;
-  for (int i = 1; i <= iteration_count; ++i) {
-    if (iteration_count != 1) {
-      std::cerr << "\n=== Iteration " << i << "\n";
+  for (int c = 1; c < argc; c++) {
+    const char* hlo_filename = argv[c];
+    std::cout << "\n ** Running " << hlo_filename << "** \n";
+
+    std::unique_ptr<std::minstd_rand0> engine;
+    if (opts.random_init_input_literals) {
+      engine = std::make_unique<std::minstd_rand0>();
     }
-    absl::Status result = xla::RunAndCompare(
-        hlo_filename, &test_runner, reference_runner.get(), engine.get(), opts,
-        /*iteration_literals_proto=*/nullptr,
-        /*reference_module_modifier_hook=*/{},
-        [&](xla::HloModuleConfig* config) {
-          config->set_seed(different_random_seeds ? i : 42);
-        });
-
-    if (result.ok()) {
-      if (!reference_platform_name.empty()) {
-        std::cerr << "\n** Results on " << test_platform_name << " and "
-                  << reference_platform_name << " are close enough. **\n";
+    const int iteration_count = opts.iterations;
+    for (int i = 1; i <= iteration_count; ++i) {
+      if (iteration_count != 1) {
+        std::cerr << "\n=== Iteration " << i << "\n";
+      }
+      absl::Status result = xla::RunAndCompare(
+          hlo_filename, &test_runner, reference_runner.get(), engine.get(),
+          opts,
+          /*iteration_literals_proto=*/nullptr,
+          /*reference_module_modifier_hook=*/{},
+          [&](xla::HloModuleConfig* config) {
+            config->set_seed(different_random_seeds ? i : 42);
+          });
+
+      if (result.ok()) {
+        if (!reference_platform_name.empty()) {
+          std::cerr << "\n** Results on " << test_platform_name << " and "
+                    << reference_platform_name << " are close enough. **\n";
+        }
+      } else {
+        failure_count++;
+        std::cerr << result << "\n";
       }
-    } else {
-      failure_count++;
-      std::cerr << result << "\n";
     }
-  }
 
-  if (!reference_platform_name.empty()) {
-    std::cerr << failure_count << "/" << iteration_count << " runs failed.\n";
+    if (!reference_platform_name.empty()) {
+      std::cerr << failure_count << "/" << iteration_count << " runs failed.\n";
+    }
   }
 
   return failure_count == 0 ? 0 : -1;

From 3d07e7ac7858f3660028034e4017c5d044ee8e50 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 09:19:41 -0700
Subject: [PATCH 144/287] Make Stream a pure virtual base class to start
 eliminating circular dependencies between and StreamExecutor.

PiperOrigin-RevId: 638677132
---
 .../stream_executor_internal.h                |   5 +-
 third_party/xla/xla/stream_executor/BUILD     |   9 +-
 .../xla/xla/stream_executor/gpu/gpu_stream.h  |   6 +-
 .../xla/stream_executor/host/host_stream.cc   |   4 +-
 .../xla/stream_executor/host/host_stream.h    |   4 +-
 third_party/xla/xla/stream_executor/stream.h  | 131 +++++-----------
 .../{stream.cc => stream_common.cc}           |  60 ++++---
 .../xla/xla/stream_executor/stream_common.h   | 146 ++++++++++++++++++
 .../tpu/tpu_stream_interface.h                |   6 +-
 9 files changed, 241 insertions(+), 130 deletions(-)
 rename third_party/xla/xla/stream_executor/{stream.cc => stream_common.cc} (77%)
 create mode 100644 third_party/xla/xla/stream_executor/stream_common.h

diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index 1ed5ed1baafc8d..77ab78ecd503f8 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_common.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/statusor.h"
@@ -96,11 +97,11 @@ class CPlatform : public Platform {
   stream_executor::ExecutorCache executor_cache_;
 };
 
-class CStream : public Stream {
+class CStream : public StreamCommon {
  public:
   CStream(SP_Device* device, SP_StreamExecutor* stream_executor,
           StreamExecutor* executor)
-      : Stream(executor),
+      : StreamCommon(executor),
         device_(device),
         stream_executor_(stream_executor),
         stream_handle_(nullptr) {}
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index c517a7a96f0092..ca432a79d96b1a 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -85,6 +85,7 @@ filegroup(
         "platform_manager.h",
         "scratch_allocator.h",
         "stream.h",
+        "stream_common.h",
         "stream_executor.h",
     ],
     visibility = ["//visibility:private"],
@@ -667,10 +668,13 @@ transitive_hdrs(
 cc_library(
     name = "stream_executor_pimpl",
     srcs = [
-        "stream.cc",
+        "stream_common.cc",
         "stream_executor_pimpl.cc",
     ],
-    hdrs = ["stream_executor_pimpl.h"],
+    hdrs = [
+        "stream_common.h",
+        "stream_executor_pimpl.h",
+    ],
     tags = ["avoid_dep"],
     visibility = ["//visibility:private"],
     deps = [
@@ -700,6 +704,7 @@ cc_library(
         "@local_tsl//tsl/platform:stacktrace",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index c8e120702501db..e7d4a1c2b7bc5d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -36,10 +36,10 @@ class GpuExecutor;
 // StreamInterface.
 //
 // Thread-safe post-initialization.
-class GpuStream : public Stream {
+class GpuStream : public StreamCommon {
  public:
   explicit GpuStream(GpuExecutor* parent)
-      : Stream(parent),
+      : StreamCommon(parent),
         parent_(parent),
         gpu_stream_(nullptr),
         completed_event_(nullptr) {}
diff --git a/third_party/xla/xla/stream_executor/host/host_stream.cc b/third_party/xla/xla/stream_executor/host/host_stream.cc
index 4bd249d0139931..55053d54c2f2e0 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream.cc
+++ b/third_party/xla/xla/stream_executor/host/host_stream.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
-#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/denormal.h"
 #include "tsl/platform/env.h"
@@ -37,7 +37,7 @@ namespace stream_executor {
 namespace host {
 
 HostStream::HostStream(StreamExecutor* executor)
-    : Stream(executor),
+    : StreamCommon(executor),
       thread_(tsl::Env::Default()->StartThread({}, "host_executor",
                                                [this]() { WorkLoop(); })) {}
 
diff --git a/third_party/xla/xla/stream_executor/host/host_stream.h b/third_party/xla/xla/stream_executor/host/host_stream.h
index 04282c9ce957fa..813585ae89f1c0 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream.h
+++ b/third_party/xla/xla/stream_executor/host/host_stream.h
@@ -26,14 +26,14 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace stream_executor {
 namespace host {
 
-class HostStream : public Stream {
+class HostStream : public StreamCommon {
  public:
   explicit HostStream(StreamExecutor* executor);
   ~HostStream() override;
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 922653540c9df4..30dc37702bb378 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -22,35 +22,23 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_STREAM_H_
 
 #include <cstdint>
-#include <memory>
-#include <utility>
 #include <variant>
-#include <vector>
 
-#include "absl/base/thread_annotations.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
-#include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/thread_annotations.h"
 
 namespace stream_executor {
 
-class DeviceMemoryBase;
-template <typename ElemT>
-class DeviceMemory;
-
 class StreamExecutor;
 
 // Represents a stream of dependent computations on a GPU device.
@@ -72,11 +60,6 @@ class Stream {
     void *stream = nullptr;  // will be nullptr if not supported
   };
 
-  // Instantiate a stream tied to parent as a platform executor. Work
-  // entrained onto this stream will be launched/managed on that
-  // StreamExecutor's platform.
-  explicit Stream(StreamExecutor *parent);
-
   // Deallocates any stream resources that the parent StreamExecutor has
   // bestowed
   // upon this object.
@@ -85,11 +68,11 @@ class Stream {
   // TODO(ezhulenev): Consider removing this platform-specific accessor and
   // forward all users to platform-specific headers, however it requires careful
   // build rules set up to avoid leaking even more implementation details.
-  virtual PlatformSpecificHandle platform_specific_handle() const;
+  virtual PlatformSpecificHandle platform_specific_handle() const = 0;
 
   // Returns whether any errors have occurred while entraining work for this
   // stream.
-  bool ok() const { return !InErrorState(); }
+  virtual bool ok() const = 0;
 
   // Retrieves execution status back into the stream from the underlying
   // implementation without blocking the stream.
@@ -100,26 +83,26 @@ class Stream {
   // devices should also override AllowsSyncOnCompletion to return false.) For
   // these devices, this method can be used after work is finished to retrieve
   // execution status.
-  absl::Status RefreshStatus() TF_LOCKS_EXCLUDED(mu_);
+  virtual absl::Status RefreshStatus() = 0;
 
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
   //
   // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
-  absl::StatusOr<Stream *> GetOrCreateSubStream() TF_LOCKS_EXCLUDED(mu_);
+  virtual absl::StatusOr<Stream *> GetOrCreateSubStream() = 0;
 
   // Return the sub-stream back to the host stream so that it can be reused
   // later. Sub-streams that are !ok() will not be reused.
   //
   // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
-  void ReturnSubStream(Stream *sub_stream) TF_LOCKS_EXCLUDED(mu_);
+  virtual void ReturnSubStream(Stream *sub_stream) = 0;
 
   // Entrains onto the stream of operations: a kernel launch with the given
   // (variadic) parameters for the invocation. These arguments can be things
   // like DeviceMemory or primitive types such as int. What arguments you may
   // pass to a given kernel are noted as the template parameters to the
-  // TypedKernel type that the machocc compiler generates.
+  // TypedKernel type that the compiler generates.
   //
   // Template parameters:
   //  Params...   The type list of formal parameters that the typed kernel
@@ -150,32 +133,32 @@ class Stream {
   // Checks that a stream does not wait for itself, and it is up to the
   // user to guarantee that a stream does not come to wait on itself in a
   // cyclic manner; in that case, behavior is undefined.
-  absl::Status WaitFor(Stream *other);
+  virtual absl::Status WaitFor(Stream *other) = 0;
 
   // Waits for an event object to be set.
   // Note that RecordEvent must have been called on the event before
   // you call this function; otherwise the event will be considered complete
   // and this wait will do nothing.
-  absl::Status WaitFor(Event *event);
+  virtual absl::Status WaitFor(Event *event) = 0;
 
   // Inserts the specified event into the end of this stream. Once the stream
   // has processed all events prior to the insertion point, the event will be
   // marked as completed.
   // The stream does not take ownership of event - meaning that event's lifetime
   // must extend past the point at which it is marked complete!
-  absl::Status RecordEvent(Event *event);
+  virtual absl::Status RecordEvent(Event *event) = 0;
 
   // Entrain onto the stream: a memcpy to a host destination from a GPU source
   // of the given target size. host_dst must be a pointer to host memory
   // allocated by StreamExecutor::HostMemoryAllocate.
-  absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
-                      uint64_t size);
+  virtual absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                              uint64_t size) = 0;
 
   // Entrain onto the stream: a memcpy to a GPU destination from a host source
   // of the given target size. host_src must be a pointer to host memory
   // allocated by StreamExecutor::HostMemoryAllocate.
-  absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
-                      uint64_t size);
+  virtual absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                              uint64_t size) = 0;
 
   // Alternative interface for memcpying from device to host that takes an
   // array slice. Checks that the destination size can accommodate the host
@@ -206,8 +189,9 @@ class Stream {
   // Entrain onto the stream: a memcpy to a GPU destination from a GPU source
   // of the given target size. gpu_src/dst must be pointers to GPU memory and
   // peer access must be enabled between their owning StreamExecutors.
-  absl::Status Memcpy(DeviceMemoryBase *gpu_dst,
-                      const DeviceMemoryBase &gpu_src, uint64_t size);
+  virtual absl::Status Memcpy(DeviceMemoryBase *gpu_dst,
+                              const DeviceMemoryBase &gpu_src,
+                              uint64_t size) = 0;
   absl::Status MemcpyD2D(DeviceMemoryBase *gpu_dst,
                          const DeviceMemoryBase &gpu_src, uint64_t size) {
     return Memcpy(gpu_dst, gpu_src, size);
@@ -215,13 +199,13 @@ class Stream {
 
   // Entrain onto the stream: a memset of zero at a GPU location of size bytes.
   // The location must not be null.
-  absl::Status MemZero(DeviceMemoryBase *location, uint64_t size);
+  virtual absl::Status MemZero(DeviceMemoryBase *location, uint64_t size) = 0;
 
   // Entrain onto the stream: a memset of a 32-bit pattern at a GPU location of
   // size bytes, where bytes must be evenly 32-bit sized (i.e. evenly divisible
   // by 4). The location must not be null.
-  absl::Status Memset32(DeviceMemoryBase *location, uint32_t pattern,
-                        uint64_t size);
+  virtual absl::Status Memset32(DeviceMemoryBase *location, uint32_t pattern,
+                                uint64_t size) = 0;
 
   // (Synchronously) block the host code waiting for the operations
   // entrained on the stream (enqueued to this point in program
@@ -229,7 +213,7 @@ class Stream {
   //
   // Returns an OK status if the blocking was successful and the stream is ok().
   // Otherwise returns an error describing why the blocking failed.
-  absl::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
+  virtual absl::Status BlockHostUntilDone() = 0;
 
   // Entrains onto the stream a callback to the host (from the device).
   // Behaves as DoHostCallbackWithStatus below, but the callback should
@@ -238,7 +222,8 @@ class Stream {
   // This is kept for backward compatibility. Future code should use
   // DoHostCallbackWithStatus and explicitly return a success status.
   // TODO(b/112125301): Eventually remove this method.
-  absl::Status DoHostCallback(absl::AnyInvocable<void() &&> callback);
+  virtual absl::Status DoHostCallback(
+      absl::AnyInvocable<void() &&> callback) = 0;
 
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
@@ -248,61 +233,27 @@ class Stream {
   //
   // On certain platforms, DoHostCallback is expected to have significant
   // negative effects on performance.
-  absl::Status DoHostCallbackWithStatus(
-      absl::AnyInvocable<absl::Status() &&> callback);
+  virtual absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) = 0;
 
   // Returns the StreamExecutor (parent object) associated with this stream.
-  StreamExecutor *parent() const {
-    CHECK(parent_ != nullptr);
-    return parent_;
-  }
+  virtual StreamExecutor *parent() const = 0;
 
-  CudaComputeCapability GetCudaComputeCapability() const {
-    return parent()->GetDeviceDescription().cuda_compute_capability();
-  }
+  // Returns the CudaComputeCapability for this stream.
+  virtual CudaComputeCapability GetCudaComputeCapability() const = 0;
 
-  RocmComputeCapability GetRocmComputeCapability() const {
-    return parent()->GetDeviceDescription().rocm_compute_capability();
-  }
+  // Returns the RocmComputeCapability for this stream.
+  virtual RocmComputeCapability GetRocmComputeCapability() const = 0;
 
   // Gets priority for a stream.
-  virtual std::variant<StreamPriority, int> priority() const {
-    return StreamPriority::Default;
-  }
-
- private:
-  bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
-    absl::ReaderMutexLock lock(&mu_);
-    return !status_.ok();
-  }
-
-  // Sets the error state if operation_retcode is false.
-  // This is a useful shorthand for many stream routines.
-  void CheckError(bool operation_retcode) TF_LOCKS_EXCLUDED(mu_);
-
-  // Checks the status and logs the error message, if any.
-  void CheckStatus(absl::Status status) TF_LOCKS_EXCLUDED(mu_);
-
-  void SetError() { CheckError(false /* = operation_retcode */); }
-
-  // The StreamExecutor that supports the operation of this stream.
-  StreamExecutor *parent_;
-
-  // mutex that guards the allocation / error state flags.
-  // Mutable so that it can be obtained via const reader lock.
-  mutable absl::Mutex mu_;
-
-  // The last error (if any) of all method calls.
-  absl::Status status_ ABSL_GUARDED_BY(mu_);
-
-  // Sub-streams that are generated from this stream. Each element has a pointer
-  // to sub-stream and a boolean value indicating if this substream is ready to
-  // be reused.
-  std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
-      ABSL_GUARDED_BY(mu_);
-
-  Stream(const Stream &) = delete;
-  void operator=(const Stream &) = delete;
+  virtual std::variant<StreamPriority, int> priority() const = 0;
+
+  // Launches a data parallel kernel with the given thread/block
+  // dimensionality and already-packed args/sizes to pass to the underlying
+  // platform driver.
+  virtual absl::Status Launch(const ThreadDim &thread_dims,
+                              const BlockDim &block_dims, const Kernel &k,
+                              const KernelArgs &args) = 0;
 };
 
 template <typename... Params, typename... Args>
@@ -311,9 +262,7 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        const TypedKernel<Params...> &kernel,
                                        Args... args) {
   auto kernel_args = PackKernelArgs(kernel, args...);
-  TF_RETURN_IF_ERROR(
-      parent_->Launch(this, thread_dims, block_dims, *kernel, *kernel_args));
-  return absl::OkStatus();
+  return Launch(thread_dims, block_dims, *kernel, *kernel_args);
 }
 
 template <typename... Params, typename... Args>
@@ -322,9 +271,7 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        const TypedKernel<Params...> &kernel,
                                        Args... args) {
   auto kernel_args = PackKernelArgs(shmem_bytes, args...);
-  TF_RETURN_IF_ERROR(
-      parent_->Launch(this, thread_dims, block_dims, *kernel, *kernel_args));
-  return absl::OkStatus();
+  return Launch(thread_dims, block_dims, *kernel, *kernel_args);
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream_common.cc
similarity index 77%
rename from third_party/xla/xla/stream_executor/stream.cc
rename to third_party/xla/xla/stream_executor/stream_common.cc
index 65034ff0577593..797646ddba054c 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -31,7 +31,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
@@ -42,18 +44,25 @@ limitations under the License.
 
 namespace stream_executor {
 
-Stream::Stream(StreamExecutor *parent)
+StreamCommon::StreamCommon(StreamExecutor *parent)
     : parent_(parent), status_(absl::OkStatus()) {
   CHECK_NE(parent, nullptr);
 }
 
-Stream::PlatformSpecificHandle Stream::platform_specific_handle() const {
+absl::Status StreamCommon::Launch(const ThreadDim &thread_dims,
+                                  const BlockDim &block_dims, const Kernel &k,
+                                  const KernelArgs &args) {
+  return parent_->Launch(this, thread_dims, block_dims, k, args);
+}
+
+StreamCommon::PlatformSpecificHandle StreamCommon::platform_specific_handle()
+    const {
   PlatformSpecificHandle handle;
   handle.stream = nullptr;
   return handle;
 }
 
-absl::Status Stream::RefreshStatus() {
+absl::Status StreamCommon::RefreshStatus() {
   absl::Status status = parent_->GetStatus(this);
   // We should not put the stream in an error state, just because the GetStatus
   // method is unimplemented.
@@ -64,11 +73,11 @@ absl::Status Stream::RefreshStatus() {
   return status;
 }
 
-absl::Status Stream::RecordEvent(Event *event) {
+absl::Status StreamCommon::RecordEvent(Event *event) {
   return parent_->RecordEvent(this, event);
 }
 
-absl::StatusOr<Stream *> Stream::GetOrCreateSubStream() {
+absl::StatusOr<Stream *> StreamCommon::GetOrCreateSubStream() {
   // Do not destroy bad streams when holding mu_ because ~Stream() may
   // BlockHostUntilDone and it's host callbacks might attempt to acquire mu_.
   std::vector<std::unique_ptr<Stream>> bad_streams;
@@ -113,7 +122,7 @@ absl::StatusOr<Stream *> Stream::GetOrCreateSubStream() {
   return sub_stream;
 }
 
-void Stream::ReturnSubStream(Stream *sub_stream) {
+void StreamCommon::ReturnSubStream(Stream *sub_stream) {
   // Do not destroy bad streams when holding mu_ because ~Stream() may
   // BlockHostUntilDone and it's host callbacks might attempt to acquire mu_.
   std::unique_ptr<Stream> bad_stream;
@@ -150,7 +159,7 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
              << sub_stream;
 }
 
-absl::Status Stream::WaitFor(Stream *other) {
+absl::Status StreamCommon::WaitFor(Stream *other) {
   if (this == other) {
     return absl::InternalError("stream cannot wait for itself");
   }
@@ -160,45 +169,48 @@ absl::Status Stream::WaitFor(Stream *other) {
   return absl::InternalError("stream cannot wait for other");
 }
 
-absl::Status Stream::WaitFor(Event *event) {
+absl::Status StreamCommon::WaitFor(Event *event) {
   return parent_->WaitForEvent(this, event);
 }
 
-absl::Status Stream::Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
-                            uint64_t size) {
+absl::Status StreamCommon::Memcpy(void *host_dst,
+                                  const DeviceMemoryBase &gpu_src,
+                                  uint64_t size) {
   return parent_->Memcpy(this, host_dst, gpu_src, size);
 }
 
-absl::Status Stream::Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
-                            uint64_t size) {
+absl::Status StreamCommon::Memcpy(DeviceMemoryBase *gpu_dst,
+                                  const void *host_src, uint64_t size) {
   return parent_->Memcpy(this, gpu_dst, host_src, size);
 }
 
-absl::Status Stream::Memcpy(DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &gpu_src, uint64_t size) {
+absl::Status StreamCommon::Memcpy(DeviceMemoryBase *gpu_dst,
+                                  const DeviceMemoryBase &gpu_src,
+                                  uint64_t size) {
   if (parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size)) {
     return absl::OkStatus();
   }
   return absl::InternalError("failed to memcpy");
 }
 
-absl::Status Stream::MemZero(DeviceMemoryBase *location, uint64_t size) {
+absl::Status StreamCommon::MemZero(DeviceMemoryBase *location, uint64_t size) {
   return parent_->MemZero(this, location, size);
 }
 
-absl::Status Stream::Memset32(DeviceMemoryBase *location, uint32_t pattern,
-                              uint64_t size) {
+absl::Status StreamCommon::Memset32(DeviceMemoryBase *location,
+                                    uint32_t pattern, uint64_t size) {
   return parent_->Memset32(this, location, pattern, size);
 }
 
-absl::Status Stream::DoHostCallback(absl::AnyInvocable<void() &&> callback) {
+absl::Status StreamCommon::DoHostCallback(
+    absl::AnyInvocable<void() &&> callback) {
   return DoHostCallbackWithStatus([cb = std::move(callback)]() mutable {
     std::move(cb)();
     return absl::OkStatus();
   });
 }
 
-absl::Status Stream::DoHostCallbackWithStatus(
+absl::Status StreamCommon::DoHostCallbackWithStatus(
     absl::AnyInvocable<absl::Status() &&> callback) {
   if (parent_->HostCallback(this, std::move(callback))) {
     return absl::OkStatus();
@@ -206,7 +218,7 @@ absl::Status Stream::DoHostCallbackWithStatus(
   return absl::InternalError("failed to host callback");
 }
 
-void Stream::CheckError(bool operation_retcode) {
+void StreamCommon::CheckError(bool operation_retcode) {
   if (operation_retcode) {
     return;
   }
@@ -214,7 +226,7 @@ void Stream::CheckError(bool operation_retcode) {
   status_ = absl::InternalError("Unknown error");
 }
 
-absl::Status Stream::BlockHostUntilDone() {
+absl::Status StreamCommon::BlockHostUntilDone() {
   if (!ok()) {
     absl::MutexLock lock(&mu_);
     LOG(INFO) << status_.ToString();
@@ -230,7 +242,7 @@ absl::Status Stream::BlockHostUntilDone() {
   return error;
 }
 
-void Stream::CheckStatus(absl::Status status) {
+void StreamCommon::CheckStatus(absl::Status status) {
   if (status.ok()) {
     return;
   }
diff --git a/third_party/xla/xla/stream_executor/stream_common.h b/third_party/xla/xla/stream_executor/stream_common.h
new file mode 100644
index 00000000000000..748406dfa497f1
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/stream_common.h
@@ -0,0 +1,146 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The Stream is used in conjunction with the StreamExecutor "parent" to
+// perform actions with a linear stream of dependencies. Dependencies can also
+// be created between Streams to do task management (i.e. limit which tasks
+// can be performed concurrently and specify what task dependencies exist).
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
+#define XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace stream_executor {
+
+// Represents a stream of dependent computations on a GPU device.
+//
+// The operations within a stream execute linearly and asynchronously until
+// BlockHostUntilDone() is invoked, which synchronously joins host code with
+// the execution of the stream.
+//
+// If any given operation fails when entraining work for the stream, ok() will
+// indicate that an error has occurred. After initialization, once a stream is
+// !ok(), it will never be ok().
+//
+// Thread-safe post-initialization.
+class StreamCommon : public Stream {
+ public:
+  // Instantiate a stream tied to parent as a platform executor. Work
+  // entrained onto this stream will be launched/managed on that
+  // StreamExecutor's platform.
+  explicit StreamCommon(StreamExecutor *parent);
+
+  PlatformSpecificHandle platform_specific_handle() const override;
+  bool ok() const override { return !InErrorState(); }
+  absl::Status RefreshStatus() override TF_LOCKS_EXCLUDED(mu_);
+  absl::StatusOr<Stream *> GetOrCreateSubStream() override
+      TF_LOCKS_EXCLUDED(mu_);
+  void ReturnSubStream(Stream *sub_stream) override TF_LOCKS_EXCLUDED(mu_);
+  absl::Status WaitFor(Stream *other) override;
+  absl::Status WaitFor(Event *event) override;
+  absl::Status RecordEvent(Event *event) override;
+  absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase *gpu_dst,
+                      const DeviceMemoryBase &gpu_src, uint64_t size) override;
+  absl::Status MemZero(DeviceMemoryBase *location, uint64_t size) override;
+  absl::Status Memset32(DeviceMemoryBase *location, uint32_t pattern,
+                        uint64_t size) override;
+  absl::Status BlockHostUntilDone() override TF_LOCKS_EXCLUDED(mu_);
+  absl::Status DoHostCallback(absl::AnyInvocable<void() &&> callback) override;
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override;
+  StreamExecutor *parent() const override {
+    CHECK(parent_ != nullptr);
+    return parent_;
+  }
+
+  CudaComputeCapability GetCudaComputeCapability() const override {
+    return parent()->GetDeviceDescription().cuda_compute_capability();
+  }
+
+  RocmComputeCapability GetRocmComputeCapability() const override {
+    return parent()->GetDeviceDescription().rocm_compute_capability();
+  }
+  std::variant<StreamPriority, int> priority() const override {
+    return StreamPriority::Default;
+  }
+  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
+                      const Kernel &k, const KernelArgs &args) override;
+
+ private:
+  bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
+    absl::ReaderMutexLock lock(&mu_);
+    return !status_.ok();
+  }
+
+  // Sets the error state if operation_retcode is false.
+  // This is a useful shorthand for many stream routines.
+  void CheckError(bool operation_retcode) TF_LOCKS_EXCLUDED(mu_);
+
+  // Checks the status and logs the error message, if any.
+  void CheckStatus(absl::Status status) TF_LOCKS_EXCLUDED(mu_);
+
+  void SetError() { CheckError(false /* = operation_retcode */); }
+
+  // The StreamExecutor that supports the operation of this stream.
+  StreamExecutor *parent_;
+
+  // mutex that guards the allocation / error state flags.
+  // Mutable so that it can be obtained via const reader lock.
+  mutable absl::Mutex mu_;
+
+  // The last error (if any) of all method calls.
+  absl::Status status_ ABSL_GUARDED_BY(mu_);
+
+  // Sub-streams that are generated from this stream. Each element has a
+  // pointer to sub-stream and a boolean value indicating if this substream is
+  // ready to be reused.
+  std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
+      ABSL_GUARDED_BY(mu_);
+
+  StreamCommon(const StreamCommon &) = delete;
+  void operator=(const StreamCommon &) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
index c492f5a2c26005..b07d3455ab6f05 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
@@ -18,17 +18,17 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-class TpuStreamInterface : public stream_executor::Stream {
+class TpuStreamInterface : public stream_executor::StreamCommon {
  public:
   explicit TpuStreamInterface(stream_executor::StreamExecutor* executor)
-      : Stream(executor) {}
+      : StreamCommon(executor) {}
   virtual bool IsSameSharedMemoryLocation(TpuStreamInterface* other) = 0;
   virtual absl::Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,

From 73e2bb2a020aec9124c7f1c99a1eaaac6ab96a22 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 30 May 2024 09:20:53 -0700
Subject: [PATCH 145/287] [XLA:GPU] Fix logic for symbol lookup

Currently, we return/log bad Status when symbol is not found, but the logic for
resolving the constants does not expect it to be always found. This leads to
two issues:

1. Log spam from "Failed to found symbol" error messages.
2. Hiding actual errors, if the symbol failed to resolve in some other ways.

Returning StatusOr<optional<>> achieves both goals.

PiperOrigin-RevId: 638677474
---
 .../xla/xla/service/gpu/gpu_executable.cc     | 22 +++++++++----------
 .../xla/stream_executor/cuda/cuda_executor.cc |  8 ++-----
 .../xla/stream_executor/gpu/gpu_executor.h    |  2 +-
 .../stream_executor/mock_stream_executor.h    |  2 +-
 .../xla/stream_executor/rocm/rocm_executor.cc |  9 ++------
 .../stream_executor_interface.h               |  5 +++--
 6 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 85b444898dec6c..811b0deca3f059 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -629,8 +629,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   // The CUDA driver isn't able to load a PTX and a binary which are both empty.
   // It's okay if we skip loading in this case; if the module isn't loaded, all
   // symbol lookups will fail, just as they should for an empty module.
-  if (!(executor->GetPlatform()->id() ==
-            stream_executor::cuda::kCudaPlatformId &&
+  if (!(executor->GetPlatform()->id() == se::cuda::kCudaPlatformId &&
         binary().empty() && text().empty())) {
     TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
   }
@@ -640,23 +639,22 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   int submitted_mem_copies = 0;
 
   for (const ConstantInfo& info : constants_) {
-    absl::StatusOr<stream_executor::DeviceMemoryBase> global_status;
+    std::optional<se::DeviceMemoryBase> global;
     if (static_cast<bool>(module_handle)) {
-      global_status = executor->GetSymbol(info.symbol_name, module_handle);
+      TF_ASSIGN_OR_RETURN(global,
+                          executor->GetSymbol(info.symbol_name, module_handle));
     }
 
-    se::DeviceMemoryBase global;
-    if (static_cast<bool>(module_handle) && global_status.ok()) {
+    if (static_cast<bool>(module_handle) && global.has_value()) {
       // The constant was defined in the PTX and has been allocated by the CUDA
       // driver.
-      global = *global_status;
       VLOG(3) << "Resolved global " << info.symbol_name << " to "
-              << global.opaque();
+              << global->opaque();
 
       if (!info.content.span().empty()) {
         // This means the constant did not have an initializer in the PTX and
         // therefore must be initialized by XLA here.
-        TF_RETURN_IF_ERROR(stream->Memcpy(&global, info.content.span().data(),
+        TF_RETURN_IF_ERROR(stream->Memcpy(&*global, info.content.span().data(),
                                           info.content.span().size()));
         submitted_mem_copies = true;
       }
@@ -667,9 +665,9 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
 
       TF_ASSIGN_OR_RETURN(auto shared, executor->CreateOrShareConstant(
                                            stream, info.content.span()));
-      global = *shared;
+      *global = *shared;
       VLOG(3) << "Allocated (or shared) global " << info.symbol_name << " at "
-              << global.opaque();
+              << global->opaque();
       // XLA will continue to own this global at least until this executable is
       // destroyed (longer if another, longer-lived executable shares the same
       // constant).
@@ -677,7 +675,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
     }
 
     if (info.allocation_index != -1) {
-      InsertOrDie(globals.get(), info.allocation_index, global);
+      InsertOrDie(globals.get(), info.allocation_index, *global);
     }
   }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index bec611675c73f2..95e4d47d6a4518 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -883,7 +883,7 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
+absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   void* mem = nullptr;
   size_t bytes = 0;
@@ -906,11 +906,7 @@ absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
     }
   }
 
-  LOG(INFO) << "Failed to find symbol: " << symbol_name;
-  return absl::NotFoundError(
-      absl::StrCat("Check if module containing symbol ", symbol_name,
-                   " is loaded (module_handle = ",
-                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
+  return std::nullopt;
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index af1b55087b8591..8cd10f57dae498 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -245,7 +245,7 @@ class GpuExecutor : public StreamExecutor {
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
 
-  absl::StatusOr<DeviceMemoryBase> GetSymbol(
+  absl::StatusOr<std::optional<DeviceMemoryBase>> GetSymbol(
       const std::string& symbol_name, ModuleHandle module_handle) override;
 
   absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index 6e2c9c7c81af22..e52c88a2903f6f 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -146,7 +146,7 @@ class MockStreamExecutor : public StreamExecutorInterface {
               (override));
   MOCK_METHOD(bool, DeviceMemoryUsage, (int64_t* free, int64_t* total),
               (const, override));
-  MOCK_METHOD(absl::StatusOr<DeviceMemoryBase>, GetSymbol,
+  MOCK_METHOD(absl::StatusOr<std::optional<DeviceMemoryBase>>, GetSymbol,
               (const std::string& symbol_name, ModuleHandle module_handle),
               (override));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<DeviceDescription>>,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index a6783d5e08f2da..8704d272b6032a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -787,7 +787,7 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
+absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   void* mem = nullptr;
   size_t bytes = 0;
@@ -810,12 +810,7 @@ absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
       return DeviceMemoryBase(mem, bytes);
     }
   }
-
-  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
-  return absl::NotFoundError(
-      absl::StrCat("Check if module containing symbol ", symbol_name,
-                   " is loaded (module_handle = ",
-                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
+  return std::nullopt;
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
diff --git a/third_party/xla/xla/stream_executor/stream_executor_interface.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
index f040fda36e44d7..aafffb39c3255a 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -303,12 +303,13 @@ class StreamExecutorInterface {
 
   // Retrieves device pointer and size for a symbol. To use
   // constant memory in CUDA, GetSymbol has to be used. Returns DeviceMemoryBase
-  // describing the symbol in memory if symbol is found.
+  // describing the symbol in memory if symbol is found, and an empty
+  // std::optional otherwise.
   //
   // If ModuleHandle is set then we search for `symbol_name` only within the
   // module corresponding to `module_handle`.  Otherwise all loaded modules are
   // searched.
-  virtual absl::StatusOr<DeviceMemoryBase> GetSymbol(
+  virtual absl::StatusOr<std::optional<DeviceMemoryBase>> GetSymbol(
       const std::string& symbol_name, ModuleHandle module_handle) {
     return absl::UnimplementedError("Not implemented");
   }

From 95c219d25aee41de56e62aa6ea2fb5750a4412d1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 09:36:51 -0700
Subject: [PATCH 146/287] [xla:cpu] Add support for emitting fusions with
 thread local computations

PiperOrigin-RevId: 638682359
---
 third_party/xla/xla/service/cpu/BUILD         |  2 +
 .../xla/xla/service/cpu/cpu_compiler.cc       | 55 +++++++++++--------
 third_party/xla/xla/service/cpu/ir_emitter.cc | 17 ++++--
 third_party/xla/xla/service/cpu/ir_emitter.h  | 29 ++++++----
 .../xla/xla/service/cpu/ir_emitter2.cc        | 50 +++++++++++++----
 third_party/xla/xla/service/cpu/ir_emitter2.h |  8 ++-
 .../xla/xla/service/cpu/ir_emitter2_test.cc   |  4 +-
 7 files changed, 112 insertions(+), 53 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 9a54668d825c1c..ea0ad033b7d96b 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -655,6 +655,7 @@ cc_library(
     hdrs = ["ir_emitter2.h"],
     deps = [
         ":elemental_math_emitter",
+        ":ir_emitter",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
@@ -662,6 +663,7 @@ cc_library(
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
+        "//xla/service/llvm_ir:tuple_ops",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index b13856b01fde97..1e5477441236b1 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1182,18 +1182,42 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
     return cpu_executable;
   };
 
+  LLVMTargetMachineFeatures target_machine_features((*jit)->target_machine());
+
+  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
+  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
+  // thread local or embedded) computations (reductions, maps, etc.).
+
+  // (Nested) IrEmitter is responsible for building LLVM module with functions
+  // for all HLO computations. In thunk execution mode we only build LLVM
+  // functions for embedded computations (e.g. reduction computations) and all
+  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
+  // functions (which are also LLVM functions, but use a HostKernel ABI).
+  IrEmitter nested_ir_emitter(
+      &mlir_context, *module, *assignment, llvm_module.get(),
+      std::move(instruction_to_profile_idx),
+      std::move(computation_to_profile_idx),
+      ModuleComputationsTransitivelyContainCustomCall(*module),
+      &target_machine_features,
+#ifdef MEMORY_SANITIZER
+      /*emit_code_for_msan=*/true
+#else
+      /*emit_code_for_msan=*/false
+#endif
+  );
+
   // If we use Thunk runtime then instead of emitting LLVM function for the
   // entry computation we emit a sequence of thunks that implement the
   // computation as a sequence of interpreted commands.
   if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
     // IR emitter is responsible for building LLVM module with host kernels for
     // corresponding HLO instructions (fusions, elemental instructions, etc.).
-    IrEmitter2 ir_emitter(*module, llvm_module.get());
+    IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
 
     // Thunk emitter is responsible for building a Thunk sequence that will
     // resolved kernels in the compiled LLVM module and execute them together
     // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
-    ThunkEmitter thunk_emitter(&ir_emitter, assignment.get());
+    ThunkEmitter thunk_emitter(&ir_emitter2, assignment.get());
     TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
                         thunk_emitter.EmitEntryComputation(*module));
 
@@ -1205,7 +1229,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
     // TODO(ezhulenev): We should be able to make it lazy on-demand, but today
     // we capture obj_files by reference and it leads to asan errors. Figure out
     // lifetime issues and move compilation to Thunk initialization stage.
-    for (const auto& kernel : ir_emitter.kernels()) {
+    for (const auto& kernel : ir_emitter2.kernels()) {
       if (auto sym = (*jit)->FindCompiledSymbol(kernel.name); !sym) {
         return Internal("Failed to find compiled symbol for kernel %s",
                         kernel.name);
@@ -1230,24 +1254,9 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
 
   // Each computation is a single function.  Emit all embedded computations
   // before the entry computation. The order of computations returned from
-  // GetEmbeddedComputations guarantees that a called computation occurs
+  // SubcomputationEmissionOrder guarantees that a called computation occurs
   // before a caller computation.
-
-  std::string function_name;
-  LLVMTargetMachineFeatures target_machine_features((*jit)->target_machine());
-  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
-                       std::move(instruction_to_profile_idx),
-                       std::move(computation_to_profile_idx),
-                       ModuleComputationsTransitivelyContainCustomCall(*module),
-                       &target_machine_features,
-#ifdef MEMORY_SANITIZER
-                       /*emit_code_for_msan=*/true
-#else
-                       /*emit_code_for_msan=*/false
-#endif
-  );
-
-  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitConstantGlobals());
 
   for (ComputationToEmit subcomputation :
        SubcomputationEmissionOrder(entry_computation)) {
@@ -1255,7 +1264,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
       continue;
     }
     TF_RETURN_IF_ERROR(
-        ir_emitter
+        nested_ir_emitter
             .EmitComputation(
                 subcomputation.computation, subcomputation.computation->name(),
                 /*is_top_level_computation=*/false,
@@ -1267,13 +1276,13 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
                                                ? "__compute"
                                                : entry_computation->name();
   TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
-                      ir_emitter.EmitComputation(
+                      nested_ir_emitter.EmitComputation(
                           entry_computation, function_name_prefix,
                           /*is_top_level_computation=*/true,
                           schedule.sequence(entry_computation).instructions(),
                           /*allow_reassociation=*/false));
 
-  function_name = [&]() {
+  std::string function_name = [&]() {
     llvm::SmallVector<char, 40> function_name_vector;
     llvm::Mangler::getNameWithPrefix(
         function_name_vector, entry_function->getName(), (*jit)->data_layout());
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index a0e73aaaf8aa77..f53439f254cf0f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -3916,7 +3916,7 @@ llvm::Value* IrEmitter::EmitScalarReturningThreadLocalCall(
 
 std::vector<llvm::Value*> IrEmitter::EmitThreadLocalCall(
     const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
-    absl::string_view name, bool is_reducer) {
+    absl::string_view name, bool is_reducer, bool in_compute_function) {
   CHECK(absl::c_binary_search(thread_local_computations_, &callee));
   const Shape& return_shape = callee.root_instruction()->shape();
   bool is_scalar_return = ShapeUtil::IsScalar(return_shape);
@@ -3962,19 +3962,24 @@ std::vector<llvm::Value*> IrEmitter::EmitThreadLocalCall(
     EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
   }
 
+  llvm::Value* null_ptr = llvm::Constant::getNullValue(b_.getPtrTy());
+
   Call(
       FindOrDie(emitted_functions_,
                 ComputationToEmit{&callee, allow_reassociation_ || is_reducer}),
       GetArrayFunctionCallArguments(
           parameter_addrs, &b_, name,
           /*return_value_buffer=*/return_value_buffer,
-          /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
-          /*buffer_table_arg=*/
-          llvm::Constant::getNullValue(b_.getPtrTy()),
-          /*status_arg=*/GetStatusArgument(),
-          /*profile_counters_arg=*/GetProfileCountersArgument()));
+          /*exec_run_options_arg=*/
+          in_compute_function ? GetExecutableRunOptionsArgument() : null_ptr,
+          /*buffer_table_arg=*/null_ptr,
+          /*status_arg=*/in_compute_function ? GetStatusArgument() : null_ptr,
+          /*profile_counters_arg=*/
+          in_compute_function ? GetProfileCountersArgument() : null_ptr));
 
   if (ComputationTransitivelyContainsCustomCall(&callee)) {
+    DCHECK(!in_compute_function) << "Custom call inside nested computations "
+                                    "are not supported by Thunks runtime";
     EmitEarlyReturnIfErrorStatus();
   }
 
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index fd1dcf22a02722..718c5b11a24c43 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -126,6 +126,24 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Emit an LLVM global variable for every constant buffer allocation.
   absl::Status EmitConstantGlobals();
 
+  // Emits a call to a thread local function (e.g. to the computation nested
+  // within a reduce or a map).  Thread local callees (by definition) only write
+  // to and read from thread local allocations.
+  // Supports only functions returning scalars or tuples of scalars.
+  //
+  // `parameters` holds the *scalar values* that need to be passed to the
+  // callee.  The return value is the scalar returned by the callee.
+  //
+  // If `in_compute_function` is true, the call is emitted inside the compute
+  // function emitted by a legacy IrEmitter and has access to executable run
+  // options, status flag, etc. If `in_compute_function` is false, then the call
+  // is inside nested computation of a host kernel emitted for thunks and it
+  // can only emit simple scalar computations and has no way to call back into
+  // the runtime.
+  std::vector<llvm::Value*> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer, bool in_compute_function = true);
+
  protected:
   //
   // The following methods implement the DfsHloVisitor interface.
@@ -290,17 +308,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice,
                                  const Shape& target_shape);
 
-  // Emits a call to a thread local function (e.g. to the computation nested
-  // within a reduce or a map).  Thread local callees (by definition) only write
-  // to and read from thread local allocations.
-  // Supports only functions returning scalars or tuples of scalars.
-  //
-  // `parameters` holds the *scalar values* that need to be passed to the
-  // callee.  The return value is the scalar returned by the callee.
-  std::vector<llvm::Value*> EmitThreadLocalCall(
-      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
-      absl::string_view name, bool is_reducer);
-
   // Similar to EmitThreadLocal, yet assumes that the function returns a scalar.
   llvm::Value* EmitScalarReturningThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 00eb70e56d1f1f..b8527143ad0a2e 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -30,13 +30,16 @@ limitations under the License.
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/cpu/elemental_math_emitter.h"
+#include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -118,8 +121,12 @@ static llvm::FunctionType* KernelFunctionTy(llvm::LLVMContext& ctx) {
 class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
  public:
   ElementalIrEmitter(llvm::Module* module, llvm::IRBuilder<>* b,
+                     const HloSchedule* schedule, IrEmitter* nested_ir_emitter,
                      bool fast_min_max)
-      : xla::ElementalIrEmitter(module, b), fast_min_max_(fast_min_max) {}
+      : xla::ElementalIrEmitter(module, b),
+        schedule_(schedule),
+        nested_ir_emitter_(nested_ir_emitter),
+        fast_min_max_(fast_min_max) {}
 
  protected:
   absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
@@ -141,12 +148,27 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
   absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name, bool is_reducer) override {
-    return absl::UnimplementedError("EmitThreadLocalCall is not implemented");
+    // Create a nested function for thread local computation.
+    TF_RETURN_IF_ERROR(
+        nested_ir_emitter_
+            ->EmitComputation(const_cast<HloComputation*>(&callee), name, false,
+                              schedule_->sequence(&callee).instructions(),
+                              /*allow_reassociation=*/is_reducer)
+            .status());
+
+    // Add a thread local call to the nested computation.
+    nested_ir_emitter_->b()->SetInsertPoint(b()->GetInsertPoint());
+    auto values = nested_ir_emitter_->EmitThreadLocalCall(
+        callee, parameters, name, is_reducer, /*in_compute_function=*/false);
+
+    return values;
   }
 
   bool fast_min_max() override { return fast_min_max_; }
 
  private:
+  const HloSchedule* schedule_;
+  IrEmitter* nested_ir_emitter_;
   bool fast_min_max_;
 };
 
@@ -154,9 +176,11 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
 // IrEmitter2
 //===----------------------------------------------------------------------===//
 
-IrEmitter2::IrEmitter2(const HloModule& hlo_module, llvm::Module* module)
+IrEmitter2::IrEmitter2(const HloModule& hlo_module, llvm::Module* module,
+                       IrEmitter* nested_ir_emitter)
     : hlo_module_(hlo_module),
       module_(module),
+      nested_ir_emitter_(nested_ir_emitter),
       call_frame_ty_(KernelCallFrameTy(module_->getContext())),
       thread_dims_ty_(KernelThreadDimTy(module_->getContext())),
       thread_ty_(KernelThreadTy(module_->getContext())),
@@ -168,6 +192,8 @@ bool IrEmitter2::fast_min_max() const {
 
 absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     const HloInstruction* instr) {
+  VLOG(2) << "Emit elemental host kernel: " << instr->name();
+
   KernelPrototype kernel_prototype = EmitKernelPrototype(instr);
 
   llvm::IRBuilder<> b(module_->getContext());
@@ -185,7 +211,8 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     return absl::InternalError("Multi-output host kernels are not supported");
   }
 
-  ElementalIrEmitter elemental_emitter(module_, &b, fast_min_max());
+  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_.schedule(),
+                                       nested_ir_emitter_, fast_min_max());
   llvm_ir::ElementGenerator element_generator =
       elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
 
@@ -198,6 +225,8 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
 
 absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
     const HloFusionInstruction* fusion) {
+  VLOG(2) << "Emit fusion host kernel: " << fusion->name();
+
   if (fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
     return absl::InternalError(absl::StrCat(
         "Unsupported loop fusion kind for instruction: ", fusion->ToString()));
@@ -208,7 +237,8 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
   llvm::IRBuilder<> b(module_->getContext());
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
-  ElementalIrEmitter elemental_emitter(module_, &b, fast_min_max());
+  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_.schedule(),
+                                       nested_ir_emitter_, fast_min_max());
   FusedIrEmitter fused_emitter(elemental_emitter);
 
   for (int i = 0; i < fusion->operand_count(); i++) {
@@ -275,13 +305,13 @@ llvm_ir::IrArray IrEmitter2::EmitKernelArgument(llvm::IRBuilder<>& b,
 IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
     std::string_view name, absl::Span<const Shape> arguments,
     absl::Span<const Shape> results) {
-  VLOG(3) << "Build kernel prototype for: " << name << " with "
-          << arguments.size() << " arguments and " << results.size()
-          << " results:";
-  for (auto& argument : arguments) {
+  VLOG(3) << "Emit kernel prototype: " << name
+          << ", #arguments=" << arguments.size()
+          << ", #results=" << results.size();
+  for (const Shape& argument : arguments) {
     VLOG(3) << "  arguments: " << argument.ToString(true);
   }
-  for (auto& result : results) {
+  for (const Shape& result : results) {
     VLOG(3) << "  result: " << result.ToString(true);
   }
 
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 580183c178ca68..860e786e2de290 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 
@@ -54,7 +55,8 @@ namespace xla::cpu {
 // WARNING: This is under construction and will eventually replace IrEmitter.
 class IrEmitter2 {
  public:
-  IrEmitter2(const HloModule& hlo_module, llvm::Module* module);
+  IrEmitter2(const HloModule& hlo_module, llvm::Module* module,
+             IrEmitter* nested_ir_emitter);
 
   // Thread dimensions of the kernel invocation.
   struct KernelThreadDims {
@@ -132,6 +134,10 @@ class IrEmitter2 {
   const HloModule& hlo_module_;
   llvm::Module* module_;
 
+  // Nested IrEmitter to emit embedded computations (e.g. computations attached
+  // to reductions inside fusions).
+  IrEmitter* nested_ir_emitter_;
+
   // LLVM types defining HostKernel API (see host_kernel_c_api.h).
   llvm::StructType* call_frame_ty_;
   llvm::StructType* thread_dims_ty_;
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
index 3b75c3bed1546f..6120d9ef98d970 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2_test.cc
@@ -47,7 +47,7 @@ TEST_F(IrEmitter2Test, BuildKernelPrototype) {
   std::vector<Shape> parameters = {shape};
   std::vector<Shape> results = {shape};
 
-  IrEmitter2 ir_emitter(*hlo, module.get());
+  IrEmitter2 ir_emitter(*hlo, module.get(), /*nested_ir_emitter=*/nullptr);
   IrEmitter2::KernelPrototype prototype =
       ir_emitter.EmitKernelPrototype("test", parameters, results);
 
@@ -100,7 +100,7 @@ TEST_F(IrEmitter2Test, EmitElementalKernel) {
   HloInstruction* convert = FindInstruction(hlo.get(), "convert");
   ASSERT_NE(convert, nullptr);
 
-  IrEmitter2 ir_emitter(*hlo, module.get());
+  IrEmitter2 ir_emitter(*hlo, module.get(), /*nested_ir_emitter=*/nullptr);
   TF_ASSERT_OK_AND_ASSIGN(IrEmitter2::KernelInfo kernel,
                           ir_emitter.EmitElementalHostKernel(convert));
 

From 904d2ed6cba618b9b055eee9c5fa7a6c74faaf1e Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 09:48:06 -0700
Subject: [PATCH 147/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638685569
---
 third_party/xla/xla/service/gpu/runtime/BUILD | 39 +++----------------
 .../gpu/runtime/address_computation_thunk.cc  |  1 -
 .../gpu/runtime/address_computation_thunk.h   |  1 -
 .../xla/xla/service/gpu/runtime/annotation.cc |  1 -
 .../service/gpu/runtime/command_buffer_cmd.h  |  1 -
 .../gpu/runtime/command_buffer_cmd_test.cc    |  1 -
 .../service/gpu/runtime/conditional_thunk.cc  |  1 -
 .../service/gpu/runtime/conditional_thunk.h   |  1 -
 .../xla/xla/service/gpu/runtime/copy_thunk.cc |  1 -
 .../service/gpu/runtime/custom_call_thunk.cc  |  1 -
 .../service/gpu/runtime/custom_call_thunk.h   |  2 +-
 .../xla/xla/service/gpu/runtime/gemm_thunk.h  |  2 +-
 .../gpu/runtime/gpublas_lt_matmul_thunk.cc    |  1 -
 .../gpu/runtime/gpublas_lt_matmul_thunk.h     |  2 +-
 .../xla/service/gpu/runtime/kernel_thunk.cc   |  1 -
 .../xla/service/gpu/runtime/kernel_thunk.h    |  2 +-
 .../xla/service/gpu/runtime/memset_thunk.h    |  2 +-
 .../nccl_collective_broadcast_thunk.cc        |  1 -
 .../runtime/nccl_collective_broadcast_thunk.h |  2 +-
 .../gpu/runtime/nccl_collective_thunk.cc      |  1 -
 .../gpu/runtime/nccl_collective_thunk.h       |  1 -
 .../service/gpu/runtime/send_recv_thunk.cc    |  1 -
 .../xla/service/gpu/runtime/send_recv_thunk.h |  2 +-
 23 files changed, 13 insertions(+), 55 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index bfdc4411d42ae9..b953caaf3e4816 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -29,7 +29,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         "//xla:printer",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -65,7 +64,6 @@ cc_library(
         ":nccl_collective_thunk",
         ":thunk",
         "//xla:executable_run_options",
-        "//xla:status",
         "//xla:types",
         "//xla:util",
         "//xla/ffi:call_frame",
@@ -130,7 +128,6 @@ cc_library(
         ":sequential_thunk",
         ":wait_for_streams_thunk",
         ":while_thunk",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/service/gpu/runtime:thunk",
@@ -150,7 +147,6 @@ xla_test(
     deps = [
         ":command_buffer_cmd",
         ":thunk",
-        "//xla:status",
         "//xla:types",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
@@ -321,14 +317,13 @@ cc_library(
     hdrs = ["address_computation_thunk.h"],
     deps = [
         ":sequential_thunk",
+        ":thunk",
         ":while_thunk",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
@@ -425,7 +420,6 @@ cc_library(
     deps = [
         ":annotation",
         ":command_buffer_cmd",
-        "//xla:status",
         "//xla:statusor",
         "//xla/service:buffer_assignment",  # build_cleaner: keep
         "//xla/service/gpu:buffer_allocations",  # build_cleaner: keep
@@ -496,20 +490,16 @@ cc_library(
     hdrs = ["conditional_thunk.h"],
     deps = [
         ":sequential_thunk",
-        "//xla:status",
+        ":thunk",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:variant_visitor",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -550,7 +540,6 @@ cc_library(
     hdrs = ["copy_thunk.h"],
     deps = [
         ":thunk",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
@@ -597,9 +586,9 @@ cc_library(
         "GOOGLE_CUDA=1",
     ]),
     deps = [
+        ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_context",
@@ -609,21 +598,14 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
-        "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -680,7 +662,6 @@ cc_library(
     hdrs = ["gemm_thunk.h"],
     deps = [
         ":thunk",
-        "//xla:status",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:matmul_utils",
@@ -706,7 +687,6 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/runtime:thunk",
-        "//xla:status",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_blas_lt",
@@ -746,7 +726,6 @@ cc_library(
     hdrs = ["kernel_thunk.h"],
     deps = [
         ":thunk",
-        "//xla:status",
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -774,9 +753,8 @@ cc_library(
     srcs = ["memset_thunk.cc"],
     hdrs = ["memset_thunk.h"],
     deps = [
-        "//xla:status",
+        ":thunk",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -859,7 +837,6 @@ cc_library(
         ":nccl_api",
         ":nccl_collective_thunk",
         ":thunk",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -912,9 +889,9 @@ cc_library(
         ":nccl_api",
         ":nccl_clique",
         ":nccl_clique_key",
+        ":thunk",
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -925,7 +902,6 @@ cc_library(
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
@@ -1090,7 +1066,6 @@ cc_library(
     ]),
     deps = [
         ":annotation",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu/runtime:thunk",
@@ -1106,13 +1081,12 @@ cc_library(
     srcs = ["send_recv_thunk.cc"],
     hdrs = ["send_recv_thunk.h"],
     deps = [
+        ":thunk",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service:global_device_id",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/base:core_headers",
@@ -1225,7 +1199,6 @@ cc_library(
     hdrs = ["while_thunk.h"],
     deps = [
         ":sequential_thunk",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 6c9b91d8e995be..032c027b358ca5 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/while_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index f549736982e55c..b1dea51cda083f 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
index 28ba83af90b98e..99b05d3a6c634a 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/printer.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index d0339ada647333..c4d50e93ea4937 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index c6109c6ece5bc5..f76fec88ed8e31 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
index 8680126792fe07..452c2e624fe766 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/variant_visitor.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
index 0d8109b70c09b6..323ee11fcaeb30 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
index 97b9dd44532a4e..7bc2874706b865 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index d7922beed05427..e40f8000d43963 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index 3a88202b298168..542583f41ad5ef 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
index 58f13d33172bb4..a8767e96c83613 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <optional>
 
+#include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 50d0cbe0c032ed..8cdcf39773278c 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 93f5de164a3cb5..9425a13360e35c 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
index 0d0ba0ddd2f372..a0343001ac35b7 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_factory.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
index 0f8c36b80a3065..ae5e0da2ea393e 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
index 6d7cf63ebbbafb..0d5d28c353e839 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 
 // This file contains thunks that set a buffer's elements to a particular value.
 // This can be faster than emitting a kernel to set the elements.
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index ebd67b72842c9b..370dd1189acc5c 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
index 8c0cc19215557d..4b19b785c025d7 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index 9ef74c526d3ed0..ce46c155e4c35c 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index d428966e784d14..3cb68268cfe45c 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
index 00a22bc207c59b..b11a6be3e2a2be 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
index b146efc6799151..196afc66c0fa46 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -24,12 +24,12 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"

From cd8ee44da79ac279a79ed7703ea01ff17ef3b4b1 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 09:48:35 -0700
Subject: [PATCH 148/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638685700
---
 third_party/xla/xla/stream_executor/tpu/BUILD          |  9 +++------
 .../xla/xla/stream_executor/tpu/tpu_executable.h       |  2 +-
 .../stream_executor/tpu/tpu_executable_interface.cc    |  1 -
 .../xla/stream_executor/tpu/tpu_executable_interface.h |  2 +-
 .../xla/xla/stream_executor/tpu/tpu_executor.cc        |  1 -
 .../xla/xla/stream_executor/tpu/tpu_op_executable.cc   |  2 +-
 .../xla/xla/stream_executor/tpu/tpu_op_executable.h    |  2 +-
 .../tpu/tpu_transfer_manager_interface.h               |  2 +-
 third_party/xla/xla/tools/BUILD                        | 10 ++++------
 .../xla/xla/tools/extract_collective_operations.cc     |  1 -
 .../xla/xla/tools/hlo_control_flow_flattening.cc       |  2 +-
 .../xla/xla/tools/hlo_control_flow_flattening.h        |  2 +-
 third_party/xla/xla/tools/hlo_decomposer.cc            |  2 +-
 third_party/xla/xla/tools/hlo_extractor.cc             |  2 +-
 third_party/xla/xla/tools/run_hlo_module.cc            |  2 +-
 third_party/xla/xla/tools/run_hlo_module.h             |  2 +-
 third_party/xla/xla/tools/xla_compile_lib.cc           |  1 -
 third_party/xla/xla/translate/hlo_to_mhlo/BUILD        |  7 +++----
 .../xla/translate/hlo_to_mhlo/custom_call_importer.cc  |  2 +-
 .../xla/translate/hlo_to_mhlo/hlo_function_importer.cc |  1 -
 .../xla/translate/hlo_to_mhlo/hlo_function_importer.h  |  2 +-
 .../xla/translate/hlo_to_mhlo/hlo_module_importer.cc   |  2 +-
 .../xla/translate/hlo_to_mhlo/hlo_module_importer.h    |  2 +-
 .../xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h    |  2 +-
 third_party/xla/xla/translate/hlo_to_mhlo/translate.cc |  2 +-
 25 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index b06c94f1841da0..68d7b4294a460b 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -307,8 +307,6 @@ cc_library(
         ":tpu_platform_interface",
         ":tpu_stream_interface",
         ":tpu_topology_external",
-        "//xla:status",
-        "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_interface",
         "//xla/tsl/c:tsl_status_internal",
@@ -392,10 +390,10 @@ cc_library(
         ":tpu_platform_interface",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -454,13 +452,13 @@ cc_library(
         ":tpu_executor_base",
         ":tpu_ops_c_api_hdrs",
         ":tpu_platform_interface",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
@@ -565,7 +563,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -602,7 +599,6 @@ cc_library(
         ":tpu_executor",
         ":tpu_executor_api",
         ":tpu_executor_c_api_hdrs",
-        "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -614,6 +610,7 @@ cc_library(
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
index bee80cb8489a59..34ce516b2fc1b9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
@@ -23,13 +23,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index 6a5435d17fbffb..f2559adad25b05 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
index b4e24551a80493..06082387f4258a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
index b7566025d35d00..2b9f74d8d3bfd3 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "xla/status.h"
 #include "xla/stream_executor/allocator_stats.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
index c062ddf8bd8cfe..a1c84151832d7c 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"  // IWYU pragma: keep
 #include "xla/stream_executor/tpu/c_api_decl.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
index 7239f33b348cbc..c87f721785460a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_executable_interface.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
index 62a9647e7650b8..e61e4fec5e4f85 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/status/status.h"
 #include "xla/literal.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
 
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index c6fbf05d30ae77..b460ac21c1f12c 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -284,7 +284,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:compilation_environments",
@@ -294,6 +293,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -529,7 +529,6 @@ cc_library(
     srcs = ["hlo_decomposer.cc"],
     hdrs = ["hlo_decomposer.h"],
     deps = [
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:call_graph",
         "//xla/service:compilation_environments",
@@ -537,6 +536,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -555,7 +555,6 @@ cc_library(
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_comparison",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -567,6 +566,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -631,7 +631,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:call_graph",
@@ -642,6 +641,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -699,7 +699,6 @@ xla_cc_binary(
         ":hlo_decomposer_lib",
         ":hlo_module_loader",
         "//xla:debug_options_flags",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/util:command_line_flags",
@@ -725,7 +724,6 @@ tsl_gpu_library(
         ":hlo_module_loader",
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/client:xla_computation",
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
index 3c46c3983d9291..2f484059d3d3b1 100644
--- a/third_party/xla/xla/tools/extract_collective_operations.cc
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/status.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tsl/util/command_line_flags.h"
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index facb78921d7505..d37c6b04fcec14 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
index e48cce0b865a8a..cff9db4c11a7dd 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.h
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tools/hlo_decomposer.cc b/third_party/xla/xla/tools/hlo_decomposer.cc
index 005355d671fe3c..30741733af2132 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.cc
+++ b/third_party/xla/xla/tools/hlo_decomposer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/compilation_environments.h"
-#include "xla/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tools/hlo_extractor.cc b/third_party/xla/xla/tools/hlo_extractor.cc
index 32d71c048f179d..8f413273d01eed 100644
--- a/third_party/xla/xla/tools/hlo_extractor.cc
+++ b/third_party/xla/xla/tools/hlo_extractor.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tests/test_utils.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 705f04f776ffb8..6d31a523cfcd92 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/status.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tools/hlo_control_flow_flattening.h"
 #include "xla/tools/hlo_decomposer.h"
diff --git a/third_party/xla/xla/tools/run_hlo_module.h b/third_party/xla/xla/tools/run_hlo_module.h
index a4f928ed57debf..c84f3003fa738f 100644
--- a/third_party/xla/xla/tools/run_hlo_module.h
+++ b/third_party/xla/xla/tools/run_hlo_module.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include <random>
 #include <string>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_runner.h"
-#include "xla/status.h"
 #include "xla/tools/run_hlo_module.pb.h"
 #include "tsl/platform/status.h"
 
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 40681060a9fcb4..86db193d31ed1d 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/service/symbol_repository.h"
 #include "xla/service/xla_compile_result.pb.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index 145cac8110ca6d..e4d48a01d0585b 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -31,11 +31,11 @@ cc_library(
     srcs = ["custom_call_importer.cc"],
     hdrs = ["custom_call_importer.h"],
     deps = [
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
@@ -76,7 +76,6 @@ cc_library(
         "//xla:protobuf_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -109,9 +108,9 @@ cc_library(
     hdrs = ["hlo_to_mlir_hlo.h"],
     deps = [
         ":hlo_module_importer",
-        "//xla:status",
         "//xla:status_macros",
         "//xla/mlir/utils:error_util",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -171,11 +170,11 @@ cc_library(
     hdrs = ["translate.h"],
     deps = [
         ":hlo_to_mlir_hlo",
-        "//xla:status",
         "//xla/mlir_hlo",
         "//xla/service:hlo_parser",
         "//xla/service:hlo_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:protobuf",
     ],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
index 0fd690ac862107..b837d363fb93ae 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
@@ -31,7 +32,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index a2e74df3d6052b..7acf65b83975eb 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -70,7 +70,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "xla/translate/hlo_to_mhlo/custom_call_importer.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
index df8a7780c84621..c92bca01f078ac 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
+#include "absl/status/status.h"
 #include "absl/types/optional.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index 299c7b3333500b..55a053baf1621f 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
index bc608c5f42153d..095caad626aee9 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -18,13 +18,13 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
index 277f522331ffda..161823a102c28c 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <stdbool.h>
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 
 namespace mlir {
 class ModuleOp;
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc b/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
index b31939c58b3364..0bd9c3097bae80 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
@@ -14,13 +14,13 @@ limitations under the License.
 
 #include "xla/translate/hlo_to_mhlo/translate.h"
 
+#include "absl/status/status.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "tsl/platform/protobuf.h"
 

From 0dffd98a9fba218bec59021ca50eacaa760ff052 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 10:19:37 -0700
Subject: [PATCH 149/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638696669
---
 third_party/xla/xla/service/BUILD             | 121 ++++++++----------
 .../xla/xla/service/algebraic_simplifier.cc   |   3 +-
 .../xla/xla/service/all_gather_combiner.cc    |   2 +-
 .../xla/xla/service/all_gather_decomposer.cc  |   2 +-
 .../xla/xla/service/all_gather_decomposer.h   |   3 +-
 .../xla/xla/service/batchnorm_expander.cc     |   2 +-
 third_party/xla/xla/service/call_inliner.cc   |   2 +-
 .../xla/xla/service/collective_pipeliner.cc   |   2 +-
 .../xla/service/collective_pipeliner_test.cc  |   2 +-
 .../xla/xla/service/computation_placer.cc     |   2 +-
 .../xla/xla/service/computation_placer.h      |   2 +-
 .../xla/service/conditional_code_motion.cc    |   2 +-
 .../xla/xla/service/cpu_gpu_shape_verifier.cc |   1 -
 third_party/xla/xla/service/dot_decomposer.cc |   2 +-
 .../xla/xla/service/dot_dimension_merger.cc   |   2 +-
 third_party/xla/xla/service/dump.cc           |   2 +-
 third_party/xla/xla/service/dump.h            |   2 +-
 .../service/dynamic_dimension_inference.cc    |   2 +-
 .../xla/service/dynamic_dimension_inference.h |   2 +-
 third_party/xla/xla/service/dynamic_padder.cc |   2 +-
 .../xla/xla/service/dynamic_padder_test.cc    |   2 +-
 .../xla/xla/service/dynamic_window_utils.h    |   2 +-
 third_party/xla/xla/service/executable.cc     |   2 +-
 .../xla/service/generic_transfer_manager.cc   |   3 +-
 .../xla/service/generic_transfer_manager.h    |   4 +-
 .../service/gpu_compilation_environment.cc    |   2 +-
 .../xla/xla/service/hlo_alias_analysis.h      |   2 +-
 .../xla/xla/service/hlo_cost_analysis.cc      |   2 +-
 .../xla/xla/service/hlo_dataflow_analysis.cc  |   2 +-
 .../xla/xla/service/hlo_dataflow_analysis.h   |   2 +-
 .../xla/service/hlo_dataflow_analysis_test.cc |   2 +-
 third_party/xla/xla/service/hlo_dce.cc        |   1 -
 third_party/xla/xla/service/hlo_dce.h         |   2 +-
 .../xla/xla/service/hlo_graph_dumper.cc       |   2 +-
 .../xla/xla/service/hlo_liveness_analysis.cc  |   1 -
 .../xla/xla/service/hlo_liveness_analysis.h   |   2 +-
 third_party/xla/xla/service/hlo_module_dce.cc |   2 +-
 .../xla/service/hlo_module_group_metadata.h   |   2 +-
 .../xla/xla/service/hlo_module_group_util.h   |   2 +-
 .../xla/xla/service/hlo_module_util.cc        |   2 +-
 third_party/xla/xla/service/hlo_module_util.h |   2 +-
 third_party/xla/xla/service/hlo_parser.cc     |   2 +-
 .../xla/xla/service/hlo_parser_test.cc        |  62 ++++-----
 third_party/xla/xla/service/hlo_proto_util.h  |   2 +-
 .../xla/xla/service/hlo_rematerialization.cc  |   5 +-
 .../xla/xla/service/hlo_runner_pjrt.cc        |   2 +-
 .../service/hlo_value_semantics_analysis.cc   |   1 -
 .../service/hlo_value_semantics_analysis.h    |   2 +-
 third_party/xla/xla/service/hlo_verifier.cc   |   1 -
 .../xla/xla/service/hlo_verifier_test.cc      |   2 +-
 .../host_memory_transfer_asyncifier.cc        |   2 +-
 .../xla/xla/service/host_offload_legalize.cc  |   1 -
 third_party/xla/xla/service/host_offloader.cc |  11 +-
 .../xla/service/latency_hiding_scheduler.cc   |   1 -
 .../xla/xla/service/layout_assignment.cc      |   2 +-
 .../xla/xla/service/layout_assignment.h       |   2 +-
 .../xla/xla/service/layout_assignment_test.cc |  13 +-
 .../xla/service/mapped_ptr_container_sorter.h |   2 +-
 ...optimize_input_output_buffer_alias_test.cc |   2 +-
 .../xla/service/p2p_schedule_preparation.cc   |   2 +-
 .../xla/xla/service/reduce_decomposer.cc      |   2 +-
 .../xla/xla/service/reduce_window_rewriter.cc |   2 +-
 .../xla/xla/service/reduce_window_rewriter.h  |   3 +-
 .../xla/xla/service/reshape_decomposer.cc     |   2 +-
 .../xla/xla/service/shape_inference.cc        |   1 -
 .../xla/xla/service/sharding_propagation.cc   |   2 +-
 third_party/xla/xla/service/source_map_util.h |   2 +-
 .../service/stochastic_convert_decomposer.cc  |   2 +-
 .../stochastic_convert_decomposer_test.cc     |   4 +-
 .../xla/xla/service/transfer_manager.cc       |   2 +-
 .../xla/xla/service/transfer_manager.h        |   6 +-
 .../while_loop_all_reduce_code_motion.cc      |   2 +-
 .../service/while_loop_concat_code_motion.cc  |   2 +-
 .../xla/xla/service/xla_compile_main.cc       |   2 +-
 74 files changed, 164 insertions(+), 190 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 2eef83d317a299..4cb687dbb1abc7 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -499,7 +499,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -513,6 +512,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -527,13 +527,13 @@ xla_cc_test(
         ":collective_pipeliner",
         ":hlo_parser",
         ":hlo_pass_pipeline",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -546,12 +546,12 @@ cc_library(
     deps = [
         ":hlo_graph_dumper",
         ":hlo_proto_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -575,10 +575,8 @@ cc_library(
     deps = [
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
@@ -653,7 +651,6 @@ cc_library(
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:sharding_op_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -667,6 +664,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -990,15 +988,14 @@ cc_library(
         ":hlo_dce",
         ":hlo_domain_isolator",
         ":hlo_pass",
-        "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
@@ -1251,7 +1248,6 @@ cc_library(
         ":hlo_buffer",
         ":hlo_cost_analysis",
         ":hlo_pass",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1290,7 +1286,6 @@ cc_library(
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
@@ -1299,6 +1294,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -1492,7 +1488,6 @@ cc_library(
         "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -1503,6 +1498,7 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -1589,7 +1585,6 @@ cc_library(
         "//xla:literal",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -1599,6 +1594,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -1821,13 +1817,13 @@ cc_library(
         ":hlo_alias_analysis",
         ":tuple_points_to_analysis",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
@@ -1842,8 +1838,8 @@ cc_library(
         ":compiler",
         ":hlo_module_config",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1854,7 +1850,6 @@ cc_library(
     hdrs = ["hlo_module_group_util.h"],
     deps = [
         ":hlo_module_group_metadata",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1864,6 +1859,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -2109,7 +2105,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -2117,6 +2112,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
@@ -2459,7 +2455,6 @@ cc_library(
         "//xla:literal_util",
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:window_util",
@@ -2767,7 +2762,6 @@ cc_library(
         ":hlo_domain_map",
         ":hlo_pass",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
@@ -2775,6 +2769,7 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_sharding_util",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -3087,7 +3082,6 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -3098,6 +3092,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -3491,10 +3486,10 @@ cc_library(
         ":hlo_pass",
         ":shape_inference",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -3529,12 +3524,12 @@ cc_library(
         ":hlo_creation_utils",
         ":hlo_pass",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -3644,10 +3639,10 @@ cc_library(
         ":hlo_pass",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -3720,8 +3715,8 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -3732,8 +3727,8 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -3773,10 +3768,10 @@ cc_library(
         ":shape_inference",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -3797,7 +3792,6 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -3811,6 +3805,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -3874,7 +3869,6 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -3887,6 +3881,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -3913,7 +3908,6 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:test",
@@ -3931,6 +3925,7 @@ xla_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -3991,7 +3986,6 @@ cc_library(
         "//xla:array2d",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -4004,6 +3998,7 @@ cc_library(
         "//xla/stream_executor/rocm:rocm_platform_id",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -4018,11 +4013,11 @@ cc_library(
     deps = [
         ":global_device_id",
         "//xla:array2d",
-        "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -4065,27 +4060,22 @@ cc_library(
     srcs = ["generic_transfer_manager.cc"],
     hdrs = ["generic_transfer_manager.h"],
     deps = [
+        ":shaped_buffer",
         ":transfer_manager",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/service:shaped_buffer",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:numbers",
-        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -4123,7 +4113,6 @@ cc_library(
     hdrs = ["hlo_cost_analysis.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -4132,6 +4121,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/lib/gtl:map_util",
@@ -4307,7 +4297,6 @@ cc_library(
         "//xla:lazy",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -4333,7 +4322,6 @@ cc_library(
         ":hlo_phi_graph",
         ":hlo_value",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -4347,6 +4335,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -4368,13 +4357,13 @@ xla_cc_test(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -4416,7 +4405,6 @@ cc_library(
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:side_effect_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -4490,7 +4478,6 @@ cc_library(
         ":hlo_value",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -4499,6 +4486,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -4553,7 +4541,6 @@ cc_library(
         ":hlo_value",
         "//xla:comparison_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -4562,6 +4549,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -4694,7 +4682,6 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -4707,6 +4694,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -4834,7 +4822,6 @@ cc_library(
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -4860,12 +4847,12 @@ cc_library(
         ":hlo_pass",
         ":tuple_simplifier",
         ":while_loop_simplifier",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -4884,7 +4871,6 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -4914,7 +4900,6 @@ xla_cc_test(
         ":layout_assignment",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -4922,6 +4907,7 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -4939,7 +4925,6 @@ cc_library(
     deps = [
         ":hlo_verifier",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -4974,7 +4959,6 @@ cc_library(
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -4987,6 +4971,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
@@ -5538,7 +5523,6 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -5555,6 +5539,7 @@ cc_library(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -5702,9 +5687,9 @@ cc_library(
         ":buffer_assignment",
         ":hlo_proto_cc",
         ":hlo_verifier",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -5778,12 +5763,12 @@ cc_library(
     srcs = ["hlo_runner_pjrt.cc"],
     hdrs = ["hlo_runner_pjrt.h"],
     deps = [
+        ":computation_layout",
         ":executable",
         ":hlo_module_util",
         ":hlo_runner_interface",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -5793,8 +5778,8 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
-        "//xla/service:computation_layout",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -5981,12 +5966,12 @@ cc_library(
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
@@ -6025,7 +6010,6 @@ cc_library(
         ":hlo_value",
         ":host_memory_offload_annotations_hdr",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -6079,7 +6063,6 @@ cc_library(
         ":pattern_matcher",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -6102,6 +6085,7 @@ xla_cc_test(
     srcs = ["host_offloader_test.cc"],
     shard_count = 12,
     deps = [
+        ":hlo_verifier",
         ":host_memory_offload_annotations_hdr",
         ":host_offload_legalize",
         ":host_offloader",
@@ -6111,7 +6095,6 @@ xla_cc_test(
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_verifier",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -6136,7 +6119,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
@@ -6217,7 +6199,6 @@ cc_library(
         ":hlo_pass",
         ":hlo_replication_analysis",
         "//xla:literal_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -6225,6 +6206,7 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -6257,7 +6239,6 @@ cc_library(
         ":tuple_simplifier",
         ":while_loop_simplifier",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -6266,6 +6247,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
@@ -6497,7 +6479,7 @@ cc_library(
     hdrs = ["source_map_util.h"],
     deps = [
         ":executable",
-        "//xla:status",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -6547,7 +6529,6 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -6559,6 +6540,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -6655,13 +6637,13 @@ xla_cc_test(
     deps = [
         ":optimize_input_output_buffer_alias",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:test",
         "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test",
     ],
@@ -7268,12 +7250,12 @@ cc_library(
     name = "mapped_ptr_container_sorter",
     hdrs = ["mapped_ptr_container_sorter.h"],
     deps = [
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -7558,7 +7540,6 @@ cc_library(
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -7570,6 +7551,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -7600,11 +7582,11 @@ cc_library(
         ":hlo_pass",
         ":shape_inference",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -7704,11 +7686,11 @@ xla_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":cpu_plugin",
-        "//xla:status",
         "//xla/tools:xla_compile_lib",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:types",
@@ -7899,11 +7881,11 @@ cc_library(
     deps = [
         ":compilation_environments",
         "//xla:parse_flags_from_env",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
@@ -7966,7 +7948,6 @@ cc_library(
     srcs = ["algorithm_util.cc"],
     hdrs = ["algorithm_util.h"],
     deps = [
-        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index b427114c5f74ad..259dc6f1683dcd 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -64,7 +64,6 @@ limitations under the License.
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -2180,7 +2179,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
       (Match(b, m::Constant(&c)) || Match(b, m::Broadcast(m::Constant(&c))))) {
     Shape result_shape = c->literal().shape();
     Literal new_literal(result_shape);
-    return primitive_util::PrimitiveTypeSwitch<Status>(
+    return primitive_util::PrimitiveTypeSwitch<absl::Status>(
         [&](auto primitive_type_constant) -> absl::Status {
           if constexpr (primitive_util::IsFloatingPointType(
                             primitive_type_constant) ||
diff --git a/third_party/xla/xla/service/all_gather_combiner.cc b/third_party/xla/xla/service/all_gather_combiner.cc
index 6cbf081466bb39..bb829ee91983a2 100644
--- a/third_party/xla/xla/service/all_gather_combiner.cc
+++ b/third_party/xla/xla/service/all_gather_combiner.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/service/collective_combiner_utils.h"
 #include "xla/service/hlo_domain_map.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/all_gather_decomposer.cc b/third_party/xla/xla/service/all_gather_decomposer.cc
index d6155fc6cdd153..98443b9113f976 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.cc
+++ b/third_party/xla/xla/service/all_gather_decomposer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/service/all_gather_decomposer.h b/third_party/xla/xla/service/all_gather_decomposer.h
index 924113fc23d2de..eb9118ad5b728c 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.h
+++ b/third_party/xla/xla/service/all_gather_decomposer.h
@@ -52,7 +52,8 @@ class AllGatherDecomposer : public HloModulePass {
     return should_decompose_(ag);
   }
 
-  Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp);
+  absl::Status DecomposeAllGather(HloAllGatherInstruction* ag,
+                                  HloComputation* comp);
 
  private:
   std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
diff --git a/third_party/xla/xla/service/batchnorm_expander.cc b/third_party/xla/xla/service/batchnorm_expander.cc
index 8a5dfde14e792d..876b0c1ab208c1 100644
--- a/third_party/xla/xla/service/batchnorm_expander.cc
+++ b/third_party/xla/xla/service/batchnorm_expander.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index cc6b5a09a18f82..579de41179ff76 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_domain_isolator.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 4252fd0c033735..21bd620b6f7d11 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/numeric/int128.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/service/value_range.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index e8c275799d8064..06622b6f61c2c5 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/hlo_pass_pipeline.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/computation_placer.cc b/third_party/xla/xla/service/computation_placer.cc
index 26df89d6c344e3..437dd342feebb8 100644
--- a/third_party/xla/xla/service/computation_placer.cc
+++ b/third_party/xla/xla/service/computation_placer.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/literal.h"
 #include "xla/service/global_device_id.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
diff --git a/third_party/xla/xla/service/computation_placer.h b/third_party/xla/xla/service/computation_placer.h
index 3ee82afd157d3e..b45d4ea9fa2b9d 100644
--- a/third_party/xla/xla/service/computation_placer.h
+++ b/third_party/xla/xla/service/computation_placer.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/array2d.h"
 #include "xla/service/global_device_id.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/conditional_code_motion.cc b/third_party/xla/xla/service/conditional_code_motion.cc
index 5eeaf4f7967ad0..813249ffcc2abb 100644
--- a/third_party/xla/xla/service/conditional_code_motion.cc
+++ b/third_party/xla/xla/service/conditional_code_motion.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/service/tuple_simplifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
index aa2195845678ea..97487b3e756505 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/dot_decomposer.cc b/third_party/xla/xla/service/dot_decomposer.cc
index 2ddf54ddce3ac3..531c8899da0ffe 100644
--- a/third_party/xla/xla/service/dot_decomposer.cc
+++ b/third_party/xla/xla/service/dot_decomposer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/dot_dimension_merger.cc b/third_party/xla/xla/service/dot_dimension_merger.cc
index 56f8e2348c6cb8..a8b881187dcaa7 100644
--- a/third_party/xla/xla/service/dot_dimension_merger.cc
+++ b/third_party/xla/xla/service/dot_dimension_merger.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index dd2843966d0792..2e06a284835947 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/hlo_proto_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/lib/io/zlib_compression_options.h"
diff --git a/third_party/xla/xla/service/dump.h b/third_party/xla/xla/service/dump.h
index 3020fa4f5fc537..13a34eb35f7a93 100644
--- a/third_party/xla/xla/service/dump.h
+++ b/third_party/xla/xla/service/dump.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_DUMP_H_
 #define XLA_SERVICE_DUMP_H_
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_graph_dumper.h"
-#include "xla/status.h"
 #include "xla/xla.pb.h"
 
 // Consolidated utilities for logging information during compilation, usually
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index ec3671363627af..b9e8a00832ea5b 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.h b/third_party/xla/xla/service/dynamic_dimension_inference.h
index cab9502ccccbf9..7419197602001b 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.h
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.h
@@ -26,12 +26,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index 4e4f0e78333105..e9b2b0848a9887 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/dynamic_padder_test.cc b/third_party/xla/xla/service/dynamic_padder_test.cc
index 6829ff4f0d7675..36d69762ac6ce8 100644
--- a/third_party/xla/xla/service/dynamic_padder_test.cc
+++ b/third_party/xla/xla/service/dynamic_padder_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_replace.h"
 #include "absl/types/span.h"
 #include "xla/client/xla_builder.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "xla/service/tuple_simplifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/test.h"
diff --git a/third_party/xla/xla/service/dynamic_window_utils.h b/third_party/xla/xla/service/dynamic_window_utils.h
index 1c82b6b82e622d..008b1847285096 100644
--- a/third_party/xla/xla/service/dynamic_window_utils.h
+++ b/third_party/xla/xla/service/dynamic_window_utils.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 namespace xla {
diff --git a/third_party/xla/xla/service/executable.cc b/third_party/xla/xla/service/executable.cc
index a7674e4114f835..ed86114607cf6f 100644
--- a/third_party/xla/xla/service/executable.cc
+++ b/third_party/xla/xla/service/executable.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/dump.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/maybe_owning_device_memory.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/lib/strings/proto_serialization.h"
diff --git a/third_party/xla/xla/service/generic_transfer_manager.cc b/third_party/xla/xla/service/generic_transfer_manager.cc
index 2727457f50feb1..b8a704afd346d6 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
@@ -75,7 +74,7 @@ absl::Status GenericTransferManager::WriteSingleTupleIndexTable(
 
 void GenericTransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
-    MutableBorrowingLiteral literal, std::function<void(Status)> done,
+    MutableBorrowingLiteral literal, std::function<void(absl::Status)> done,
     const TransferMetadata* transfer_metadata) {
   VLOG(2) << "transferring literal from device ordinal "
           << stream->parent()->device_ordinal()
diff --git a/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/xla/xla/service/generic_transfer_manager.h
index a00f46d4f6fc45..3503cff66b7dc0 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.h
+++ b/third_party/xla/xla/service/generic_transfer_manager.h
@@ -23,13 +23,13 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -58,7 +58,7 @@ class GenericTransferManager : public TransferManager {
 
   void TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
-      MutableBorrowingLiteral literal, std::function<void(Status)> done,
+      MutableBorrowingLiteral literal, std::function<void(absl::Status)> done,
       const TransferMetadata* transfer_metadata) override;
 
   absl::Status TransferLiteralToDeviceAsync(
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index 8d0e7d81f6e2a6..936e878ec98a71 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/service/compilation_environments.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.h b/third_party/xla/xla/service/hlo_alias_analysis.h
index 197a4d60109ec8..18e4c6a988b275 100644
--- a/third_party/xla/xla/service/hlo_alias_analysis.h
+++ b/third_party/xla/xla/service/hlo_alias_analysis.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index 0883386fbabbfa..31cb13ad2a7154 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index 7b19a63fbb8130..ff142bd75c3af9 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/map_util.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.h b/third_party/xla/xla/service/hlo_dataflow_analysis.h
index ddc97b0c7df12c..f77ac14472612f 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.h
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/service/hlo_phi_graph.h"
 #include "xla/service/hlo_value.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
index 89ce2c4bea22f8..7ae701466ed0d2 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/hlo_dce.cc b/third_party/xla/xla/service/hlo_dce.cc
index d6b1b941879286..64aca60aef21c7 100644
--- a/third_party/xla/xla/service/hlo_dce.cc
+++ b/third_party/xla/xla/service/hlo_dce.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/hlo_dce.h b/third_party/xla/xla/service/hlo_dce.h
index c6330d30c38ef8..e821584d58bc68 100644
--- a/third_party/xla/xla/service/hlo_dce.h
+++ b/third_party/xla/xla/service/hlo_dce.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index 06ebbb23a0dc22..1d2618c2a36da4 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/file_system.h"
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.cc b/third_party/xla/xla/service/hlo_liveness_analysis.cc
index 83ae6250e3d8c2..ff534fb1d4ecc1 100644
--- a/third_party/xla/xla/service/hlo_liveness_analysis.cc
+++ b/third_party/xla/xla/service/hlo_liveness_analysis.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.h b/third_party/xla/xla/service/hlo_liveness_analysis.h
index 0c49dda4aaaf8c..8a68432bf66c8c 100644
--- a/third_party/xla/xla/service/hlo_liveness_analysis.h
+++ b/third_party/xla/xla/service/hlo_liveness_analysis.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_module_dce.cc b/third_party/xla/xla/service/hlo_module_dce.cc
index de04957795aa25..76a77cfc4e1016 100644
--- a/third_party/xla/xla/service/hlo_module_dce.cc
+++ b/third_party/xla/xla/service/hlo_module_dce.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "xla/service/hlo_liveness_analysis.h"
 #include "xla/service/tuple_simplifier.h"
 #include "xla/service/while_loop_simplifier.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/service/hlo_module_group_metadata.h b/third_party/xla/xla/service/hlo_module_group_metadata.h
index 1b19d0b069c359..180ae7e391efcb 100644
--- a/third_party/xla/xla/service/hlo_module_group_metadata.h
+++ b/third_party/xla/xla/service/hlo_module_group_metadata.h
@@ -23,11 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_alias_analysis.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/status.h"
 
diff --git a/third_party/xla/xla/service/hlo_module_group_util.h b/third_party/xla/xla/service/hlo_module_group_util.h
index dc084615791950..f1b8eab2a2c874 100644
--- a/third_party/xla/xla/service/hlo_module_group_util.h
+++ b/third_party/xla/xla/service/hlo_module_group_util.h
@@ -22,12 +22,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_reachability.h"
 #include "xla/service/hlo_module_group_metadata.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/status.h"
 
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index 02b4aab4e1f3b2..69176c58bc928d 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/service/compiler.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_module_util.h b/third_party/xla/xla/service/hlo_module_util.h
index 8692bf927e4443..3a4cd4fde2de68 100644
--- a/third_party/xla/xla/service/hlo_module_util.h
+++ b/third_party/xla/xla/service/hlo_module_util.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/service/compiler.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index 53ec967089ae78..d12f3bba5fda6a 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -69,7 +70,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index 904cc5351c1a7a..526687523b5872 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -2613,14 +2613,14 @@ class HloParserTest : public ::testing::Test {
 TEST_F(HloParserTest, Empty) {
   const std::string original = "";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, Garbage) {
   const std::string original =
       "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOpcode) {
@@ -2634,7 +2634,7 @@ ENTRY %blabla (x: f32[], y: f32[]) -> f32[] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, MetadataWithCholesky) {
@@ -2645,7 +2645,7 @@ ENTRY %blabla (a: f32[1,291,291]) -> f32[1,291,291] {
 }
 )";
   auto result = ParseAndReturnVerifiedModule(original);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
   EXPECT_EQ("Cholesky", result.value()
                             ->entry_computation()
                             ->root_instruction()
@@ -2673,7 +2673,7 @@ ENTRY %blabla (x: g32[]) -> g32[] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOperandsSize) {
@@ -2686,7 +2686,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, OperandNotFound) {
@@ -2697,7 +2697,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, MoreConstants) {
@@ -2739,7 +2739,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(), "unexpected '}' token");
 }
 
@@ -2752,7 +2752,7 @@ ENTRY %some_2 () -> f32[2] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "expects nested array in rank 1, but sees larger");
 }
@@ -2766,7 +2766,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "expects nested array in rank 2, but sees 1");
 }
@@ -2780,7 +2780,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "expects 3 elements in the [0]th element");
 }
@@ -2795,7 +2795,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type F16");
 }
@@ -2807,7 +2807,7 @@ TEST_F(HloParserTest, ConstantBf16NoOverflow) {
   ENTRY test {
     ROOT c = bf16[] constant(-65505)
   })";
-  EXPECT_EQ(OkStatus(), ParseAndReturnVerifiedModule(original).status());
+  EXPECT_EQ(absl::OkStatus(), ParseAndReturnVerifiedModule(original).status());
 }
 
 TEST_F(HloParserTest, ConstantBf16Overflow) {
@@ -2828,7 +2828,7 @@ TEST_F(HloParserTest, ConstantU4Underflow) {
         ROOT %constant = u4[] constant(-1)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type U4");
 }
@@ -2840,7 +2840,7 @@ TEST_F(HloParserTest, ConstantU4Overflow) {
         ROOT %constant = u4[] constant(16)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type U4");
 }
@@ -2852,7 +2852,7 @@ TEST_F(HloParserTest, ConstantS4Underflow) {
         ROOT %constant = s4[] constant(-9)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type S4");
 }
@@ -2864,7 +2864,7 @@ TEST_F(HloParserTest, ConstantS4Overflow) {
         ROOT %constant = s4[] constant(8)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type S4");
 }
@@ -2876,7 +2876,7 @@ TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
         ROOT %constant = u64[] constant(-1)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantUnsignedOverflow) {
@@ -2886,7 +2886,7 @@ TEST_F(HloParserTest, ConstantUnsignedOverflow) {
         ROOT %constant = u32[] constant(4294967296)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type U32");
 }
@@ -2898,7 +2898,7 @@ TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) {
         ROOT %constant = u64[] constant(9223372036854775808)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantC64Overflow) {
@@ -2908,7 +2908,7 @@ TEST_F(HloParserTest, ConstantC64Overflow) {
         ROOT c = c64[] constant((1e100, 0))
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantC64Underflow) {
@@ -2918,7 +2918,7 @@ TEST_F(HloParserTest, ConstantC64Underflow) {
         ROOT c = c64[] constant((0, -1e100))
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantF64Overflow) {
@@ -2928,7 +2928,7 @@ TEST_F(HloParserTest, ConstantF64Overflow) {
         ROOT c = f64[] constant(1.8e308)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantF64Underflow) {
@@ -2938,7 +2938,7 @@ TEST_F(HloParserTest, ConstantF64Underflow) {
         ROOT c = f64[] constant(-1.8e308)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantWithExp) {
@@ -2980,7 +2980,7 @@ ENTRY %NegativeNan () -> bf16[2] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
   EXPECT_EQ(result.value()->ToString(HloPrintOptions()), original);
 }
 
@@ -2994,7 +2994,7 @@ ENTRY %NanPayload () -> bf16[2] {
 
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
   EXPECT_EQ(result.value()->ToString(HloPrintOptions()), original);
 }
 
@@ -4379,7 +4379,7 @@ TEST_F(HloParserTest, CheckIndexedConditionalDimension) {
   }
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(),
               HasSubstr("The first operand must be a scalar"));
 }
@@ -4406,7 +4406,7 @@ TEST_F(HloParserTest, CheckIndexedConditionalElementType) {
   }
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(),
               HasSubstr("The first operand must be a scalar of PRED or S32"));
 }
@@ -4434,7 +4434,7 @@ TEST_F(HloParserTest,
   }
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(),
               HasSubstr("unexpected attribute \"branch_computations\""));
 }
@@ -4719,7 +4719,7 @@ ENTRY test {
 }
 )";
   auto result = ParseAndReturnVerifiedModule(hlo_string);
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(), HasSubstr("dimensions"));
 }
 
@@ -5283,7 +5283,7 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
       })";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
-  EXPECT_EQ(OkStatus(), result.status());
+  EXPECT_EQ(absl::OkStatus(), result.status());
 }
 
 TEST_F(HloParserTest, ReplicaIdWithLayout) {
diff --git a/third_party/xla/xla/service/hlo_proto_util.h b/third_party/xla/xla/service/hlo_proto_util.h
index 8bdaf38b0c04df..5f4483be951889 100644
--- a/third_party/xla/xla/service/hlo_proto_util.h
+++ b/third_party/xla/xla/service/hlo_proto_util.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/hlo_rematerialization.cc b/third_party/xla/xla/service/hlo_rematerialization.cc
index 94e83cf1b76938..5164ae8ffd0998 100644
--- a/third_party/xla/xla/service/hlo_rematerialization.cc
+++ b/third_party/xla/xla/service/hlo_rematerialization.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -2887,7 +2887,8 @@ absl::StatusOr<bool> HloRematerialization::Run(
       async_threads.insert(computation->execution_thread());
     }
     TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
-        [this, module, &async_threads](const CallGraphNode& node) -> Status {
+        [this, module,
+         &async_threads](const CallGraphNode& node) -> absl::Status {
           auto callee_thread = node.computation()->execution_thread();
           if (node.context() == CallContext::kControlFlow &&
               HloInstruction::IsThreadIncluded(callee_thread, async_threads)) {
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 5f1c91278f517d..ba8d74c0b96277 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/hlo_module_util.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index 8d70024209666e..377f629370ae90 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/side_effect_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
index b005be32603728..caa82009b8ff30 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 280c730e1fce75..a7a8cdf54aaf0e 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index c7e125a75d36c9..d32c928bfdbd2a 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/base/log_severity.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/service/layout_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
index 79132f44926970..21f647b9872657 100644
--- a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index 261a1cb5301410..012b540d538f86 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/host_memory_offload_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 2d34835228c6c0..6c0a3213e53879 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -532,7 +531,7 @@ absl::StatusOr<bool> HostOffloader::HandleInputStreaming(
                     /*insert_copy_before=*/false));
             changed = changed || result;
           }
-          return OkStatus();
+          return absl::OkStatus();
         }));
   }
   return changed;
@@ -724,7 +723,7 @@ absl::Status HostOffloader::ValidateSliceLeadsToMoveToDeviceCustomCall(
     HloInstruction* slice) {
   if (validated_slices_.find(slice) != validated_slices_.end()) {
     // Already validated this one.
-    return OkStatus();
+    return absl::OkStatus();
   }
   // Every host-to-device DynamicSlice/Slice must be followed by a MoveToDevice
   // custom call. This function verifiest that.
@@ -762,7 +761,7 @@ absl::Status HostOffloader::ValidateSliceLeadsToMoveToDeviceCustomCall(
     }
   }
   validated_slices_.insert(slice);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status HostOffloader::CreateAllocateBufferForDynamicUpdateSlice(
@@ -770,7 +769,7 @@ absl::Status HostOffloader::CreateAllocateBufferForDynamicUpdateSlice(
   if (dynamic_update_slices_already_allocated_.find(dynamic_update_slice) !=
       dynamic_update_slices_already_allocated_.end()) {
     // Already added an AllocateBuffer for this DynamicUpdateSlice.
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(2) << absl::StreamFormat(
       "Creating a AllocateBuffer in host memory space for \"%s\"",
@@ -899,7 +898,7 @@ absl::Status HostOffloader::CreateAllocateBufferForDynamicUpdateSlice(
                         "result of a broadcast.",
                         dynamic_update_slice->name()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<HloInstruction*> HostOffloader::DynamifySlice(
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 2b48b0aba08179..33eeece389ba31 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "xla/service/dump.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 53db8c6fba65f3..410c0601a8b834 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -53,7 +54,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index f15b09ade22738..4f78e4638bb828 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index 4e9e9f5a7d145c..de3af1cc82036d 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -645,12 +645,13 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
                          /*device_allocator=*/nullptr)
           .value();
 
-  EXPECT_EQ(OkStatus(), backend()
-                            .compiler()
-                            ->RunBackend(std::move(compiled_module),
-                                         backend().default_stream_executor(),
-                                         /*device_allocator=*/nullptr)
-                            .status());
+  EXPECT_EQ(absl::OkStatus(),
+            backend()
+                .compiler()
+                ->RunBackend(std::move(compiled_module),
+                             backend().default_stream_executor(),
+                             /*device_allocator=*/nullptr)
+                .status());
 }
 
 // A GTE inside of a fusion node inherits the layout of its operand (which
diff --git a/third_party/xla/xla/service/mapped_ptr_container_sorter.h b/third_party/xla/xla/service/mapped_ptr_container_sorter.h
index 388f3ec7fecfa9..580785fb0c6d7f 100644
--- a/third_party/xla/xla/service/mapped_ptr_container_sorter.h
+++ b/third_party/xla/xla/service/mapped_ptr_container_sorter.h
@@ -43,9 +43,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc b/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
index bd035053cc7970..1c8e7fd0dc049e 100644
--- a/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
+++ b/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 4e0dc040dc4893..3ed9a6df03dfee 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_reachability.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/reduce_decomposer.cc b/third_party/xla/xla/service/reduce_decomposer.cc
index 473dc8b35f3236..907fd824b750d4 100644
--- a/third_party/xla/xla/service/reduce_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_decomposer.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/reduce_window_rewriter.cc b/third_party/xla/xla/service/reduce_window_rewriter.cc
index 87b48867188b15..241b30a9eb68d7 100644
--- a/third_party/xla/xla/service/reduce_window_rewriter.cc
+++ b/third_party/xla/xla/service/reduce_window_rewriter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
diff --git a/third_party/xla/xla/service/reduce_window_rewriter.h b/third_party/xla/xla/service/reduce_window_rewriter.h
index 591056a3f094ba..fad14dc6c9e0d9 100644
--- a/third_party/xla/xla/service/reduce_window_rewriter.h
+++ b/third_party/xla/xla/service/reduce_window_rewriter.h
@@ -17,13 +17,14 @@ limitations under the License.
 #define XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
 
 #include <cstdint>
+
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/reshape_decomposer.cc b/third_party/xla/xla/service/reshape_decomposer.cc
index 4e8a07d60e2f25..60ccb36adc8a68 100644
--- a/third_party/xla/xla/service/reshape_decomposer.cc
+++ b/third_party/xla/xla/service/reshape_decomposer.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "xla/service/reshape_decomposer.h"
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index 894b2e424e4556..53ff55705d529c 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 5f6233b39e51b6..631b60a4c3221f 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -53,7 +54,6 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/sharding_op_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/source_map_util.h b/third_party/xla/xla/service/source_map_util.h
index b5efa41186dded..bfde7dd8c7f8f2 100644
--- a/third_party/xla/xla/service/source_map_util.h
+++ b/third_party/xla/xla/service/source_map_util.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_SOURCE_MAP_UTIL_H_
 #define XLA_SERVICE_SOURCE_MAP_UTIL_H_
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "xla/service/executable.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace source_map_util {
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer.cc b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
index d659ca89be3820..cca3618ed1d054 100644
--- a/third_party/xla/xla/service/stochastic_convert_decomposer.cc
+++ b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/shape_inference.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer_test.cc b/third_party/xla/xla/service/stochastic_convert_decomposer_test.cc
index 2dd3564884d766..48ac6e61ce8e9c 100644
--- a/third_party/xla/xla/service/stochastic_convert_decomposer_test.cc
+++ b/third_party/xla/xla/service/stochastic_convert_decomposer_test.cc
@@ -93,7 +93,7 @@ ENTRY entry {
   StochasticConvertDecomposer decomposer;
 
   auto result = decomposer.Run(module.get());
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(), HasSubstr("have same bits"));
 }
 
@@ -113,7 +113,7 @@ ENTRY entry {
   StochasticConvertDecomposer decomposer;
 
   auto result = decomposer.Run(module.get());
-  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_NE(absl::OkStatus(), result.status());
   EXPECT_THAT(result.status().message(),
               HasSubstr("must be unsigned integers"));
 }
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 36022227215e4d..18da58af615399 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/base/const_init.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/literal.h"
 #include "xla/service/compiler.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 4ead3279bee2ac..3611a9f6c7e26a 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
@@ -100,13 +100,13 @@ class TransferManager {
   // tells the actual implementation to do something special.
   virtual void TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
-      MutableBorrowingLiteral literal, std::function<void(Status)> done,
+      MutableBorrowingLiteral literal, std::function<void(absl::Status)> done,
       const TransferMetadata* transfer_metadata) = 0;
 
   void TransferLiteralFromDevice(se::Stream* stream,
                                  const ShapedBuffer& device_buffer,
                                  MutableBorrowingLiteral literal,
-                                 std::function<void(Status)> done) {
+                                 std::function<void(absl::Status)> done) {
     return TransferLiteralFromDevice(stream, device_buffer, literal, done,
                                      nullptr);
   }
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index ca814f4a250add..c9b34c702efc32 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo_replication_analysis.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index 98009b4dc2a247..798920e8d9b0c9 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/service/tuple_simplifier.h"
 #include "xla/service/while_loop_simplifier.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 5ec29917963e24..1e607a0b674c3b 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "xla/status.h"
 #include "xla/tools/xla_compile_lib.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"

From 335ea179cc18709391dcf1c40f931ddcf71c693b Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 10:22:52 -0700
Subject: [PATCH 150/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638697803
---
 third_party/xla/xla/pjrt/cpu/BUILD                  |  5 -----
 .../xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc    |  5 ++---
 .../xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h     |  2 +-
 third_party/xla/xla/pjrt/cpu/cpu_client.cc          |  1 -
 third_party/xla/xla/pjrt/cpu/cpu_client.h           |  2 +-
 third_party/xla/xla/pjrt/cpu/cpu_client_test.cc     |  1 -
 third_party/xla/xla/service/cpu/BUILD               | 13 ++++---------
 third_party/xla/xla/service/cpu/cpu_compiler.cc     |  1 -
 third_party/xla/xla/service/cpu/cpu_compiler.h      |  2 +-
 third_party/xla/xla/service/cpu/cpu_xfeed.h         |  2 +-
 third_party/xla/xla/service/cpu/mlir_emitter.h      |  2 +-
 .../xla/xla/service/cpu/parallel_task_assignment.cc |  2 +-
 third_party/xla/xla/service/spmd/BUILD              |  5 ++---
 .../xla/xla/service/spmd/custom_call_handler.cc     |  2 +-
 third_party/xla/xla/service/spmd/dot_handler.cc     |  2 +-
 .../xla/xla/service/spmd/gather_scatter_handler.cc  |  2 +-
 .../xla/xla/service/spmd/spmd_partitioner.cc        |  6 +++---
 .../xla/xla/service/spmd/spmd_partitioner_test.cc   |  2 +-
 .../xla/xla/service/spmd/spmd_partitioner_util.h    |  2 +-
 19 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 4d67cbd0523abb..f27d2d116144b7 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -68,7 +68,6 @@ cc_library(
         "//xla:literal",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -145,7 +144,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -231,8 +229,6 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
         "//xla:util",
         "//xla/client:xla_computation",
         "//xla/ffi",
@@ -341,7 +337,6 @@ cc_library(
     deps = if_oss([
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
index f0247de19f532b..e9efa07e69ed78 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
@@ -52,7 +52,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -424,7 +423,7 @@ PjRtFuture<> AbstractTfrtCpuBuffer::ToLiteralHelper(
   if (should_sync_copy) {
     CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
     // Unblock ToLiteral caller.
-    return PjRtFuture<>(OkStatus());
+    return PjRtFuture<>(absl::OkStatus());
   } else {
     PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
     // Wait for buffer definition events to finish before d2h dispatch. D2H
@@ -575,7 +574,7 @@ PjRtFuture<> AbstractTfrtCpuBuffer::GetReadyFuture() {
           FailedPrecondition("Buffer Definition Event: %s",
                              definition_event.GetError().message()));
     }
-    return PjRtFuture<>(OkStatus());
+    return PjRtFuture<>(absl::OkStatus());
   } else {
     PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
     definition_event.AndThen([definition_event = definition_event.AsPtr(),
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
index f21e4546d47de2..bd8004fafde01b 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/pjrt/transpose.h"
 #include "xla/service/cpu/cpu_event.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 37a80188aaf00a..7a4251625584bb 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -97,7 +97,6 @@ limitations under the License.
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index 310c6114467877..30127e6200b700 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -60,7 +61,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index 081f11d7a3cfe8..95f09c383362f7 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index ea0ad033b7d96b..337967d214bcb3 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -181,7 +181,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -190,6 +189,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:notification",
@@ -268,7 +268,6 @@ cc_library(
         "//xla:literal",
         "//xla:protobuf_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -443,10 +442,8 @@ cc_library(
         ":target_machine_features",
         ":xla_framework",
         "//xla:cpu_function_runtime",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
-        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/service:buffer_assignment",
@@ -456,9 +453,9 @@ cc_library(
         "//xla/service:hlo_profile_printer_data_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/service:llvm_compiler",
-        "//xla/service:reduce_window_rewriter",
         "//xla/stream_executor",
         "//xla/stream_executor/host:host_platform_id",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Target",
     ],
     alwayslink = True,  # Contains compiler registration
@@ -1326,10 +1323,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1538,7 +1533,6 @@ cc_library(
         ":ir_emission_utils",
         ":shape_partition",
         ":target_machine_features",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -1548,6 +1542,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -1645,9 +1640,9 @@ cc_library(
     hdrs = ["mlir_emitter.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:Linker",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 1e5477441236b1..dd8b587f81aee5 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -181,7 +181,6 @@ limitations under the License.
 #include "xla/service/zero_sized_hlo_elimination.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 3cc54c4a616e3f..87fc7eb82eae15 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/Target/TargetMachine.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/llvm_compiler.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/cpu/cpu_xfeed.h b/third_party/xla/xla/service/cpu/cpu_xfeed.h
index 7d09ec862d84d1..4a5791d00166c5 100644
--- a/third_party/xla/xla/service/cpu/cpu_xfeed.h
+++ b/third_party/xla/xla/service/cpu/cpu_xfeed.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/literal.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/shaped_buffer.h"
-#include "xla/status.h"
 
 // This provides a lower level API than TransferManager that does not depend on
 // StreamExecutor. It is intended to be used by callers that do not want to use
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.h b/third_party/xla/xla/service/cpu/mlir_emitter.h
index af1b1626af2a50..ae9deb8f9764c0 100644
--- a/third_party/xla/xla/service/cpu/mlir_emitter.h
+++ b/third_party/xla/xla/service/cpu/mlir_emitter.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_MLIR_EMITTER_H_
 #define XLA_SERVICE_CPU_MLIR_EMITTER_H_
 
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -23,7 +24,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/shape.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
index b82707a367e246..60d6fd7c1c966a 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/service/cpu/target_machine_features.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/cpu_info.h"
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 04b60ad4fba942..a244e5d2251420 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -40,9 +40,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:protobuf_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:types",
         "//xla:util",
         "//xla:window_util",
@@ -81,6 +79,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -97,7 +96,6 @@ xla_cc_test(
     deps = [
         ":spmd_partitioner",
         ":spmd_prepare",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -112,6 +110,7 @@ xla_cc_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 1c87f84bd4c2d1..2a0276681fe5ff 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index 7cef761b6bf6e2..5ff198843939b8 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 337c37aad659c0..94f1d1cd24a9e0 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index c9075863bbc374..89033d9fd131d3 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "xla/service/tuple_simplifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -2266,7 +2266,7 @@ std::vector<ReplicaGroup> SpmdPartitioningVisitor::CreateReplicaGroups(
   return device_groups;
 }
 
-Status SpmdPartitioningVisitor::HandleCall(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleCall(HloInstruction* hlo) {
   std::vector<HloInstruction*> call_args;
   HloComputation* computation = hlo->called_computations()[0];
   for (int64_t i = 0; i < hlo->operand_count(); ++i) {
@@ -2289,7 +2289,7 @@ Status SpmdPartitioningVisitor::HandleCall(HloInstruction* hlo) {
     call->set_raw_backend_config_string(hlo->raw_backend_config_string());
     return call;
   });
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 8cc77cbb25e9a5..483019675ac636 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/collective_device_list.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/sharding_format_picker.h"
 #include "xla/service/spmd/spmd_prepare.h"
-#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 4b1649d8ba2a50..aceec39c881503 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_replace.h"
 #include "absl/utility/utility.h"
 #include "xla/hlo/ir/collective_device_list.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/service/hlo_dce.h"
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"

From 0b4b099690d4e4f995428792e23c83e66550df20 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 10:30:57 -0700
Subject: [PATCH 151/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638700851
---
 third_party/xla/xla/service/gpu/BUILD         | 155 +++---------------
 .../address_computation_fusion_rewriter.cc    |   1 -
 .../xla/xla/service/gpu/autotuner_util.cc     |   1 -
 .../xla/xla/service/gpu/buffer_allocations.cc |   2 +-
 .../xla/xla/service/gpu/buffer_allocations.h  |   2 +-
 .../collective_permute_cycle_decomposer.cc    |   2 +-
 .../service/gpu/command_buffer_scheduling.cc  |   1 -
 .../service/gpu/command_buffer_scheduling.h   |   2 +-
 .../service/gpu/compile_module_to_llvm_ir.cc  |   2 +-
 .../xla/xla/service/gpu/cublas_cudnn.cc       |   1 -
 .../service/gpu/cudnn_fused_mha_rewriter.cc   |   2 +-
 .../xla/service/gpu/cudnn_norm_rewriter.cc    |   1 -
 .../service/gpu/cudnn_pad_for_convolutions.cc |   2 +-
 .../gpu/cudnn_vectorize_convolutions.cc       |   1 -
 .../xla/xla/service/gpu/cusolver_context.cc   |   1 -
 .../xla/service/gpu/dot_dimension_sorter.cc   |   2 +-
 .../xla/service/gpu/dot_sparsity_rewriter.cc  |   2 +-
 .../gpu/double_buffer_loop_unrolling.cc       |   1 -
 .../xla/xla/service/gpu/fusion_wrapper.cc     |   1 -
 .../xla/xla/service/gpu/gemm_fusion.cc        |   1 -
 .../xla/service/gpu/gemm_fusion_autotuner.cc  |   1 -
 .../xla/xla/service/gpu/gemm_rewriter.cc      |   1 -
 .../xla/xla/service/gpu/gpu_compiler.cc       |   1 -
 .../xla/xla/service/gpu/gpu_compiler.h        |   1 -
 .../xla/xla/service/gpu/gpu_conv_runner.h     |   2 +-
 .../xla/xla/service/gpu/gpu_executable.cc     |   1 -
 .../xla/service/gpu/gpu_fused_mha_runner.h    |   2 +-
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |   1 -
 .../xla/service/gpu/gpu_layout_assignment.cc  |   1 -
 .../xla/service/gpu/gpu_layout_assignment.h   |   2 +-
 .../service/gpu/gpu_memory_space_assignment.h |   2 +-
 .../xla/xla/service/gpu/gpu_norm_runner.h     |   2 +-
 .../xla/xla/service/gpu/gpu_p2p_pipeliner.cc  |   2 +-
 .../xla/xla/service/gpu/hlo_fusion_stats.cc   |   1 -
 .../xla/xla/service/gpu/ir_emission_utils.cc  |   1 -
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |   1 -
 .../xla/xla/service/gpu/ir_emitter_triton.h   |   1 -
 .../xla/service/gpu/ir_emitter_unnested.cc    |   1 -
 .../xla/xla/service/gpu/ir_emitter_unnested.h |   1 -
 .../xla/xla/service/gpu/kernel_arguments.cc   |   1 -
 .../xla/service/gpu/make_batch_pointers.cc    |   1 -
 .../xla/xla/service/gpu/make_batch_pointers.h |   2 +-
 .../xla/xla/service/gpu/matmul_utils.cc       |   1 -
 .../xla/xla/service/gpu/move_copy_to_users.cc |   1 -
 .../xla/xla/service/gpu/nvptx_compiler.cc     |   1 -
 .../xla/service/gpu/pipelined_p2p_rewriter.cc |   1 -
 .../xla/xla/service/gpu/runtime_intrinsics.cc |   1 -
 .../service/gpu/softmax_rewriter_triton.cc    |   1 -
 .../xla/service/gpu/softmax_rewriter_triton.h |   2 +-
 .../xla/service/gpu/split_k_gemm_rewriter.cc  |   1 -
 .../xla/service/gpu/split_k_gemm_rewriter.h   |   2 +-
 .../xla/xla/service/gpu/target_util.cc        |   2 +-
 .../xla/xla/service/gpu/topk_specializer.cc   |   1 -
 .../xla/xla/service/gpu/topk_splitter.cc      |   1 -
 third_party/xla/xla/service/gpu/topk_test.cc  |   1 -
 .../xla/service/gpu/triton_fusion_analysis.cc |   1 -
 .../xla/service/gpu/triton_fusion_analysis.h  |   2 +-
 57 files changed, 44 insertions(+), 187 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6e1a991aea1205..33c9df46edd8ef 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -107,7 +107,6 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_clique_key",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
@@ -124,12 +123,12 @@ cc_library(
     name = "gpu_memory_space_assignment",
     hdrs = ["gpu_memory_space_assignment.h"],
     deps = [
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_ordering",
         "//xla/service:hlo_value",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -163,7 +162,6 @@ xla_test(
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:test_helpers",
         "//xla/client:xla_builder",
@@ -246,12 +244,12 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -340,7 +338,6 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -464,12 +461,10 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
-        "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
@@ -516,7 +511,6 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -758,7 +752,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -767,7 +760,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla/tools:hlo_decomposer_lib",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -797,6 +789,7 @@ cc_library(
         "//xla/service/gpu:hlo_traversal",
     ]) + [
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:path",
     ],
 )
@@ -898,12 +891,10 @@ cc_library(
     srcs = ["buffer_allocations.cc"],
     hdrs = ["buffer_allocations.h"],
     deps = [
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
@@ -937,7 +928,6 @@ cc_library(
         "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -1021,7 +1011,6 @@ cc_library(
         ":target_util",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1118,8 +1107,6 @@ cc_library(
     hdrs = ["cublas_cudnn.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status",
@@ -1171,7 +1158,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1229,7 +1215,6 @@ xla_test(
     deps = [
         ":gpu_device_info_for_tests",
         ":gpu_float_support",
-        ":ir_emission_utils",
         ":ir_emitter_triton",
         ":matmul_utils",
         ":triton_fusion_analysis",
@@ -1300,7 +1285,6 @@ cc_library(
         ":triton_tiling_propagation",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -1350,8 +1334,6 @@ cc_library(
         ":triton_support",
         ":triton_tiling_propagation",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1412,7 +1394,6 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -1469,7 +1450,6 @@ cc_library(
         ":ir_emission_utils",
         ":triton_support",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -1509,7 +1489,6 @@ cc_library(
         ":autotuner_util",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -1534,7 +1513,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 cc_library(
@@ -1548,7 +1527,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1560,7 +1538,6 @@ cc_library(
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1572,7 +1549,10 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
-    ]) + ["//xla/stream_executor:stream_executor_memory_allocator"],
+    ]) + [
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "@com_google_absl//absl/status",
+    ],
 )
 
 # We need a separate target, as runtime executable cannot depend on compilation
@@ -1587,7 +1567,6 @@ cc_library(
         ":ir_emission_utils",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
@@ -1608,7 +1587,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 xla_test(
@@ -1676,7 +1655,6 @@ cc_library(
         ":ir_emission_utils",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1740,13 +1718,13 @@ cc_library(
     deps = [
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -1775,13 +1753,13 @@ cc_library(
     srcs = ["dot_sparsity_rewriter.cc"],
     hdrs = ["dot_sparsity_rewriter.h"],
     deps = [
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:statusor",
@@ -1894,7 +1872,6 @@ cc_library(
         ":stream_executor_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1928,7 +1905,7 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_utils",
         "@local_tsl//tsl/platform:status",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 xla_test(
@@ -1971,10 +1948,6 @@ cc_library(
         ":cublas_cudnn",
         ":stream_executor_util",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -2002,7 +1975,6 @@ cc_library(
         ":cublas_cudnn",
         ":stream_executor_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -2029,13 +2001,8 @@ cc_library(
         ":cublas_cudnn",
         ":stream_executor_util",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:lazy_op_runner",
@@ -2105,8 +2072,6 @@ cc_library(
     hdrs = ["move_copy_to_users.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
@@ -2189,7 +2154,6 @@ cc_library(
     ]),
     deps = [
         "//xla:comparison_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -2705,19 +2669,16 @@ cc_library(
     deps = [
         ":cublas_cudnn",
         ":cudnn_support_utils",
-        ":ir_emission_utils",
         ":stream_executor_util",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
-        "//xla:window_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -2752,7 +2713,6 @@ cc_library(
         ":cudnn_support_utils",
         ":stream_executor_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/client:xla_builder",
@@ -2929,7 +2889,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
@@ -2944,7 +2903,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 cc_library(
@@ -2962,7 +2921,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
@@ -2976,7 +2934,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "//xla:status_macros",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 tf_proto_library(
@@ -3002,12 +2960,8 @@ cc_library(
         ":io_feed_manager",
         ":target_constants",
         "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service:compiler",
@@ -3017,7 +2971,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
@@ -3107,7 +3060,6 @@ cc_library(
         ":metrics",
         ":runtime_intrinsics",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -3126,6 +3078,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:AsmParser",
@@ -3154,8 +3107,6 @@ cc_library(
         ":ir_emission_utils",
         ":variant_visitor",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
@@ -3243,7 +3194,6 @@ cc_library(
         ":hlo_traversal",
         ":ir_emission_utils",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -3550,8 +3500,6 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "//xla:status",
-        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:protobuf",
         "//xla/service:compiler",
         "//xla/service:scatter_expander",
@@ -3579,6 +3527,7 @@ cc_library(
         ":stream_executor_util",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:platform_manager",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:FuncDialect",
         "@local_tsl//tsl/lib/monitoring:counter",
     ],
@@ -3746,7 +3695,6 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -3756,7 +3704,6 @@ cc_library(
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Support",
         "//xla:autotune_results_proto_cc",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -3807,7 +3754,7 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
         "//xla/tsl/util:env_var",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 xla_test(
@@ -4068,9 +4015,7 @@ cc_library(
         "//xla:literal",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:util",
-        "//xla/stream_executor",
         "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -4099,7 +4044,6 @@ cc_library(
         ":stream_executor_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
@@ -4191,7 +4135,6 @@ cc_library(
         ":cublas_cudnn",
         ":gpu_schedule_postprocessing",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -4257,7 +4200,6 @@ cc_library(
     srcs = ["gpu_p2p_pipeliner.cc"],
     hdrs = ["gpu_p2p_pipeliner.h"],
     deps = [
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -4265,6 +4207,7 @@ cc_library(
         "//xla/service:hlo_parser",
         "//xla/service:hlo_pass_pipeline",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -4327,8 +4270,6 @@ cc_library(
         ":stream_executor_util_kernel",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
-        "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -4350,7 +4291,6 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -4658,7 +4598,6 @@ cc_library(
         ":backend_configs_cc",
         ":cublas_cudnn",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -4719,7 +4658,6 @@ cc_library(
         ":stream_executor_util",
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -4734,6 +4672,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -5087,20 +5026,15 @@ cc_library(
     srcs = ["reduction_degenerate_dim_remover.cc"],
     hdrs = ["reduction_degenerate_dim_remover.h"],
     deps = [
-        ":ir_emission_utils",
         "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
-        "//xla/service:pattern_matcher",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -5111,7 +5045,6 @@ cc_library(
     hdrs = ["reduction_dimension_grouper.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
@@ -5167,14 +5100,11 @@ cc_library(
     srcs = ["reduction_layout_normalizer.cc"],
     hdrs = ["reduction_layout_normalizer.h"],
     deps = [
-        ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
-        "//xla/service:pattern_matcher",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -5195,7 +5125,6 @@ cc_library(
     deps = [
         ":reduction_utils",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5223,8 +5152,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
-        "//xla:status_macros",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
@@ -5297,7 +5224,6 @@ cc_library(
     srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
     hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
     deps = [
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -5349,8 +5275,6 @@ tsl_gpu_library(
     hdrs = ["runtime_intrinsics.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
@@ -5388,14 +5312,10 @@ cc_library(
     srcs = ["hlo_fusion_stats.cc"],
     hdrs = ["hlo_fusion_stats.h"],
     deps = [
-        "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_pass",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5477,11 +5397,8 @@ cc_library(
     srcs = ["topk_specializer.cc"],
     hdrs = ["topk_specializer.h"],
     deps = [
-        "//xla:executable_run_options",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5495,7 +5412,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5505,7 +5421,6 @@ cc_library(
     hdrs = ["topk_splitter.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5548,8 +5463,6 @@ xla_test(
     deps = [
         ":topk_specializer",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/service:platform_util",
@@ -5656,10 +5569,7 @@ cc_library(
     hdrs = ["kernel_arguments.h"],
     deps = [
         ":gpu_constants",
-        ":ir_emission_utils",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -5709,9 +5619,6 @@ cc_library(
     hdrs = ["fusion_wrapper.h"],
     deps = [
         ":gpu_fusible",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5756,7 +5663,6 @@ xla_cc_test(
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:scoped_mock_log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "//xla:autotune_results_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5772,6 +5678,7 @@ xla_cc_test(
         "@local_tsl//tsl/platform:protobuf",
     ]) + [
         "//xla/tests:xla_internal_test_main",  # Keep outside GPU guard
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -5780,7 +5687,6 @@ cc_library(
     srcs = ["double_buffer_loop_unrolling.cc"],
     hdrs = ["double_buffer_loop_unrolling.h"],
     deps = [
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5869,7 +5775,6 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5879,6 +5784,7 @@ cc_library(
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
@@ -5909,7 +5815,6 @@ cc_library(
         ":backend_configs_cc",
         ":gpu_fusible",
         "//xla:comparison_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -5936,7 +5841,6 @@ xla_cc_test(
     deps = [
         ":backend_configs_cc",
         ":stream_attribute_annotator",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -5960,7 +5864,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         "//xla:comparison_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -5986,7 +5889,6 @@ xla_cc_test(
     deps = [
         ":backend_configs_cc",
         ":stream_attribute_async_wrapper",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -6009,22 +5911,16 @@ cc_library(
     hdrs = ["gpu_windowed_einsum_handler.h"],
     deps = [
         ":backend_configs_cc",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -6037,7 +5933,6 @@ xla_cc_test(
     deps = [
         ":backend_configs_cc",
         ":gpu_windowed_einsum_handler",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -6068,7 +5963,6 @@ cc_library(
         ":ir_emission_utils",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "//xla:shape_util",
@@ -6083,7 +5977,7 @@ cc_library(
         "//xla/tools:hlo_decomposer_lib",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ]),
+    ]) + ["@com_google_absl//absl/status"],
 )
 
 xla_test(
@@ -6117,7 +6011,6 @@ cc_library(
     hdrs = ["pipelined_p2p_rewriter.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 2f2a9e835e102e..6b52e71073865b 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuner_util.cc
index f3737a60bb996a..a67533fe0f0977 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.cc b/third_party/xla/xla/service/gpu/buffer_allocations.cc
index e0ed378af1cbf0..29e4e6d7453488 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.cc
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <set>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.h b/third_party/xla/xla/service/gpu/buffer_allocations.h
index ddf80ff9278ce0..f521b43394fb2c 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.h
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
index f466fd6c416435..d48a8d557df1a6 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 29767cef78d275..e9571714393dce 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
index 79855f307d6003..20a2b1e8fa99fe 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
@@ -21,13 +21,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 68d83954d30bfa..0d20cc65712ccd 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -60,7 +61,6 @@ limitations under the License.
 #include "xla/service/hlo_ordering.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index a3cb21031769d3..b4a1cd82fbe7f9 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index 99af4c92ce668e..f03fe4f0fac1ab 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
index 3fc3f0f66dc776..a270d45699cb76 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/types.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
index 866324cad85867..ed83a622c9da7c 100644
--- a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/bind_front.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/cudnn_vectorize_convolutions.cc b/third_party/xla/xla/service/gpu/cudnn_vectorize_convolutions.cc
index 9ab4630f2c2e7a..4c29e69aa70028 100644
--- a/third_party/xla/xla/service/gpu/cudnn_vectorize_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_vectorize_convolutions.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
diff --git a/third_party/xla/xla/service/gpu/cusolver_context.cc b/third_party/xla/xla/service/gpu/cusolver_context.cc
index 5814e33ec122cf..ccea46d992c7ae 100644
--- a/third_party/xla/xla/service/gpu/cusolver_context.cc
+++ b/third_party/xla/xla/service/gpu/cusolver_context.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/library_types.h"
 #endif
 #include "xla/primitive_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
diff --git a/third_party/xla/xla/service/gpu/dot_dimension_sorter.cc b/third_party/xla/xla/service/gpu/dot_dimension_sorter.cc
index 0609581981c017..38920ee2abdbb7 100644
--- a/third_party/xla/xla/service/gpu/dot_dimension_sorter.cc
+++ b/third_party/xla/xla/service/gpu/dot_dimension_sorter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
 #include "xla/permutation_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc
index d972e6d6f61fba..0f410916039242 100644
--- a/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc
index 4c10da5b8aa047..5f703e6168e094 100644
--- a/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/flatten_call_graph.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_wrapper.cc b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
index 940597be29376e..2cb847183d9f4a 100644
--- a/third_party/xla/xla/service/gpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/gpu_fusible.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index f81b9167aee00c..0cf14aee6b96d1 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index 377cdef92332b9..6775b6dabafb41 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -74,7 +74,6 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index e0923792b493ea..424c6591d47972 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/blas.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index bea84c8de7aab5..e1a7ce2a1cc3f2 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -224,7 +224,6 @@ limitations under the License.
 #include "xla/service/zero_sized_hlo_elimination.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 7c9613d27855f3..889c02a250e782 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/llvm_compiler.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.h b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
index 31efbd2fecd7be..11f1a3b3b6b834 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
@@ -25,13 +25,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/lazy_op_runner.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 811b0deca3f059..cae036b0f4b2be 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -70,7 +70,6 @@ limitations under the License.
 #include "xla/service/xla_debug_info_manager.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
index 7ca35805be251e..ff111b301b5808 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
@@ -25,10 +25,10 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/lazy_op_runner.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 8688349c0cdd2e..a92bf6fcfe927c 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -57,7 +57,6 @@ limitations under the License.
 #include "xla/service/profile_guided_latency_estimator.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index 227d965257f932..008dbaeade1ab9 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/tsl/util/env_var.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
index 9056ffb014d881..70741fea030efb 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/layout_assignment.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
index 79df0f12f81c0b..ebd1af9dfb2b25 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_ordering.h"
 #include "xla/service/hlo_value.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/gpu_norm_runner.h b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
index 7c9da554a16fa3..f13b05075c53d0 100644
--- a/third_party/xla/xla/service/gpu/gpu_norm_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/lazy_op_runner.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
index 3b415999c6215b..7ec125d9023ea7 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/service/collective_pipeliner.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/hlo_pass_pipeline.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
index 5fd9a0da24cfd8..f41d93d0e87ca4 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 1cf5506488cc23..069ae1cf75e6f6 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -68,7 +68,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index f877cb9e6471a2..8080871125dccf 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -125,7 +125,6 @@ limitations under the License.
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index fc3571f085fe5b..2e8bd6542356be 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index b0a007819f1adf..f59487701ab072 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -153,7 +153,6 @@ limitations under the License.
 #include "xla/service/name_uniquer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index a9b3e943bbddd7..b9cab5cc0571fc 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 #if TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.cc b/third_party/xla/xla/service/gpu/kernel_arguments.cc
index ebdfbd7946cad3..ad9bda03e73f10 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.cc
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
index 46e06e7ddfde07..e6c06d0baea2d0 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cc
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstddef>
 
 #include "absl/status/status.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.h b/third_party/xla/xla/service/gpu/make_batch_pointers.h
index 171f33616d27e0..6e437fafdcb6aa 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.h
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <cstddef>
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 75f521048b887c..3df57aad8d1e90 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users.cc b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
index 51ffbed0ec0138..acc10db6af6927 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index aa307503a95618..6a183fac8e7cdd 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -90,7 +90,6 @@ limitations under the License.
 #include "xla/service/reshape_decomposer.h"
 #include "xla/service/reshape_mover.h"
 #include "xla/service/tuple_simplifier.h"
-#include "xla/status.h"
 #include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
diff --git a/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc
index b43df3adc12432..b8a760deec28ba 100644
--- a/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime_intrinsics.cc b/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
index 8e4413c00e335e..879ca6faf7c671 100644
--- a/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
+++ b/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index 2db88bc59379a1..b382cc05c55c6b 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
index 9463d510f4590e..44d32c5717b709 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
@@ -20,12 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 2ff84c2c61ea91..d1a81683b154b0 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
index 234288e9b1956f..8ec8d67e0025fa 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/target_util.cc b/third_party/xla/xla/service/gpu/target_util.cc
index 3209c69f6cf51a..302b21093da3ab 100644
--- a/third_party/xla/xla/service/gpu/target_util.cc
+++ b/third_party/xla/xla/service/gpu/target_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/service/gpu/topk_specializer.cc b/third_party/xla/xla/service/gpu/topk_specializer.cc
index 1e8f150490ae21..bd01a076cc1711 100644
--- a/third_party/xla/xla/service/gpu/topk_specializer.cc
+++ b/third_party/xla/xla/service/gpu/topk_specializer.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/topk_splitter.cc b/third_party/xla/xla/service/gpu/topk_splitter.cc
index 33e271de2c32b4..3b8ef207f124d9 100644
--- a/third_party/xla/xla/service/gpu/topk_splitter.cc
+++ b/third_party/xla/xla/service/gpu/topk_splitter.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/topk_test.cc b/third_party/xla/xla/service/gpu/topk_test.cc
index 52018e55315f85..43e25b8543cc61 100644
--- a/third_party/xla/xla/service/gpu/topk_test.cc
+++ b/third_party/xla/xla/service/gpu/topk_test.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/service/topk_rewriter.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 1e849849fc8fbe..10eb3893e96457 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
index c52551db079350..69a419d702ccb2 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <map>
 #include <string>
 
+#include "absl/status/status.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {

From b00aea40c524fb54ba212e211f1b8854477a69ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 11:05:19 -0700
Subject: [PATCH 152/287] Fix test build.

PiperOrigin-RevId: 638714289
---
 tensorflow/c/experimental/ops/gen/common/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/experimental/ops/gen/common/BUILD b/tensorflow/c/experimental/ops/gen/common/BUILD
index a5618623bbd586..1782722cac7f72 100644
--- a/tensorflow/c/experimental/ops/gen/common/BUILD
+++ b/tensorflow/c/experimental/ops/gen/common/BUILD
@@ -41,5 +41,6 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:types",
     ],
 )

From ef468a0f363b5140627a3db0b7b7867b7dd44c3f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 11:17:12 -0700
Subject: [PATCH 153/287] Integrate LLVM at llvm/llvm-project@765206e05045

Updates LLVM usage to match
[765206e05045](https://github.com/llvm/llvm-project/commit/765206e05045)

PiperOrigin-RevId: 638718977
---
 third_party/llvm/generated.patch | 12 ++++++++++++
 third_party/llvm/workspace.bzl   |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..bc38d4ebf08754 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,13 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
+--- a/lld/MachO/ObjC.cpp
++++ b/lld/MachO/ObjC.cpp
+@@ -388,7 +388,7 @@
+         : categoryPrefix(_categoryPrefix),
+           pointersPerStruct(_pointersPerStruct) {}
+ 
+-    inline bool operator==(const PointerListInfo &cmp) {
++    inline bool operator==(const PointerListInfo &cmp) const {
+       return pointersPerStruct == cmp.pointersPerStruct &&
+              structSize == cmp.structSize && structCount == cmp.structCount &&
+              allPtrs == cmp.allPtrs;
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index cfc24fd2f858ca..3bb2fd69e0db62 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "9b79acedd689c34d3505ac351c732aa44e22fc86"
-    LLVM_SHA256 = "5e3150bf1d6e8725f4b4bcffa6107fb2b5160087899fb2d78bc74424245d773d"
+    LLVM_COMMIT = "765206e050453018e861637a08a4520f29238074"
+    LLVM_SHA256 = "2b6964c988c6f8fe1bb5679faba9bc45bfa0485a98215ac21a5b76dee86bbafc"
 
     tf_http_archive(
         name = name,

From 9dd5e3499af82c41e152603b939c14f589ec4f35 Mon Sep 17 00:00:00 2001
From: Yin Zhang <yinzz@google.com>
Date: Thu, 30 May 2024 11:30:53 -0700
Subject: [PATCH 154/287] Fix time_span filter in visibility filtering

PiperOrigin-RevId: 638724310
---
 .../convert/trace_viewer/trace_events.cc      | 19 ++++++++++---------
 .../convert/trace_viewer/trace_events.h       |  2 +-
 .../trace_viewer/trace_viewer_visibility.h    | 10 ++++++++++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
index c514988f8f2b12..7cdfe861b708b2 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
@@ -270,7 +270,7 @@ tsl::Status DoStoreAsLevelDbTable(
 tsl::Status DoLoadFromLevelDbTable(
     const std::string& filename,
     std::unique_ptr<TraceEventsFilterInterface> filter,
-    std::unique_ptr<TraceVisibilityFilter> visibility,
+    std::unique_ptr<TraceVisibilityFilter> visibility_filter,
     int64_t filter_by_visibility_threshold, Trace& trace,
     bool& filter_by_visibility,
     const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
@@ -309,13 +309,14 @@ tsl::Status DoLoadFromLevelDbTable(
   filter_by_visibility = filter_by_visibility_threshold == -1LL ||
                          !trace.has_num_events() ||
                          trace.num_events() >= filter_by_visibility_threshold;
-  if (!filter_by_visibility) {
-    visibility.reset();  // disable streaming
-  }
-  if (visibility) {
-    visibility->SetUp(trace);
-    visible_span = visibility->VisibleSpan();
-    container_resolution_ps = visibility->ResolutionPs();
+  if (visibility_filter) {
+    if (!filter_by_visibility) {
+      // disable streaming
+      visibility_filter->UpdateVisibility(0);
+    }
+    visibility_filter->SetUp(trace);
+    visible_span = visibility_filter->VisibleSpan();
+    container_resolution_ps = visibility_filter->ResolutionPs();
   } else {
     visible_span = TraceSpan(trace);
   }
@@ -376,7 +377,7 @@ tsl::Status DoLoadFromLevelDbTable(
             << filtered << " events from LevelDb fast file: " << filename;
   size_t visible_events_count = 0;
   for (TraceEvent* event : loaded_events) {
-    if (!visibility || !visibility->Filter(*event)) {
+    if (!visibility_filter || !visibility_filter->Filter(*event)) {
       add_arena_event(event);
       ++visible_events_count;
     }
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
index a99e22478ef333..271cf9116fba56 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -61,7 +61,7 @@ tsl::Status DoStoreAsLevelDbTable(
 tsl::Status DoLoadFromLevelDbTable(
     const std::string& filename,
     std::unique_ptr<TraceEventsFilterInterface> filter,
-    std::unique_ptr<TraceVisibilityFilter> visibility,
+    std::unique_ptr<TraceVisibilityFilter> visibility_filter,
     int64_t filter_by_visibility_threshold, Trace& trace,
     bool& filter_by_visibility,
     const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
index 23d59ad077139e..0ce72ebcf7d582 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
@@ -54,6 +54,8 @@ class TraceViewerVisibility {
   void SetVisibleAtResolution(const TraceEvent& event);
 
   tsl::profiler::Timespan VisibleSpan() const { return visible_span_; }
+  // TODO(tf-profiler) Rename ResolutionPs and resolution_ps to be more
+  // self-explanatory (eg. MinDurationPs)
   uint64_t ResolutionPs() const { return resolution_ps_; }
 
  private:
@@ -142,6 +144,14 @@ class TraceVisibilityFilter : public TraceEventsFilterInterface {
     }
   }
 
+  // Updates the visibility based on `resolution`.
+  void UpdateVisibility(double resolution) {
+    resolution_ = resolution;
+    visibility_ = TraceViewerVisibility(
+        visibility_.VisibleSpan(),
+        ResolutionPs(visibility_.VisibleSpan().duration_ps()));
+  }
+
   bool Filter(const TraceEvent& event) override {
     return !visibility_.Visible(event);
   }

From adc512abf8c35c7c6874821c5fd6f6b26e371e1e Mon Sep 17 00:00:00 2001
From: Pauline Sho <psho@google.com>
Date: Thu, 30 May 2024 11:32:37 -0700
Subject: [PATCH 155/287] Support per-axis embedding lookup reference kernel

PiperOrigin-RevId: 638725225
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |   2 -
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |   8 --
 tensorflow/lite/kernels/embedding_lookup.cc   |  28 ++++-
 .../lite/kernels/embedding_lookup_test.cc     | 103 +++++++++++++++++-
 4 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 5f4cce6d8e8a76..5d34320521b429 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1633,8 +1633,6 @@ def TFL_EluOp: TFL_Op<"elu", [
 
 def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
     [Pure,
-     PredOpTrait<"value and output must have same element type",
-       TFL_TCresVTEtIsSameAsOp<0, 1>>,
      TFL_OperandHasRank<0, 1>,
      TFL_OperandHasRankAtLeast<1, 2>,
      DynamicRangeQuantizedOpInterface,
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index fa69cd46017f8f..6c6bffbc44dbd1 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -2031,14 +2031,6 @@ func.func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xf32>) -
 
 // -----
 
-func.func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xi8>) -> tensor<?xf32> {
-  // expected-error @+1 {{'tfl.embedding_lookup' op failed to verify that value and output must have same element type}}
-  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xi8>) -> tensor<?xf32>
-  func.return %0 : tensor<?xf32>
-}
-
-// -----
-
 func.func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
   // expected-error @+1 {{'tfl.local_response_normalization' op operand #0 must be tensor of 32-bit float values, but got 'tensor<1x56x56x192x!quant.uniform<u8:f32, 2.000000e-02>>'}}
   %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 3c4d34923ef065..f8ccee35937086 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -55,6 +55,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &value));
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
+  if (value->quantization.type == kTfLiteAffineQuantization) {
+    const auto qparams = static_cast<const TfLiteAffineQuantization*>(
+        value->quantization.params);
+    TF_LITE_ENSURE(context, qparams->scale != nullptr);
+    TF_LITE_ENSURE(context, qparams->zero_point != nullptr);
+    // Only support symmetric quantization for now.
+    TF_LITE_ENSURE(context, qparams->zero_point->data[0] == 0);
+    if (qparams->scale->size > 1 || qparams->zero_point->size > 1) {
+      // Per-axis quantization must have quantized_dimension == 0 and correct
+      // sizes for scale and zero_point.
+      TF_LITE_ENSURE(context, qparams->quantized_dimension == 0);
+      const int row_size = SizeOfDimension(value, 0);
+      TF_LITE_ENSURE(context, qparams->scale->size == row_size);
+      TF_LITE_ENSURE(context, qparams->zero_point->size == row_size);
+    }
+  }
+
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
@@ -101,7 +118,6 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
                         const TfLiteTensor* lookup, const TfLiteTensor* value,
                         TfLiteTensor* output) {
   const int row_size = SizeOfDimension(value, 0);
-  const double scaling_factor = value->params.scale;
 
   // col_size after we flatten tensor into 2D.
   int col_size = 1;
@@ -125,6 +141,16 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       // Dequantize embedding values.
       // TODO(alanchiao): refactor scalar multiply into separate function
       // for ease of adding a neon equivalent if ever necessary.
+      double scaling_factor = value->params.scale;
+      if (value->quantization.type == kTfLiteAffineQuantization) {
+        const auto qparams = static_cast<const TfLiteAffineQuantization*>(
+            value->quantization.params);
+        if (qparams->scale->size > 1) {
+          // get this row's scale for per-axis quantization
+          scaling_factor = qparams->scale->data[idx];
+        }
+      }
+
       for (int j = 0; j < col_size; j++) {
         output_ptr[j + i * col_size] =
             value_ptr[j + idx * col_size] * scaling_factor;
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 16e3b65d9bd88b..e85d6da4174880 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -24,7 +24,6 @@ License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -38,12 +37,22 @@ using ::testing::ElementsAreArray;
 
 class BaseEmbeddingLookupOpModel : public SingleOpModel {
  public:
-  BaseEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
-                             std::initializer_list<int> weight_shape,
-                             TensorType weight_type = TensorType_FLOAT32,
-                             TensorType output_type = TensorType_FLOAT32) {
+  BaseEmbeddingLookupOpModel(
+      std::initializer_list<int> index_shape,
+      std::initializer_list<int> weight_shape,
+      TensorType weight_type = TensorType_FLOAT32,
+      TensorType output_type = TensorType_FLOAT32,
+      const std::vector<float>& per_channel_quantization_scales = {}) {
     input_ = AddInput(TensorType_INT32);
-    weight_ = AddInput(weight_type);
+    if (per_channel_quantization_scales.empty()) {
+      weight_ = AddInput(weight_type);
+    } else {
+      std::vector<int64_t> per_channel_quantization_offsets(
+          per_channel_quantization_scales.size(), 0);
+      weight_ = AddInput({weight_type, weight_shape, 0, 0, 0, 0, true,
+                          per_channel_quantization_scales,
+                          per_channel_quantization_offsets, 0});
+    }
     output_ = AddOutput(output_type);
     SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
     BuildInterpreter({index_shape, weight_shape});
@@ -101,6 +110,22 @@ class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
   }
 };
 
+class PerAxisHybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
+ public:
+  PerAxisHybridEmbeddingLookupOpModel(
+      std::initializer_list<int> index_shape,
+      std::initializer_list<int> weight_shape,
+      const std::vector<float>& per_channel_quantization_scales,
+      TensorType type)
+      : BaseEmbeddingLookupOpModel(index_shape, weight_shape, type,
+                                   TensorType_FLOAT32,
+                                   per_channel_quantization_scales) {}
+
+  void SetSignedWeight(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(weight_, data);
+  }
+};
+
 // TODO(ahentz): write more tests that exercise the details of the op, such as
 // lookup errors and variable input shapes.
 TEST(EmbeddingLookupOpTest, SimpleTest) {
@@ -261,5 +286,71 @@ TEST(EmbeddingLookupHybridOpTest, Simple3DTestQuantized) {
               }));
 }
 
+TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple2DTestInt8) {
+  PerAxisHybridEmbeddingLookupOpModel m(
+      {3}, {3, 8}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple3DTestInt8) {
+  PerAxisHybridEmbeddingLookupOpModel m(
+      {3}, {3, 2, 4}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple4DTestInt8) {
+  PerAxisHybridEmbeddingLookupOpModel m(
+      {3}, {3, 2, 2, 2}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite

From d76dc473244bd2095c11854e48c482cc19581135 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 11:38:03 -0700
Subject: [PATCH 156/287] Remove bench_microkernels build variant (we can just
 use test_microkernels), and remove -UNDEBUG from the bazel BUILD.

PiperOrigin-RevId: 638727422
---
 tensorflow/lite/delegates/xnnpack/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index a2841aecca92cd..d83fd8161b744b 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -298,7 +298,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
-        "@XNNPACK//:XNNPACK_test_mode",
+        "@XNNPACK",
         "@XNNPACK//:experiments_config",
         "@XNNPACK//:logging",
     ],

From c672de4b3cb2a027753031579905a948c9c7412b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 11:45:22 -0700
Subject: [PATCH 157/287] Tighten the error tolerances after fixing an internal
 bug related to flush-subnormal-to-zero semantics for Sqrt, Rsqrt, and Cbrt on
 TPU. After this fix, the behavior matches that on GPUs.

PiperOrigin-RevId: 638730152
---
 .../exhaustive/exhaustive_op_test_utils.h     |  5 +-
 .../exhaustive_unary_test_f32_or_smaller.cc   | 91 ++++++++++++-------
 2 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h b/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
index 52fbeb2379827d..af66022c45e35e 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
@@ -951,7 +951,7 @@ ErrorSpec DefaultSpecGenerator(typename ExhaustiveOpTestBase<T, N>::NativeT,
 // relative errors in values at or below the subnormal boundary (e.g. for values
 // less than ~1e-38 for FP32).
 static constexpr float kDefaultAbsoluteToleranceSlackFactor = 2;
-static constexpr float kDefaultRelativeToleranceSlackFactor = 50;
+static constexpr float kDefaultRelativeToleranceSlackFactor = 20;
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<C128, 1>(complex128) {
@@ -966,8 +966,7 @@ template <>
 inline ErrorSpec DefaultSpecGenerator<C64, 1>(complex64) {
   double atol =
       kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<float>::min();
-  double rtol = kDefaultRelativeToleranceSlackFactor *
-                std::numeric_limits<float>::epsilon();
+  double rtol = 40 * std::numeric_limits<float>::epsilon();
   return ErrorSpec{atol, rtol};
 }
 
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
index c27b1910779e64..8af352571e1a18 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
@@ -42,11 +42,14 @@ using Eigen::half;
 
 template <typename T, size_t N>
 T EvaluatePolynomial(T x, const std::array<T, N>& coeffs) {
-  T result = 0;
+  // Evaluate the polynomial as accurately as we can using double precision and
+  // FMA.
+  double result = 0;
+  double x_d = static_cast<double>(x);
   for (T c : coeffs) {
-    result = result * x + c;
+    result = std::fma(result, x_d, static_cast<double>(c));
   }
-  return result;
+  return static_cast<T>(result);
 }
 
 // There's no std::erfinv, so we have to implement it ourselves.  This follows
@@ -90,14 +93,14 @@ float HostErfInv(float x) {
   };
 
   if (std::abs(x) > 1 || std::isnan(x)) {
-    return std::numeric_limits<float>::quiet_NaN();
+    return std::numeric_limits<double>::quiet_NaN();
   }
   if (std::abs(x) == 1) {
-    return std::copysign(std::numeric_limits<float>::infinity(), x);
+    return std::copysign(std::numeric_limits<double>::infinity(), x);
   }
 
-  float unsigned_result = [&] {
-    float y = std::abs(x);
+  double unsigned_result = [&] {
+    double y = std::abs(x);
     if (y <= 0.85) {
       double r = 0.180625 - 0.25 * y * y;
       return (y * EvaluatePolynomial(r, kPolyA)) /
@@ -113,28 +116,28 @@ float HostErfInv(float x) {
       }
     }
   }();
-  return std::copysign(unsigned_result, x);
+  return static_cast<float>(std::copysign(unsigned_result, x));
 }
 
 // Digamma implementation using a polynomial from Cephes.  Notably this is a
 // different implementation from the one in math.cc.
 float HostDigamma(float x) {
   // Euler-Mascheroni constant
-  float kGamma = 0.57721566490153286061;
-  float kPi = M_PI;
+  double kGamma = 0.57721566490153286061;
+  double kPi = M_PI;
 
-  std::array<float, 4> kPoly = {
+  std::array<double, 4> kPoly = {
       -4.16666666666666666667E-3,
       3.96825396825396825397E-3,
       -8.33333333333333333333E-3,
       8.33333333333333333333E-2,
   };
 
-  float reflection = 0;
+  double reflection = 0;
   if (x <= 0) {
-    float floor = std::floor(x);
+    double floor = std::floor(x);
     if (x == floor) {
-      return std::numeric_limits<float>::quiet_NaN();
+      return std::numeric_limits<double>::quiet_NaN();
     }
     // Compute reflection term, pi * cot(pi * x).
     reflection = x - floor;
@@ -149,27 +152,27 @@ float HostDigamma(float x) {
     x = 1 - x;
   }
 
-  float result = 0;
+  double result = 0;
   if (x <= 10 && x == std::floor(x)) {
     // Special case for integers <= 10.
     for (int i = 1; i < x; ++i) {
-      result += 1.0f / i;
+      result += 1.0 / i;
     }
     result -= kGamma;
   } else {
-    float w = 0;
+    double w = 0;
     for (; x < 10; ++x) {
-      w += 1.0f / x;
+      w += 1.0 / x;
     }
     if (x < 1e8) {
-      float z = 1.0f / (x * x);
+      double z = 1.0 / (x * x);
       result = z * EvaluatePolynomial(z, kPoly);
     }
-    result = std::log(x) - 0.5f / x - result - w;
+    result = std::log(x) - 0.5 / x - result - w;
   }
 
   // Compute the final, reflected value.
-  return result - reflection;
+  return static_cast<float>(result - reflection);
 }
 
 // Exhaustive test for unary operations for <= 32bit floating point types.
@@ -382,14 +385,19 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
-  Run(Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
+  auto error_spec_gen = +[](NativeT x) {
+    float eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{
+        .abs_err = 0, .rel_err = 2 * eps, .strict_signed_zeros = true};
+  };
+  Run(Rsqrt, +[](float x) { return 1 / std::sqrt(x); }, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
   auto error_spec_gen = +[](NativeT x) {
-    auto spec = GetDefaultSpecGenerator()(x);
-    spec.strict_signed_zeros = true;
-    return spec;
+    float eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{
+        .abs_err = 0, .rel_err = 2 * eps, .strict_signed_zeros = true};
   };
   Run(Sqrt, std::sqrt, error_spec_gen);
 })
@@ -397,12 +405,23 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cbrt, {
   auto error_spec_gen = +[](NativeT x) {
     NativeT eps = std::numeric_limits<NativeT>::epsilon();
-    NativeT min = std::numeric_limits<NativeT>::min();
-    // Allow a small absolute error (e.g. 9e-16 for F32).
-    // This corresponds to a 0.5% relative error for the
-    // smallest normalized floating point values.
-    return ErrorSpec{.abs_err = std::cbrt(min) / 200, .rel_err = 50 * eps};
+    return ErrorSpec{
+        .abs_err = 0, .rel_err = 5 * eps, .strict_signed_zeros = true};
   };
+  if (IsCpu(platform_)) {
+    // While GPUs and TPUs flush subnormal inputs to zero, the CPU returns a
+    // relatively inaccurate approximation for such inputs. Therefore we allow a
+    // small absolute error (e.g. ~9e-16 for F32). This corresponds to a 0.5%
+    // relative error for the smallest normalized floating point values,
+    // increasing gradually to 100% for the smallest subnormal value.
+    error_spec_gen = +[](NativeT x) {
+      NativeT denorm_min = std::numeric_limits<NativeT>::denorm_min();
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = std::cbrt(denorm_min),
+                       .rel_err = 10 * eps,
+                       .strict_signed_zeros = true};
+    };
+  }
   Run(Cbrt, std::cbrt, error_spec_gen);
 })
 
@@ -557,7 +576,11 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Tan, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, {
-  auto error_spec_gen = GetDefaultSpecGenerator();
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT min = std::numeric_limits<NativeT>::min();
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 2 * min, .rel_err = 50 * eps};
+  };
   if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT min = std::numeric_limits<NativeT>::min();
@@ -568,7 +591,11 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, {
   Run(Erfc, std::erfc, error_spec_gen);
 })
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, {
-  auto error_spec_gen = GetDefaultSpecGenerator();
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT min = std::numeric_limits<NativeT>::min();
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 2 * min, .rel_err = 50 * eps};
+  };
   if (IsTpu(platform_)) {
     error_spec_gen = +[](NativeT x) {
       NativeT eps = std::numeric_limits<NativeT>::epsilon();

From 9ca5a013eca72dfc8f117a7d27a9d53102978a83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Thu, 30 May 2024 12:14:35 -0700
Subject: [PATCH 158/287] Reverts 8f8ce5cbf917285f77e1c08e75ffa011324565ef

PiperOrigin-RevId: 638740644
---
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  23 ++--
 .../xla/xla/service/gpu/triton_support.cc     |  43 -------
 .../xla/xla/service/gpu/triton_support.h      |   9 --
 .../xla/service/gpu/triton_support_test.cc    | 109 ------------------
 .../service/gpu/triton_tiling_propagation.cc  |  24 +++-
 6 files changed, 29 insertions(+), 180 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 33c9df46edd8ef..5222f7d6f58824 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1192,7 +1192,6 @@ cc_library(
     hdrs = ["triton_support.h"],
     deps = [
         ":variant_visitor",
-        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 8080871125dccf..d8835fbe605e5b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1568,24 +1568,21 @@ class MatMulEmitterHelper {
             majormost_dim_start_index_ptr_val, mt::CacheModifier::NONE,
             mt::EvictionPolicy::NORMAL,
             /*isVolatile=*/false);
+        Value majormost_dim_start_index_lower_limit_val =
+            CreateConst(b_, majormost_dim_start_index_val.getType(), 0);
         int64_t majormost_dim_start_index_upper_limit =
             hlo->operand(0)->shape().dimensions(majormost_dim) -
             hlo->dynamic_slice_sizes().at(majormost_dim);
-        // We don't want to cast S64 indices to S32, because that could result
-        // in an incorrect value.
-        if (majormost_dim_start_index_val.getType().isInteger() &&
-            majormost_dim_start_index_val.getType().getIntOrFloatBitWidth() ==
-                64) {
-          return UncompilableMatmul(
-              "64 bit dynamic-slice indices are not supported yet.");
-        }
+        Value majormost_dim_start_index_upper_limit_val =
+            CreateConst(b_, majormost_dim_start_index_val.getType(),
+                        majormost_dim_start_index_upper_limit);
+        // Our Triton codegen only supports signed integers so far.
         majormost_dim_start_index_val =
-            Cast(b_, majormost_dim_start_index_val, b_.getI32Type());
+            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val,
+                                   majormost_dim_start_index_lower_limit_val);
         majormost_dim_start_index_val =
-            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(0));
-        majormost_dim_start_index_val = b_.create<ma::MinSIOp>(
-            majormost_dim_start_index_val,
-            Cst32(majormost_dim_start_index_upper_limit));
+            b_.create<ma::MinSIOp>(majormost_dim_start_index_val,
+                                   majormost_dim_start_index_upper_limit_val);
 
         // How many "rows" (non-contracting dim values) are there in a slice of
         // size 1?
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 27e456bf7f0984..155ef105d645fe 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/triton_support.h"
 
-#include <cstdint>
 #include <iterator>
 #include <variant>
 #include <vector>
@@ -26,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
@@ -316,43 +314,6 @@ CodegenDecision CanTritonHandleReduce(
   return "Reduction is not a row-reduction of a single operand.";
 }
 
-CodegenDecision IsTritonSupportedDynamicSlice(
-    const HloDynamicSliceInstruction& instr) {
-  for (const HloInstruction* index_operand : instr.index_operands()) {
-    switch (index_operand->shape().element_type()) {
-      case S8:
-      case S16:
-      case S32:
-        break;  // supported
-      default:
-        return CodegenDecision(
-            "Dynamic slice is only supported with S8, S16, or S32 indices.");
-    }
-  }
-
-  // Similar to normal slice, we cannot slice a non-major-most dimension as
-  // that would introduce non-contiguous strides under tiling. The existing
-  // check against this in GetRequirementsIfSupportedOrder is not suitable for
-  // dynamic slices, so we instead check for this here.
-  const HloInstruction* input = instr.operand(0);
-  Layout in_layout = input->shape().layout();
-  int64_t majormost_dim_id =
-      in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
-
-  for (int i = 0; i < input->shape().dimensions_size(); ++i) {
-    if (i == majormost_dim_id) {
-      continue;
-    } else if (input->shape().dimensions(i) != instr.slice_sizes(i)) {
-      return CodegenDecision(
-          "Unsupported dynamic slice on non-major-most dimension.");
-    }
-  }
-
-  // TODO(b/343143854): Check the subtleties of which dynamic slices are
-  // supported, for example that a fragmented dimension cannot be sliced.
-  return CodegenDecision{};
-}
-
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   if (instr.IsElementwise()) {
@@ -373,10 +334,6 @@ CodegenDecision IsTritonSupportedInstruction(
       }
       return "Only supports root tuples.";
     }
-    case HloOpcode::kDynamicSlice: {
-      return IsTritonSupportedDynamicSlice(
-          *Cast<HloDynamicSliceInstruction>(&instr));
-    }
     case HloOpcode::kBitcast:
     case HloOpcode::kTranspose:
     case HloOpcode::kSlice:
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index 66e5e5f2c73906..072c9ab948ec00 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -53,14 +52,6 @@ bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
 
-// Checks dynamic slice against requirements of triton emitter.
-//
-// This is exposed separately from IsTritonSupportedInstruction because we can
-// use it in the dimension order propagation without adding a dependency on the
-// GPU version.
-CodegenDecision IsTritonSupportedDynamicSlice(
-    const HloDynamicSliceInstruction& instr);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
index 6ba5b1ff6ca17c..7e9a8fdd7312bd 100644
--- a/third_party/xla/xla/service/gpu/triton_support_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -34,10 +34,8 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
@@ -476,113 +474,6 @@ INSTANTIATE_TEST_SUITE_P(DotTestTestSuite, DotTest,
                                             ::testing::Values(HloOpcode::kDot)),
                          TestParamsToString);
 
-struct DynamicSliceTestParam {
-  PrimitiveType data_type;
-  PrimitiveType index_type;
-  bool is_the_majormost_dim_being_sliced;
-
-  using TupleType = std::tuple<PrimitiveType, PrimitiveType, bool>;
-
-  explicit DynamicSliceTestParam(const TupleType& tuple)
-      : data_type(std::get<0>(tuple)),
-        index_type(std::get<1>(tuple)),
-        is_the_majormost_dim_being_sliced(std::get<2>(tuple)) {}
-};
-
-std::string DynamicSliceTestParamToString(
-    const ::testing::TestParamInfo<DynamicSliceTestParam>& info) {
-  return absl::StrCat(
-      primitive_util::LowercasePrimitiveTypeName(info.param.data_type), "_",
-      primitive_util::LowercasePrimitiveTypeName(info.param.index_type), "_",
-      info.param.is_the_majormost_dim_being_sliced ? "majormost"
-                                                   : "not_majormost");
-}
-
-class DynamicSliceTest
-    : public TritonSupportTest,
-      public ::testing::WithParamInterface<DynamicSliceTestParam> {};
-
-TEST_P(DynamicSliceTest, IsTritonSupportedExecutesCorrectlyForDynamicSlice) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE) &&
-      GetParam().data_type == BF16) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
-
-  constexpr absl::string_view kHloTestTemplate =
-      R"(
-HloModule m
-
-triton_gemm {
-  dynamic_slice_input = $0[$2,$3] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = $1[] parameter(2)
-  start_index1 = $1[] parameter(3)
-  dynamic_slice = $0[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  convert = f32[5,2] convert(dynamic_slice)
-  ROOT dot = f32[5, 4] dot(convert, dot_rhs),
-          lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  dynamic_slice_input = $0[$2,$3] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = $1[] constant($4)
-  start_index1 = $1[] constant($5)
-  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate,
-      primitive_util::LowercasePrimitiveTypeName(GetParam().data_type),
-      primitive_util::LowercasePrimitiveTypeName(GetParam().index_type),
-      GetParam().is_the_majormost_dim_being_sliced ? 7 : 5,  // input dim0
-      GetParam().is_the_majormost_dim_being_sliced ? 2 : 4,  // input dim1
-      GetParam().is_the_majormost_dim_being_sliced ? 1 : 0,  // start_index0
-      GetParam().is_the_majormost_dim_being_sliced ? 0 : 1   // start_index1
-  );
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_test));
-  const HloComputation* computation =
-      module->GetComputationWithName("triton_gemm");
-  ASSERT_NE(computation, nullptr);
-  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
-      *computation, HloOpcode::kDynamicSlice);
-
-  const bool is_supported_instruction =
-      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
-          .CanFuse();
-  const bool is_supported_dynamic_slice =
-      IsTritonSupportedDynamicSlice(*Cast<HloDynamicSliceInstruction>(instr))
-          .CanFuse();
-  EXPECT_EQ(is_supported_instruction, is_supported_dynamic_slice);
-
-  if (is_supported_instruction) {
-    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
-    EXPECT_TRUE(RunAndCompareNoHloPasses(
-        std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
-  } else {
-    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
-                tsl::testing::StatusIs(absl::StatusCode::kFailedPrecondition));
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    All, DynamicSliceTest,
-    ::testing::ConvertGenerator<DynamicSliceTestParam::TupleType>(
-        ::testing::Combine(::testing::Values(F16, BF16, F32),
-                           ::testing::Values(S8, S16, S32, S64, U8, U16, U32,
-                                             U64),
-                           ::testing::Bool())),
-    DynamicSliceTestParamToString);
-
 TEST_F(TritonSupportTest, UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
   const std::string kHloTest = R"(
 triton_gemm___computation {
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 277470eae9c2ea..58b9492fda2bd3 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -977,11 +977,25 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
       return "Dynamic slices for now are only supported in GEMM fusions.";
     }
 
-    if (CodegenDecision decision = IsTritonSupportedDynamicSlice(
-            *Cast<HloDynamicSliceInstruction>(&hlo));
-        !decision.CanFuse()) {
-      // CodegenDecision is actually the same type as FusionDecision.
-      return decision;
+    // Similar to normal slice, we cannot slice a non-major-most dimension as
+    // that would introduce non-contiguous strides under tiling. The existing
+    // check against this in GetRequirementsIfSupportedOrder is not suitable for
+    // dynamic slices, so we instead check for this here.
+    const HloInstruction* input = hlo.operand(0);
+    Layout in_layout = input->shape().layout();
+    int64_t majormost =
+        in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
+    const HloDynamicSliceInstruction* dynamic_slice =
+        Cast<HloDynamicSliceInstruction>(&hlo);
+
+    for (int i = 0; i < input->shape().dimensions_size(); ++i) {
+      if (i == majormost) {
+        continue;
+      } else if (input->shape().dimensions(i) !=
+                 dynamic_slice->slice_sizes(i)) {
+        return FusionDecision(
+            "Unsupported dynamic slice on non-major-most dimension.");
+      }
     }
 
     return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,

From cc00bdde17291aebc244375b8ef05111c0194285 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 13:11:31 -0700
Subject: [PATCH 159/287] Include stream_executor.h in a few files that were
 missing its inclusion.

PiperOrigin-RevId: 638758402
---
 tensorflow/compiler/tf2xla/xla_helpers.cc              | 1 +
 tensorflow/core/tpu/kernels/tpu_configuration_ops.cc   | 1 +
 third_party/xla/xla/service/gpu/make_batch_pointers.cc | 1 +
 third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc | 1 +
 third_party/xla/xla/stream_executor/lazy_op_runner.h   | 1 +
 5 files changed, 5 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 024d8cd469fb56..6bf772e7dd03f9 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index d78c755301270f..60636369401c17 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
index e6c06d0baea2d0..f2516742e1dedd 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cc
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
index 17b2562bd5d5ab..6f931aeb6324fd 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/algorithm_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/protobuf/dnn.pb.h"
diff --git a/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/xla/xla/stream_executor/lazy_op_runner.h
index c211244b426542..68ac81a0cb388c 100644
--- a/third_party/xla/xla/stream_executor/lazy_op_runner.h
+++ b/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/dnn.pb.h"
 

From c96b7e83d6e24e594de833eb21152bd355ade339 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 13:46:31 -0700
Subject: [PATCH 160/287] Reverts 73e2bb2a020aec9124c7f1c99a1eaaac6ab96a22

PiperOrigin-RevId: 638769822
---
 .../xla/xla/service/gpu/gpu_executable.cc     | 22 ++++++++++---------
 .../xla/stream_executor/cuda/cuda_executor.cc |  8 +++++--
 .../xla/stream_executor/gpu/gpu_executor.h    |  2 +-
 .../stream_executor/mock_stream_executor.h    |  2 +-
 .../xla/stream_executor/rocm/rocm_executor.cc |  9 ++++++--
 .../stream_executor_interface.h               |  5 ++---
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index cae036b0f4b2be..1a9b52051ba9f9 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -628,7 +628,8 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   // The CUDA driver isn't able to load a PTX and a binary which are both empty.
   // It's okay if we skip loading in this case; if the module isn't loaded, all
   // symbol lookups will fail, just as they should for an empty module.
-  if (!(executor->GetPlatform()->id() == se::cuda::kCudaPlatformId &&
+  if (!(executor->GetPlatform()->id() ==
+            stream_executor::cuda::kCudaPlatformId &&
         binary().empty() && text().empty())) {
     TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
   }
@@ -638,22 +639,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   int submitted_mem_copies = 0;
 
   for (const ConstantInfo& info : constants_) {
-    std::optional<se::DeviceMemoryBase> global;
+    absl::StatusOr<stream_executor::DeviceMemoryBase> global_status;
     if (static_cast<bool>(module_handle)) {
-      TF_ASSIGN_OR_RETURN(global,
-                          executor->GetSymbol(info.symbol_name, module_handle));
+      global_status = executor->GetSymbol(info.symbol_name, module_handle);
     }
 
-    if (static_cast<bool>(module_handle) && global.has_value()) {
+    se::DeviceMemoryBase global;
+    if (static_cast<bool>(module_handle) && global_status.ok()) {
       // The constant was defined in the PTX and has been allocated by the CUDA
       // driver.
+      global = *global_status;
       VLOG(3) << "Resolved global " << info.symbol_name << " to "
-              << global->opaque();
+              << global.opaque();
 
       if (!info.content.span().empty()) {
         // This means the constant did not have an initializer in the PTX and
         // therefore must be initialized by XLA here.
-        TF_RETURN_IF_ERROR(stream->Memcpy(&*global, info.content.span().data(),
+        TF_RETURN_IF_ERROR(stream->Memcpy(&global, info.content.span().data(),
                                           info.content.span().size()));
         submitted_mem_copies = true;
       }
@@ -664,9 +666,9 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
 
       TF_ASSIGN_OR_RETURN(auto shared, executor->CreateOrShareConstant(
                                            stream, info.content.span()));
-      *global = *shared;
+      global = *shared;
       VLOG(3) << "Allocated (or shared) global " << info.symbol_name << " at "
-              << global->opaque();
+              << global.opaque();
       // XLA will continue to own this global at least until this executable is
       // destroyed (longer if another, longer-lived executable shares the same
       // constant).
@@ -674,7 +676,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
     }
 
     if (info.allocation_index != -1) {
-      InsertOrDie(globals.get(), info.allocation_index, *global);
+      InsertOrDie(globals.get(), info.allocation_index, global);
     }
   }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 95e4d47d6a4518..bec611675c73f2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -883,7 +883,7 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
+absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   void* mem = nullptr;
   size_t bytes = 0;
@@ -906,7 +906,11 @@ absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
     }
   }
 
-  return std::nullopt;
+  LOG(INFO) << "Failed to find symbol: " << symbol_name;
+  return absl::NotFoundError(
+      absl::StrCat("Check if module containing symbol ", symbol_name,
+                   " is loaded (module_handle = ",
+                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 8cd10f57dae498..af1b55087b8591 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -245,7 +245,7 @@ class GpuExecutor : public StreamExecutor {
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
 
-  absl::StatusOr<std::optional<DeviceMemoryBase>> GetSymbol(
+  absl::StatusOr<DeviceMemoryBase> GetSymbol(
       const std::string& symbol_name, ModuleHandle module_handle) override;
 
   absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index e52c88a2903f6f..6e2c9c7c81af22 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -146,7 +146,7 @@ class MockStreamExecutor : public StreamExecutorInterface {
               (override));
   MOCK_METHOD(bool, DeviceMemoryUsage, (int64_t* free, int64_t* total),
               (const, override));
-  MOCK_METHOD(absl::StatusOr<std::optional<DeviceMemoryBase>>, GetSymbol,
+  MOCK_METHOD(absl::StatusOr<DeviceMemoryBase>, GetSymbol,
               (const std::string& symbol_name, ModuleHandle module_handle),
               (override));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<DeviceDescription>>,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 8704d272b6032a..a6783d5e08f2da 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -787,7 +787,7 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
+absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   void* mem = nullptr;
   size_t bytes = 0;
@@ -810,7 +810,12 @@ absl::StatusOr<std::optional<DeviceMemoryBase>> GpuExecutor::GetSymbol(
       return DeviceMemoryBase(mem, bytes);
     }
   }
-  return std::nullopt;
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return absl::NotFoundError(
+      absl::StrCat("Check if module containing symbol ", symbol_name,
+                   " is loaded (module_handle = ",
+                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
diff --git a/third_party/xla/xla/stream_executor/stream_executor_interface.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
index aafffb39c3255a..f040fda36e44d7 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -303,13 +303,12 @@ class StreamExecutorInterface {
 
   // Retrieves device pointer and size for a symbol. To use
   // constant memory in CUDA, GetSymbol has to be used. Returns DeviceMemoryBase
-  // describing the symbol in memory if symbol is found, and an empty
-  // std::optional otherwise.
+  // describing the symbol in memory if symbol is found.
   //
   // If ModuleHandle is set then we search for `symbol_name` only within the
   // module corresponding to `module_handle`.  Otherwise all loaded modules are
   // searched.
-  virtual absl::StatusOr<std::optional<DeviceMemoryBase>> GetSymbol(
+  virtual absl::StatusOr<DeviceMemoryBase> GetSymbol(
       const std::string& symbol_name, ModuleHandle module_handle) {
     return absl::UnimplementedError("Not implemented");
   }

From dcabcf3a7c5a3c28cc093a8499aa441ccc074f23 Mon Sep 17 00:00:00 2001
From: Anshuman Goswami <anshumang@google.com>
Date: Thu, 30 May 2024 13:47:19 -0700
Subject: [PATCH 161/287] Inline one-off use small functions as lambdas

PiperOrigin-RevId: 638770068
---
 .../core/common_runtime/eager/execute.cc      | 371 +++++++++---------
 1 file changed, 183 insertions(+), 188 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index cda35c20062173..5c1b42f2b19c44 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -137,16 +137,6 @@ const string& DeviceNameOrUnspecified(Device* device) {
   return (device == nullptr) ? *unspecified_string : device->name();
 }
 
-// Returns whether a kernel should be cached.
-bool KernelCacheEnabled(const OpDef& op_def) {
-  if (data::DatasetOpKernel::IsDatasetOp(op_def)) {
-    return false;
-  }
-  // TODO(b/162540360): Revisit a way to mark kernels as uncachable once we have
-  // 5+ kernels to exclude.
-  return true;
-}
-
 // This function expects *handle to point to an existing tensor handle that is
 // currently on "handle_device", but where the operation expects that input to
 // reside on "expected_input_device".  The function will arrange for this
@@ -302,33 +292,6 @@ Status ValidateInputTypeAndPlacement(
   return absl::OkStatus();
 }
 
-Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
-  const auto& node_def = op->MutableAttrs()->BuildNodeDef();
-  const OpDef* op_def = nullptr;
-
-  const FunctionDef* function_def = op->GetFunctionDef();
-  if (function_def != nullptr) {
-    op_def = &(function_def->signature());
-  } else {
-    TF_RETURN_IF_ERROR(OpDefForOp(op->Name().c_str(), &op_def));
-  }
-
-  TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, output_dtypes));
-
-  return absl::OkStatus();
-}
-
-const KernelDef* GetKernelDef(const EagerOperation& op, const NodeDef* node_def,
-                              const Device* op_device) {
-  if (node_def == nullptr || op_device == nullptr) return nullptr;
-  const KernelDef* kernel_def = nullptr;
-  Status s = FindKernelDef(DeviceType(op_device->device_type()), *node_def,
-                           &kernel_def,
-                           /*kernel_class_name=*/nullptr);
-  if (!s.ok()) return nullptr;
-  return kernel_def;
-}
-
 bool IsHostMemoryArg(const EagerOperation& op, const NodeDef* node_def,
                      const Device* op_device, const KernelDef* kernel_def,
                      const int port_id) {
@@ -407,24 +370,6 @@ Status GetDeviceForInput(const EagerOperation& op, const EagerContext& ctx,
   return absl::OkStatus();
 }
 
-// Appends a TensorShape object to Fprint128 hash.
-// For best performance, we would like to avoid dynamic memory allocation in
-// this function.
-// If "shape" has unknown rank, we attach "?" to hashed content; otherwise we
-// attach every dim size to hashed content.
-void AppendTensorShapeToFingerprint(const PartialTensorShape& shape,
-                                    Fprint128* fingerprint) {
-  if (shape.unknown_rank()) {
-    char c = '?';
-    *fingerprint = tsl::FingerprintCat128(*fingerprint, c);
-  } else {
-    for (int i = 0; i < shape.dims(); i++) {
-      int64_t dim = shape.dim_size(i);
-      *fingerprint = tsl::FingerprintCat128(*fingerprint, dim);
-    }
-  }
-}
-
 Status GetFuncAttr(const EagerOperation* op, const EagerContext& ctx,
                    const char* attr_name, bool* value) {
   Status status = op->Attrs().Get(attr_name, value);
@@ -620,21 +565,6 @@ Status UpdateCompileCounter(const EagerOperation* op, const EagerContext& ctx,
   return absl::OkStatus();
 }
 
-Status VerifyWrappableInCallOp(const OpDef& opdef, EagerOperation* op) {
-  absl::flat_hash_set<string> opdef_attrs;
-  for (const auto& attr : opdef.attr()) {
-    opdef_attrs.insert(attr.name());
-  }
-  const auto& node_def = op->MutableAttrs()->BuildNodeDef();
-  for (const auto& attr : node_def.attr()) {
-    if (opdef_attrs.find(attr.first) == opdef_attrs.end()) {
-      return errors::Unimplemented("EagerOperation: ", op->Name(),
-                                   " has a private attr '", attr.first, "'.");
-    }
-  }
-  return absl::OkStatus();
-}
-
 using ProtoArgListType = protobuf::RepeatedPtrField<OpDef_ArgDef>;
 
 string EscapeOrigName(const string& orig_name) {
@@ -698,18 +628,6 @@ Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
   return absl::OkStatus();
 }
 
-// Validates the node def. This is required when running in eager op as function
-// mode because this code path does not go through the _apply_op_helper's
-// validation (which is reached when executing in graph mode)
-// or the eager execution's validation (which is reached via the CreateOpKernel
-// call).
-Status ValidateOp(EagerOperation* op) {
-  const NodeDef& node_def = op->MutableAttrs()->BuildNodeDef();
-  const OpDef* op_def;
-  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
-  return ValidateNodeDef(node_def, *op_def);
-}
-
 // Builds the signature of the wrapping FunctionDef for an eager op.
 //
 // For ops without variadic inputs/outputs, the signature is the same as the
@@ -977,7 +895,22 @@ Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   // Raise an error for ops which don't support wrapping yet. This includes
   // ops with list inputs/outputs and ops with private attrs.
   // TODO(srbs): Support list inputs/outputs.
-  TF_RETURN_IF_ERROR(VerifyWrappableInCallOp(opdef, op));
+  auto verify_wrappable_in_call_op = [](const OpDef& opdef,
+                                        EagerOperation* op) -> Status {
+    absl::flat_hash_set<string> opdef_attrs;
+    for (const auto& attr : opdef.attr()) {
+      opdef_attrs.insert(attr.name());
+    }
+    const auto& node_def = op->MutableAttrs()->BuildNodeDef();
+    for (const auto& attr : node_def.attr()) {
+      if (opdef_attrs.find(attr.first) == opdef_attrs.end()) {
+        return errors::Unimplemented("EagerOperation: ", op->Name(),
+                                     " has a private attr '", attr.first, "'.");
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(verify_wrappable_in_call_op(opdef, op));
 
   // Build a FunctionDef containing op as a node and register with context.
   // TODO(srbs): Here we are unable to distinguish between a FunctionDef for
@@ -1047,37 +980,6 @@ Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   return AddMixedTypeListAttrs(*wrapped_op, op_attrs, opdef);
 }
 
-// Necessary condition to place int args/retvals on device but not sufficient.
-// For eager operations return values can be placed on the device for use
-// by subsequent eager ops. E.g.
-// with tf.device("/GPU:0"):
-//   x = tf.random_uniform(shape=(2, 2), maxval=5, dtype=tf.int32)
-//   y = tf.random_uniform(shape=(2, 2), maxval=5, dtype=tf.int32)
-//   z = tf.bitwise.bitwise_and(x, y)
-// In the above example `z` can use the outputs of `x` and `y` without needing
-// an H2D copy if x and y are left on-device.
-bool IntArgsAndRetvalsOnDevice(EagerOperation* op,
-                               const KernelDef* kernel_def) {
-  // We choose to leave `EagerConsts`
-  // on HOST to avoid `shape` and other arguments that are traditionally pinned
-  // to HostMemory from being placed on-device and then being copied to host via
-  // an expensive D2H transfer.
-  if (op->Name() == "_EagerConst") return false;
-
-  // Check if any of the Op's output_arg(s) are pinned to Host.
-  if (kernel_def == nullptr) return false;
-  const OpDef& op_def = OpRegistry::Global()->LookUp(op->Name())->op_def;
-  for (const string& host_memory_arg : kernel_def->host_memory_arg()) {
-    for (const auto& output_arg : op_def.output_arg()) {
-      if (output_arg.name() == host_memory_arg) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
 using BoolTensorInputs = std::vector<std::pair<std::string, bool>>;
 
 // Identifies boolean tensor inputs from the EagerOperation and returns them. If
@@ -1180,25 +1082,6 @@ std::optional<bool> GetBoolArgumentValue(const EagerOperation& op,
   return std::nullopt;
 }
 
-bool IsSmallConstantOptimizationEnabled(const EagerOperation& op) {
-  if (!op.is_function()) return false;
-  const FunctionDef* fdef = op.EagerContext().GetFunctionDef(op.Name());
-  if (fdef == nullptr) return false;
-  return small_constants_optimizer::IsSmallConstantOptimizationEnabled(*fdef);
-}
-
-bool IsSummaryOptimizerEnabled(const EagerOperation* op) {
-  if (!op->is_function()) return false;
-  const FunctionDef* fdef = op->EagerContext().GetFunctionDef(op->Name());
-  if (fdef == nullptr) return false;
-  const auto include_summary_arg =
-      summary_optimizer::GetDisableSummariesInputArg(*fdef);
-  if (include_summary_arg.first.empty()) return false;
-  const auto arg_value = GetBoolArgumentValue(*op, include_summary_arg.first);
-  if (!arg_value.has_value()) return false;
-  return arg_value.value() == include_summary_arg.second;
-}
-
 absl::StatusOr<Fprint128> GetKernelCacheKey(
     const EagerOperation& op, const Fprint128& op_cache_key,
     const std::vector<Device*>& input_device_ptrs,
@@ -1233,7 +1116,23 @@ absl::StatusOr<Fprint128> GetKernelCacheKey(
       // Add _Arg index, dtype and shape to "cache_key".
       cache_key = tsl::FingerprintCat128(cache_key, i);
       cache_key = tsl::FingerprintCat128(cache_key, dtype_and_shape.dtype);
-      AppendTensorShapeToFingerprint(dtype_and_shape.shape, &cache_key);
+      // Appends a TensorShape object to Fprint128 hash.
+      // For best performance, we would like to avoid dynamic memory allocation
+      // in this function. If "shape" has unknown rank, we attach "?" to hashed
+      // content; otherwise we attach every dim size to hashed content.
+      auto append_tensor_shape_to_fingerprint =
+          [](const PartialTensorShape& shape, Fprint128* fingerprint) -> void {
+        if (shape.unknown_rank()) {
+          char c = '?';
+          *fingerprint = tsl::FingerprintCat128(*fingerprint, c);
+        } else {
+          for (int i = 0; i < shape.dims(); i++) {
+            int64_t dim = shape.dim_size(i);
+            *fingerprint = tsl::FingerprintCat128(*fingerprint, dim);
+          }
+        }
+      };
+      append_tensor_shape_to_fingerprint(dtype_and_shape.shape, &cache_key);
     }
   }
 
@@ -1331,13 +1230,6 @@ Status SetOpDevice(EagerContext& ctx, EagerOperation* op, Device** device) {
   return absl::OkStatus();
 }
 
-Fprint128 GetDeviceCacheKey(EagerOperation* op, const EagerContext& ctx) {
-  Fprint128 device_cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
-  device_cache_key =
-      tsl::FingerprintCat128(device_cache_key, ctx.AllowSoftPlacement());
-  return device_cache_key;
-}
-
 Status GetOrCreateKernelAndDevice(
     EagerOperation* op, TensorHandle** retvals, int* num_retvals,
     core::RefCountPtr<KernelAndDevice>* out_kernel) {
@@ -1346,7 +1238,14 @@ Status GetOrCreateKernelAndDevice(
 
   // Update the EagerOperation with information about the boolean input tensors
   // when small constant optimization is enabled.
-  if (IsSmallConstantOptimizationEnabled(*op)) {
+  auto is_small_constant_optimization_enabled =
+      [](const EagerOperation& op) -> bool {
+    if (!op.is_function()) return false;
+    const FunctionDef* fdef = op.EagerContext().GetFunctionDef(op.Name());
+    if (fdef == nullptr) return false;
+    return small_constants_optimizer::IsSmallConstantOptimizationEnabled(*fdef);
+  };
+  if (is_small_constant_optimization_enabled(*op)) {
     TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs,
                         GetBoolInputs(op, /*delete_inputs=*/false));
     string folded_name = op->Name();
@@ -1357,16 +1256,35 @@ Status GetOrCreateKernelAndDevice(
     op->UpdateName(folded_name);
   }
 
-  // Update the EagerOperation with information about the boolean input tensors
-  // when the summary_optimizer is enabled.
-  if (IsSummaryOptimizerEnabled(op)) {
+  // Update the EagerOperation with information about the boolean input
+  // tensors when the summary_optimizer is enabled.
+  auto is_summary_optimizer_enabled = [](const EagerOperation* op) -> bool {
+    if (!op->is_function()) return false;
+    const FunctionDef* fdef = op->EagerContext().GetFunctionDef(op->Name());
+    if (fdef == nullptr) return false;
+    const auto include_summary_arg =
+        summary_optimizer::GetDisableSummariesInputArg(*fdef);
+    if (include_summary_arg.first.empty()) return false;
+    const auto arg_value = GetBoolArgumentValue(*op, include_summary_arg.first);
+    if (!arg_value.has_value()) return false;
+    return arg_value.value() == include_summary_arg.second;
+  };
+  if (is_summary_optimizer_enabled(op)) {
     op->UpdateName(summary_optimizer::StrippedFunctionName(op->Name()));
   }
 
   // Set the EagerOperation's device prior to extracting the input_device_ptrs
   // to avoid any redundant H2D/D2H copies.
   if (device == nullptr && !op->is_function()) {
-    Fprint128 device_cache_key = GetDeviceCacheKey(op, ctx);
+    auto get_device_cache_key = [](EagerOperation* op,
+                                   const EagerContext& ctx) -> Fprint128 {
+      Fprint128 device_cache_key =
+          op->MutableAttrs()->CacheKey(op->DeviceName());
+      device_cache_key =
+          tsl::FingerprintCat128(device_cache_key, ctx.AllowSoftPlacement());
+      return device_cache_key;
+    };
+    Fprint128 device_cache_key = get_device_cache_key(op, ctx);
     device = ctx.GetCachedDevice(device_cache_key);
     if (device == nullptr) {
       TF_RETURN_IF_ERROR(SetOpDevice(ctx, op, &device));
@@ -1387,8 +1305,17 @@ Status GetOrCreateKernelAndDevice(
       input_resource_variable_dtypes_and_shapes;
   const KernelDef* kernel_def = nullptr;
   if (!op->is_function()) {
-    const NodeDef* node_def = &op->MutableAttrs()->BuildNodeDef();
-    kernel_def = GetKernelDef(*op, node_def, device);
+    const NodeDef& node_def = op->MutableAttrs()->BuildNodeDef();
+    auto get_kernel_def = [](const EagerOperation& op, const NodeDef& node_def,
+                             const Device* op_device) -> const KernelDef* {
+      const KernelDef* kernel_def = nullptr;
+      Status s = FindKernelDef(DeviceType(op_device->device_type()), node_def,
+                               &kernel_def,
+                               /*kernel_class_name=*/nullptr);
+      if (!s.ok()) return nullptr;
+      return kernel_def;
+    };
+    kernel_def = get_kernel_def(*op, node_def, device);
   }
   if (op->is_function() || ctx.RunEagerOpAsFunction()) {
     TF_RETURN_IF_ERROR(ExtractFunctionInputInfo(
@@ -1469,8 +1396,8 @@ Status GetOrCreateKernelAndDevice(
     // eager mode. This is specially important for cases where the
     // preferred device is not the actual device on which the op is run.
     // E.g. the preferred device for a `RangeDataset` op could be set to `GPU`
-    // but `ctx->SelectDevice` would still place it on CPU. Placer on the other
-    // hand would throw an error.
+    // but `ctx->SelectDevice` would still place it on CPU. Placer on the
+    // other hand would throw an error.
     //
     // Note: The wrapped function is never jit compiled but rather run via the
     // FLR. This is needed because certain ops e.g. `VarHandleOp` can not be
@@ -1485,7 +1412,19 @@ Status GetOrCreateKernelAndDevice(
     bool shape_inference_on_tfe_dialect_import = true;
     if (ctx.RunEagerOpAsFunction() && !op->is_function()) {
       EagerOperation* wrapped_op = nullptr;
-      TF_RETURN_IF_ERROR(ValidateOp(op));
+      // Validates the node def. This is required when running in eager op as
+      // function mode because this code path does not go through the
+      // _apply_op_helper's validation (which is reached when executing in
+      // graph mode) or the eager execution's validation (which is reached via
+      // the CreateOpKernel call).
+      auto validate_op = [](EagerOperation* op) -> Status {
+        const NodeDef& node_def = op->MutableAttrs()->BuildNodeDef();
+        const OpDef* op_def;
+        TF_RETURN_IF_ERROR(
+            OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
+        return ValidateNodeDef(node_def, *op_def);
+      };
+      TF_RETURN_IF_ERROR(validate_op(op));
       TF_RETURN_IF_ERROR(WrapInCallOp(op, &wrapped_op));
       DCHECK(wrapped_op);
       DCHECK(wrapped_op->is_function());
@@ -1494,8 +1433,37 @@ Status GetOrCreateKernelAndDevice(
       allow_small_function_optimizations = true;
       allow_control_flow_sync_execution = true;
       shape_inference_on_tfe_dialect_import = false;
+      // Necessary condition to place int args/retvals on device but not
+      // sufficient. For eager operations return values can be placed on the
+      // device for use by subsequent eager ops. E.g. with tf.device("/GPU:0"):
+      //   x = tf.random_uniform(shape=(2, 2), maxval=5, dtype=tf.int32)
+      //   y = tf.random_uniform(shape=(2, 2), maxval=5, dtype=tf.int32)
+      //   z = tf.bitwise.bitwise_and(x, y)
+      // In the above example `z` can use the outputs of `x` and `y` without
+      // needing an H2D copy if x and y are left on-device.
+      auto is_int_args_and_retvals_on_device =
+          [](EagerOperation* op, const KernelDef* kernel_def) -> bool {
+        // We choose to leave `EagerConsts`
+        // on HOST to avoid `shape` and other arguments that are traditionally
+        // pinned to HostMemory from being placed on-device and then being
+        // copied to host via an expensive D2H transfer.
+        if (op->Name() == "_EagerConst") return false;
+
+        // Check if any of the Op's output_arg(s) are pinned to Host.
+        if (kernel_def == nullptr) return false;
+        const OpDef& op_def = OpRegistry::Global()->LookUp(op->Name())->op_def;
+        for (const string& host_memory_arg : kernel_def->host_memory_arg()) {
+          for (const auto& output_arg : op_def.output_arg()) {
+            if (output_arg.name() == host_memory_arg) {
+              return false;
+            }
+          }
+        }
+
+        return true;
+      };
       int_args_and_retvals_on_device =
-          IntArgsAndRetvalsOnDevice(op, kernel_def);
+          is_int_args_and_retvals_on_device(op, kernel_def);
       op = wrapped_op;
       if (int_args_and_retvals_on_device) {
         op->MutableAttrs()->Set(FunctionLibraryDefinition::kIntsOnDeviceAttr,
@@ -1573,7 +1541,18 @@ Status GetOrCreateKernelAndDevice(
     } else {
       TF_RETURN_IF_ERROR(OpDefForOp(op->Name().data(), &op_def));
     }
-    if (op_def != nullptr && KernelCacheEnabled(*op_def)) {
+    // Returns whether a kernel should be cached.
+    auto kernel_cache_enabled = [](const OpDef& op_def) -> bool {
+      if (data::DatasetOpKernel::IsDatasetOp(op_def)) {
+        return false;
+      }
+      // TODO(b/162540360): Revisit a way to mark kernels as uncachable once we
+      // have 5+ kernels to exclude.
+      return true;
+    };
+    // TODO(b/162540360): Revisit a way to mark kernels as uncachable once we
+    // have 5+ kernels to exclude.
+    if (op_def != nullptr && kernel_cache_enabled(*op_def)) {
       // TODO(intel-tf): Implement an eviction policy to prevent potential
       // memory growth (https://github.com/tensorflow/tensorflow/issues/58676)
       VLOG(2) << "Caching op " << op->Name();
@@ -1819,38 +1798,6 @@ Status MaybePackInputTensor(EagerOperation* op) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
-  EagerContext& ctx = op->EagerContext();
-
-  remote_op->set_id(ctx.RemoteMgr()->NextOpId());
-  remote_op->set_name(op->Name());
-
-  op->Attrs().FillAttrValueMapWithoutDefaults(remote_op->mutable_attrs());
-  remote_op->set_device(std::get<Device*>(op->Device())->name());
-  remote_op->set_is_function(op->is_function());
-}
-
-Status StoreResourceDtypesAndShapes(const eager::Operation& remote_op,
-                                    const DataTypeVector& output_dtypes,
-                                    TensorHandle** retvals) {
-  if (remote_op.name() == "VarHandleOp") {
-    if (output_dtypes.size() != 1) {
-      return errors::Internal("VarHandleOp should only have one output.");
-    }
-    if (output_dtypes[0] != DT_RESOURCE) {
-      return errors::Internal(
-          "The output of VarHandleOp should be a DT_RESOURCE.");
-    }
-    AttrSlice attr_slice = AttrSlice(&remote_op.attrs());
-    const AttrValue* dtype;
-    TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
-    const AttrValue* shape;
-    TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
-    retvals[0]->SetResourceHandleDtypeAndShape(
-        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
-  }
-  return absl::OkStatus();
-}
 
 Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                           int* num_retvals) {
@@ -1965,10 +1912,37 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     }
   }
 
-  PrepareRemoteOp(remote_op, op);
+  auto prepare_remote_op = [](eager::Operation* remote_op,
+                              EagerOperation* op) -> void {
+    EagerContext& ctx = op->EagerContext();
+
+    remote_op->set_id(ctx.RemoteMgr()->NextOpId());
+    remote_op->set_name(op->Name());
+
+    op->Attrs().FillAttrValueMapWithoutDefaults(remote_op->mutable_attrs());
+    remote_op->set_device(std::get<Device*>(op->Device())->name());
+    remote_op->set_is_function(op->is_function());
+  };
+  prepare_remote_op(remote_op, op);
 
   DataTypeVector output_dtypes;
-  TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
+  auto get_output_dtypes = [](EagerOperation* op,
+                              DataTypeVector* output_dtypes) -> Status {
+    const auto& node_def = op->MutableAttrs()->BuildNodeDef();
+    const OpDef* op_def = nullptr;
+
+    const FunctionDef* function_def = op->GetFunctionDef();
+    if (function_def != nullptr) {
+      op_def = &(function_def->signature());
+    } else {
+      TF_RETURN_IF_ERROR(OpDefForOp(op->Name().c_str(), &op_def));
+    }
+
+    TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, output_dtypes));
+
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(get_output_dtypes(op, &output_dtypes));
 
   const size_t num_outputs = output_dtypes.size();
   if (num_outputs != *num_retvals) {
@@ -2001,8 +1975,29 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   // the type and shape during function instantiation. Store the type and
   // shape on eager master and sent them to the default function device along
   // with the EnqueueRequest.
+  auto store_resource_dtypes_and_shapes =
+      [](const eager::Operation& remote_op, const DataTypeVector& output_dtypes,
+         TensorHandle** retvals) -> Status {
+    if (remote_op.name() == "VarHandleOp") {
+      if (output_dtypes.size() != 1) {
+        return errors::Internal("VarHandleOp should only have one output.");
+      }
+      if (output_dtypes[0] != DT_RESOURCE) {
+        return errors::Internal(
+            "The output of VarHandleOp should be a DT_RESOURCE.");
+      }
+      AttrSlice attr_slice = AttrSlice(&remote_op.attrs());
+      const AttrValue* dtype;
+      TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
+      const AttrValue* shape;
+      TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
+      retvals[0]->SetResourceHandleDtypeAndShape(
+          {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
+    }
+    return absl::OkStatus();
+  };
   TF_RETURN_IF_ERROR(
-      StoreResourceDtypesAndShapes(*remote_op, output_dtypes, retvals));
+      store_resource_dtypes_and_shapes(*remote_op, output_dtypes, retvals));
 
   auto& executor = op->Executor();
   VLOG(4) << "Execute remote eager op: " << op->Name()

From 85e0b3a9b3801b749de7f119b2afc3b2bff18249 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 13:54:29 -0700
Subject: [PATCH 162/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638772258
---
 third_party/xla/xla/service/gpu/fusions/BUILD | 21 ++-----------------
 .../xla/xla/service/gpu/fusions/custom.cc     |  1 -
 .../xla/service/gpu/fusions/fusion_emitter.cc |  1 -
 .../xla/service/gpu/fusions/fusion_emitter.h  |  2 +-
 .../xla/xla/service/gpu/fusions/fusions.cc    |  2 +-
 .../fusions/in_place_dynamic_update_slice.cc  |  1 -
 .../fusions/in_place_dynamic_update_slice.h   |  2 +-
 .../in_place_dynamic_update_slice_mlir.h      |  2 +-
 .../xla/service/gpu/fusions/input_slices.cc   |  1 -
 .../xla/service/gpu/fusions/input_slices.h    |  2 +-
 .../service/gpu/fusions/input_slices_mlir.h   |  2 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |  2 +-
 .../xla/xla/service/gpu/fusions/loop.h        |  2 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.h   |  2 +-
 .../xla/xla/service/gpu/fusions/reduction.cc  |  1 -
 .../xla/xla/service/gpu/fusions/scatter.cc    |  1 -
 .../xla/xla/service/gpu/fusions/scatter.h     |  2 +-
 .../xla/service/gpu/fusions/scatter_mlir.h    |  2 +-
 .../xla/xla/service/gpu/fusions/transpose.cc  |  1 -
 .../xla/service/gpu/fusions/transpose_mlir.h  |  2 +-
 20 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 27c292fedd333b..35a3e0fe984964 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -14,7 +14,6 @@ cc_library(
     hdrs = ["in_place_dynamic_update_slice.h"],
     deps = [
         ":fusion_emitter",
-        "//xla:status",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -58,7 +57,6 @@ cc_library(
     hdrs = ["in_place_dynamic_update_slice_mlir.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -67,7 +65,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
-        "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -125,7 +122,6 @@ cc_library(
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -212,7 +208,6 @@ cc_library(
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -268,7 +263,6 @@ cc_library(
         ":transpose_mlir",
         ":triton",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:backend_configs_cc",
@@ -282,9 +276,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -333,7 +324,6 @@ cc_library(
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -348,6 +338,7 @@ cc_library(
         "//xla/service/llvm_ir:ir_array",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:macros",
@@ -362,7 +353,6 @@ cc_library(
     deps = [
         ":loop",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -408,7 +398,6 @@ cc_library(
     deps = [
         ":loop",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:scatter_simplifier",
@@ -459,7 +448,6 @@ cc_library(
         ":fusion_emitter",
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -475,8 +463,8 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -532,7 +520,6 @@ cc_library(
         ":fusion_emitter",
         ":loop",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -740,7 +727,6 @@ cc_library(
         ":thunk_util",
         ":tiling_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -988,7 +974,6 @@ cc_library(
         ":tiling_util",
         "//xla:permutation_util",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1043,7 +1028,6 @@ cc_library(
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
@@ -1074,7 +1058,6 @@ cc_library(
     srcs = ["input_slices_mlir.cc"],
     hdrs = ["input_slices_mlir.h"],
     deps = [
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 8098a2e3c44ba4..e58b8cd6afa911 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -63,7 +63,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index e61cc677e8edd6..cc01f915d0bf28 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index db2e6d82ff35ac..aec8ff8b4f3814 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index cf6d8abd65f791..0697994dceb01f 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index 959480d85a3ca7..594d8ca6e255c7 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 741d6b339a6478..010ccde6ccc512 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
index b3086ea29af6a6..dafe43c3c1cc28 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index ccea6311405544..ee5c3603045d3d 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index 70f5c2cca5b2dd..e4065637062ef2 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
index cfc6d9a0d1751d..9d82cd67207fd3 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index d7b0961c79448e..af6426b252d664 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/numeric/bits.h"
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.h b/third_party/xla/xla/service/gpu/fusions/loop.h
index e466abe66a843f..76ba2f2a9e4346 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
index dce02b372f849a..c7b9fb066603a1 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index 1eba5e2024c4d8..ddba9446823ac8 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -76,7 +76,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index 4d78e09581f601..287b619e6f75ad 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index f054a3ecf81f9d..501f36c7837358 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
index 5d657afb81d8a5..1739db72276ab8 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index c86084e37d7d6b..14b419f0dcd063 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index af2869ecd32ec6..6682b870bf74dc 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {

From 262874b57e73cb11df46d0a5aaa2ca89501d573f Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Thu, 30 May 2024 14:05:47 -0700
Subject: [PATCH 163/287] Add note on TensorFlow NumPy 2.0 upgrade in the next
 release

PiperOrigin-RevId: 638776511
---
 RELEASE.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 22bf380baa3c22..467a6216ce03ca 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -34,6 +34,8 @@
 * Add TensorFlow to StableHLO converter to TensorFlow pip package.
 * TensorRT support: this is the last release supporting TensorRT. It will be
 removed in the next release.
+* NumPy 2.0 support: TensorFlow is going to support NumPy 2.0 in the next
+release. It may break some edge cases of TensorFlow API usage.
 
 ## Keras
 

From d858b9c11ab328f77e84a76f51d0be5c76f10cd7 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Thu, 30 May 2024 14:14:21 -0700
Subject: [PATCH 164/287] Allow sharing the same IFRT client across multiple
 IFRT Proxy sessions

PiperOrigin-RevId: 638779549
---
 third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc | 2 +-
 third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h  | 2 +-
 .../xla/xla/python/ifrt_proxy/server/ifrt_backend.cc        | 4 ++--
 third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc
index fbd3b6952eb103..6295dc11c97d81 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc
@@ -74,7 +74,7 @@ absl::StatusOr<std::unique_ptr<GrpcServer>> GrpcServer::Create(
 absl::StatusOr<std::unique_ptr<GrpcServer>>
 GrpcServer::CreateFromIfrtClientFactory(
     absl::string_view address,
-    absl::AnyInvocable<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>()>
+    absl::AnyInvocable<absl::StatusOr<std::shared_ptr<xla::ifrt::Client>>()>
         backend_ifrt_client_factory) {
   if (backend_ifrt_client_factory == nullptr) {
     return absl::InvalidArgumentError(
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
index d9bd31dcee376f..af225939474f8b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
@@ -47,7 +47,7 @@ class GrpcServer {
   static absl::StatusOr<std::unique_ptr<GrpcServer>>
   CreateFromIfrtClientFactory(
       absl::string_view address,
-      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>()>
+      absl::AnyInvocable<absl::StatusOr<std::shared_ptr<xla::ifrt::Client>>()>
           backend_ifrt_client_factory);
 
   // Starts shutting down the server and waits until it properly shuts down.
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 32e7bff17827f4..bf66c2680f4602 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -78,7 +78,7 @@ namespace ifrt {
 namespace proxy {
 
 IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
-                         std::unique_ptr<xla::ifrt::Client> ifrt_client,
+                         std::shared_ptr<xla::ifrt::Client> ifrt_client,
                          std::shared_ptr<HostBufferStore> host_buffer_store)
     : version_(std::move(version)),
       session_id_(session_id),
@@ -99,7 +99,7 @@ IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
 
 absl::StatusOr<std::unique_ptr<IfrtBackend>> IfrtBackend::Create(
     IfrtProxyVersion version, uint64_t session_id,
-    std::unique_ptr<xla::ifrt::Client> ifrt_client,
+    std::shared_ptr<xla::ifrt::Client> ifrt_client,
     std::shared_ptr<HostBufferStore> host_buffer_store) {
   if (ifrt_client == nullptr) {
     return absl::InvalidArgumentError("ifrt_client cannot be a nullptr.");
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index b61c1b248fdde9..c9775aabb0b6fc 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -67,7 +67,7 @@ class IfrtBackend final : public BackendInterface {
   // be a nullptr.
   static absl::StatusOr<std::unique_ptr<IfrtBackend>> Create(
       IfrtProxyVersion version, uint64_t session_id,
-      std::unique_ptr<xla::ifrt::Client> ifrt_client,
+      std::shared_ptr<xla::ifrt::Client> ifrt_client,
       std::shared_ptr<HostBufferStore> host_buffer_store);
 
   ~IfrtBackend() override;
@@ -94,7 +94,7 @@ class IfrtBackend final : public BackendInterface {
   };
 
   IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
-              std::unique_ptr<xla::ifrt::Client> ifrt_client,
+              std::shared_ptr<xla::ifrt::Client> ifrt_client,
               std::shared_ptr<HostBufferStore> host_buffer_store);
 
   // Executes the given function on the given thread pool and returns a future
@@ -174,7 +174,7 @@ class IfrtBackend final : public BackendInterface {
   // Must not change during the life of this object.
   const IfrtProxyVersion version_;
   const uint64_t session_id_;
-  const std::unique_ptr<xla::ifrt::Client> client_;
+  const std::shared_ptr<xla::ifrt::Client> client_;
   const std::shared_ptr<HostBufferStore> host_buffer_store_;
 
   absl::Mutex futures_mutex_;

From f39297b4ff31241bb3495626d525563a46a492e8 Mon Sep 17 00:00:00 2001
From: Farzin Houshmand <farzinh@google.com>
Date: Thu, 30 May 2024 14:31:45 -0700
Subject: [PATCH 165/287] First cl to implement a pass to unstack loop
 operands.

This pass implements unstacking for loop operands. Generally speaking, unstacking is the act of breaking a rank n tensor into n smaller n-1 rank tensors without changing the semantics of the program. There are different patterns that can benefit from unstacking. This pass aims to implement such patterns. The patterns implemented are not exhaustive by any means. There are more patterns to be added.
The pass is not added to the compiler yet.

PiperOrigin-RevId: 638785310
---
 third_party/xla/xla/service/BUILD             |  38 ++
 third_party/xla/xla/service/hlo_unstacker.cc  | 574 ++++++++++++++++++
 third_party/xla/xla/service/hlo_unstacker.h   |  93 +++
 .../xla/xla/service/hlo_unstacker_test.cc     | 506 +++++++++++++++
 .../xla/xla/service/while_loop_unroller.cc    |  29 +-
 5 files changed, 1239 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/hlo_unstacker.cc
 create mode 100644 third_party/xla/xla/service/hlo_unstacker.h
 create mode 100644 third_party/xla/xla/service/hlo_unstacker_test.cc

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 4cb687dbb1abc7..e6e23eb7e0b1d5 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3248,6 +3248,44 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_unstacker",
+    srcs = ["hlo_unstacker.cc"],
+    hdrs = ["hlo_unstacker.h"],
+    deps = [
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":pattern_matcher",
+        ":tuple_util",
+        ":while_loop_unroller",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_unstacker_test",
+    srcs = ["hlo_unstacker_test.cc"],
+    tags = ["requires-net:external"],
+    deps = [
+        ":hlo_unstacker",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "while_loop_unroller",
     srcs = ["while_loop_unroller.cc"],
diff --git a/third_party/xla/xla/service/hlo_unstacker.cc b/third_party/xla/xla/service/hlo_unstacker.cc
new file mode 100644
index 00000000000000..2f647b914da8fb
--- /dev/null
+++ b/third_party/xla/xla/service/hlo_unstacker.cc
@@ -0,0 +1,574 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/hlo_unstacker.h"
+
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_creation_utils.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/tuple_util.h"
+#include "xla/service/while_loop_unroller.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+// TODO(b/342457472): Remove this struct and move its field to the
+// UnstackerTransformer as static members. A struct that holds the required
+// information for unstacking that is fixed across different unstacker
+// instastances.
+struct UnstackerMetadata {
+  static absl::StatusOr<UnstackerMetadata> Create(HloModule* module) {
+    UnstackerMetadata metadata;
+    TF_ASSIGN_OR_RETURN(
+        bool prepared,
+        WhileLoopUnroller::PrepareModuleForUnrolling(module, {}));
+    if (prepared) {
+      VLOG(3) << "Prepared module: " << module->name() << " for unstacking.";
+    }
+    std::vector<std::pair<HloInstruction*, WhileLoopConfig>> loops =
+        WhileLoopUnroller::GetUnrollableLoops(module, {});
+    for (const auto& [instr, while_loop_config] : loops) {
+      metadata.unrollable_loop_bodies[instr->while_body()] = while_loop_config;
+    }
+    return metadata;
+  }
+  absl::flat_hash_map<HloComputation*, WhileLoopConfig> unrollable_loop_bodies;
+  // A pair of custom pattern and its handler lambda that describes the
+  // transformation needed to unstack the hlo graph for the pattern.
+  std::pair<std::function<const HloInstruction*(
+                const UnstackerMetadata&, const HloInstruction*, int64_t)>,
+            std::function<absl::Status(HloInstruction*, const Shape&)>>
+      custom_handler;
+};
+
+// A struct that holds the required information for two-step unstacking. The
+// content of each instance differs for each operand of a while loop.
+struct UnstackerTransformer {
+  UnstackerMetadata metadata;
+  static absl::StatusOr<UnstackerTransformer> Create(
+      const UnstackerMetadata& c) {
+    UnstackerTransformer transformer;
+    transformer.metadata = std::move(c);
+    return transformer;
+  }
+
+  // Given an instruction and the index of the its changed operand, it applies
+  // the custom handler and populates body_changes lambdas that unstacks the hlo
+  // graph accordingly.
+  bool HandleInstruction(const HloInstruction* instr, int64_t changed_idx) {
+    VLOG(3) << "HandleInstruction(" << instr->shape().ToString()
+            << instr->name() << ", " << changed_idx << ")";
+
+    auto custom_pattern = metadata.custom_handler.first;
+    auto custom_handler = metadata.custom_handler.second;
+
+    const HloInstruction* stacked_user =
+        custom_pattern(metadata, instr, changed_idx);
+    if (stacked_user == nullptr) {
+      return false;
+    }
+    if (unstacking_computation != nullptr) {
+      LOG(ERROR) << "Seen multiple users, cannot handle. \n instr: "
+                 << instr->ToString() << "\n hoisted_computation: "
+                 << unstacking_computation->ToString(
+                        HloPrintOptions::Fingerprint());
+      return false;
+    }
+
+    unstacking_computation =
+        stacked_user->fused_instructions_computation()->Clone(
+            "hoisted_unstacking");
+    VLOG(3) << "Unstacking computation: "
+            << unstacking_computation->ToString(HloPrintOptions::Fingerprint());
+
+    // TODO(b/342440749): Currently, we assume the stacked dimension is always
+    // the most major dimension. This condition can be checked and terminate
+    // unstacking if not met.
+    Shape slice_shape = stacked_user->shape();
+    int64_t num_layers = stacked_user->operand(0)->shape().dimensions(0);
+    std::vector<Shape> shapes;
+    for (int64_t i = 0; i < num_layers; ++i) {
+      shapes.push_back(slice_shape);
+    }
+    unstacked_shape =
+        std::make_unique<Shape>(ShapeUtil::MakeTupleShape(shapes));
+
+    // Wrapper function around the unstacker lambda which calls the unstacker.
+    std::function<absl::Status()> unstack_wrapper =
+        [=]() mutable -> absl::Status {
+      HloInstruction* mutable_dynamic_slicing_fusion =
+          const_cast<HloInstruction*>(stacked_user);
+      return custom_handler(mutable_dynamic_slicing_fusion, slice_shape);
+    };
+    body_changes.push_back(unstack_wrapper);
+    return true;
+  }
+
+  // This pointer is populated if the unstacker finds unstackable loop input.
+  std::unique_ptr<Shape> unstacked_shape = nullptr;
+  // This is a pointer to the computation that is responsible for unstacking. It
+  // is used to hoist the unstacking computations outside the loop bodies.
+  std::unique_ptr<HloComputation> unstacking_computation = nullptr;
+  // A vector of lambdas that describe necessary changes to the shape of the
+  // loops to unstack. The lambdas accept the pointer to the new unstacked
+  // shape.
+  std::vector<std::function<void(const Shape*)>> loop_changes;
+  // a list of lambdas that captures all the changes to the hlo graph needed for
+  // unstacking.
+  std::vector<std::function<absl::Status()>> body_changes;
+  // A map that tracks the index of the changed operand for instructions of type
+  // get-tuple-element, tuple, and while during unstacking.
+  absl::flat_hash_map<HloInstruction*, int64_t> operand_changes;
+};
+
+bool CanUnstackWhileOperand(const HloInstruction* while_instr,
+                            UnstackerTransformer& unstacker, int64_t index);
+
+// Given a gte and an unstacker instance, this function walks down the graph of
+// the users in BFS manner and propagates the index of the changed input operand
+// for kGetTupleElement, kTuple, and kWhile instructions. Moreover, if checks if
+// the a user should be handled with the provided custom handler(s) inside the
+// unstacker instance. Note that this function does NOT change the shape of any
+// instruction, it merely keeps track of the instructions and where in the input
+// operands the change need to be applied later.
+bool PropagateGteShapeChange(HloInstruction* gte,
+                             UnstackerTransformer& unstacker) {
+  VLOG(5) << "PropagateGteShapeChange(" << gte->ToString() << ")";
+
+  // TODO(b/343457903): Use HloDataflowAnalysis to track the usage of a value
+  // instead of manually applying bfs
+  //
+  // Apply BFS to propagate the index of the changed operand.
+  absl::flat_hash_map<HloInstruction*, int64_t>& visited =
+      unstacker.operand_changes;
+  std::deque<HloInstruction*> worklist;
+  worklist.push_back(gte);
+  visited.insert({gte, gte->tuple_index()});
+  while (!worklist.empty()) {
+    HloInstruction* changed_instr_to_propagate = worklist.front();
+    int64_t changed_operand_index =
+        FindOrDie(visited, changed_instr_to_propagate);
+    worklist.pop_front();
+    for (HloInstruction* user : changed_instr_to_propagate->users()) {
+      if (ContainsKey(visited, user)) {
+        continue;
+      }
+      // We explicitly propagate the changed index for three types of users,
+      // namely, get-tuple-element, tuple and while users. The rationale is that
+      // the output shape of these three instruction types are inferred only by
+      // their input operand(s). Finally, we check if the user can be handled by
+      // the provided custom handler in HandleInstruction method.
+      if (user->opcode() == HloOpcode::kGetTupleElement) {
+        if (user->tuple_index() != changed_operand_index) {
+          continue;
+        }
+        // Since we insert the gte user only if the index of the gte is equal to
+        // the changed operand of its tuple input, we are sure that this gte
+        // instruction will get the new shape eventually and the
+        // change_operand_index does not matter.
+        visited.insert({user, changed_operand_index});
+        worklist.push_back(user);
+      } else if (user->opcode() == HloOpcode::kTuple) {
+        int64_t use_index = user->operand_index(changed_instr_to_propagate);
+        visited.insert({user, {use_index}});
+        worklist.push_back(user);
+      } else if (user->opcode() == HloOpcode::kWhile) {
+        // Recursively check the inner while for unstacking and populate
+        // unstacker instance.
+        bool changed_nested_while =
+            CanUnstackWhileOperand(user, unstacker, changed_operand_index);
+        if (!changed_nested_while) {
+          return false;
+        }
+        visited.insert({user, changed_operand_index});
+        worklist.push_back(user);
+      } else {
+        int64_t use_index = user->operand_index(changed_instr_to_propagate);
+        if (!unstacker.HandleInstruction(user, use_index)) {
+          VLOG(3) << "Custom unstacker not found for " << user->ToString();
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// Within the given computation, finds all the gte instruction with the
+// following form: get-tuple-elements(operand), index=idx and collects all the
+// new shapes. new_shape is the new shape at idx of the operand of the gte.
+bool CanPropagateGteShapeChangesInComputation(
+    const HloComputation* comp, const HloInstruction* operand,
+    UnstackerTransformer& shape_transformer, int64_t idx) {
+  VLOG(3) << "Propagating shape change of index " << idx
+          << " in : " << comp->name();
+  for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
+    // We only need to propagate changes through the gte instructions with index
+    // = idx.
+    if (instr->opcode() == HloOpcode::kGetTupleElement &&
+        instr->tuple_index() == idx) {
+      if (instr->operand(0) != operand) {
+        continue;
+      }
+      // If propagation is not possible (no custom handler provided for the
+      // users of the candidate), we bail early.
+      bool can_propagate = PropagateGteShapeChange(instr, shape_transformer);
+      if (!can_propagate) {
+        VLOG(3) << "Failed to propagate shape change for " << instr->ToString();
+        return false;
+      }
+    }
+  }
+  VLOG(3) << "Finish propagating shape change of index " << idx
+          << " in: " << comp->name();
+  return true;
+}
+
+bool CanUnstackWhileOperand(const HloInstruction* while_instr,
+                            UnstackerTransformer& unstacker, int64_t index) {
+  VLOG(5) << "ReplaceWhileOperandShape: " << while_instr->name() << " at "
+          << index;
+
+  bool body_changes_collected = CanPropagateGteShapeChangesInComputation(
+      while_instr->while_body(),
+      while_instr->while_body()->parameter_instruction(0), unstacker, index);
+
+  bool condition_changes_collected = CanPropagateGteShapeChangesInComputation(
+      while_instr->while_condition(),
+      while_instr->while_condition()->parameter_instruction(0), unstacker,
+      index);
+  if (body_changes_collected && condition_changes_collected) {
+    auto loop_change = [](HloInstruction* loop, const Shape* new_shape,
+                          int64_t idx) mutable {
+      Shape old_shape = ShapeUtil::MakeStaticShape(
+          loop->while_body()->parameter_instruction(0)->shape());
+      ShapeUtil::UpdateTupleShape(*new_shape, idx, &old_shape);
+
+      loop->while_body()->ReplaceParameter(
+          0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
+      loop->while_condition()->ReplaceParameter(
+          0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
+    };
+    auto loop_change_wrapper = [=](const Shape* new_shape) {
+      HloInstruction* mutable_loop = const_cast<HloInstruction*>(while_instr);
+      loop_change(mutable_loop, new_shape, index);
+    };
+    unstacker.loop_changes.push_back(loop_change_wrapper);
+    return true;
+  }
+  return false;
+}
+
+// This function is responsible for:
+// 1. Hoisting the unstacking computation outside the while_instr.
+// 2. Replacing the input of the while_instr with the new unstacked version.
+void UnstackWhileInput(const UnstackerTransformer& unstacker,
+                       HloInstruction* while_instr, const Shape* new_shape,
+                       int64_t index) {
+  const Shape& slice_shape = new_shape->tuple_shapes(0);
+  HloInstruction* old_while_input =
+      while_instr->while_init()->mutable_operand(index);
+
+  // TODO(b/341815540): Instead of creating the unstacked tuple for every input
+  // index, we should reuse if the input and unstacking computations are the
+  // same.
+  //
+  // Hoist the unstacking computation outside the while_instr and create a tuple
+  // of slices.
+  std::vector<HloInstruction*> slices;
+  for (int64_t i = 0; i < new_shape->tuple_shapes_size(); ++i) {
+    std::vector<HloInstruction*> operands = {
+        old_while_input,
+        while_instr->AddInstruction(MakeConstantWithShape(
+            unstacker.unstacking_computation->parameter_instruction(1)->shape(),
+            i))};
+    HloInstruction* slice =
+        while_instr->AddInstruction(HloInstruction::CreateFusion(
+            slice_shape, HloInstruction::FusionKind::kLoop, operands,
+            while_instr->GetModule()->AddEmbeddedComputation(
+                unstacker.unstacking_computation->Clone()),
+            "hoisted"));
+    slices.push_back(slice);
+  }
+  HloInstruction* new_operand_element =
+      while_instr->AddInstruction(HloInstruction::CreateTuple(slices));
+  HloInstruction* new_while_init =
+      TupleUtil::ReplaceTupleWith(new_operand_element,
+                                  while_instr->while_init(), {index}, false)
+          .value();
+  CHECK_OK(while_instr->ReplaceOperandWithDifferentShape(0, new_while_init));
+}
+
+// Apply the two-step unstacking algorithm to the given while_instr at the given
+// index.
+bool UnstackWhileOperandAtIndex(const UnstackerMetadata& metadata,
+                                HloInstruction* while_instr, int64_t index) {
+  UnstackerTransformer unstacker =
+      UnstackerTransformer::Create(metadata).value();
+
+  // First step of unstacking to determine whether while_instr at index is
+  // unstackable.
+  bool can_unstack = CanUnstackWhileOperand(while_instr, unstacker, index);
+  if (!can_unstack) {
+    return false;
+  }
+
+  // Check if we can propagate the changes through the output of the while
+  // at index.
+  bool parent_changes_collected = CanPropagateGteShapeChangesInComputation(
+      while_instr->parent(), while_instr, unstacker, index);
+  if (!parent_changes_collected) {
+    return false;
+  }
+
+  // If unstacker has not found an unstackable shape, there is no point in
+  // applying the unstacker changes.
+  if (unstacker.unstacked_shape == nullptr) {
+    return false;
+  }
+
+  // At this point, we have the unstacked_shape at hand. We go ahead and apply
+  // all the changes that required the unstacked shape.
+  //
+  // Update the shape of get-tuple-element, tuple, and, while instructions
+  // based on the unstacked_shape and the index of the changed operand.
+  for (const auto& [instr, index] : unstacker.operand_changes) {
+    switch (instr->opcode()) {
+      case HloOpcode::kGetTupleElement:
+        *instr->mutable_shape() = *unstacker.unstacked_shape;
+        break;
+      case HloOpcode::kTuple:
+        *instr->mutable_shape()->mutable_tuple_shapes(index) =
+            *unstacker.unstacked_shape;
+        break;
+      case HloOpcode::kWhile:
+        ShapeUtil::UpdateTupleShape(*unstacker.unstacked_shape, index,
+                                    instr->mutable_shape());
+        break;
+      default:
+        LOG(FATAL) << "Unsupported opcode: " << instr->ToString();
+    }
+  }
+  // Apply the changes to the body according to the provided custom handler.
+  for (const auto& body_change : unstacker.body_changes) {
+    CHECK_OK(body_change());
+  }
+  // Update the input and output shape of the loop.
+  UnstackWhileInput(unstacker, while_instr, unstacker.unstacked_shape.get(),
+                    index);
+  const Shape& new_while_shape = while_instr->while_init()->shape();
+  *while_instr->mutable_shape() = new_while_shape;
+  // Apply the changes to the shape of the loop body and condition
+  // computations.
+  for (auto& loop_change : unstacker.loop_changes) {
+    loop_change(unstacker.unstacked_shape.get());
+  }
+  return true;
+}
+
+// This method checks if the given instruction is a fusion with the following
+// properties:
+// 1. It is inside the body of an unrollable loop
+// 2. The parameter at stacked_operand_index has a single user inside the
+//    fused computation.
+// 3. The single user is a fusion with two operands with the following form:
+//    fusion(stacked_param, slicing_offset)
+//    (We assume that the stacked parameter is always the first operand and
+//    the slicing offset is the second operand.)
+// 4. The fusion user contains a shape-covering dynamic-slice instruction.
+const HloInstruction* GetNestedDynamicSlicingFusion(
+    const UnstackerMetadata& metadata, const HloInstruction* instr,
+    int64_t stacked_operand_idx) {
+  if (!Match(instr, match::Fusion())) {
+    return nullptr;
+  }
+
+  if (!metadata.unrollable_loop_bodies.contains(instr->parent())) {
+    VLOG(5) << "Instruction not inside unrollable while body, "
+            << instr->ToString() << instr->parent()->ToString();
+    return nullptr;
+  }
+
+  WhileLoopConfig while_instr_config =
+      metadata.unrollable_loop_bodies.at(instr->parent());
+
+  HloInstruction* inner_fusion_user = nullptr;
+  for (HloInstruction* fused_instr :
+       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
+    // Find the changed parameter in the fused computation
+    if (Match(fused_instr, match::Parameter(stacked_operand_idx))) {
+      // There must be a single fusion user
+      if (fused_instr->user_count() != 1) {
+        return nullptr;
+      }
+      if (Match(fused_instr->users()[0],
+                match::Fusion(match::Op(), match::Op()))) {
+        inner_fusion_user = fused_instr->users()[0];
+        break;
+      }
+    }
+  }
+  if (inner_fusion_user == nullptr) {
+    return nullptr;
+  }
+  for (HloInstruction* inner_fusion_instr :
+       inner_fusion_user->fused_instructions_computation()
+           ->MakeInstructionPostOrder()) {
+    if (!Match(inner_fusion_instr, match::DynamicSlice())) {
+      continue;
+    }
+    std::optional<int64_t> dynamic_index =
+        MatchShapeCoveringDynamicIndexInstruction(
+            inner_fusion_instr,
+            inner_fusion_user->fused_instructions_computation()
+                ->parameter_instruction(0),
+            HloOpcode::kDynamicSlice, while_instr_config);
+    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
+      return inner_fusion_user;
+    }
+  }
+  return nullptr;
+}
+
+// The function below captures all the changes necessary to hlo graph for it's
+// corresponding (IsNestedDynamicSlicingFusion) pattern to unstack.
+absl::Status UnstackNestedDynamicSlicingFusion(
+    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
+  // We are sure that this lambda is called with a nested fusion.
+  HloInstruction* parent_fusion =
+      mutable_dynamic_slicing_fusion->parent()->FusionInstruction();
+  VLOG(3) << "Found shape-covering dynamic slicing fusion inside a fusion: "
+          << mutable_dynamic_slicing_fusion->name() << " inside "
+          << parent_fusion->name();
+
+  // Under the assumption that the stacked parameter is always the first
+  // operand of the inner fusion.
+  HloInstruction* stacked_in_ds_fusion =
+      mutable_dynamic_slicing_fusion->mutable_operand(0);
+  CHECK_EQ(stacked_in_ds_fusion->opcode(), HloOpcode::kParameter);
+  int64_t stacked_param_number = stacked_in_ds_fusion->parameter_number();
+  HloInstruction* stacked =
+      parent_fusion->mutable_operand(stacked_param_number);
+
+  // Under the assumption that the slicing offset is always the second
+  // operand of the inner fusion.
+  HloInstruction* offset_in_ds_fusion =
+      mutable_dynamic_slicing_fusion->mutable_operand(1);
+  CHECK_EQ(offset_in_ds_fusion->opcode(), HloOpcode::kParameter);
+  HloInstruction* offset =
+      parent_fusion->mutable_operand(offset_in_ds_fusion->parameter_number());
+
+  HloInstruction* sliced_param =
+      parent_fusion->fused_instructions_computation()->ReplaceParameter(
+          stacked_param_number,
+          HloInstruction::CreateParameter(stacked_param_number, slice_shape,
+                                          "sliced"));
+
+  TF_RETURN_IF_ERROR(
+      mutable_dynamic_slicing_fusion->ReplaceAllUsesWith(sliced_param));
+  TF_RETURN_IF_ERROR(
+      parent_fusion->fused_instructions_computation()
+          ->RemoveInstructionAndUnusedOperands(mutable_dynamic_slicing_fusion));
+
+  std::vector<Shape> parameters =
+      parent_fusion->fused_instructions_computation()
+          ->ComputeProgramShape()
+          .parameters();
+  parameters.at(stacked_param_number) = slice_shape;
+  *parent_fusion->fused_instructions_computation()
+       ->ComputeProgramShape()
+       .mutable_parameters() = parameters;
+
+  // Create the custom-call to dynamically get the tuple element given the
+  // loop iteration number. We rely on WhileLoopUnroller to rewrite this as
+  // a get-tuple-element hlo once the iteration number is known and loop
+  // bodies are unrolled.
+  HloInstruction* new_operand =
+      parent_fusion->AddInstruction(HloInstruction::CreateCustomCall(
+          slice_shape, {stacked, offset}, "DynamicGte"));
+  return parent_fusion->ReplaceOperandWithDifferentShape(stacked_param_number,
+                                                         new_operand);
+}
+
+};  // namespace
+
+// The entry point of the unstacking algorithm. Given a module, it creates the
+// unstacking metadata and populates the unstacking custom handler(s). Moreover,
+// it attempts unstacking each index of the loops in the entry computation of
+// the module. Finally, it removes the unused computations and unrolls the
+// module.
+absl::StatusOr<bool> HloUnstacker::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  TF_ASSIGN_OR_RETURN(auto metadata, UnstackerMetadata::Create(module));
+
+  // Custom handler is a pair of pattern and transformation function that
+  // captures different cases of unstacking. It is decoupled from the unstacking
+  // algorithm for modularity.
+  metadata.custom_handler = std::make_pair(GetNestedDynamicSlicingFusion,
+                                           UnstackNestedDynamicSlicingFusion);
+
+  bool unstacked = false;
+  for (HloInstruction* instr :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (instr->opcode() != HloOpcode::kWhile) {
+      continue;
+    }
+    for (int64_t i = 0; i < instr->shape().tuple_shapes_size(); ++i) {
+      VLOG(3) << "Attempting to unstack " << instr->name() << " at " << i
+              << " with stacked shape "
+              << instr->shape().tuple_shapes(i).ToString();
+      if (UnstackWhileOperandAtIndex(metadata, instr, i)) {
+        VLOG(3) << "Unstacked " << instr->name() << " at " << i
+                << " with stacked shape "
+                << instr->shape().tuple_shapes(i).ToString();
+        unstacked |= true;
+      }
+    }
+  }
+  if (unstacked) {
+    // Unstacking computations are cloned, leaving the original unstacking
+    // computation unused.
+    TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
+    // We rely on the WhileLoopUnroller pass to unroll loop bodies and rewrite
+    // custom-calls created by unstacker, i.e., DynamicGte and DynamicTuple.
+    TF_RETURN_IF_ERROR(WhileLoopUnroller(-1, true).Run(module).status());
+  }
+  return unstacked;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_unstacker.h b/third_party/xla/xla/service/hlo_unstacker.h
new file mode 100644
index 00000000000000..eaa74ffc003468
--- /dev/null
+++ b/third_party/xla/xla/service/hlo_unstacker.h
@@ -0,0 +1,93 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_UNSTACKER_H_
+#define XLA_SERVICE_HLO_UNSTACKER_H_
+
+#include <stdbool.h>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+// This pass implements unstacking for loop operands. Generally speaking,
+// unstacking is the act of breaking a rank n tensor into n smaller n-1 rank
+// tensors without changing the semantics of the program. There are different
+// patterns that can benefit from unstacking. This pass aims to implement such
+// patterns. The patterns implemented are not exhaustive by any means. Lets
+// consider a simple example:
+// In the pattern below, `I` (the most-major dimension in the stacked tensor),
+// is equal to the trip count of the while loop and `i` is the iteration
+// variable of the loop. The stacked input is used only as input to a
+// shape-covering dynamic-slice (check the definition of a shape-covering
+// dynamic-slice: `tensorflow/compiler/xla/service/while_loop_unroller.h`)
+//
+//   +-while----------------------------------------------------+
+//   | param = tuple(..., [I,x1,...,xn]stacked, ...)            |
+//   | ...                                                      |
+//   | [1,x1,...,xn]slice = ds([I,x1,...,xn]stacked, i, 0, ...) |
+//   | ...                                                      |
+//   | ops using the slice                                      |
+//   | ...                                                      |
+//   | ROOT = tuple(..., stacked, ...)                          |
+//   +----------------------------------------------------------+
+//
+// This pattern can be unstacked and rewritten as following:
+//
+//   +-while-----------------------------------------------------------------+
+//   | param = tuple(..., ([1,x1,...,xn], ..., [1,x1,...,xn])unstacked, ...) |
+//   | ...                                                                   |
+//.  | slice_1 = get_tuple_element(unstacked), index=i                       |
+//   | ops using the slice_i                                                 |
+//   | ...                                                                   |
+//   | ROOT = tuple(..., unstacked, ...)                                     |
+//   +-----------------------------------------------------------------------+
+//
+// where the unstacked input is initialized with the slices outside of the loop:
+// unstacked = tuple(slice_1, ..., slice_n)
+// To get each slice, the pass introduces a dynamic version of the
+// kGetTupleElement instruction using a custom-call. This custom-call is then
+// replaced with a normal get-tuple-element during loop unrolling.
+//
+// Below is a high-level overview of the unstacking algorithm:
+// We unstack a module by unstacking inputs to the while loops within the entry
+// computation for every index. Given a while loop and a candidate for
+// unstacking, the algorithm performs the following two steps:
+// 1. The first step is to determine if unstacking is possible by checking if
+//  the unstacking of the while operand at the given index can be propagated
+//  through the body (and nested bodies if any). Unstacking is possible
+//  if a pair of pattern and handler is provided that can identify and handle
+//  such pattern that involves all the uses of the stacked operand at the given
+//  index.
+// 2. Apply the unstacking by executing the changes gathered in the first phase.
+class HloUnstacker : public HloModulePass {
+ public:
+  ~HloUnstacker() override = default;
+
+  explicit HloUnstacker() = default;
+
+  absl::string_view name() const override { return "hlo_unstacker"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_UNSTACKER_H_
diff --git a/third_party/xla/xla/service/hlo_unstacker_test.cc b/third_party/xla/xla/service/hlo_unstacker_test.cc
new file mode 100644
index 00000000000000..707f442077592e
--- /dev/null
+++ b/third_party/xla/xla/service/hlo_unstacker_test.cc
@@ -0,0 +1,506 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/hlo_unstacker.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using UnstackerTest = HloTestBase;
+
+TEST_F(UnstackerTest, UnstackLoopSingleNestedFusionUser) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
+    %param_0.51117 = s8[3,128,128] parameter(0)
+    p1 = s32[] parameter(1)
+    %constant.85694 = s32[] constant(0)
+    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+  }
+
+  %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
+    %param_0.34523 = bf16[8,128] parameter(0)
+    %param_1.30691 = s8[3,128,128] parameter(1)
+    p2 = s32[] parameter(2)
+    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
+    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+  }
+
+  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
+    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
+    one = s32[] constant(1)
+    inc = s32[] add(i, one)
+    fusion.conv = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner
+    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
+  }
+
+  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    %constant.12857 = s32[] constant(3)
+    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+  }
+
+  ENTRY main {
+    p0 = s8[3,128,128] parameter(0)
+    p1 = bf16[8,128] parameter(1)
+    init = s32[] constant(0)
+    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
+    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
+    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
+    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto original = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_TRUE(unstacked);
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
+                                      std::nullopt, false));
+}
+
+TEST_F(UnstackerTest, UnstackLoopSingleNestedFusionUserMultipleIndex) {
+  std::string hlo_string = R"(
+    HloModule SimpleLoop
+    %fused_computation.slice.1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
+      %param_0.51117 = s8[4,128,128] parameter(0)
+      p1 = s32[] parameter(1)
+      %constant.85694 = s32[] constant(0)
+      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+    }
+    
+    %fused_computation.slice.2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
+      %param_0.51117 = s8[4,128,128] parameter(0)
+      p1 = s32[] parameter(1)
+      %constant.85694 = s32[] constant(0)
+      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+    }
+
+    %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
+      %param_0.34523 = bf16[8,128] parameter(0)
+      %param_1.30691 = s8[4,128,128] parameter(1)
+      p2 = s32[] parameter(2)
+      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
+      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+    }
+    
+    %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
+      %param_0.34523 = bf16[8,128] parameter(0)
+      %param_1.30691 = s8[4,128,128] parameter(1)
+      p2 = s32[] parameter(2)
+      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
+      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+    }
+
+    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      p0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      // to_be_sliced_while_gte
+      p1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      p2 = s8[4,128,128] get-tuple-element(wide_p), index=3
+      one = s32[] constant(1)
+      inc = s32[] add(i, one)
+      fusion.conv.1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
+      fusion.conv.2 = bf16[8,128] fusion(p0, p2, i), kind=kOutput, calls=%fused_computation.inner.2
+      plus = bf16[8,128] add(fusion.conv.1, fusion.conv.2)
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(inc, plus, p1, p2)
+    }
+
+    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(4)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s8[4,128,128] parameter(0)
+      p1 = s8[4,128,128] parameter(1)
+      p2 = bf16[8,128] parameter(2)
+      init = s32[] constant(0)
+      while.input = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(init, p2, p0, p1)
+      while.out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
+      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto original = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_TRUE(unstacked);
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
+                                      std::nullopt, false));
+}
+
+TEST_F(UnstackerTest, UnstackLoopSingleNestedFusionUserDiffereOperandsOrder) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
+    %param_0.51117 = s8[3,128,128] parameter(0)
+    p1 = s32[] parameter(1)
+    %constant.85694 = s32[] constant(0)
+    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+  }
+
+  %fused_computation.inner (param_1.30691: s8[3,128,128], p2: s32[], param_0.34523: bf16[8,128]) -> bf16[8,128] {
+    %param_0.34523 = bf16[8,128] parameter(2)
+    %param_1.30691 = s8[3,128,128] parameter(0)
+    p2 = s32[] parameter(1)
+    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
+    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+  }
+
+  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
+    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
+    one = s32[] constant(1)
+    inc = s32[] add(i, one)
+    fusion.conv = bf16[8,128] fusion(p1, i, p0), kind=kOutput, calls=%fused_computation.inner
+    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
+  }
+
+  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    %constant.12857 = s32[] constant(3)
+    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+  }
+
+  ENTRY main {
+    p0 = s8[3,128,128] parameter(0)
+    p1 = bf16[8,128] parameter(1)
+    init = s32[] constant(0)
+    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
+    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
+    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
+    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto original = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_TRUE(unstacked);
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
+                                      std::nullopt, false));
+}
+
+TEST_F(UnstackerTest, NotUnstackLoopMultipleNestedFusionUsers) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  %fused_computation.slice.1 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
+    %param_0.51117 = s8[3,128,128] parameter(0)
+    p1 = s32[] parameter(1)
+    %constant.85694 = s32[] constant(0)
+    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+  }
+  
+  %fused_computation.slice.2 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
+    %param_0.51117 = s8[3,128,128] parameter(0)
+    p1 = s32[] parameter(1)
+    %constant.85694 = s32[] constant(0)
+    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+  }
+
+  %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
+    %param_0.34523 = bf16[8,128] parameter(0)
+    %param_1.30691 = s8[3,128,128] parameter(1)
+    p2 = s32[] parameter(2)
+    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
+    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+  }
+  
+  %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
+    %param_0.34523 = bf16[8,128] parameter(0)
+    %param_1.30691 = s8[3,128,128] parameter(1)
+    p2 = s32[] parameter(2)
+    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
+    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+  }
+
+  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
+    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
+    one = s32[] constant(1)
+    inc = s32[] add(i, one)
+    fusion.conv1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
+    fusion.conv2 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.2
+    add = bf16[8,128] add(fusion.conv1, fusion.conv2)
+    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, add, p1)
+  }
+
+  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
+    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
+    i = s32[] get-tuple-element(wide_p), index=0
+    %constant.12857 = s32[] constant(3)
+    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+  }
+
+  ENTRY main {
+    p0 = s8[3,128,128] parameter(0)
+    p1 = bf16[8,128] parameter(1)
+    init = s32[] constant(0)
+    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
+    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
+    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
+    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Currently, we don't unroll if there are multiple nested ds fusions.
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_FALSE(unstacked);
+}
+
+TEST_F(UnstackerTest, UnstackMultipleLoops) {
+  std::string hlo_string = R"(
+    HloModule SimpleLoop
+    %fused_computation.slice1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
+      %param_0.51117 = s8[4,128,128] parameter(0)
+      p1 = s32[] parameter(1)
+      %constant.85694 = s32[] constant(0)
+      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+    }
+
+    %fused_computation.inner1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
+      %param_0.34523 = bf16[8,128] parameter(0)
+      %param_1.30691 = s8[4,128,128] parameter(1)
+      p2 = s32[] parameter(2)
+      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice1
+      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+    }
+
+    %while.body.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(1)
+      inc = s32[] add(i, one)
+      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner1
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
+    }
+
+    %while.cond.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(4)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+    }
+
+    %while.body1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(2)
+      zero = s32[] constant(0)
+      mult = s32[] multiply(i, one)
+      inner.in = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
+      inner.out = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in), condition=%while.cond.inner1, body=%while.body.inner1
+      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out), index=1
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
+    }
+
+    %while.cond1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(20)
+      add = s32[] add(%constant.12857, %constant.12857)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
+    }
+    
+    %fused_computation.slice2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
+      %param_0.51117 = s8[4,128,128] parameter(0)
+      p1 = s32[] parameter(1)
+      %constant.85694 = s32[] constant(0)
+      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+    }
+
+    %fused_computation.inner2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
+      %param_0.34523 = bf16[8,128] parameter(0)
+      %param_1.30691 = s8[4,128,128] parameter(1)
+      p2 = s32[] parameter(2)
+      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice2
+      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+    }
+
+    %while.body.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(1)
+      inc = s32[] add(i, one)
+      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner2
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
+    }
+
+    %while.cond.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(4)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+    }
+    
+    %while.body2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(2)
+      zero = s32[] constant(0)
+      mult = s32[] multiply(i, one)
+      inner.in = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
+      inner.out = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in), condition=%while.cond.inner2, body=%while.body.inner2
+      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out), index=1
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
+    }
+
+    %while.cond2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(20)
+      add = s32[] add(%constant.12857, %constant.12857)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
+    }
+
+    ENTRY main {
+      weight = s8[4,128,128] parameter(0)
+      p1 = bf16[8,128] parameter(1)
+      init = s32[] constant(1)
+      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
+      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond1 , body=%while.body1
+      second.while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
+      second.while.output = (s32[], bf16[8,128], s8[4,128,128]) while(second.while.input), condition=%while.cond2 , body=%while.body2
+      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto original = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_TRUE(unstacked);
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
+                                      std::nullopt, false));
+}
+
+TEST_F(UnstackerTest, UnstackNestedLoopSingleNestedFusionUser) {
+  std::string hlo_string = R"(
+    HloModule SimpleLoop
+    %fused_computation.slice (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
+      %param_0.51117 = s8[4,128,128] parameter(0)
+      p1 = s32[] parameter(1)
+      %constant.85694 = s32[] constant(0)
+      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
+      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
+    }
+
+    %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
+      %param_0.34523 = bf16[8,128] parameter(0)
+      %param_1.30691 = s8[4,128,128] parameter(1)
+      p2 = s32[] parameter(2)
+      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
+      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
+    }
+
+    %while.body.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(1)
+      inc = s32[] add(i, one)
+      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
+    }
+
+    %while.cond.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(4)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
+    }
+
+    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
+      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
+      one = s32[] constant(2)
+      zero = s32[] constant(0)
+      mult = s32[] multiply(i, one)
+      inner.in = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
+      inner.out = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in), condition=%while.cond.inner, body=%while.body.inner
+      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out), index=1
+      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
+    }
+
+    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
+      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
+      i = s32[] get-tuple-element(wide_p), index=0
+      %constant.12857 = s32[] constant(20)
+      add = s32[] add(%constant.12857, %constant.12857)
+      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
+    }
+
+    ENTRY main {
+      weight = s8[4,128,128] parameter(0)
+      p1 = bf16[8,128] parameter(1)
+      init = s32[] constant(1)
+      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
+      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
+      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto original = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
+  EXPECT_TRUE(unstacked);
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
+                                      std::nullopt, false));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 253b3f79120a3e..2da3691134b698 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -93,6 +93,33 @@ std::unique_ptr<HloComputation> MakeTrivialLoopCondition(
           init_value_constant, ComparisonDirection::kLe)));
 }
 
+// Handle DynamicGte and DynamicTuple custom-calls created during unstacking
+// pass.
+absl::Status HandleDynamicGteOrTuple(HloInstruction* instr, int64_t iter_num) {
+  if (instr->IsCustomCall("DynamicGte")) {
+    return instr->parent()->ReplaceInstruction(
+        instr, instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+                   instr->mutable_operand(0), iter_num)));
+  } else if (instr->IsCustomCall("DynamicTuple")) {
+    std::vector<HloInstruction*> tuple_operands;
+    for (int64_t i = 0; i < instr->operand(0)->shape().tuple_shapes_size();
+         i++) {
+      if (i == iter_num) {
+        tuple_operands.push_back(instr->mutable_operand(1));
+      } else {
+        HloInstruction* slice =
+            instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+                instr->mutable_operand(0), i));
+        tuple_operands.push_back(slice);
+      }
+    }
+    return instr->parent()->ReplaceInstruction(
+        instr,
+        instr->AddInstruction(HloInstruction::CreateTuple(tuple_operands)));
+  }
+  return absl::OkStatus();
+}
+
 // Helper function that replaces a single iteration of a while loop with
 // induction variable equal to induction_value.
 absl::StatusOr<std::unique_ptr<HloComputation>>
@@ -155,7 +182,7 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
                                        match::Constant()))) {
         continue;
       }
-
+      CHECK_OK(HandleDynamicGteOrTuple(indvar_use, induction_value));
       for (int64_t i = 0; i < indvar_use->operand_count(); ++i) {
         const HloInstruction* indvar_use_operand = indvar_use->operand(i);
         // Found the induction var user.

From a378d8e3190074ab6448be5b01e0e7f4a03a1397 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 14:33:50 -0700
Subject: [PATCH 166/287] Add profiling_info_pb2 to lite.py

PiperOrigin-RevId: 638786147
---
 tensorflow/lite/python/lite.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 30895357924d74..826fd6e6e0ec73 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -31,6 +31,7 @@
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op  # pylint: disable=unused-import
+from tensorflow.lite.profiling.proto import profiling_info_pb2  # pylint: disable=unused-import
 from tensorflow.lite.python import conversion_metadata_schema_py_generated as conversion_metadata_fb
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_conversion_flags as _build_conversion_flags

From 4e3162a4b73506cfdc2462ceaaf868a5c3297b88 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 15:03:34 -0700
Subject: [PATCH 167/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638796198
---
 third_party/xla/xla/client/lib/BUILD                     | 5 +----
 third_party/xla/xla/client/lib/matrix.cc                 | 1 -
 third_party/xla/xla/client/lib/matrix_test.cc            | 2 +-
 third_party/xla/xla/client/lib/tridiagonal.cc            | 1 -
 third_party/xla/xla/client/lib/tridiagonal_test.cc       | 1 -
 third_party/xla/xla/pjrt/c/BUILD                         | 9 ---------
 third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc        | 1 -
 third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc         | 1 -
 third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h          | 1 -
 third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc    | 1 -
 third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h     | 1 -
 third_party/xla/xla/pjrt/distributed/BUILD               | 4 +---
 third_party/xla/xla/pjrt/distributed/topology_util.cc    | 1 -
 third_party/xla/xla/pjrt/distributed/topology_util.h     | 2 +-
 third_party/xla/xla/pjrt/distributed/util.h              | 2 +-
 third_party/xla/xla/service/gpu/kernels/BUILD            | 3 ---
 .../xla/xla/service/gpu/kernels/custom_kernel_fusion.cc  | 1 -
 .../xla/xla/service/gpu/kernels/custom_kernel_fusion.h   | 2 +-
 .../xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc   | 1 -
 third_party/xla/xla/service/gpu/model/BUILD              | 4 +---
 .../xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc   | 1 -
 .../xla/xla/service/gpu/model/symbolic_tile_analysis.cc  | 1 -
 .../service/gpu/model/symbolic_tiled_hlo_instruction.cc  | 1 -
 23 files changed, 7 insertions(+), 40 deletions(-)

diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index 694e3f6c19c448..590833f28a3037 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -229,7 +229,6 @@ cc_library(
         ":slicing",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -253,7 +252,6 @@ xla_test(
         ":constants",
         ":matrix",
         ":slicing",
-        "//xla:status",
         "//xla:statusor",
         "//xla:test",
         "//xla:types",
@@ -261,6 +259,7 @@ xla_test(
         "//xla/tests:client_library_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -637,7 +636,6 @@ cc_library(
         ":loops",
         ":slicing",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
@@ -658,7 +656,6 @@ xla_test(
         ":tridiagonal",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:test",
         "//xla/client:xla_builder",
         "//xla/tests:client_library_test_base",
diff --git a/third_party/xla/xla/client/lib/matrix.cc b/third_party/xla/xla/client/lib/matrix.cc
index e150af775e3a12..5a27e2ccf922b1 100644
--- a/third_party/xla/xla/client/lib/matrix.cc
+++ b/third_party/xla/xla/client/lib/matrix.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/client/lib/matrix_test.cc b/third_party/xla/xla/client/lib/matrix_test.cc
index 1f3aba6d92e615..73a8582292194f 100644
--- a/third_party/xla/xla/client/lib/matrix_test.cc
+++ b/third_party/xla/xla/client/lib/matrix_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/lib/slicing.h"
 #include "xla/client/xla_builder.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/client_library_test_base.h"
diff --git a/third_party/xla/xla/client/lib/tridiagonal.cc b/third_party/xla/xla/client/lib/tridiagonal.cc
index d91c066277ba7d..4d6be1d176bc99 100644
--- a/third_party/xla/xla/client/lib/tridiagonal.cc
+++ b/third_party/xla/xla/client/lib/tridiagonal.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/client/lib/slicing.h"
 #include "xla/client/xla_builder.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 
diff --git a/third_party/xla/xla/client/lib/tridiagonal_test.cc b/third_party/xla/xla/client/lib/tridiagonal_test.cc
index 7aa9029332a531..fcaaa748169ae3 100644
--- a/third_party/xla/xla/client/lib/tridiagonal_test.cc
+++ b/third_party/xla/xla/client/lib/tridiagonal_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/test_macros.h"
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 691f6866c746ad..4b4a6468773d2c 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -120,7 +120,6 @@ cc_library(
         ":pjrt_c_api_layouts_extension_hdrs",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
@@ -169,7 +168,6 @@ cc_library(
         ":pjrt_c_api_layouts_extension_hdrs",
         ":pjrt_c_api_profiler_extension_hdrs",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
@@ -365,12 +363,9 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:ffi",
-        "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt/distributed:in_memory_key_value_store",
@@ -380,9 +375,6 @@ xla_test(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:status_matchers",
@@ -398,7 +390,6 @@ xla_cc_test(
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_wrapper_impl",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 8d6b4c53849d3c..14cf5da33c5a37 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 75b645a69efc28..479a7e4c403cc8 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/primitive_util.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index 685c4e241093d8..5f7746db24a497 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace pjrt {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
index bb4e386e5a4b53..1db3004af9bef6 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index b0fa0d9103b7cc..f3d375cd7f9d9a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 
 struct PJRT_Error {
   absl::Status status;
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 20533c2cbb1073..ffcec08bd07daf 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -23,7 +23,6 @@ cc_library(
     deps = [
         ":topology_util",
         ":util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
@@ -97,7 +96,7 @@ cc_library(
     name = "util",
     hdrs = ["util.h"],
     deps = [
-        "//xla:status",
+        "@com_google_absl//absl/status",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -120,7 +119,6 @@ cc_library(
     deps = [
         ":key_value_store_interface",
         ":protocol_proto_cc",
-        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/pjrt:pjrt_client",
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index d8649773156c52..3420453d746bb2 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/utils.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index 99cc8eab8012e1..8fb9c16d83d825 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
+#include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/distributed/util.h b/third_party/xla/xla/pjrt/distributed/util.h
index b0fc713b191138..e554d35c52d70c 100644
--- a/third_party/xla/xla/pjrt/distributed/util.h
+++ b/third_party/xla/xla/pjrt/distributed/util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef XLA_PJRT_DISTRIBUTED_UTIL_H_
 #define XLA_PJRT_DISTRIBUTED_UTIL_H_
 
+#include "absl/status/status.h"
 #include "grpcpp/support/status.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index 16cfcc2633c406..e394a8dae31ab1 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -27,8 +27,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":custom_kernel",
-        "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/base:core_headers",
@@ -88,7 +86,6 @@ cc_library(
         ":cutlass_gemm",
         ":cutlass_gemm_custom_kernel",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
index ec65f30510ebb6..3132ae44c709ba 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/status.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
index e0d8a27ff57687..7da7fbbceefa46 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 8891e407c7ff78..e57cb2425fb4a0 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index ca3aee3e83ee79..091f48b700125d 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -150,7 +150,6 @@ cc_library(
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiles",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -554,9 +553,9 @@ cc_library(
     deps = [
         ":indexing_analysis",
         ":symbolic_tile",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -633,7 +632,6 @@ cc_library(
         ":symbolic_tiled_hlo_instruction",
         ":tiled_hlo_computation",
         ":tiled_hlo_instruction",
-        "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
         "@com_google_absl//absl/algorithm:container",
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index 4bbe929730c661..50b8e66d79212d 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 30fb78c9e795d6..32bb5bbe307be0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/status.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
index c419cb91cc36fe..f1b1d4cb34df45 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/service/gpu/model/symbolic_tile.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace gpu {

From 4b7bd079c36e8ed90d22ffd31dfdfaea9a96e97f Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 30 May 2024 15:58:04 -0700
Subject: [PATCH 168/287] Pattern-match FP8 calls with input converts without
 scales.

This change pattern matches the following into an FP8 cublasLT custom call

  dot(x.astype(jnp.float16), y.astype(jnp.float16))

We previously matched a version of this that also included scaling into a custom call.

  dot(x.astype(jnp.float16) * x_scale, y.astype(jnp.float16) * y_scale)

Now the scales are optional even when the inputs are cast to FP16. There is no particular reason to cast inputs to FP16 if there are no input scales, but at least one internal model does this.

PiperOrigin-RevId: 638814065
---
 .../xla/xla/service/gpu/gemm_rewriter.cc      |  18 ++-
 .../service/gpu/tests/gemm_rewrite_test.cc    | 121 ++++++++++++++++++
 2 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 424c6591d47972..74ded35506ad92 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -263,19 +263,28 @@ bool IsSupportedF8Pattern(HloInstruction *instr, HloInstruction *&x,
     return true;
   }
 
+  int num_dequant_ops;
   // When not operating directly on an FP8 operand, the second and
-  // third instructions in the subgraph must describe a dequantization, i.e. a
+  // third instructions in the subgraph can describe a dequantization, i.e. a
   // convert instruction followed by a multiply/divide instruction.
   if (subgraph.size() > 2 &&
       Match(subgraph[2].first,
             m::MultiplyAnyOrder(m::Convert(m::Op(&x)),
                                 m::Broadcast(m::Op(&x_scale))))) {
     x_mult_scale = true;
+    num_dequant_ops = 2;
   } else if (subgraph.size() > 2 &&
              Match(subgraph[2].first,
                    m::Divide(m::Convert(m::Op(&x)),
                              m::Broadcast(m::Op(&x_scale))))) {
     x_mult_scale = false;
+    num_dequant_ops = 2;
+  } else if (subgraph.size() > 1 &&
+             Match(subgraph[1].first, m::Convert(m::Op(&x)))) {
+    // We have a convert from FP8 without a scale in this case.
+    x_scale = nullptr;
+    x_mult_scale = false;
+    num_dequant_ops = 1;
   } else {
     VLOG(1) << "Possible intended FP8 GEMM operating on "
             << instr->ToShortString() << " not rewritten into FP8 Custom Call.";
@@ -290,8 +299,9 @@ bool IsSupportedF8Pattern(HloInstruction *instr, HloInstruction *&x,
     return instr->GetModule()->config().use_spmd_partitioning();
   };
 
-  // Skip the initial FP8 instruction and the two dequantization instructions.
-  for (int i = 3; i < subgraph.size(); ++i) {
+  // Skip the initial FP8 instruction and the dequantization instructions.
+  int start = 1 + num_dequant_ops;
+  for (int i = start; i < subgraph.size(); ++i) {
     // The remaining instructions must be commutative with dequantization.
     // Bitcast, broadcast, copy, dynamic-slice, pad, reshape, select, slice,
     // transpose, all-gather, all-to-all and collective-permute instructions are
@@ -326,7 +336,7 @@ bool IsSupportedF8Pattern(HloInstruction *instr, HloInstruction *&x,
     }
   }
 
-  x_ops = {subgraph.begin() + 3, subgraph.end()};
+  x_ops = {subgraph.begin() + start, subgraph.end()};
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index c8aa11850366b8..10b89be125af84 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -5137,6 +5137,61 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
                      .WithShape(F32, {16, 16})));
 }
 
+// Test case where F8 inputs are converted to F32 before the dot, but without
+// any scaling.
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDWithConvertF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      ROOT out = f32[16,16] dot(x_f32, y_f32), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text);
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16]) -> f32[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
@@ -5210,6 +5265,72 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
       )");
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest,
+       UnscaledABUnscaledDUnaryOpsWithConvertF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[3] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      x_f32 = f32[3] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      zero = f32[] constant(0)
+      x_padded = f32[30] pad(x_f32, zero), padding=0_27
+      x_padded_bcast = f32[30,8,5] broadcast(x_padded), dimensions={0}
+      x_padded_bcast_sliced = f32[16,8,4] slice(x_padded_bcast), slice={[2:18], [0:8], [0:4]}
+      x_padded_bcast_sliced_reshaped = f32[16,32] reshape(x_padded_bcast_sliced)
+      ROOT out = f32[16,16] dot(x_padded_bcast_sliced_reshaped, y_f32), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text);
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
+      R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[3], {{.*}}: <<F8E4M3>>[32,16]) -> f32[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[3]{0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C0_CONVERT:%[^ ]+]] = <<F8E4M3>>[] convert([[C0]])
+; CHECK-NEXT:    [[P0_U0:%[^ ]+]] = <<F8E4M3>>[30]{0} pad([[P0]], [[C0_CONVERT]]), padding=0_27
+; CHECK-NEXT:    [[P0_U1:%[^ ]+]] = <<F8E4M3>>[30,8,5]{2,1,0} broadcast([[P0_U0]]), dimensions={0}
+; CHECK-NEXT:    [[P0_U2:%[^ ]+]] = <<F8E4M3>>[16,8,4]{2,1,0} slice([[P0_U1]]), slice={[2:18], [0:8], [0:4]}
+; CHECK-NEXT:    [[P0_U3:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} reshape([[P0_U2]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[C2]], [[C2]], [[C2]], /*index=5*/[[C2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";

From 422bd16858138f711c2672cd8a8b98ac07d5a18c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 16:12:22 -0700
Subject: [PATCH 169/287] [xla:pjrt:cpu] Correctly create PjRt memory for
 constant allocations

PiperOrigin-RevId: 638818872
---
 third_party/xla/xla/pjrt/cpu/BUILD            |  1 +
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    | 27 ++++++++++++++-----
 .../xla/xla/service/cpu/cpu_compiler.cc       |  3 +++
 .../xla/xla/service/cpu/cpu_executable.h      |  3 ++-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index f27d2d116144b7..d34967ad3bc325 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -187,6 +187,7 @@ cc_library(
         "//xla/service/cpu:simple_orc_jit",
         "//xla/service/cpu/runtime:buffer_allocations",
         "//xla/service/cpu/runtime:thunk",
+        "//xla/stream_executor",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 7a4251625584bb..77ea8259cf2f1d 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -97,6 +97,7 @@ limitations under the License.
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -1237,6 +1238,7 @@ struct BufferAllocAndCopy {
 // and assemble the buffer pointers in order to call into CpuExecutable.
 static absl::StatusOr<BufferInfo> MemoryForAllocation(
     const BufferAllocation& allocation,
+    absl::Span<const cpu::CpuExecutable::ConstantAllocation> constants,
     absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
     BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
   BufferInfo buffer_info;
@@ -1268,6 +1270,17 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
     buffer_info.owns_buffer = arg->owns_buffers();
     buffer_info.buffer_size = arg->BufferSize(allocation.param_shape_index());
     return buffer_info;
+
+  } else if (allocation.is_constant() &&
+             allocation.index() < constants.size()) {
+    se::DeviceMemoryBase constant =
+        constants[allocation.index()].AsDeviceMemoryBase();
+    buffer_info.buffer = tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
+        constant.opaque(), constant.size());
+    buffer_info.owns_buffer = false;
+    buffer_info.buffer_size = constant.size();
+    return buffer_info;
+
   } else if (allocation.is_constant() || allocation.is_thread_local()) {
     buffer_info.buffer =
         tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>();
@@ -1290,15 +1303,16 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
 
 static absl::StatusOr<std::vector<BufferInfo>> CreateBufferTable(
     const BufferAssignment& assignment,
+    absl::Span<const cpu::CpuExecutable::ConstantAllocation> constants,
     absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
     BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
   std::vector<BufferInfo> buffer_table(assignment.Allocations().size());
-  for (BufferAllocation::Index i = 0; i < assignment.Allocations().size();
-       ++i) {
+  for (BufferAllocation::Index i = 0; i < buffer_table.size(); ++i) {
     const BufferAllocation& allocation = assignment.GetAllocation(i);
-    TF_ASSIGN_OR_RETURN(buffer_table[i],
-                        MemoryForAllocation(allocation, arguments, buffer_alloc,
-                                            buffer_alloc_and_copy));
+    TF_ASSIGN_OR_RETURN(
+        buffer_table[i],
+        MemoryForAllocation(allocation, constants, arguments, buffer_alloc,
+                            buffer_alloc_and_copy));
   }
   return std::move(buffer_table);
 }
@@ -1496,7 +1510,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   BufferAllocAndCopy buffer_alloc_and_copy;
   TF_ASSIGN_OR_RETURN(
       std::vector<BufferInfo> buffer_table,
-      CreateBufferTable(cpu_executable->buffer_assignment(), tracked_buffers,
+      CreateBufferTable(cpu_executable->buffer_assignment(),
+                        cpu_executable->constants(), tracked_buffers,
                         buffer_alloc, buffer_alloc_and_copy));
   auto result_buffers_info =
       CreateResultBufferInfo(result_buffer_indices_, buffer_table);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index dd8b587f81aee5..61e133414dab72 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1092,6 +1092,9 @@ CreateConstantAllocations(const BufferAssignment& assignment) {
                        allocation.ToString()));
     }
 
+    VLOG(3) << "Create constant allocation for index " << allocation.index()
+            << " from constant literal " << const_instr->name()
+            << "; shape=" << const_instr->literal().shape();
     TF_ASSIGN_OR_RETURN(constants.emplace_back(),
                         LiteralToConstantAllocation(allocation.index(),
                                                     const_instr->literal()));
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 3e7436daf8486c..f20b786f65a516 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -58,7 +58,7 @@ class CpuExecutable : public Executable {
   struct ConstantAllocation {
     se::DeviceMemoryBase AsDeviceMemoryBase() const;
 
-    BufferAllocation::Index index;
+    BufferAllocation::Index index = -1;
     std::variant<std::monostate, std::vector<uint8_t>,
                  absl::Span<const uint8_t>>
         data;
@@ -133,6 +133,7 @@ class CpuExecutable : public Executable {
   ThunkSequence& thunks() { return *thunks_; }
 
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
+  absl::Span<const ConstantAllocation> constants() const { return constants_; }
 
   int64_t SizeOfGeneratedCodeInBytes() const override;
 

From 72d3ba97f621e4a7d13848a882076d2910d5b5a8 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Thu, 30 May 2024 16:17:59 -0700
Subject: [PATCH 170/287] Update docstring to satisfy bulidifier

PiperOrigin-RevId: 638820536
---
 third_party/xla/xla/service/gpu/build_defs.bzl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 7acba9419a6124..2837c5f3139cf3 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -100,6 +100,8 @@ def gen_gpu_hlo_compile_tests(
         use for that target.
       backend_args: A dict mapping backend name to list of additional args to
         use for that target.
+      xla_flags: A list of XLA flags passed to multihost_hlo_runner.
+
 
     Example Usage:
 

From f9ac8f0467649b5eb39cb88b8e95aea194e87f45 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 16:39:32 -0700
Subject: [PATCH 171/287] Automated g4 rollback of changelist 638510790.

Reverts 8e437155b9927d505afe1c708616f0763aef3d9b

PiperOrigin-RevId: 638826803
---
 .../core/tfrt/graph_executor/graph_execution_options.cc      | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
index eb5d1a8618b2a2..2c997ff35da7e8 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
@@ -40,11 +40,6 @@ tensorflow::SessionOptions CreateDefaultSessionOptions(
       ->mutable_rewrite_options()
       ->set_disable_meta_optimizer(!options.compile_options.enable_grappler);
 
-  // Disable MLIR-based graphdef optimizer as it is buggy.
-  config.mutable_graph_options()
-      ->mutable_rewrite_options()
-      ->set_disable_tfg_optimizer(true);
-
   // The following configs are constant.
 
   // Setting use_tfrt to true avoids grappler logic that lowers to v1 control

From c781c4efc270a1e16e40b1286859ab25cac38faf Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Thu, 30 May 2024 17:07:44 -0700
Subject: [PATCH 172/287] [xla:gpu] Reuse CanTritonHandleGEMM in
 GpuAlgebraicSimplifier

We should not duplicate the code to determine if triton can handle a gemm

PiperOrigin-RevId: 638834952
---
 third_party/xla/xla/service/gpu/BUILD         |  1 -
 .../service/gpu/gpu_algebraic_simplifier.cc   | 46 +------------------
 .../gpu/gpu_algebraic_simplifier_test.cc      |  3 +-
 .../xla/xla/service/gpu/triton_support.cc     | 30 +++++++-----
 .../xla/xla/service/gpu/triton_support.h      |  4 ++
 5 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5222f7d6f58824..f5e3f39b243d52 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3864,7 +3864,6 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
index 857da0bdc7bba0..a2d4425adec493 100644
--- a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
@@ -15,58 +15,14 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_algebraic_simplifier.h"
 
-#include <variant>
-
-#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/triton_support.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
-bool IsDotSupportedByGemmFusion(const HloInstruction* dot,
-                                se::GpuComputeCapability compute_capability) {
-  auto supported_output_type = [&](const PrimitiveType t) {
-    auto cuda_compute_capability =
-        std::get_if<se::CudaComputeCapability>(&compute_capability);
-    auto rocm_compute_capability =
-        std::get_if<se::RocmComputeCapability>(&compute_capability);
-
-    CHECK(cuda_compute_capability || rocm_compute_capability);
-
-    switch (t) {
-      case F16:
-      case F32:
-        return true;
-      case BF16:
-        if (cuda_compute_capability) {
-          return true;
-        }
-        if (rocm_compute_capability) {
-          return rocm_compute_capability->has_bf16_dtype_support();
-        }
-        return false;
-      default:
-        return false;
-    }
-  };
-
-  if (!supported_output_type(dot->shape().element_type())) {
-    return false;
-  }
-
-  if (!IsTritonSupportedDataType(dot->operand(0)->shape().element_type(),
-                                 compute_capability) ||
-      !IsTritonSupportedDataType(dot->operand(1)->shape().element_type(),
-                                 compute_capability)) {
-    return false;
-  }
-  return true;
-}
-
 bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
     const HloInstruction* hlo) {
   if (!options_.enable_dot_strength_reduction()) {
@@ -95,7 +51,7 @@ bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
 
   // If GemmFusion cannot handle this dot, we should strength-reduce it so that
   // it can be handled by the fusion pipeline.
-  return !IsDotSupportedByGemmFusion(dot, compute_capability_);
+  return !CanTritonHandleGEMM(*dot, compute_capability_);
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
index b0a5cc6a44440a..07dd66c40f2856 100644
--- a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
@@ -58,7 +58,8 @@ ENTRY entry {
   p0 = f32[32, 5, 7] parameter(0)
   p1 = f32[32, 5] parameter(1)
   ROOT dot = f32[32,7] dot(p0, p1), lhs_batch_dims={0},
-    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1},
+    algorithm=dot_bf16_bf16_f32_x6
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 155ef105d645fe..7576523d733577 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -254,16 +254,6 @@ CodegenDecision CanTritonHandleGEMM(
     return "Multiple batch dimensions.";
   }
 
-  // Cases where lhs or rhs have no non-contracting dims are not handled.
-  if (dim_numbers.lhs_batch_dimensions().size() +
-              dim_numbers.lhs_contracting_dimensions().size() ==
-          dot.operand(0)->shape().rank() ||
-      dim_numbers.rhs_batch_dimensions().size() +
-              dim_numbers.rhs_contracting_dimensions().size() ==
-          dot.operand(1)->shape().rank()) {
-    return "No non-contracting dimensions.";
-  }
-
   return CodegenDecision{};
 }
 
@@ -314,6 +304,19 @@ CodegenDecision CanTritonHandleReduce(
   return "Reduction is not a row-reduction of a single operand.";
 }
 
+bool NoNonContractingDimension(const HloDotInstruction& dot) {
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+  if (dim_numbers.lhs_batch_dimensions().size() +
+              dim_numbers.lhs_contracting_dimensions().size() ==
+          dot.operand(0)->shape().rank() ||
+      dim_numbers.rhs_batch_dimensions().size() +
+              dim_numbers.rhs_contracting_dimensions().size() ==
+          dot.operand(1)->shape().rank()) {
+    return true;
+  }
+  return false;
+}
+
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   if (instr.IsElementwise()) {
@@ -322,7 +325,12 @@ CodegenDecision IsTritonSupportedInstruction(
 
   switch (instr.opcode()) {
     case HloOpcode::kDot: {
-      return CanTritonHandleGEMM(*Cast<HloDotInstruction>(&instr), gpu_version);
+      auto* dot = Cast<HloDotInstruction>(&instr);
+      // Cases where lhs or rhs have no non-contracting dims are not handled.
+      if (NoNonContractingDimension(*dot)) {
+        return "No non-contracting dimensions.";
+      }
+      return CanTritonHandleGEMM(*dot, gpu_version);
     }
     case HloOpcode::kReduce: {
       return CanTritonHandleReduce(*Cast<HloReduceInstruction>(&instr),
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index 072c9ab948ec00..aebe116ffd6349 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -48,6 +49,9 @@ bool IsTritonSupportedDataType(PrimitiveType, const se::GpuComputeCapability&);
 // Checks elementwise operation against all supported by Triton GEMM codegen.
 bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
 
+CodegenDecision CanTritonHandleGEMM(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version);
+
 // Checks instruction against requirements of triton emitter.
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);

From 657d0bd08849f8702c43bb5798cb03588d2a81e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 17:18:25 -0700
Subject: [PATCH 173/287] Right now, the shape inference code only looks at
 `_Arg`s to match user-given `arg_shapes`, as `_Arg`s are what placeholder ops
 are represented during runtime. To run shape inference ahead-of-time (with
 static graph and input shapes), we need the shape inference code to examine
 `Placeholder` ops as well; thus this cl

PiperOrigin-RevId: 638837715
---
 tensorflow/compiler/jit/BUILD                 |  7 +-
 tensorflow/compiler/jit/shape_inference.cc    | 32 +++++++-
 tensorflow/compiler/jit/shape_inference.h     |  3 +
 .../compiler/jit/shape_inference_test.cc      | 75 +++++++++++++++++++
 4 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 623334534567de..0865e6b85e4aa2 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -956,9 +956,10 @@ cc_library(
         ":shape_inference_helpers",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla:statusor",
     ],
 )
@@ -988,11 +989,15 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:constant_op",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index 1848998da26116..9f17f0d00d2ffc 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -15,14 +15,27 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/shape_inference.h"
 
+#include <cstdint>
+#include <map>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -60,6 +73,8 @@ Status PropagateShapes(Graph* graph,
   GetReversePostOrder(*graph, &order);
 
   for (Node* n : order) {
+    VLOG(4) << "Propagating shape for node " << n->name()
+            << ", type: " << n->type_string();
     // Ignore the status returned by the shape_refiner. We want the best effort
     // shapes, even if no shape function is registered for a node.
     Status status = shape_refiner->AddNode(n);
@@ -75,11 +90,20 @@ Status PropagateShapes(Graph* graph,
       }
     }
 
+    int index = -1;
     if (n->type_string() == "_Arg") {
-      int index;
+      // NOTE: during runtime, Placeholder ops will be replaced as `_Arg` ops.
+      // And Args must have `index` attribute.
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-      auto it = arg_shapes.find(index);
-      if (it != arg_shapes.end()) {
+    } else if (n->type_string() == "Placeholder") {
+      // Use custom attribute (prefixed with `_`) `_index` for placeholders as
+      // they come from user specifications.
+      if (const auto s = GetNodeAttr(n->attrs(), "_index", &index); !s.ok()) {
+        LOG(WARNING) << "Failed to get node index for node " << n->name();
+      }
+    }
+    if (index >= 0) {
+      if (auto it = arg_shapes.find(index); it != arg_shapes.end()) {
         const InferredShape& arg_shape = it->second;
         shape_inference::InferenceContext* context =
             shape_refiner->GetContext(n);
diff --git a/tensorflow/compiler/jit/shape_inference.h b/tensorflow/compiler/jit/shape_inference.h
index 2d6322644b9e12..e12452fb316cf6 100644
--- a/tensorflow/compiler/jit/shape_inference.h
+++ b/tensorflow/compiler/jit/shape_inference.h
@@ -40,6 +40,9 @@ typedef std::unordered_map<string, std::vector<InferredShape>> GraphShapeInfo;
 // Infer shapes for all Tensors in a graph, and save them in a map.  The vector
 // for a Node contains the information about each of its outputs.
 // TODO(phawkins): this code does not infer accurate shapes for cyclic graphs.
+// `arg_shapes`: user given map from the `index` to shapes of this
+// node, where `index` is the `index` attribute of `_Arg` op or `_index`
+// attribute of `Placeholder` op.
 Status InferShapes(Graph* graph, const std::map<int, InferredShape>& arg_shapes,
                    const tensorflow::FunctionLibraryDefinition* fnlib_def,
                    GraphShapeInfo* shape_info);
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
index 9bd236290bc0f7..3f96101da79c00 100644
--- a/tensorflow/compiler/jit/shape_inference_test.cc
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -18,17 +18,26 @@ limitations under the License.
 #include "tensorflow/compiler/jit/shape_inference.h"
 
 #include <map>
+#include <memory>
 #include <vector>
 
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
@@ -61,6 +70,72 @@ TEST(ShapeInferenceTest, Basics) {
   TF_EXPECT_OK(ShapeAnnotationsMatch(*graph, shape_info, expected));
 }
 
+// Test that shape inference uses user-given `arg_shapes` to propagate shapes.
+TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSize) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Placeholder(root.WithOpName("A"), DT_FLOAT,
+                            ops::Placeholder::Shape({-1, 3}));
+  auto b = ops::Placeholder(root.WithOpName("B"), DT_FLOAT,
+                            ops::Placeholder::Shape({-1, 3}));
+  auto c = ops::Add(root.WithOpName("C"), a, b);
+  auto d = ops::Neg(root.WithOpName("D"), c);
+
+  a.node()->AddAttr("_index", 0);
+  b.node()->AddAttr("_index", 1);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(root.ToGraph(graph.get()));
+
+  std::map<int, InferredShape> arg_shapes;
+  arg_shapes[0].shape = TensorShape({2, 3});
+  arg_shapes[1].shape = TensorShape({2, 3});
+
+  GraphShapeInfo shape_info;
+  TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
+                           /*fnlib_def=*/nullptr, &shape_info));
+
+  std::map<string, std::vector<PartialTensorShape>> expected = {
+      {"A", {PartialTensorShape({2, 3})}},
+      {"B", {PartialTensorShape({2, 3})}},
+      {"C", {PartialTensorShape({2, 3})}},
+      {"D", {PartialTensorShape({2, 3})}},
+  };
+  TF_EXPECT_OK(ShapeAnnotationsMatch(*graph, shape_info, expected));
+}
+
+// Test that when user-given `arg_shapes` is incomplete (to cover all
+// Placeholders), shape inference still succeeds.
+TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSizeIncompleteUserArgs) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Placeholder(root.WithOpName("A"), DT_FLOAT,
+                            ops::Placeholder::Shape({-1, 3}));
+  auto b = ops::Placeholder(root.WithOpName("B"), DT_FLOAT,
+                            ops::Placeholder::Shape({-1, 3}));
+  auto c = ops::Add(root.WithOpName("C"), a, b);
+  auto d = ops::Neg(root.WithOpName("D"), c);
+
+  a.node()->AddAttr("_index", 0);
+  b.node()->AddAttr("_index", 0);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(root.ToGraph(graph.get()));
+
+  std::map<int, InferredShape> arg_shapes;
+  arg_shapes[0].shape = TensorShape({2, 3});
+
+  GraphShapeInfo shape_info;
+  TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
+                           /*fnlib_def=*/nullptr, &shape_info));
+
+  std::map<string, std::vector<PartialTensorShape>> expected = {
+      {"A", {PartialTensorShape({2, 3})}},
+      {"B", {PartialTensorShape({2, 3})}},
+      {"C", {PartialTensorShape({2, 3})}},
+      {"D", {PartialTensorShape({2, 3})}},
+  };
+  TF_EXPECT_OK(ShapeAnnotationsMatch(*graph, shape_info, expected));
+}
+
 TEST(ShapeInferenceTest, WhileLoop) {
   // Graph:
   // x = array_ops.placeholder(dtypes.int32)

From 6b490a504bc5fa2db50179a1489ea0c327e81e66 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 17:45:52 -0700
Subject: [PATCH 174/287] [xla:cpu] Add support for emitting reduction kernels

PiperOrigin-RevId: 638844214
---
 third_party/xla/xla/service/cpu/ir_emitter.h  |  6 ++++
 .../xla/xla/service/cpu/ir_emitter2.cc        | 28 ++++++++++++++-----
 third_party/xla/xla/service/cpu/ir_emitter2.h |  4 +++
 .../xla/xla/service/cpu/thunk_emitter.cc      | 16 +++++++++++
 .../xla/xla/service/cpu/thunk_emitter.h       |  3 ++
 5 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 718c5b11a24c43..e546d431342a39 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -144,6 +144,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name, bool is_reducer, bool in_compute_function = true);
 
+  // Returns true if given computation has been emitted.
+  bool is_computation_emitted(const HloComputation& callee,
+                              bool allow_reassociation) {
+    return emitted_functions_.contains({&callee, allow_reassociation});
+  }
+
  protected:
   //
   // The following methods implement the DfsHloVisitor interface.
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index b8527143ad0a2e..f33fb34c3be357 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -148,15 +148,21 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
   absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name, bool is_reducer) override {
-    // Create a nested function for thread local computation.
-    TF_RETURN_IF_ERROR(
-        nested_ir_emitter_
-            ->EmitComputation(const_cast<HloComputation*>(&callee), name, false,
-                              schedule_->sequence(&callee).instructions(),
-                              /*allow_reassociation=*/is_reducer)
-            .status());
+    // Create a nested function for thread local computation if it is not
+    // already created. Nested functions are created with internal linkage.
+    if (!nested_ir_emitter_->is_computation_emitted(callee, is_reducer)) {
+      VLOG(2) << "Emit nested computation: " << callee.name();
+      TF_RETURN_IF_ERROR(
+          nested_ir_emitter_
+              ->EmitComputation(const_cast<HloComputation*>(&callee), name,
+                                false,
+                                schedule_->sequence(&callee).instructions(),
+                                /*allow_reassociation=*/is_reducer)
+              .status());
+    }
 
     // Add a thread local call to the nested computation.
+    VLOG(2) << "Emit thread local call to: " << callee.name();
     nested_ir_emitter_->b()->SetInsertPoint(b()->GetInsertPoint());
     auto values = nested_ir_emitter_->EmitThreadLocalCall(
         callee, parameters, name, is_reducer, /*in_compute_function=*/false);
@@ -259,6 +265,14 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
   return kernels_.emplace_back(kernel_prototype.function->getName().str());
 }
 
+absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitReductionHostKernel(
+    const HloInstruction* instr) {
+  VLOG(2) << "Emit reduction host kernel: " << instr->name();
+
+  // TODO(ezhulenev): Port vectorized reduction emitter from IrEmitter.
+  return EmitElementalHostKernel(instr);
+}
+
 //===----------------------------------------------------------------------===//
 // Building HostKernel prototypes.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 860e786e2de290..db0d9702fc542d 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -108,6 +108,10 @@ class IrEmitter2 {
   absl::StatusOr<KernelInfo> EmitFusionHostKernel(
       const HloFusionInstruction* fusion);
 
+  // Emits a host kernel for the given reduction instruction.
+  absl::StatusOr<KernelInfo> EmitReductionHostKernel(
+      const HloInstruction* instr);
+
   // Emits a host kernel prototype and prepares function for emitting kernel
   // body into it.
   KernelPrototype EmitKernelPrototype(std::string_view name,
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 937a3c34d4a3f4..d5e2d2a5636125 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -124,6 +124,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kImag:
+    case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog1p:
     case HloOpcode::kLog:
@@ -154,6 +155,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kFusion:
       return EmitFusionKernelThunk(instruction);
 
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+      return EmitReductionKernelThunk(instruction);
+
     case HloOpcode::kCall:
       return EmitCallThunk(instruction);
 
@@ -207,6 +212,17 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
   return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
 }
 
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitReductionKernelThunk(
+    const HloInstruction* instruction) {
+  TF_ASSIGN_OR_RETURN(auto kernel,
+                      ir_emitter_->EmitReductionHostKernel(instruction));
+  TF_ASSIGN_OR_RETURN(auto buffers, GetLeafAllocationSlices(instruction));
+
+  // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
+  // invocation, for now we assume that we always emit a full loop.
+  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+}
+
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitWhileThunk(
     const HloInstruction* instruction) {
   HloInstruction* cond = instruction->while_condition()->root_instruction();
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index b2844e92068805..a7a72600d74eb8 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -68,6 +68,9 @@ class ThunkEmitter {
   absl::StatusOr<ThunkSequence> EmitFusionKernelThunk(
       const HloInstruction* instruction);
 
+  absl::StatusOr<ThunkSequence> EmitReductionKernelThunk(
+      const HloInstruction* instruction);
+
   absl::StatusOr<ThunkSequence> EmitWhileThunk(
       const HloInstruction* instruction);
 

From 633ad1683c7e9c684c3bf2e24907863fe0c82cf2 Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Thu, 30 May 2024 17:57:24 -0700
Subject: [PATCH 175/287] #tf-data-service Add a worker client test for
 alternative data transfer mechanisms.

PiperOrigin-RevId: 638846795
---
 tensorflow/core/data/service/BUILD            |   1 +
 tensorflow/core/data/service/test_cluster.cc  |  15 ++-
 tensorflow/core/data/service/test_cluster.h   |   9 +-
 .../core/data/service/worker_client_test.cc   | 100 ++++++++++++++++--
 4 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index b34c1bb532aab1..5c854915b26715 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -1086,6 +1086,7 @@ tf_cc_test(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index ec6976812e83ad..9cda46ce0f1b0b 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -38,7 +38,10 @@ namespace {
 constexpr const char kProtocol[] = "grpc";
 }  // namespace
 
-TestCluster::TestCluster(int num_workers) : num_workers_(num_workers) {}
+TestCluster::TestCluster(int num_workers,
+                         std::optional<std::string> data_transfer_protocol)
+    : num_workers_(num_workers),
+      data_transfer_protocol_(data_transfer_protocol) {}
 
 TestCluster::TestCluster(const TestCluster::Config& config)
     : num_workers_(config.num_workers), config_(config) {}
@@ -80,18 +83,24 @@ Status TestCluster::Initialize() {
   workers_.reserve(num_workers_);
   worker_addresses_.reserve(num_workers_);
   for (int i = 0; i < num_workers_; ++i) {
-    TF_RETURN_IF_ERROR(AddWorker());
+    TF_RETURN_IF_ERROR(
+        AddWorker(/*port=*/std::nullopt, data_transfer_protocol_));
   }
   return absl::OkStatus();
 }
 
-Status TestCluster::AddWorker(std::optional<int> port) {
+Status TestCluster::AddWorker(
+    std::optional<int> port,
+    std::optional<std::string> data_transfer_protocol) {
   std::unique_ptr<WorkerGrpcDataServer> worker;
   experimental::WorkerConfig config;
   if (port.has_value()) {
     config.set_port(*port);
   }
   config.set_protocol(kProtocol);
+  if (data_transfer_protocol.has_value()) {
+    config.set_data_transfer_protocol(*data_transfer_protocol);
+  }
   config.set_dispatcher_address(dispatcher_address_);
   std::string worker_address =
       port.has_value() ? absl::StrCat("localhost:", *port) : "localhost:%port%";
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index 10c981401167f6..c071b9c2bffeae 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -59,7 +59,9 @@ class TestCluster {
   };
 
   // Creates a new test cluster with a dispatcher and `num_workers` workers.
-  explicit TestCluster(int num_workers);
+  explicit TestCluster(
+      int num_workers,
+      std::optional<std::string> data_transfer_protocol = std::nullopt);
   explicit TestCluster(const Config& config);
   virtual ~TestCluster();
 
@@ -67,7 +69,9 @@ class TestCluster {
   // the cluster. Initialize should be called only once.
   Status Initialize();
   // Adds a new worker to the cluster.
-  Status AddWorker(std::optional<int> port = std::nullopt);
+  Status AddWorker(
+      std::optional<int> port = std::nullopt,
+      std::optional<std::string> data_transfer_protocol = std::nullopt);
   // Returns the number of workers in this cluster.
   size_t NumWorkers() const { return workers_.size(); }
   // Returns the port number of a worker.
@@ -97,6 +101,7 @@ class TestCluster {
  private:
   bool initialized_ = false;
   int num_workers_;
+  std::optional<std::string> data_transfer_protocol_;
   Config config_;
   std::unique_ptr<DispatchGrpcDataServer> dispatcher_;
   std::string dispatcher_address_;
diff --git a/tensorflow/core/data/service/worker_client_test.cc b/tensorflow/core/data/service/worker_client_test.cc
index dd8e907a68e38c..201004b2a7e867 100644
--- a/tensorflow/core/data/service/worker_client_test.cc
+++ b/tensorflow/core/data/service/worker_client_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/substitute.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/data/service/common.h"
@@ -53,11 +54,16 @@ using ::tensorflow::testing::StatusIs;
 using ::testing::MatchesRegex;
 
 constexpr const char kProtocol[] = "grpc";
+constexpr const char kAltTransferProtocol[] = "alt";
 
-class WorkerClientTest : public ::testing::Test {
+class WorkerClientTest : public ::testing::TestWithParam<std::string> {
  protected:
-  void SetUp() override {
-    test_cluster_ = std::make_unique<TestCluster>(/*num_workers=*/1);
+  void SetUp() override { InitializeTestCluster(); }
+
+  void InitializeTestCluster(
+      std::optional<std::string> data_transfer_protocol = std::nullopt) {
+    test_cluster_ = std::make_unique<TestCluster>(/*num_workers=*/1,
+                                                  data_transfer_protocol);
     TF_ASSERT_OK(test_cluster_->Initialize());
     dispatcher_client_ = std::make_unique<DataServiceDispatcherClient>(
         test_cluster_->DispatcherAddress(), kProtocol);
@@ -132,6 +138,76 @@ class WorkerClientTest : public ::testing::Test {
   std::unique_ptr<DataServiceDispatcherClient> dispatcher_client_;
 };
 
+class AltDataTransferServer : public DataTransferServer {
+ public:
+  explicit AltDataTransferServer(DataTransferServer::GetElementT get_element)
+      : get_element_(get_element) {}
+
+  absl::Status GetElement(const GetElementRequest& req,
+                          GetElementResult& result) {
+    return get_element_(&req, &result);
+  }
+
+  absl::Status Start(const experimental::WorkerConfig& config) override {
+    return absl::OkStatus();
+  }
+
+  int Port() const override { return -1; }
+
+ private:
+  DataTransferServer::GetElementT get_element_;
+};
+
+class AltDataTransferClient : public DataTransferClient {
+ public:
+  explicit AltDataTransferClient(std::shared_ptr<AltDataTransferServer> server)
+      : server_(server) {}
+
+  absl::Status GetElement(const GetElementRequest& req,
+                          GetElementResult& result) override {
+    return server_->GetElement(req, result);
+  }
+
+  void TryCancel() override {}
+
+ private:
+  std::shared_ptr<AltDataTransferServer> server_;
+};
+
+class AltDataTransferRegistrar {
+ public:
+  AltDataTransferRegistrar() {
+    DataTransferServer::Register(
+        kAltTransferProtocol,
+        [this](DataTransferServer::GetElementT get_element,
+               std::shared_ptr<DataTransferServer>* server) {
+          server_ = std::make_shared<AltDataTransferServer>(get_element);
+          *server = server_;
+          return absl::OkStatus();
+        });
+    DataTransferClient::Register(
+        kAltTransferProtocol,
+        [this](DataTransferClient::Config config,
+               std::unique_ptr<DataTransferClient>* client) {
+          *client = std::make_unique<AltDataTransferClient>(server_);
+          return absl::OkStatus();
+        });
+  }
+
+ private:
+  std::shared_ptr<AltDataTransferServer> server_ = nullptr;
+};
+
+static AltDataTransferRegistrar alt_data_transfer_registrar;
+
+class DataTransferProtocolWorkerClientTest : public WorkerClientTest {
+ protected:
+  void SetUp() override {
+    std::string data_transfer_protocol = GetParam();
+    InitializeTestCluster(data_transfer_protocol);
+  }
+};
+
 TEST_F(WorkerClientTest, LocalRead) {
   const int64_t range = 5;
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id, RegisterDataset(range));
@@ -177,10 +253,10 @@ TEST_F(WorkerClientTest, LocalReadEmptyDataset) {
                        MatchesRegex("Local worker.*is no longer available.*")));
 }
 
-TEST_F(WorkerClientTest, GrpcRead) {
-  // To test transfer over fake gRPC, remove the worker from `LocalWorkers`.
-  // If we don't do this, the local server will be used, even with gRPC
-  // specified as below (see `DataServiceWorkerClient::DataTransferProtocol`).
+TEST_P(DataTransferProtocolWorkerClientTest, NetworkRead) {
+  std::string data_transfer_protocol = GetParam();
+
+  // Consider the worker to be remote so that local protocol isn't forced on.
   LocalWorkers::Remove(GetWorkerAddress());
 
   const int64_t range = 5;
@@ -190,7 +266,7 @@ TEST_F(WorkerClientTest, GrpcRead) {
   TF_ASSERT_OK_AND_ASSIGN(const int64_t task_id,
                           GetTaskToRead(iteration_client_id));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DataServiceWorkerClient> client,
-                          GetWorkerClient(kGrpcTransferProtocol));
+                          GetWorkerClient(data_transfer_protocol));
   for (int64_t i = 0; i < range; ++i) {
     TF_ASSERT_OK_AND_ASSIGN(GetElementResult result,
                             GetElement(*client, task_id));
@@ -199,6 +275,14 @@ TEST_F(WorkerClientTest, GrpcRead) {
   }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    NetworkProtocols, DataTransferProtocolWorkerClientTest,
+    ::testing::Values(kGrpcTransferProtocol, kAltTransferProtocol),
+    [](const ::testing::TestParamInfo<
+        DataTransferProtocolWorkerClientTest::ParamType>& info) {
+      return info.param;
+    });
+
 TEST_F(WorkerClientTest, LocalServerShutsDown) {
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id,
                           RegisterDataset(/*range=*/5));

From 83b78bb3da2afefb08b2c5de8b55db07cef931ae Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 30 May 2024 18:47:29 -0700
Subject: [PATCH 176/287] [xla:cpu] Add support for a copy with layout change
 (traspose) to CopyThunk

PiperOrigin-RevId: 638858788
---
 third_party/xla/xla/service/cpu/runtime/BUILD |  5 ++
 .../xla/xla/service/cpu/runtime/copy_thunk.cc | 64 +++++++++++++++----
 .../xla/xla/service/cpu/runtime/copy_thunk.h  | 18 ++++--
 .../service/cpu/runtime/copy_thunk_test.cc    | 37 ++++++++++-
 .../xla/service/cpu/runtime/kernel_thunk.cc   |  6 +-
 .../xla/xla/service/cpu/thunk_emitter.cc      |  9 ++-
 6 files changed, 114 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index 6618e175ac475a..9c5a8b436da330 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -60,8 +60,12 @@ cc_library(
     hdrs = ["copy_thunk.h"],
     deps = [
         ":thunk",
+        "//xla:shape_util",
+        "//xla/pjrt:transpose",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:logging",
@@ -77,6 +81,7 @@ xla_cc_test(
         ":buffer_allocations",
         ":copy_thunk",
         ":thunk",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/service:maybe_owning_device_memory",
         "//xla/stream_executor",
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
index 8984a607fa99ea..f7dbb067c058c5 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
@@ -17,27 +17,59 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstring>
+#include <functional>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "xla/pjrt/transpose.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::cpu {
 
-using ::tsl::strings::HumanReadableNumBytes;
-
 CopyThunk::CopyThunk(BufferAllocation::Slice source_buffer,
+                     const Shape& source_shape,
                      BufferAllocation::Slice destination_buffer,
-                     uint64_t size_in_bytes)
+                     const Shape& destination_shape)
     : Thunk(Kind::kCopy),
       source_buffer_(source_buffer),
+      source_shape_(source_shape),
       destination_buffer_(destination_buffer),
-      size_in_bytes_(size_in_bytes) {}
+      destination_shape_(destination_shape) {
+  // TODO(ezhulenev): Use factory constructor instead of CHECK.
+  CHECK(ShapeUtil::Compatible(source_shape_, destination_shape_))
+      << "Source shape " << source_shape_.ToString(true)
+      << " must be compatble with destination shape "
+      << destination_shape_.ToString(true);
+
+  // TODO(ezhulenev): This is almost certainly wrong for many types of copies
+  // that change layout, however it works in a few tests. This implementation
+  // is copied from `xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc`. It seems to
+  // work only if destination is a row-major layout.
+  if (source_shape_ != destination_shape_) {
+    TransposePlan::Options options;
+    options.elem_size_in_bytes =
+        ShapeUtil::ByteSizeOfPrimitiveType(source_shape_.element_type());
+    options.dims = source_shape_.dimensions();
+
+    auto byte_strides = ShapeUtil::ByteStrides(source_shape_);
+    options.input_layout = TransposePlan::Striding{*byte_strides};
+
+    absl::InlinedVector<int64_t, 4> permutation(options.dims.size());
+    absl::c_reverse_copy(destination_shape_.layout().minor_to_major(),
+                         permutation.begin());
+    options.permutation = permutation;
+
+    transpose_plan_ = TransposePlan::Create(options).value();
+  }
+}
 
 absl::Status CopyThunk::Execute(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(
@@ -48,18 +80,28 @@ absl::Status CopyThunk::Execute(const ExecuteParams& params) {
       se::DeviceMemoryBase destination_data,
       params.buffer_allocations->GetDeviceAddress(destination_buffer_));
 
-  VLOG(3) << absl::StrFormat(
-      "Copy buffer of size %s from slice %s (%p) to slice %s (%p)",
-      HumanReadableNumBytes(size_in_bytes_), source_buffer_.ToString(),
-      source_data.opaque(), destination_buffer_.ToString(),
-      destination_data.opaque());
+  VLOG(3) << absl::StreamFormat("Copy buffer: use_transpose=%s",
+                                transpose_plan_ ? "true" : "false");
+  VLOG(3) << absl::StreamFormat(
+      " - src: %s in slice %s (%p)", source_shape_.ToString(true),
+      source_buffer_.ToString(), source_data.opaque());
+  VLOG(3) << absl::StreamFormat(
+      " - dst: %s in slice %s (%p)", destination_shape_.ToString(true),
+      destination_buffer_.ToString(), destination_data.opaque());
 
   // TODO(ezhulenev): Add benchmarks for copy thunk and add support for
   // running it on multiple threads.
   // TODO(ezhulenev): Use StreamExecutor API instead of std::memcpy? This also
   // requires a benchmark and a multi-threaded implementation.
   // TODO(ezhulenev): Add extra checks for buffer overlap.
-  std::memcpy(destination_data.opaque(), source_data.opaque(), size_in_bytes_);
+
+  if (transpose_plan_) {
+    transpose_plan_->Execute(source_data.opaque(), destination_data.opaque(),
+                             [](std::function<void()> fn) { fn(); });
+  } else {
+    size_t size_in_bytes = ShapeUtil::ByteSizeOf(source_shape_);
+    std::memcpy(destination_data.opaque(), source_data.opaque(), size_in_bytes);
+  }
 
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.h b/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
index 64f526bdd6b12f..0ab4d8d380d7fc 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
@@ -17,25 +17,35 @@ limitations under the License.
 #define XLA_SERVICE_CPU_RUNTIME_COPY_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "xla/pjrt/transpose.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/runtime/thunk.h"
+#include "xla/shape.h"
 
 namespace xla::cpu {
 
-// Copies data from a source buffer to a destination buffer.
+// Copies data from a source buffer to a destination buffer. If source and
+// destination buffers have different layouts it will transpose the data.
 class CopyThunk final : public Thunk {
  public:
-  CopyThunk(BufferAllocation::Slice source_buffer,
-            BufferAllocation::Slice destination_buffer, uint64_t size_in_bytes);
+  CopyThunk(BufferAllocation::Slice source_buffer, const Shape& source_shape,
+            BufferAllocation::Slice destination_buffer,
+            const Shape& destination_shape);
 
   absl::Status Execute(const ExecuteParams& params) final;
 
  private:
   BufferAllocation::Slice source_buffer_;
+  Shape source_shape_;
+
   BufferAllocation::Slice destination_buffer_;
-  uint64_t size_in_bytes_;
+  Shape destination_shape_;
+
+  std::unique_ptr<TransposePlan> transpose_plan_;  // optional
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
index a8803ddf531700..9152380b8e8dfd 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
@@ -18,10 +18,13 @@ limitations under the License.
 #include <cstddef>
 #include <vector>
 
+#include "xla/layout_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/runtime/buffer_allocations.h"
 #include "xla/service/cpu/runtime/thunk.h"
 #include "xla/service/maybe_owning_device_memory.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
@@ -29,7 +32,7 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
-TEST(CopyThunkTest, Copy) {
+TEST(CopyThunkTest, CopySameShape) {
   std::vector<MaybeOwningDeviceMemory> buffers;
   std::vector<float> src = {1.0, 2.0, 3.0, 4.0};
   std::vector<float> dst(4, 0.0);
@@ -46,7 +49,8 @@ TEST(CopyThunkTest, Copy) {
   BufferAllocation::Slice src_slice(&src_alloc, 0, size_in_bytes);
   BufferAllocation::Slice dst_slice(&dst_alloc, 0, size_in_bytes);
 
-  CopyThunk thunk(src_slice, dst_slice, size_in_bytes);
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  CopyThunk thunk(src_slice, shape, dst_slice, shape);
 
   Thunk::ExecuteParams params = {nullptr, &allocations};
   TF_ASSERT_OK(thunk.Execute(params));
@@ -54,5 +58,34 @@ TEST(CopyThunkTest, Copy) {
   EXPECT_EQ(src, dst);
 }
 
+TEST(CopyThunkTest, CopyTransposed) {
+  std::vector<MaybeOwningDeviceMemory> buffers;
+  std::vector<float> src = {1.0, 2.0, 3.0, 4.0};
+  std::vector<float> dst(4, 0.0);
+
+  size_t size_in_bytes = src.size() * sizeof(float);
+  buffers.emplace_back(se::DeviceMemoryBase(src.data(), size_in_bytes));
+  buffers.emplace_back(se::DeviceMemoryBase(dst.data(), size_in_bytes));
+
+  BufferAllocations allocations(buffers);
+
+  BufferAllocation src_alloc(0, size_in_bytes, 0);
+  BufferAllocation dst_alloc(1, size_in_bytes, 0);
+
+  BufferAllocation::Slice src_slice(&src_alloc, 0, size_in_bytes);
+  BufferAllocation::Slice dst_slice(&dst_alloc, 0, size_in_bytes);
+
+  Shape src_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  *src_shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  Shape dst_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  CopyThunk thunk(src_slice, src_shape, dst_slice, dst_shape);
+
+  Thunk::ExecuteParams params = {nullptr, &allocations};
+  TF_ASSERT_OK(thunk.Execute(params));
+
+  std::vector<float> expected = {1.0, 3.0, 2.0, 4.0};
+  EXPECT_EQ(expected, dst);
+}
+
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
index 10a7143377db61..cc1dfab06f7675 100644
--- a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
@@ -42,7 +42,7 @@ KernelThunk::KernelThunk(absl::Span<const BufferAllocation::Slice> buffers,
       thread_dim_(thread_dim) {}
 
 absl::Status KernelThunk::Execute(const ExecuteParams& params) {
-  VLOG(3) << absl::StrFormat(
+  VLOG(3) << absl::StreamFormat(
       "Launch host kernel %s with %d buffer arguments: %s", kernel_name_,
       buffers_.size(), thread_dim_.ToString());
 
@@ -52,8 +52,8 @@ absl::Status KernelThunk::Execute(const ExecuteParams& params) {
   for (BufferAllocation::Slice& buffer : buffers_) {
     TF_ASSIGN_OR_RETURN(buffers_data.emplace_back(),
                         params.buffer_allocations->GetDeviceAddress(buffer));
-    VLOG(3) << absl::StrFormat(" - add argument %s (%p)", buffer.ToString(),
-                               buffers_data.back().opaque());
+    VLOG(3) << absl::StreamFormat(" - arg: %s (%p)", buffer.ToString(),
+                                  buffers_data.back().opaque());
   }
 
   // TODO(ezhulenev): Kernel ptr should be loaded as a part of Thunk
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index d5e2d2a5636125..5504bea37c8d4b 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -182,12 +182,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
     const HloInstruction* instruction) {
-  TF_ASSIGN_OR_RETURN(auto source_buffer,
-                      GetAllocationSlice(instruction->operand(0)));
+  const HloInstruction* source = instruction->operand(0);
+  TF_ASSIGN_OR_RETURN(auto source_buffer, GetAllocationSlice(source));
   TF_ASSIGN_OR_RETURN(auto destination_buffer, GetAllocationSlice(instruction));
-  return ThunkSequence::Of<CopyThunk>(
-      source_buffer, destination_buffer,
-      ShapeUtil::ByteSizeOf(instruction->shape()));
+  return ThunkSequence::Of<CopyThunk>(source_buffer, source->shape(),
+                                      destination_buffer, instruction->shape());
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(

From bf124eb5a7b51fe88bc18d534cb6569afea2681b Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Thu, 30 May 2024 20:03:43 -0700
Subject: [PATCH 177/287] Stop using xla/status.h in favor of
 absl/status/status.h.

PiperOrigin-RevId: 638874762
---
 third_party/xla/xla/BUILD                     | 34 ++++++++-----------
 third_party/xla/xla/array.h                   |  3 +-
 third_party/xla/xla/layout_util.cc            |  2 +-
 third_party/xla/xla/layout_util.h             |  2 +-
 third_party/xla/xla/literal.cc                | 10 +++---
 third_party/xla/xla/literal.h                 |  2 +-
 third_party/xla/xla/literal_comparison.cc     |  4 +--
 third_party/xla/xla/literal_comparison.h      |  2 +-
 third_party/xla/xla/literal_test.cc           |  4 +--
 third_party/xla/xla/protobuf_util.cc          |  2 +-
 third_party/xla/xla/protobuf_util.h           |  2 +-
 .../xla/service/cpu/onednn_matmul_rewriter.cc | 10 +++---
 .../xla/xla/service/layout_normalization.cc   |  2 +-
 third_party/xla/xla/service_interface.h       |  2 +-
 third_party/xla/xla/shape_layout.cc           |  2 +-
 third_party/xla/xla/shape_layout.h            |  2 +-
 third_party/xla/xla/shape_tree.h              |  2 +-
 third_party/xla/xla/shape_util.cc             |  2 +-
 third_party/xla/xla/shape_util.h              |  2 +-
 third_party/xla/xla/shape_util_test.cc        |  2 +-
 third_party/xla/xla/sharding_op_util.cc       |  2 +-
 third_party/xla/xla/sharding_op_util.h        |  2 +-
 third_party/xla/xla/status_macros.cc          |  1 -
 third_party/xla/xla/status_macros.h           |  1 -
 third_party/xla/xla/status_macros_test.cc     |  2 +-
 third_party/xla/xla/statusor.h                |  1 -
 third_party/xla/xla/test_helpers.h            |  2 +-
 27 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 75529903cf3451..aa09828f7a19ba 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -271,9 +271,9 @@ cc_library(
     hdrs = ["service_interface.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":status",
         ":xla_data_proto_cc",
         ":xla_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -283,7 +283,6 @@ cc_library(
     hdrs = ["status_macros.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":status",
         ":statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
@@ -300,10 +299,10 @@ xla_cc_test(
     size = "small",
     srcs = ["status_macros_test.cc"],
     deps = [
-        ":status",
         ":status_macros",
         ":test",
         ":test_helpers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -333,7 +332,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
-        ":status",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -425,9 +424,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":status",
         ":util",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
@@ -467,7 +466,6 @@ cc_library(
     deps = [
         ":permutation_util",
         ":printer",
-        ":status",
         ":status_macros",
         ":statusor",
         ":types",
@@ -480,6 +478,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -501,10 +500,10 @@ cc_library(
     hdrs = ["sharding_op_util.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":status",
         ":status_macros",
         ":util",
         "//xla/service:hlo_lexer",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -528,10 +527,10 @@ xla_cc_test(
     srcs = ["shape_util_test.cc"],
     deps = [
         ":shape_util",
-        ":status",
         ":test",
         ":util",
         ":xla_data_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
@@ -606,18 +605,16 @@ cc_library(
         ":printer",
         ":shape_tree",
         ":shape_util",
-        ":status",
         ":status_macros",
-        ":statusor",
         ":types",
         ":util",
         ":xla_data_proto_cc",
         "//xla/tsl/util:byte_swap_array",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:config",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -645,7 +642,6 @@ xla_cc_test(
         ":literal_util",
         ":shape_tree",
         ":shape_util",
-        ":status",
         ":test",
         ":types",
         ":util",
@@ -653,6 +649,7 @@ xla_cc_test(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/random",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -710,11 +707,11 @@ cc_library(
         ":literal",
         ":literal_util",
         ":shape_util",
-        ":status",
         ":types",
         ":util",
         ":xla_data_proto_cc",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -755,9 +752,10 @@ cc_library(
     hdrs = ["array.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":status",
         ":types",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -881,10 +879,8 @@ cc_library(
     hdrs = ["test_helpers.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":status",
         ":statusor",
-        ":types",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -967,10 +963,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":shape_util",
-        ":status",
         ":statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/gtl:iterator_range",
         "@local_tsl//tsl/platform:errors",
@@ -1000,8 +996,8 @@ cc_library(
     deps = [
         ":printer",
         ":shape_util",
-        ":status",
         ":util",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
     ],
diff --git a/third_party/xla/xla/array.h b/third_party/xla/xla/array.h
index fe33a49335ac26..6a6f50574e1d9e 100644
--- a/third_party/xla/xla/array.h
+++ b/third_party/xla/xla/array.h
@@ -29,9 +29,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "xla/status.h"
 #include "xla/types.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 59381c738b4818..c5c6db9392def8 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/printer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index ca48164965ec50..8192c2bbb7a052 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -23,11 +23,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index a11ddc71c549e5..c81a6ed050f2a2 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/util/byte_swap_array.h"
 #include "xla/types.h"
@@ -842,7 +842,7 @@ absl::Status MutableLiteralBase::CopySliceFrom(
   TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
   TF_RET_CHECK(shape().rank() == dest_base.size());
 
-  return primitive_util::ArrayTypeSwitch<Status>(
+  return primitive_util::ArrayTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return CopySliceFromInternal<NativeT>(src_literal, src_base, dest_base,
@@ -1384,7 +1384,7 @@ std::optional<complex128> LiteralBase::GetAsComplex128(
 absl::Status MutableLiteralBase::SetIntegralAsS64(
     absl::Span<const int64_t> multi_index, int64_t value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
-  return primitive_util::PrimitiveTypeSwitch<Status>(
+  return primitive_util::PrimitiveTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant) ||
                       primitive_type_constant == PRED) {
@@ -1703,7 +1703,7 @@ absl::Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
   auto src_data = src_literal.data<NativeSrcT>();
   void* dst_base = dst_literal.untyped_data();
   DCHECK_EQ(src_data.size(), dst_literal.element_count());
-  return primitive_util::ArrayTypeSwitch<Status>(
+  return primitive_util::ArrayTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsComplexType(kSrcType) &&
                       !primitive_util::IsComplexType(primitive_type_constant)) {
@@ -1739,7 +1739,7 @@ absl::StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
   // duplicating it N^2 times in the conversion implementation.
   Literal result(
       ShapeUtil::ChangeElementType(literal.shape(), primitive_dest_type));
-  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<Status>(
+  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<absl::Status>(
       [&](auto primitive_type_constant) -> absl::Status {
         return ConvertIfDestTypeMatches<primitive_type_constant>(literal,
                                                                  result);
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 0ea05a201d8188..16dbc2fe765006 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/types.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/literal_comparison.cc b/third_party/xla/xla/literal_comparison.cc
index cb2e2cf64539df..b89454e86908f9 100644
--- a/third_party/xla/xla/literal_comparison.cc
+++ b/third_party/xla/xla/literal_comparison.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -741,7 +741,7 @@ absl::Status NearHelper(const LiteralSlice& expected,
       ShapeUtil::ElementIsComplex(expected.shape())) {
     bool use_detailed_message = detailed_message.value_or(
         ShapeUtil::ElementsIn(expected.shape()) >= 64);
-    return primitive_util::PrimitiveTypeSwitch<Status>(
+    return primitive_util::PrimitiveTypeSwitch<absl::Status>(
         [&](auto primitive_type) -> absl::Status {
           if constexpr (primitive_util::IsFloatingPointType(primitive_type) ||
                         primitive_util::IsComplexType(primitive_type)) {
diff --git a/third_party/xla/xla/literal_comparison.h b/third_party/xla/xla/literal_comparison.h
index 61f0695c981e0c..113ac2240eb3f4 100644
--- a/third_party/xla/xla/literal_comparison.h
+++ b/third_party/xla/xla/literal_comparison.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "xla/error_spec.h"
 #include "xla/literal.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace literal_comparison {
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index 37d0868334f210..eacbdf73af9554 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/hash/hash.h"
 #include "absl/random/random.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -1822,7 +1822,7 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
   absl::Status status =
       literal.BitcastConvert(ShapeUtil::ChangeElementType(literal.shape(), F64))
           .status();
-  EXPECT_NE(OkStatus(), status);
+  EXPECT_NE(absl::OkStatus(), status);
   EXPECT_TRUE(
       absl::StrContains(status.message(), "to a shape of different size"));
 }
diff --git a/third_party/xla/xla/protobuf_util.cc b/third_party/xla/xla/protobuf_util.cc
index d465270316569c..a8d6dfa15a2a32 100644
--- a/third_party/xla/xla/protobuf_util.cc
+++ b/third_party/xla/xla/protobuf_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/hash/hash.h"
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/protobuf_util.h b/third_party/xla/xla/protobuf_util.h
index 031596083df361..79f00773fb07a0 100644
--- a/third_party/xla/xla/protobuf_util.h
+++ b/third_party/xla/xla/protobuf_util.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <functional>
 #include <string>
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 19c15b38db283d..68f1f4efa086e0 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -649,7 +649,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return Match(instr, pattern);
   }
 
-  Status HandleSelect(HloInstruction* instr) override {
+  absl::Status HandleSelect(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
     HloInstruction* optional_bitcast = nullptr;
@@ -667,7 +667,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
-  Status HandleTanh(HloInstruction* instr) override {
+  absl::Status HandleTanh(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
     HloInstruction* optional_bitcast = nullptr;
@@ -683,7 +683,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
-  Status HandleClamp(HloInstruction* instr) override {
+  absl::Status HandleClamp(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
     HloInstruction* optional_bitcast = nullptr;
@@ -769,7 +769,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                                           m::Exp(m::Negate(m::Op(src))))));
   }
 
-  Status HandleDivide(HloInstruction* instr) override {
+  absl::Status HandleDivide(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
     HloInstruction* optional_bitcast = nullptr;
@@ -783,7 +783,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                               intermediate_instr, optional_bitcast);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   absl::Status FuseActivation(OneDnnMatMulConfig_FusionKind kind,
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 6b892249f0dfb1..af9dd19a81d9f9 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -380,7 +380,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // Scatter is layout-preserving regarding the scatter operands, so we only
   // have to permute values inside the ScatterDimensionNumbers.
-  Status HandleScatter(HloInstruction* hlo) override {
+  absl::Status HandleScatter(HloInstruction* hlo) override {
     auto* scatter = Cast<HloScatterInstruction>(hlo);
     std::vector<HloInstruction*> normalized_operands;
     normalized_operands.reserve(scatter->scatter_operand_count());
diff --git a/third_party/xla/xla/service_interface.h b/third_party/xla/xla/service_interface.h
index a3c974600d0afa..39c43b7e0b8467 100644
--- a/third_party/xla/xla/service_interface.h
+++ b/third_party/xla/xla/service_interface.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_INTERFACE_H_
 #define XLA_SERVICE_INTERFACE_H_
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/shape_layout.cc b/third_party/xla/xla/shape_layout.cc
index a810e0bda9a896..7a3516b5fb7cec 100644
--- a/third_party/xla/xla/shape_layout.cc
+++ b/third_party/xla/xla/shape_layout.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "xla/shape_layout.h"
 
+#include "absl/status/status.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/shape_layout.h b/third_party/xla/xla/shape_layout.h
index 6262acb2851a1c..cb3d56a53bfb3b 100644
--- a/third_party/xla/xla/shape_layout.h
+++ b/third_party/xla/xla/shape_layout.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "xla/layout.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/shape_tree.h b/third_party/xla/xla/shape_tree.h
index 3d514d23e53158..8fe712cfe2a1e8 100644
--- a/third_party/xla/xla/shape_tree.h
+++ b/third_party/xla/xla/shape_tree.h
@@ -26,10 +26,10 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/lib/gtl/iterator_range.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 739dacdc26266c..de311c35473d44 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -49,7 +50,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 0339df04b20755..35690357dcfb8d 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index f72cf4e4742cb9..a9a6f0bd0bddbf 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/shape.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/sharding_op_util.cc b/third_party/xla/xla/sharding_op_util.cc
index d2dc84ffc3d405..40154c61f45c63 100644
--- a/third_party/xla/xla/sharding_op_util.cc
+++ b/third_party/xla/xla/sharding_op_util.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/service/hlo_lexer.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/sharding_op_util.h b/third_party/xla/xla/sharding_op_util.h
index 8b59f69a270777..ca48a0e359948a 100644
--- a/third_party/xla/xla/sharding_op_util.h
+++ b/third_party/xla/xla/sharding_op_util.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace sharding_op_util {
diff --git a/third_party/xla/xla/status_macros.cc b/third_party/xla/xla/status_macros.cc
index 3c4b8a8e608aa5..e9d28c3a80c41b 100644
--- a/third_party/xla/xla/status_macros.cc
+++ b/third_party/xla/xla/status_macros.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/base/optimization.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "xla/status.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/status_macros.h b/third_party/xla/xla/status_macros.h
index 02621a8f2c1e01..d62bfa276a4c28 100644
--- a/third_party/xla/xla/status_macros.h
+++ b/third_party/xla/xla/status_macros.h
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "absl/base/optimization.h"
 #include "absl/status/status.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/status_macros_test.cc b/third_party/xla/xla/status_macros_test.cc
index 99eaebdf96b697..5f54b5961e433e 100644
--- a/third_party/xla/xla/status_macros_test.cc
+++ b/third_party/xla/xla/status_macros_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/statusor.h b/third_party/xla/xla/statusor.h
index 66c02ab4e74489..3a7d6da42897f7 100644
--- a/third_party/xla/xla/statusor.h
+++ b/third_party/xla/xla/statusor.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef XLA_STATUSOR_H_
 #define XLA_STATUSOR_H_
 
-#include "xla/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/test_helpers.h b/third_party/xla/xla/test_helpers.h
index 7a80fd2f1874d5..abcf4e438d1208 100644
--- a/third_party/xla/xla/test_helpers.h
+++ b/third_party/xla/xla/test_helpers.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_TEST_HELPERS_H_
 #define XLA_TEST_HELPERS_H_
 
-#include "xla/status.h"
+#include "absl/status/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/test.h"
 

From d079b649523fc56bafb5120748a8069c0f7043f4 Mon Sep 17 00:00:00 2001
From: George Necula <necula@google.com>
Date: Thu, 30 May 2024 20:10:59 -0700
Subject: [PATCH 178/287] Add DynamicApproxTopKOp to StableHLO

This CL adds a DynamicApproxTopKOp to StableHLO. This op is a dynamic version of the ApproxTopKOp. The op is represented via a custom call to `stablehlo.dynamic_approx_top_k`. The custom call has the regular operand of ApproxTopKOp plus an additional `k` operand that determines the shape of the output.

The semantics of DynamicApproxTopKOp are inherited from semantics of ApproxTopKOp.

PiperOrigin-RevId: 638876781
---
 third_party/stablehlo/temporary.patch         | 385 +++++++++++++++++-
 .../xla/third_party/stablehlo/temporary.patch | 385 +++++++++++++++++-
 third_party/xla/xla/python/xla_client.py      |   2 +-
 3 files changed, 763 insertions(+), 9 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index e58e282bc89283..1b320f5cbc5903 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -447,7 +447,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,505 @@
+@@ -0,0 +1,633 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -465,6 +465,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +
++#include <cstddef>
 +#include <cstdint>
 +#include <optional>
 +
@@ -950,13 +951,140 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
++LogicalResult DynamicApproxTopKOpAdaptor::verify() {
++  auto isSupportedAttrName = [](NamedAttribute attr) {
++    auto name = attr.getName();
++    return name == "call_target_name" || name == "mhlo.backend_config" ||
++           name == "backend_config" || name == "api_version" ||
++           name == "called_computations" || name == "has_side_effect";
++  };
++  for (const auto& attr : op_->getAttrs()) {
++    if (!isSupportedAttrName(attr))
++      return op_.emitOpError()
++             << attr.getName().getValue()
++             << " is not a supported attribute for DynamicApproxTopK";
++  }
++
++  if (op_.getCallTargetName() != "stablehlo.dynamic_approx_top_k")
++    return op_.emitError() << "expects @stablehlo.dynamic_approx_top_k";
++
++  auto called_computations = op_.getCalledComputations();
++  if (called_computations.size() != 1) {
++    return op_.emitOpError()
++           << "DynamicApproxTopK must take 1 called_computations";
++  }
++
++  auto backend_config = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
++      op_->getAttr("mhlo.backend_config"));
++  if (!backend_config)
++    return op_.emitOpError() << "Missing mhlo.backend_config attribute";
++
++  // C1
++  if (backend_config.contains("top_k"))
++    return op_.emitOpError("mhlo.backend_config attribute contains top_k");
++
++  // C2
++  if (!backend_config.contains("reduction_dim"))
++    return op_.emitOpError(
++        "mhlo.backend_config attribute does not contain "
++        "reduction_dim");
++  auto reduction_dim_attr = backend_config.getAs<IntegerAttr>("reduction_dim");
++  if (!reduction_dim_attr || !reduction_dim_attr.getType().isInteger(64))
++    return op_.emitOpError()
++           << "mhlo.backend_config attribute reduction_dim must be i64";
++  int64_t reduction_dim = reduction_dim_attr.getInt();
++
++  // C3
++  int num_operands = op_->getNumOperands();
++  if (num_operands < 3 || num_operands % 2 != 1)
++    return op_.emitError() << "size(operands) is even or less than 3";
++  size_t num_inputs = getNumInputs();
++
++  if (op_->getNumResults() != num_inputs)
++    return op_.emitError()
++           << "size(results) is not the same as the number of inputs";
++
++  for (size_t i = 0; i < num_inputs; ++i) {
++    // C4
++    if (!getInput(i).getType().hasRank())
++      return op_.emitError() << "input " << i << " is unranked";
++    if (getInput(i).getType().getShape() != getInput(0).getType().getShape())
++      return op_.emitError()
++             << "input " << i << " shape does not match input 0 shape";
++    // C5
++    if (!getInitialValue(i).getType().hasRank())
++      return op_.emitError() << "initial value " << i << " is unranked";
++    if (0 != getInitialValue(i).getType().getRank())
++      return op_.emitError()
++             << "initial value " << i << " does not have rank 0";
++    // C6
++    if (getInitialValue(i).getType().getElementType() !=
++        getInput(i).getType().getElementType())
++      return op_.emitError() << "initial value " << i
++                             << " element type does not match input type";
++    // C7
++    if (!getOutput(i).getType().hasRank())
++      return op_.emitError() << "output " << i << " is unranked";
++    if (getOutput(i).getType().getShape() != getOutput(0).getType().getShape())
++      return op_.emitError()
++             << "output " << i << " shape does not match output 0 shape";
++    // C8
++    if (getOutput(i).getType().getElementType() !=
++        getInput(i).getType().getElementType())
++      return op_.emitError() << "output " << i
++                             << " element type does not match input type";
++  }
++
++  // C9
++  auto inputShape = getInput(0).getType().getShape();
++  auto outputShape = getOutput(0).getType().getShape();
++  if (reduction_dim < 0 || (size_t)reduction_dim >= inputShape.size())
++    return op_.emitError() << "reduction_dim is out of range";
++
++  // C10
++  for (size_t i = 0; i < inputShape.size(); ++i) {
++    if (i != (size_t)reduction_dim && inputShape[i] != outputShape[i])
++      return op_.emitError()
++             << "output values dimension " << i << " has size "
++             << outputShape[i] << " different than operand dimension size "
++             << inputShape[i];
++  }
++  return success();
++}
++
++size_t DynamicApproxTopKOpAdaptor::getNumInputs() {
++  return (op_->getNumOperands() - 1) / 2;
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getInput(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[idx]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getInitialValue(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[getNumInputs() + idx]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getK() {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[2 * getNumInputs()]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getOutput(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getResults()[idx]);
++}
++
++std::optional<DynamicApproxTopKOpAdaptor> getDynamicApproxTopKOp(
++    CustomCallOp op) {
++  if (op.getCallTargetName() != "stablehlo.dynamic_approx_top_k") return {};
++  return DynamicApproxTopKOpAdaptor(op);
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
-@@ -0,0 +1,230 @@
+@@ -0,0 +1,305 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -1182,6 +1310,81 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
++// The DynamicApproxTopKOp experiment provides a dynamic version of
++// ApproxTopKOp.
++//
++// Within this experiment, DynamicApproxTopKOp is represented via the
++// `stablehlo.custom_call @stablehlo.dynamic_approx_top_k` custom call.
++// This custom call has the regular operands of ApproxTopKOp plus an
++// additional `k` operand that determines the shape of the output.
++//
++// Semantics of DynamicApproTopKOp are inherited from semantics of ApproxTopKOp.
++//
++// #### Inputs
++//
++// | Label | Name            | Type                                         |
++// |-------|-----------------|----------------------------------------------|
++// | (I1)  | `inputs`        | N tensors of integer or floating-point type   |
++// | (I2)  | `initial_values`| N 0-dimensional tensors of same element type |
++// |       |                 | as the corresponding input element type      |
++// | (I3)  | `k`             | 0-dimensional tensor of integer or index type|
++//
++// #### Attributes
++//
++// * api_version: always 2 if present
++// * has_side_effect: always False if present
++// * called_computations: the comparator for scoring entries
++// * mhlo.backend_config: does not include `top_k` and includes:
++//   * `reduction_dim`
++//
++// #### Outputs
++//
++// | Name           | Type                                                |
++// |----------------|-----------------------------------------------------|
++// | `outputs`      | N tensor of same type as the corresponding input    |
++//
++// #### Constraints
++//
++// * (C1) the `mhlo.backend_config` attribute does not contain `top_k`
++// * (C2) the `mhlo.backend_config` attribute contains `reduction_dim`
++// * (C3) len(inputs) == len(initial_values) == len(outputs)
++// * (C4) inputs have ranked type and have the same shape
++// * (C5) initial_values have ranked type and have rank 0
++// * (C6) initial_values have the same element type as the corresponding input
++// * (C7) outputs have same shape
++// * (C8) outputs have the same element type as the corresponding input
++// * (C9) 0 <= reduction_dim < rank(inputs[0])
++// * (C10) shape(inputs[0])[i] == shape(outputs[0])[i] except for i == reduction_dim
++
++class DynamicApproxTopKOpAdaptor {
++ public:
++  DynamicApproxTopKOpAdaptor(CustomCallOp op) : op_(op) {}
++  operator Operation*() { return op_; }
++  Operation* operator->() { return op_; }
++
++  // These accessors assume that the operation is well-formed (i.e. that it
++  // can pass verification).
++  size_t getNumInputs();
++  TypedValue<ShapedType> getInput(size_t idx);
++  TypedValue<ShapedType> getInitialValue(size_t idx);
++  TypedValue<ShapedType> getK();
++
++  TypedValue<ShapedType> getOutput(size_t idx);
++
++  // Verifies the constraints documented above.
++  // Emits errors if errors are detected.
++  LogicalResult verify();
++
++ private:
++  CustomCallOp op_;
++};
++
++// Wraps a custom call in a DynamicApproxTopKOpAdaptor.
++// Fails if the call_target_name of the custom call doesn't match
++// "stablehlo.dynamic_approx_top_k".
++std::optional<DynamicApproxTopKOpAdaptor> getDynamicApproxTopKOp(
++    CustomCallOp op);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -1416,7 +1619,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in b/stableh
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
-@@ -0,0 +1,344 @@
+@@ -0,0 +1,475 @@
 +// RUN: experimental-stablehlo-opt --experimental-stablehlo-canonicalize-dynamism --split-input-file --verify-diagnostics %s | FileCheck %s
 +
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
@@ -1761,6 +1964,137 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<3xf32>, tensor<4xi32>)
 +  return %1#0, %1#1 : tensor<3xf32>, tensor<4xi32>
 +}
++
++// -----
++
++// approx_dynamic_top_k success
++// CHECK-LABEL: func @approx_dynamic_top_k_success
++func.func @approx_dynamic_top_k_success(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // CHECK: ApproxTopK{{.*}}top_k = 3
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_no_called_computation
++func.func @approx_dynamic_top_k_error_no_called_computation(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{must take 1 called_computations}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_backend_config_includes_top_k
++func.func @approx_dynamic_top_k_error_backend_config_includes_top_k(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{mhlo.backend_config attribute contains top_k}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      top_k = 3,
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_even_operands
++func.func @approx_dynamic_top_k_error_even_operands(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{size(operands) is even or less than 3}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_few_operands
++func.func @approx_dynamic_top_k_error_few_operands(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{size(operands) is even or less than 3}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k() {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : () -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
@@ -2218,7 +2552,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -0,0 +1,171 @@
+@@ -0,0 +1,214 @@
 +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 +   Copyright 2023 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2356,6 +2690,48 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +  }
 +};
 +
++struct CanonicalizeApproxDynamicTopKOpPattern
++    : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp impl,
++                                PatternRewriter& rewriter) const override {
++    auto maybeOp = getDynamicApproxTopKOp(impl);
++    if (!maybeOp || failed(maybeOp->verify())) return failure();
++    DynamicApproxTopKOpAdaptor op = *maybeOp;
++
++    SmallVector<int64_t> k;
++    if (failed(hlo::matchInts(op.getK(), k))) {
++      return rewriter.notifyMatchFailure(impl, "expected constant k");
++    }
++
++    SmallVector<Value> newOperands;
++    for (size_t i = 0; i < op.getNumInputs(); ++i) {
++      newOperands.push_back(op.getInput(i));
++    }
++    for (size_t i = 0; i < op.getNumInputs(); ++i) {
++      newOperands.push_back(op.getInitialValue(i));
++    }
++
++    auto stablehloBackendConfig = "mhlo.backend_config";
++    auto backend_config = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
++        impl->getAttr(stablehloBackendConfig));
++    if (!backend_config)
++      return rewriter.notifyMatchFailure(op,
++                                         "Missing backend_config attribute");
++    SmallVector<NamedAttribute> backend_config_attrs{backend_config.begin(),
++                                                     backend_config.end()};
++    backend_config_attrs.push_back(
++        rewriter.getNamedAttr("top_k", rewriter.getI64IntegerAttr(k[0])));
++
++    auto newOp = rewriter.replaceOpWithNewOp<CustomCallOp>(
++        op, op->getResultTypes(), newOperands, op->getAttrs());
++    newOp.setCallTargetName("ApproxTopK");
++    newOp->setAttr(stablehloBackendConfig,
++                   rewriter.getDictionaryAttr(backend_config_attrs));
++    return success();
++  }
++};
++
 +struct StablehloCanonicalizeDynamismPass
 +    : public impl::StablehloCanonicalizeDynamismPassBase<
 +          StablehloCanonicalizeDynamismPass> {
@@ -2375,6 +2751,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
++    patterns.add<CanonicalizeApproxDynamicTopKOpPattern>(&getContext());
 +
 +    auto funcOp = getOperation();
 +    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns),
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index e58e282bc89283..1b320f5cbc5903 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -447,7 +447,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,505 @@
+@@ -0,0 +1,633 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -465,6 +465,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +
++#include <cstddef>
 +#include <cstdint>
 +#include <optional>
 +
@@ -950,13 +951,140 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
++LogicalResult DynamicApproxTopKOpAdaptor::verify() {
++  auto isSupportedAttrName = [](NamedAttribute attr) {
++    auto name = attr.getName();
++    return name == "call_target_name" || name == "mhlo.backend_config" ||
++           name == "backend_config" || name == "api_version" ||
++           name == "called_computations" || name == "has_side_effect";
++  };
++  for (const auto& attr : op_->getAttrs()) {
++    if (!isSupportedAttrName(attr))
++      return op_.emitOpError()
++             << attr.getName().getValue()
++             << " is not a supported attribute for DynamicApproxTopK";
++  }
++
++  if (op_.getCallTargetName() != "stablehlo.dynamic_approx_top_k")
++    return op_.emitError() << "expects @stablehlo.dynamic_approx_top_k";
++
++  auto called_computations = op_.getCalledComputations();
++  if (called_computations.size() != 1) {
++    return op_.emitOpError()
++           << "DynamicApproxTopK must take 1 called_computations";
++  }
++
++  auto backend_config = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
++      op_->getAttr("mhlo.backend_config"));
++  if (!backend_config)
++    return op_.emitOpError() << "Missing mhlo.backend_config attribute";
++
++  // C1
++  if (backend_config.contains("top_k"))
++    return op_.emitOpError("mhlo.backend_config attribute contains top_k");
++
++  // C2
++  if (!backend_config.contains("reduction_dim"))
++    return op_.emitOpError(
++        "mhlo.backend_config attribute does not contain "
++        "reduction_dim");
++  auto reduction_dim_attr = backend_config.getAs<IntegerAttr>("reduction_dim");
++  if (!reduction_dim_attr || !reduction_dim_attr.getType().isInteger(64))
++    return op_.emitOpError()
++           << "mhlo.backend_config attribute reduction_dim must be i64";
++  int64_t reduction_dim = reduction_dim_attr.getInt();
++
++  // C3
++  int num_operands = op_->getNumOperands();
++  if (num_operands < 3 || num_operands % 2 != 1)
++    return op_.emitError() << "size(operands) is even or less than 3";
++  size_t num_inputs = getNumInputs();
++
++  if (op_->getNumResults() != num_inputs)
++    return op_.emitError()
++           << "size(results) is not the same as the number of inputs";
++
++  for (size_t i = 0; i < num_inputs; ++i) {
++    // C4
++    if (!getInput(i).getType().hasRank())
++      return op_.emitError() << "input " << i << " is unranked";
++    if (getInput(i).getType().getShape() != getInput(0).getType().getShape())
++      return op_.emitError()
++             << "input " << i << " shape does not match input 0 shape";
++    // C5
++    if (!getInitialValue(i).getType().hasRank())
++      return op_.emitError() << "initial value " << i << " is unranked";
++    if (0 != getInitialValue(i).getType().getRank())
++      return op_.emitError()
++             << "initial value " << i << " does not have rank 0";
++    // C6
++    if (getInitialValue(i).getType().getElementType() !=
++        getInput(i).getType().getElementType())
++      return op_.emitError() << "initial value " << i
++                             << " element type does not match input type";
++    // C7
++    if (!getOutput(i).getType().hasRank())
++      return op_.emitError() << "output " << i << " is unranked";
++    if (getOutput(i).getType().getShape() != getOutput(0).getType().getShape())
++      return op_.emitError()
++             << "output " << i << " shape does not match output 0 shape";
++    // C8
++    if (getOutput(i).getType().getElementType() !=
++        getInput(i).getType().getElementType())
++      return op_.emitError() << "output " << i
++                             << " element type does not match input type";
++  }
++
++  // C9
++  auto inputShape = getInput(0).getType().getShape();
++  auto outputShape = getOutput(0).getType().getShape();
++  if (reduction_dim < 0 || (size_t)reduction_dim >= inputShape.size())
++    return op_.emitError() << "reduction_dim is out of range";
++
++  // C10
++  for (size_t i = 0; i < inputShape.size(); ++i) {
++    if (i != (size_t)reduction_dim && inputShape[i] != outputShape[i])
++      return op_.emitError()
++             << "output values dimension " << i << " has size "
++             << outputShape[i] << " different than operand dimension size "
++             << inputShape[i];
++  }
++  return success();
++}
++
++size_t DynamicApproxTopKOpAdaptor::getNumInputs() {
++  return (op_->getNumOperands() - 1) / 2;
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getInput(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[idx]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getInitialValue(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[getNumInputs() + idx]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getK() {
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[2 * getNumInputs()]);
++}
++
++TypedValue<ShapedType> DynamicApproxTopKOpAdaptor::getOutput(size_t idx) {
++  return cast<TypedValue<ShapedType>>(op_.getResults()[idx]);
++}
++
++std::optional<DynamicApproxTopKOpAdaptor> getDynamicApproxTopKOp(
++    CustomCallOp op) {
++  if (op.getCallTargetName() != "stablehlo.dynamic_approx_top_k") return {};
++  return DynamicApproxTopKOpAdaptor(op);
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
-@@ -0,0 +1,230 @@
+@@ -0,0 +1,305 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -1182,6 +1310,81 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
++// The DynamicApproxTopKOp experiment provides a dynamic version of
++// ApproxTopKOp.
++//
++// Within this experiment, DynamicApproxTopKOp is represented via the
++// `stablehlo.custom_call @stablehlo.dynamic_approx_top_k` custom call.
++// This custom call has the regular operands of ApproxTopKOp plus an
++// additional `k` operand that determines the shape of the output.
++//
++// Semantics of DynamicApproTopKOp are inherited from semantics of ApproxTopKOp.
++//
++// #### Inputs
++//
++// | Label | Name            | Type                                         |
++// |-------|-----------------|----------------------------------------------|
++// | (I1)  | `inputs`        | N tensors of integer or floating-point type   |
++// | (I2)  | `initial_values`| N 0-dimensional tensors of same element type |
++// |       |                 | as the corresponding input element type      |
++// | (I3)  | `k`             | 0-dimensional tensor of integer or index type|
++//
++// #### Attributes
++//
++// * api_version: always 2 if present
++// * has_side_effect: always False if present
++// * called_computations: the comparator for scoring entries
++// * mhlo.backend_config: does not include `top_k` and includes:
++//   * `reduction_dim`
++//
++// #### Outputs
++//
++// | Name           | Type                                                |
++// |----------------|-----------------------------------------------------|
++// | `outputs`      | N tensor of same type as the corresponding input    |
++//
++// #### Constraints
++//
++// * (C1) the `mhlo.backend_config` attribute does not contain `top_k`
++// * (C2) the `mhlo.backend_config` attribute contains `reduction_dim`
++// * (C3) len(inputs) == len(initial_values) == len(outputs)
++// * (C4) inputs have ranked type and have the same shape
++// * (C5) initial_values have ranked type and have rank 0
++// * (C6) initial_values have the same element type as the corresponding input
++// * (C7) outputs have same shape
++// * (C8) outputs have the same element type as the corresponding input
++// * (C9) 0 <= reduction_dim < rank(inputs[0])
++// * (C10) shape(inputs[0])[i] == shape(outputs[0])[i] except for i == reduction_dim
++
++class DynamicApproxTopKOpAdaptor {
++ public:
++  DynamicApproxTopKOpAdaptor(CustomCallOp op) : op_(op) {}
++  operator Operation*() { return op_; }
++  Operation* operator->() { return op_; }
++
++  // These accessors assume that the operation is well-formed (i.e. that it
++  // can pass verification).
++  size_t getNumInputs();
++  TypedValue<ShapedType> getInput(size_t idx);
++  TypedValue<ShapedType> getInitialValue(size_t idx);
++  TypedValue<ShapedType> getK();
++
++  TypedValue<ShapedType> getOutput(size_t idx);
++
++  // Verifies the constraints documented above.
++  // Emits errors if errors are detected.
++  LogicalResult verify();
++
++ private:
++  CustomCallOp op_;
++};
++
++// Wraps a custom call in a DynamicApproxTopKOpAdaptor.
++// Fails if the call_target_name of the custom call doesn't match
++// "stablehlo.dynamic_approx_top_k".
++std::optional<DynamicApproxTopKOpAdaptor> getDynamicApproxTopKOp(
++    CustomCallOp op);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -1416,7 +1619,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in b/stableh
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
-@@ -0,0 +1,344 @@
+@@ -0,0 +1,475 @@
 +// RUN: experimental-stablehlo-opt --experimental-stablehlo-canonicalize-dynamism --split-input-file --verify-diagnostics %s | FileCheck %s
 +
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
@@ -1761,6 +1964,137 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<3xf32>, tensor<4xi32>)
 +  return %1#0, %1#1 : tensor<3xf32>, tensor<4xi32>
 +}
++
++// -----
++
++// approx_dynamic_top_k success
++// CHECK-LABEL: func @approx_dynamic_top_k_success
++func.func @approx_dynamic_top_k_success(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // CHECK: ApproxTopK{{.*}}top_k = 3
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_no_called_computation
++func.func @approx_dynamic_top_k_error_no_called_computation(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{must take 1 called_computations}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_backend_config_includes_top_k
++func.func @approx_dynamic_top_k_error_backend_config_includes_top_k(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{mhlo.backend_config attribute contains top_k}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1, %k) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      top_k = 3,
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>, tensor<ui64>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_even_operands
++func.func @approx_dynamic_top_k_error_even_operands(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{size(operands) is even or less than 3}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k(%arg0, %inp1, %init0, %init1) {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : (tensor<3x8xf32>, tensor<3x8xi32>, tensor<f32>, tensor<i32>) -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func @approx_dynamic_top_k_error_few_operands
++func.func @approx_dynamic_top_k_error_few_operands(%arg0: tensor<3x8xf32>) -> (tensor<3x4xf32>, tensor<3x4xi32>) {
++  %init0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
++  %init1 = stablehlo.constant dense<-1> : tensor<i32>
++  %inp1 = stablehlo.iota dim = 1 : tensor<3x8xi32>
++  %k = stablehlo.constant dense<3> : tensor<ui64>
++  // expected-error@+1{{size(operands) is even or less than 3}}
++  %2:2 = stablehlo.custom_call @stablehlo.dynamic_approx_top_k() {
++    called_computations = [@top_k_gt_f32_comparator],
++    mhlo.backend_config = {
++      aggregate_to_topk = true,
++      is_fallback = true,
++      recall_target = 0.95 : f32,
++      reduction_dim = 1 : i64,
++      reduction_input_size_override = -1 : i64
++    }
++  } : () -> (tensor<3x4xf32>, tensor<3x4xi32>)
++  return %2#0, %2#1 : tensor<3x4xf32>, tensor<3x4xi32>
++}
++
++func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
++  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
++  return %0 : tensor<i1>
++}
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
@@ -2218,7 +2552,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -0,0 +1,171 @@
+@@ -0,0 +1,214 @@
 +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 +   Copyright 2023 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2356,6 +2690,48 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +  }
 +};
 +
++struct CanonicalizeApproxDynamicTopKOpPattern
++    : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp impl,
++                                PatternRewriter& rewriter) const override {
++    auto maybeOp = getDynamicApproxTopKOp(impl);
++    if (!maybeOp || failed(maybeOp->verify())) return failure();
++    DynamicApproxTopKOpAdaptor op = *maybeOp;
++
++    SmallVector<int64_t> k;
++    if (failed(hlo::matchInts(op.getK(), k))) {
++      return rewriter.notifyMatchFailure(impl, "expected constant k");
++    }
++
++    SmallVector<Value> newOperands;
++    for (size_t i = 0; i < op.getNumInputs(); ++i) {
++      newOperands.push_back(op.getInput(i));
++    }
++    for (size_t i = 0; i < op.getNumInputs(); ++i) {
++      newOperands.push_back(op.getInitialValue(i));
++    }
++
++    auto stablehloBackendConfig = "mhlo.backend_config";
++    auto backend_config = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
++        impl->getAttr(stablehloBackendConfig));
++    if (!backend_config)
++      return rewriter.notifyMatchFailure(op,
++                                         "Missing backend_config attribute");
++    SmallVector<NamedAttribute> backend_config_attrs{backend_config.begin(),
++                                                     backend_config.end()};
++    backend_config_attrs.push_back(
++        rewriter.getNamedAttr("top_k", rewriter.getI64IntegerAttr(k[0])));
++
++    auto newOp = rewriter.replaceOpWithNewOp<CustomCallOp>(
++        op, op->getResultTypes(), newOperands, op->getAttrs());
++    newOp.setCallTargetName("ApproxTopK");
++    newOp->setAttr(stablehloBackendConfig,
++                   rewriter.getDictionaryAttr(backend_config_attrs));
++    return success();
++  }
++};
++
 +struct StablehloCanonicalizeDynamismPass
 +    : public impl::StablehloCanonicalizeDynamismPassBase<
 +          StablehloCanonicalizeDynamismPass> {
@@ -2375,6 +2751,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
++    patterns.add<CanonicalizeApproxDynamicTopKOpPattern>(&getContext());
 +
 +    auto funcOp = getOperation();
 +    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns),
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 6552c3aed01f83..8180d9fd421d4d 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -52,7 +52,7 @@
 _version = 268
 
 # Version number for MLIR:Python components.
-mlir_api_version = 56
+mlir_api_version = 57
 
 xla_platform_names = {
     'cpu': 'Host',

From e0fc6ed9a83325399282e8930082d45f32712499 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Thu, 30 May 2024 21:25:16 -0700
Subject: [PATCH 179/287] Support DenseBoolArrayAttr in MLRT

PiperOrigin-RevId: 638892428
---
 .../tfrt/translate/mlrt/mlir_to_bytecode.cc   | 24 +++++++++++++++++++
 .../translate/mlrt/mlir_to_bytecode_test.cc   | 12 ++++++++--
 .../mlrt/testdata/basic_attributes.mlir       |  1 +
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
index 06606c6fff345e..d3b19eb3447cf7 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
 
+#include <cstdint>
 #include <cstring>
 #include <iterator>
 #include <optional>
@@ -30,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 
 namespace mlrt {
@@ -109,6 +111,24 @@ std::optional<std::string> EncodeDenseArray(llvm::ArrayRef<T> array) {
   return std::string(buffer.data(), buffer.size());
 }
 
+// bool values has special encoding in MLIR. It occupies one bit in MLIR
+// but in bytecode it is one byte.
+std::optional<std::string> EncodeDenseBoolArray(llvm::ArrayRef<bool> array) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto ctor = bc::New<bc::Vector<uint8_t>>(&allocator, array.size());
+
+  if (!array.empty()) {
+    std::vector<uint8_t> data(array.size());
+    int i = 0;
+    for (auto v : array) {
+      data[i++] = static_cast<uint8_t>(v);
+    }
+    ctor.Place(reinterpret_cast<const char*>(data.data()), data.size());
+  }
+  return std::string(buffer.data(), buffer.size());
+}
+
 // Encode a list of strings as bytes using bc::Vector<bc::String>. The bytes
 // can be decoded directly using bc::Vector<bc::String>. If `array` is not a
 // list of strings, a nullopt will be returned.
@@ -426,6 +446,10 @@ std::optional<std::string> EncodeSimpleAttribute(
           [](const auto& dense_array_i64) -> std::optional<std::string> {
             return EncodeDenseArray<int64_t>(dense_array_i64);
           })
+      .Case<mlir::DenseBoolArrayAttr>(
+          [](const auto& dense_array_bool) -> std::optional<std::string> {
+            return EncodeDenseBoolArray(dense_array_bool.asArrayRef());
+          })
       .Case<mlir::FlatSymbolRefAttr>([&](const auto& symbol_ref) {
         return EncodeIntegerOrFloat<uint32_t>(
             module_context.GetFunctionId(symbol_ref.getValue()));
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
index 07f1fbfdb0c0c1..9f02f1d3c2a531 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
@@ -125,7 +125,7 @@ TEST(MlirToByteCodeTest, BasicAttributes) {
 
   auto attributes = executable.attributes();
 
-  ASSERT_EQ(attributes.size(), 14);
+  ASSERT_EQ(attributes.size(), 15);
 
   auto attr_iter = attributes.begin();
 
@@ -179,9 +179,13 @@ TEST(MlirToByteCodeTest, BasicAttributes) {
 
   bc::Vector<int32_t> empty_dense_array((*attr_iter).data());
   EXPECT_TRUE(empty_dense_array.empty());
+  ++attr_iter;
+
+  bc::Vector<uint8_t> dense_array_of_bool((*attr_iter).data());
+  EXPECT_THAT(dense_array_of_bool, ElementsAreArray({true, false}));
 
   auto kernels = executable.functions()[0].kernels();
-  ASSERT_EQ(kernels.size(), 15);
+  ASSERT_EQ(kernels.size(), 16);
   auto kernel_iter = kernels.begin();
 
   auto attribute_span = [&](auto kernel_iter) {
@@ -236,6 +240,10 @@ TEST(MlirToByteCodeTest, BasicAttributes) {
 
   EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int32_t>>(0),
               IsEmpty());
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<bool>>(0),
+              ElementsAreArray({true, false}));
 }
 
 TEST(MlirToByteCodeTest, UnsupportedAttributes) {
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir
index db72598bb4a0de..e992dc776a539a 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir
@@ -13,6 +13,7 @@ func.func @simple_attributes() {
   "test_custom.attribute"() {value = array<i32: 0, 1, 2>} : () -> ()
   "test_custom.attribute"() {value = array<i64: 0, 1, 2>} : () -> ()
   "test_custom.attribute"() {value = array<i32>} : () -> ()
+  "test_custom.attribute"() {value = array<i1: true, false>} : () -> ()
   func.return
 }
 

From 1e0dfc43c5a287d63184d28f91223d1d75388db2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 21:54:23 -0700
Subject: [PATCH 180/287] Automated Code Change

PiperOrigin-RevId: 638898736
---
 tensorflow/core/grappler/BUILD      | 1 +
 tensorflow/core/grappler/devices.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 5515c119893436..ae85e69d064649 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -81,6 +81,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/log",
     ],
 )
 
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 8a993a3d72ac15..491044c2ddb58c 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/grappler/devices.h"
+
 #include <memory>
 
-#include "tensorflow/core/grappler/devices.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "absl/log/log.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From ff6239a3a77b0d375a194e314f22a0396615e612 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Thu, 30 May 2024 22:12:30 -0700
Subject: [PATCH 181/287] Remove obsolete TODO item.

PiperOrigin-RevId: 638902688
---
 .../mlir/quantization/tensorflow/python/py_function_lib.py      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
index f630138f81fca1..edc900ab5e3505 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
@@ -409,8 +409,6 @@ def _run_function_for_calibration_eager_mode(
     # This conversion is required because the model saved in `model_dir` is
     # saved using TF1 SavedModelBuilder, which doesn't save the
     # SavedObjectGraph.
-    # TODO(b/236795224): Remove the need for this conversion by keeping the
-    # FunctionSpec (object graph) in the SavedModel. Related: b/213406917.
     func_kwargs = _convert_values_to_tf_tensors(sample)
     func(**func_kwargs)
 

From 88889e47221e2ae618f9b47bbfac787bd08ec777 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Thu, 30 May 2024 22:21:42 -0700
Subject: [PATCH 182/287] Remove obsolete TODO item.

PiperOrigin-RevId: 638904633
---
 .../compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
index 51984bcd000feb..c5daf8455c3753 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
@@ -228,7 +228,6 @@ void EnableIrPrinting(mlir::PassManager& pm,
       /*print_after_only_on_change=*/true, flag));
 }
 
-// TODO(b/259374854): Create tests for MaybeEnableIrPrinting.
 absl::Status MaybeEnableIrPrinting(mlir::PassManager& pm,
                                    absl::string_view file_name_prefix) {
   if (!VLOG_IS_ON(1)) {

From f8909c55a3dda6848eab44e7066d74e77d197fb0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 30 May 2024 23:22:21 -0700
Subject: [PATCH 183/287] Automated Code Change

PiperOrigin-RevId: 638917814
---
 tensorflow/lite/kernels/internal/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 8b103baeda2423..11292aeb2bb8ed 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1528,7 +1528,7 @@ filegroup(
 
 transitive_hdrs(
     name = "nnapi_external_headers",
-    visibility = ["//tensorflow/lite:__subpackages__"],
+    visibility = ["//visibility:private"],
     deps = [
         "@eigen_archive//:eigen3",
         "@gemmlowp",

From 01979c439184f5de93709652f4d725bb4dece77c Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Fri, 31 May 2024 01:25:37 -0700
Subject: [PATCH 184/287] Enable batched radix sort using CUB library

CUB library provides `DeviceSegmentedRadixSort::SortKeys` and `DeviceSegmentedRadixSort::SortPairs` functions that allow sorting multiple segments at the same time.

`GpuSortRewriter` pass now supports any sorts where the sort dimension is minor.
If the `batch_size` passed to the custom kernel is greater than one, scratch size is increased to hold the segment offsets at the end of the buffer.

PiperOrigin-RevId: 638948437
---
 .../xla/xla/service/gpu/cub_sort_kernel.cu.cc |  67 +++++++++++-
 .../xla/xla/service/gpu/cub_sort_kernel.h     |  10 +-
 .../xla/xla/service/gpu/gpu_sort_rewriter.cc  |  20 +++-
 .../xla/service/gpu/gpu_sort_rewriter_test.cc |  55 +++++++++-
 .../xla/service/gpu/ir_emitter_unnested.cc    |   7 +-
 third_party/xla/xla/service/gpu/runtime/BUILD |   2 +-
 .../xla/service/gpu/runtime/cub_sort_thunk.cc | 103 ++++++++++++------
 .../xla/service/gpu/runtime/cub_sort_thunk.h  |  19 ++--
 8 files changed, 218 insertions(+), 65 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
index 2efd6d09bffa9b..0d3144d160896f 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
+++ b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
@@ -52,6 +52,32 @@ const char* CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
   return nullptr;
 }
 
+template <typename KeyT>
+const char* CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                        const void* d_keys_in, void* d_keys_out,
+                        size_t num_items, bool descending, size_t batch_size) {
+  if (batch_size == 1) {
+    return CubSortKeys<KeyT>(d_temp_storage, temp_bytes, d_keys_in, d_keys_out,
+                             num_items, descending);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  auto err =
+      descending
+          ? gpuprim::DeviceSegmentedRadixSort::SortKeysDescending<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                start_offsets, end_offsets)
+          : gpuprim::DeviceSegmentedRadixSort::SortKeys<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                start_offsets, end_offsets);
+  CHK_GPU_ERR(err)
+  return nullptr;
+}
+
 template <typename KeyT, typename ValT>
 const char* CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
                          const void* d_keys_in, void* d_keys_out,
@@ -73,24 +99,57 @@ const char* CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
   return nullptr;
 }
 
+template <typename KeyT, typename ValT>
+const char* CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         const void* d_values_in, void* d_values_out,
+                         size_t num_items, bool descending, size_t batch_size) {
+  if (batch_size == 1) {
+    return CubSortPairs<KeyT, ValT>(d_temp_storage, temp_bytes, d_keys_in,
+                                    d_keys_out, d_values_in, d_values_out,
+                                    num_items, descending);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  auto err =
+      descending
+          ? gpuprim::DeviceSegmentedRadixSort::SortPairsDescending<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, batch_size,
+                start_offsets, end_offsets)
+          : gpuprim::DeviceSegmentedRadixSort::SortPairs<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, batch_size,
+                start_offsets, end_offsets);
+  CHK_GPU_ERR(err)
+  return nullptr;
+}
+
 }  // namespace
 
 #define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                               \
   const char* CubSortKeys_##suffix(void* d_temp_storage, size_t& temp_bytes, \
                                    const void* d_keys_in, void* d_keys_out,  \
-                                   size_t num_items, bool descending) {      \
+                                   size_t num_items, bool descending,        \
+                                   size_t batch_size) {                      \
     return CubSortKeys<type>(d_temp_storage, temp_bytes, d_keys_in,          \
-                             d_keys_out, num_items, descending);             \
+                             d_keys_out, num_items, descending, batch_size); \
   }
 
 #define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                      \
   const char* CubSortPairs_##suffix(                                         \
       void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,       \
       void* d_keys_out, const void* d_values_in, void* d_values_out,         \
-      size_t num_items, bool descending) {                                   \
+      size_t num_items, bool descending, size_t batch_size) {                \
     return CubSortPairs<type1, type2>(d_temp_storage, temp_bytes, d_keys_in, \
                                       d_keys_out, d_values_in, d_values_out, \
-                                      num_items, descending);                \
+                                      num_items, descending, batch_size);    \
   }
 
 // Floating point types.
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.h b/third_party/xla/xla/service/gpu/cub_sort_kernel.h
index 8d1efaa2bc66d7..fdb048fde9d76f 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.h
+++ b/third_party/xla/xla/service/gpu/cub_sort_kernel.h
@@ -24,10 +24,10 @@ namespace gpu {
 
 // Returns nullptr if no error, otherwise the error message as a null-terminated
 // string (cudaGetErrorString or similar).
-#define XLA_CUB_DECLARE_SORT_KEYS(suffix)                                    \
-  const char* CubSortKeys_##suffix(void* d_temp_storage, size_t& temp_bytes, \
-                                   const void* d_keys_in, void* d_keys_out,  \
-                                   size_t num_items, bool descending);
+#define XLA_CUB_DECLARE_SORT_KEYS(suffix)                              \
+  const char* CubSortKeys_##suffix(                                    \
+      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in, \
+      void* d_keys_out, size_t num_items, bool descending, size_t batch_size);
 
 // Returns nullptr if no error, otherwise the error message as a null-terminated
 // string (cudaGetErrorString or similar).
@@ -35,7 +35,7 @@ namespace gpu {
   const char* CubSortPairs_##suffix(                                   \
       void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in, \
       void* d_keys_out, const void* d_values_in, void* d_values_out,   \
-      size_t num_items, bool descending);
+      size_t num_items, bool descending, size_t batch_size);
 
 XLA_CUB_DECLARE_SORT_KEYS(bf16)
 XLA_CUB_DECLARE_SORT_KEYS(f16)
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
index 0f209b143e74d9..7b20fbcc1b1fff 100644
--- a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
@@ -103,11 +103,13 @@ bool IsCubCompatibleSort(HloSortInstruction* sort_op) {
     VLOG(2) << "Unsupported operand count: " << sort_op->operand_count();
     return false;
   }
-  if (sort_op->operand(0)->shape().rank() != 1) {
-    VLOG(2) << "Only 1D shapes are supported";
+
+  const Shape& operand_shape = sort_op->operand(0)->shape();
+  if (sort_op->sort_dimension() != operand_shape.rank() - 1) {
+    VLOG(2) << "Sort dimension should be the minor one";
     return false;
   }
-  if (sort_op->operand(0)->shape().dimensions(0) <
+  if (Product(operand_shape.dimensions()) <
       GpuSortRewriter::kSortSizeThreshold) {
     VLOG(2) << "Tensor shape size is too small to see an improvement";
     return false;
@@ -151,10 +153,20 @@ absl::StatusOr<bool> GpuSortRewriter::RunOnInstruction(
       AnalyzeSortComputation(sort_op->called_computations().front()).value();
 
   // Get scratch size requirements from CUB.
+  const Shape& operand_shape = sort_op->operand(0)->shape();
+  int64_t batch_size = Product(operand_shape.dimensions()) /
+                       operand_shape.dimensions(sort_op->sort_dimension());
+
   TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_op, sort_config));
   TF_ASSIGN_OR_RETURN(
       int64_t scratch_size,
-      runner->GetScratchSize(sort_op->operand(0)->shape().dimensions(0)));
+      runner->GetScratchSize(Product(operand_shape.dimensions()), batch_size));
+
+  // Align and increase scratch size to fit the offsets.
+  if (batch_size > 1) {
+    scratch_size += sizeof(int) - scratch_size % sizeof(int);
+    scratch_size += (batch_size + 1) * sizeof(int);
+  }
 
   // Values are only present if sorting a pair of tensors.
   HloInstruction* keys = sort_op->mutable_operand(0);
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc
index 3df91cc98e89ea..008d93c2eea602 100644
--- a/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
-#include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -205,7 +204,7 @@ ENTRY %main {
 }
 
 // Only 1D shapes are supported.
-TEST_F(GpuSortRewriterTest, NoRewriteManyDimensions) {
+TEST_F(GpuSortRewriterTest, NoRewriteNonMinorSortDimension) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -309,6 +308,58 @@ ENTRY %main {
   EXPECT_FALSE(RunPass(module.get()));
 }
 
+// Basic sort: with batch dimension.
+TEST_F(GpuSortRewriterTest, SortWithBatchDim) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[100,1000] parameter(0)
+  ROOT %sort = f32[100,1000] sort(%input), dimensions={1}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+  ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
+                  /*descending=*/false);
+}
+
+// Basic sort: with multiple batch dimensions.
+TEST_F(GpuSortRewriterTest, SortWithMultipleBatchDims) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[10,10,1000] parameter(0)
+  ROOT %sort = f32[10,10,1000] sort(%input), dimensions={2}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+  ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
+                  /*descending=*/false);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index f59487701ab072..20ff725849aee9 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -1219,13 +1219,16 @@ absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
 
   TF_ASSIGN_OR_RETURN(xla::SortOptions options,
                       instr->backend_config<xla::SortOptions>());
+  const Shape& operand_shape = instr->operand(0)->shape();
   auto thunk = std::make_unique<CubSortThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr),
-      instr->operand(0)->shape().element_type(),
+      operand_shape.element_type(),
       instr->operand_count() == 2
           ? std::optional(instr->operand(1)->shape().element_type())
           : std::nullopt,
-      operands, results, scratch, options.descending());
+      operands, results, scratch, options.descending(),
+      Product(operand_shape.dimensions()) /
+          operand_shape.dimensions(operand_shape.rank() - 1));
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index b953caaf3e4816..35a52bc260e649 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -570,7 +570,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu/runtime:thunk",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
index 5b477dc5fb2a46..709fb9e2862ee2 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
@@ -33,18 +33,38 @@ limitations under the License.
 #include "xla/service/gpu/cub_sort_kernel.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+// N pairs of [start_offset, end_offset) require (N+1) storage.
+// The size of each offset is selected to be 32-bits (int type).
+uint64_t GetOffsetsSize(int64_t batch_size) {
+  return (batch_size + 1) * sizeof(int);
+}
+
+// Copies segment offsets to the device memory.
+absl::Status CopyOffsets(se::Stream* stream, se::DeviceMemoryBase scratch,
+                         int64_t batch_size, int64_t segment_size) {
+  uint64_t offsets_size = GetOffsetsSize(batch_size);
+  char* offsets_buffer =
+      static_cast<char*>(scratch.opaque()) + scratch.size() - offsets_size;
+  se::DeviceMemoryBase d_offsets(offsets_buffer, offsets_size);
+  std::vector<int> h_offsets(batch_size + 1);
+  for (int i = 0; i <= batch_size; ++i) h_offsets[i] = i * segment_size;
+  return stream->Memcpy(&d_offsets, h_offsets.data(), offsets_size);
+}
+
 // Template class for sorting a single tensor.
 class CubSortKeysImpl : public CubSortRunnerInterface {
  public:
   using SortKeysFn = std::function<const char*(void*, size_t&, const void*,
-                                               void*, size_t, bool)>;
+                                               void*, size_t, bool, size_t)>;
 
   explicit CubSortKeysImpl(SortKeysFn sort_keys_fn, PrimitiveType type)
       : sort_keys_fn_(sort_keys_fn), type_(type) {}
@@ -53,10 +73,12 @@ class CubSortKeysImpl : public CubSortRunnerInterface {
                    se::DeviceMemoryBase input_values,
                    se::DeviceMemoryBase output_keys,
                    se::DeviceMemoryBase output_values,
-                   se::DeviceMemoryBase scratch, bool descending) override;
+                   se::DeviceMemoryBase scratch, bool descending,
+                   int64_t batch_size, se::Stream* stream) override;
   absl::Status Run(const Thunk::ExecuteParams& params,
                    const CubSortThunk* thunk) override;
-  absl::StatusOr<int64_t> GetScratchSize(int64_t num_items) override;
+  absl::StatusOr<int64_t> GetScratchSize(int64_t num_items,
+                                         int64_t batch_size) override;
 
  private:
   SortKeysFn sort_keys_fn_;
@@ -67,15 +89,20 @@ absl::Status CubSortKeysImpl::Run(se::DeviceMemoryBase input_keys,
                                   se::DeviceMemoryBase input_values,
                                   se::DeviceMemoryBase output_keys,
                                   se::DeviceMemoryBase output_values,
-                                  se::DeviceMemoryBase scratch,
-                                  bool descending) {
+                                  se::DeviceMemoryBase scratch, bool descending,
+                                  int64_t batch_size, se::Stream* stream) {
   size_t temp_bytes = scratch.size();
   size_t num_items = input_keys.size() * 8 / primitive_util::BitWidth(type_);
   CHECK(input_values.is_null());
   CHECK(output_values.is_null());
+  if (batch_size > 1) {
+    TF_RETURN_IF_ERROR(
+        CopyOffsets(stream, scratch, batch_size, num_items / batch_size));
+    temp_bytes -= GetOffsetsSize(batch_size);
+  }
   const char* error =
       sort_keys_fn_(scratch.opaque(), temp_bytes, input_keys.opaque(),
-                    output_keys.opaque(), num_items, descending);
+                    output_keys.opaque(), num_items, descending, batch_size);
   if (error != nullptr) {
     return absl::InvalidArgumentError(
         absl::StrCat("CubSortKeys error: ", error));
@@ -88,13 +115,15 @@ absl::Status CubSortKeysImpl::Run(const Thunk::ExecuteParams& params,
   const BufferAllocations& allocs = *params.buffer_allocations;
   return Run(allocs.GetDeviceAddress(thunk->operand(0)), se::DeviceMemoryBase(),
              allocs.GetDeviceAddress(thunk->result(0)), se::DeviceMemoryBase(),
-             allocs.GetDeviceAddress(thunk->scratch()), thunk->descending());
+             allocs.GetDeviceAddress(thunk->scratch()), thunk->descending(),
+             thunk->batch_size(), params.stream);
 }
 
-absl::StatusOr<int64_t> CubSortKeysImpl::GetScratchSize(int64_t num_items) {
+absl::StatusOr<int64_t> CubSortKeysImpl::GetScratchSize(int64_t num_items,
+                                                        int64_t batch_size) {
   size_t temp_bytes = 0;
-  const char* error =
-      sort_keys_fn_(nullptr, temp_bytes, nullptr, nullptr, num_items, false);
+  const char* error = sort_keys_fn_(nullptr, temp_bytes, nullptr, nullptr,
+                                    num_items, false, batch_size);
   if (error != nullptr) {
     return absl::InvalidArgumentError(
         absl::StrCat("CubSortKeys error: ", error));
@@ -105,8 +134,9 @@ absl::StatusOr<int64_t> CubSortKeysImpl::GetScratchSize(int64_t num_items) {
 // Template class for sorting a pair of tensors.
 class CubSortPairsImpl : public CubSortRunnerInterface {
  public:
-  using SortPairsFn = std::function<const char*(
-      void*, size_t&, const void*, void*, const void*, void*, size_t, bool)>;
+  using SortPairsFn =
+      std::function<const char*(void*, size_t&, const void*, void*, const void*,
+                                void*, size_t, bool, size_t)>;
 
   explicit CubSortPairsImpl(SortPairsFn sort_pairs_fn, PrimitiveType type)
       : sort_pairs_fn_(sort_pairs_fn), type_(type) {}
@@ -115,10 +145,12 @@ class CubSortPairsImpl : public CubSortRunnerInterface {
                    se::DeviceMemoryBase input_values,
                    se::DeviceMemoryBase output_keys,
                    se::DeviceMemoryBase output_values,
-                   se::DeviceMemoryBase scratch, bool descending) override;
+                   se::DeviceMemoryBase scratch, bool descending,
+                   int64_t batch_size, se::Stream* stream) override;
   absl::Status Run(const Thunk::ExecuteParams& params,
                    const CubSortThunk* thunk) override;
-  absl::StatusOr<int64_t> GetScratchSize(int64_t num_items) override;
+  absl::StatusOr<int64_t> GetScratchSize(int64_t num_items,
+                                         int64_t batch_size) override;
 
  private:
   SortPairsFn sort_pairs_fn_;
@@ -130,12 +162,19 @@ absl::Status CubSortPairsImpl::Run(se::DeviceMemoryBase input_keys,
                                    se::DeviceMemoryBase output_keys,
                                    se::DeviceMemoryBase output_values,
                                    se::DeviceMemoryBase scratch,
-                                   bool descending) {
+                                   bool descending, int64_t batch_size,
+                                   se::Stream* stream) {
   size_t temp_bytes = scratch.size();
   size_t num_items = input_keys.size() * 8 / primitive_util::BitWidth(type_);
-  const char* error = sort_pairs_fn_(
-      scratch.opaque(), temp_bytes, input_keys.opaque(), output_keys.opaque(),
-      input_values.opaque(), output_values.opaque(), num_items, descending);
+  if (batch_size > 1) {
+    TF_RETURN_IF_ERROR(
+        CopyOffsets(stream, scratch, batch_size, num_items / batch_size));
+    temp_bytes -= GetOffsetsSize(batch_size);
+  }
+  const char* error =
+      sort_pairs_fn_(scratch.opaque(), temp_bytes, input_keys.opaque(),
+                     output_keys.opaque(), input_values.opaque(),
+                     output_values.opaque(), num_items, descending, batch_size);
   if (error != nullptr) {
     return absl::InvalidArgumentError(
         absl::StrCat("CubSortPairs error: ", error));
@@ -150,13 +189,16 @@ absl::Status CubSortPairsImpl::Run(const Thunk::ExecuteParams& params,
              allocs.GetDeviceAddress(thunk->operand(1)),
              allocs.GetDeviceAddress(thunk->result(0)),
              allocs.GetDeviceAddress(thunk->result(1)),
-             allocs.GetDeviceAddress(thunk->scratch()), thunk->descending());
+             allocs.GetDeviceAddress(thunk->scratch()), thunk->descending(),
+             thunk->batch_size(), params.stream);
 }
 
-absl::StatusOr<int64_t> CubSortPairsImpl::GetScratchSize(int64_t num_items) {
+absl::StatusOr<int64_t> CubSortPairsImpl::GetScratchSize(int64_t num_items,
+                                                         int64_t batch_size) {
   size_t temp_bytes = 0;
-  const char* error = sort_pairs_fn_(nullptr, temp_bytes, nullptr, nullptr,
-                                     nullptr, nullptr, num_items, false);
+  const char* error =
+      sort_pairs_fn_(nullptr, temp_bytes, nullptr, nullptr, nullptr, nullptr,
+                     num_items, false, batch_size);
   if (error != nullptr) {
     return absl::InvalidArgumentError(
         absl::StrCat("CubSortPairs error: ", error));
@@ -252,25 +294,14 @@ CubSortThunk::CubSortThunk(
     std::optional<PrimitiveType> value_type,
     absl::InlinedVector<BufferAllocation::Slice, 2> operands,
     absl::InlinedVector<BufferAllocation::Slice, 2> results,
-    BufferAllocation::Slice scratch, bool descending)
+    BufferAllocation::Slice scratch, bool descending, int64_t batch_size)
     : Thunk(Thunk::kCubSort, thunk_info),
       runner_(CubSortRunnerInterface::Create(type, value_type).value()),
       operands_(std::move(operands)),
       results_(std::move(results)),
       scratch_(scratch),
-      descending_(descending) {}
-
-absl::Status RunCubSort(PrimitiveType type,
-                        std::optional<PrimitiveType> value_type,
-                        se::DeviceMemoryBase input_keys,
-                        se::DeviceMemoryBase input_values,
-                        se::DeviceMemoryBase output_keys,
-                        se::DeviceMemoryBase output_values,
-                        se::DeviceMemoryBase scratch, bool descending) {
-  auto runner = CubSortRunnerInterface::Create(type, value_type).value();
-  return runner->Run(input_keys, input_values, output_keys, output_values,
-                     scratch, descending);
-}
+      descending_(descending),
+      batch_size_(batch_size) {}
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
index 12ee7a6dd1f3eb..c267b82925a0dc 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
@@ -38,10 +38,12 @@ class CubSortRunnerInterface {
                            se::DeviceMemoryBase input_values,
                            se::DeviceMemoryBase output_keys,
                            se::DeviceMemoryBase output_values,
-                           se::DeviceMemoryBase scratch, bool descending) = 0;
+                           se::DeviceMemoryBase scratch, bool descending,
+                           int64_t batch_size, se::Stream* stream) = 0;
   virtual absl::Status Run(const Thunk::ExecuteParams& params,
                            const class CubSortThunk* thunk) = 0;
-  virtual absl::StatusOr<int64_t> GetScratchSize(int64_t num_items) = 0;
+  virtual absl::StatusOr<int64_t> GetScratchSize(int64_t num_items,
+                                                 int64_t batch_size) = 0;
 
   static absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> Create(
       PrimitiveType type, std::optional<PrimitiveType> value_type);
@@ -53,7 +55,8 @@ class CubSortThunk : public Thunk {
                std::optional<PrimitiveType> value_type,
                absl::InlinedVector<BufferAllocation::Slice, 2> operands,
                absl::InlinedVector<BufferAllocation::Slice, 2> results,
-               BufferAllocation::Slice scratch, bool descending);
+               BufferAllocation::Slice scratch, bool descending,
+               int64_t batch_size);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override {
     return runner_->Run(params, this);
@@ -63,6 +66,7 @@ class CubSortThunk : public Thunk {
   BufferAllocation::Slice result(int i) const { return results_[i]; }
   BufferAllocation::Slice scratch() const { return scratch_; }
   bool descending() const { return descending_; }
+  int64_t batch_size() const { return batch_size_; }
 
  private:
   std::unique_ptr<CubSortRunnerInterface> runner_;
@@ -70,16 +74,9 @@ class CubSortThunk : public Thunk {
   absl::InlinedVector<BufferAllocation::Slice, 2> results_;
   BufferAllocation::Slice scratch_;
   bool descending_;
+  int64_t batch_size_;
 };
 
-absl::Status RunCubSort(PrimitiveType type,
-                        std::optional<PrimitiveType> value_type,
-                        se::DeviceMemoryBase input_keys,
-                        se::DeviceMemoryBase input_values,
-                        se::DeviceMemoryBase output_keys,
-                        se::DeviceMemoryBase output_values,
-                        se::DeviceMemoryBase scratch, bool descending);
-
 }  // namespace gpu
 }  // namespace xla
 

From fe00d1747820d9a49354cd372883f594634d5ab3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 02:02:30 -0700
Subject: [PATCH 185/287] compat: Update forward compatibility horizon to
 2024-05-31

PiperOrigin-RevId: 638957695
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ee7befbcc0d463..3b2f26d436ad4e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 31)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a76f5dc371682b111f12ebbef89895f084ded61e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 02:03:55 -0700
Subject: [PATCH 186/287] Update GraphDef version to 1879.

PiperOrigin-RevId: 638958096
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9a4e301fcfdadb..ed6ed68fa05dd7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1878  // Updated: 2024/5/30
+#define TF_GRAPH_DEF_VERSION 1879  // Updated: 2024/5/31
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 58a97840542bac9be1aeae91d7ee7c12ee321d71 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 03:32:49 -0700
Subject: [PATCH 187/287] Reverts adc512abf8c35c7c6874821c5fd6f6b26e371e1e

PiperOrigin-RevId: 638979119
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |   2 +
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |   8 ++
 tensorflow/lite/kernels/embedding_lookup.cc   |  28 +----
 .../lite/kernels/embedding_lookup_test.cc     | 103 +-----------------
 4 files changed, 17 insertions(+), 124 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 5d34320521b429..5f4cce6d8e8a76 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1633,6 +1633,8 @@ def TFL_EluOp: TFL_Op<"elu", [
 
 def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
     [Pure,
+     PredOpTrait<"value and output must have same element type",
+       TFL_TCresVTEtIsSameAsOp<0, 1>>,
      TFL_OperandHasRank<0, 1>,
      TFL_OperandHasRankAtLeast<1, 2>,
      DynamicRangeQuantizedOpInterface,
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 6c6bffbc44dbd1..fa69cd46017f8f 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -2031,6 +2031,14 @@ func.func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xf32>) -
 
 // -----
 
+func.func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xi8>) -> tensor<?xf32> {
+  // expected-error @+1 {{'tfl.embedding_lookup' op failed to verify that value and output must have same element type}}
+  %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xi8>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// -----
+
 func.func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
   // expected-error @+1 {{'tfl.local_response_normalization' op operand #0 must be tensor of 32-bit float values, but got 'tensor<1x56x56x192x!quant.uniform<u8:f32, 2.000000e-02>>'}}
   %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index f8ccee35937086..3c4d34923ef065 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -55,23 +55,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &value));
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
-  if (value->quantization.type == kTfLiteAffineQuantization) {
-    const auto qparams = static_cast<const TfLiteAffineQuantization*>(
-        value->quantization.params);
-    TF_LITE_ENSURE(context, qparams->scale != nullptr);
-    TF_LITE_ENSURE(context, qparams->zero_point != nullptr);
-    // Only support symmetric quantization for now.
-    TF_LITE_ENSURE(context, qparams->zero_point->data[0] == 0);
-    if (qparams->scale->size > 1 || qparams->zero_point->size > 1) {
-      // Per-axis quantization must have quantized_dimension == 0 and correct
-      // sizes for scale and zero_point.
-      TF_LITE_ENSURE(context, qparams->quantized_dimension == 0);
-      const int row_size = SizeOfDimension(value, 0);
-      TF_LITE_ENSURE(context, qparams->scale->size == row_size);
-      TF_LITE_ENSURE(context, qparams->zero_point->size == row_size);
-    }
-  }
-
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
@@ -118,6 +101,7 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
                         const TfLiteTensor* lookup, const TfLiteTensor* value,
                         TfLiteTensor* output) {
   const int row_size = SizeOfDimension(value, 0);
+  const double scaling_factor = value->params.scale;
 
   // col_size after we flatten tensor into 2D.
   int col_size = 1;
@@ -141,16 +125,6 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       // Dequantize embedding values.
       // TODO(alanchiao): refactor scalar multiply into separate function
       // for ease of adding a neon equivalent if ever necessary.
-      double scaling_factor = value->params.scale;
-      if (value->quantization.type == kTfLiteAffineQuantization) {
-        const auto qparams = static_cast<const TfLiteAffineQuantization*>(
-            value->quantization.params);
-        if (qparams->scale->size > 1) {
-          // get this row's scale for per-axis quantization
-          scaling_factor = qparams->scale->data[idx];
-        }
-      }
-
       for (int j = 0; j < col_size; j++) {
         output_ptr[j + i * col_size] =
             value_ptr[j + idx * col_size] * scaling_factor;
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index e85d6da4174880..16e3b65d9bd88b 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -24,6 +24,7 @@ License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -37,22 +38,12 @@ using ::testing::ElementsAreArray;
 
 class BaseEmbeddingLookupOpModel : public SingleOpModel {
  public:
-  BaseEmbeddingLookupOpModel(
-      std::initializer_list<int> index_shape,
-      std::initializer_list<int> weight_shape,
-      TensorType weight_type = TensorType_FLOAT32,
-      TensorType output_type = TensorType_FLOAT32,
-      const std::vector<float>& per_channel_quantization_scales = {}) {
+  BaseEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
+                             std::initializer_list<int> weight_shape,
+                             TensorType weight_type = TensorType_FLOAT32,
+                             TensorType output_type = TensorType_FLOAT32) {
     input_ = AddInput(TensorType_INT32);
-    if (per_channel_quantization_scales.empty()) {
-      weight_ = AddInput(weight_type);
-    } else {
-      std::vector<int64_t> per_channel_quantization_offsets(
-          per_channel_quantization_scales.size(), 0);
-      weight_ = AddInput({weight_type, weight_shape, 0, 0, 0, 0, true,
-                          per_channel_quantization_scales,
-                          per_channel_quantization_offsets, 0});
-    }
+    weight_ = AddInput(weight_type);
     output_ = AddOutput(output_type);
     SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
     BuildInterpreter({index_shape, weight_shape});
@@ -110,22 +101,6 @@ class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
   }
 };
 
-class PerAxisHybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
- public:
-  PerAxisHybridEmbeddingLookupOpModel(
-      std::initializer_list<int> index_shape,
-      std::initializer_list<int> weight_shape,
-      const std::vector<float>& per_channel_quantization_scales,
-      TensorType type)
-      : BaseEmbeddingLookupOpModel(index_shape, weight_shape, type,
-                                   TensorType_FLOAT32,
-                                   per_channel_quantization_scales) {}
-
-  void SetSignedWeight(std::initializer_list<float> data) {
-    PerChannelSymmetricQuantizeAndPopulate(weight_, data);
-  }
-};
-
 // TODO(ahentz): write more tests that exercise the details of the op, such as
 // lookup errors and variable input shapes.
 TEST(EmbeddingLookupOpTest, SimpleTest) {
@@ -286,71 +261,5 @@ TEST(EmbeddingLookupHybridOpTest, Simple3DTestQuantized) {
               }));
 }
 
-TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple2DTestInt8) {
-  PerAxisHybridEmbeddingLookupOpModel m(
-      {3}, {3, 8}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
-  m.SetInput({1, 0, 2});
-  m.SetSignedWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-  });
-
-  ASSERT_EQ(m.Invoke(), kTfLiteOk);
-
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-                  },
-                  kTestTolerance)));
-}
-
-TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple3DTestInt8) {
-  PerAxisHybridEmbeddingLookupOpModel m(
-      {3}, {3, 2, 4}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
-  m.SetInput({1, 0, 2});
-  m.SetSignedWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-  });
-
-  ASSERT_EQ(m.Invoke(), kTfLiteOk);
-
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-                  },
-                  kTestTolerance)));
-}
-
-TEST(PerAxisHybridEmbeddingLookupHybridOpTest, PerAxisSimple4DTestInt8) {
-  PerAxisHybridEmbeddingLookupOpModel m(
-      {3}, {3, 2, 2, 2}, {0.00102, 0.0089, 0.016772}, TensorType_INT8);
-  m.SetInput({1, 0, 2});
-  m.SetSignedWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-  });
-
-  ASSERT_EQ(m.Invoke(), kTfLiteOk);
-
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
-                  },
-                  kTestTolerance)));
-}
-
 }  // namespace
 }  // namespace tflite

From a076343aa176666e7d4384def477d9db4a168313 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Fri, 31 May 2024 03:46:13 -0700
Subject: [PATCH 188/287] Add runtime test for radix sort correctness

PiperOrigin-RevId: 638982005
---
 third_party/xla/xla/service/gpu/tests/BUILD   |  19 +++
 .../service/gpu/tests/gpu_cub_sort_test.cc    | 133 ++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc

diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 7f0b9d992b476b..70a29348c99859 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -965,6 +965,25 @@ xla_test(
     ),
 )
 
+xla_test(
+    name = "gpu_cub_sort_test",
+    srcs = ["gpu_cub_sort_test.cc"],
+    backends = ["gpu"],
+    shard_count = 10,
+    deps = [
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:gpu_sort_rewriter",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_test(
     name = "gpu_fused_mha_test",
     srcs = ["gpu_fused_mha_test.cc"],
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc
new file mode 100644
index 00000000000000..fe2be62e83c658
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/gpu_sort_rewriter.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// ----- Sort keys
+
+class CubSortKeysTest : public HloTestBase,
+                        public ::testing::WithParamInterface<
+                            std::tuple<PrimitiveType, bool, int>> {};
+
+TEST_P(CubSortKeysTest, CompareToReference) {
+  int batch_size = std::get<2>(GetParam());
+  int segment_size = GpuSortRewriter::kSortSizeThreshold / batch_size;
+
+  const char* kHloTpl = R"(
+HloModule TestSortKeys
+
+compare {
+  %lhs = $0[] parameter(0)
+  %rhs = $0[] parameter(1)
+  ROOT %comp = pred[] compare(%lhs, %rhs), direction=$1
+}
+
+ENTRY main {
+  %input = $0[$2,$3] parameter(0)
+  ROOT %sort = $0[$2,$3] sort(%input), dimensions={1}, to_apply=compare
+})";
+  std::string hlo_str = absl::Substitute(
+      kHloTpl,
+      primitive_util::LowercasePrimitiveTypeName(std::get<0>(GetParam())),
+      std::get<1>(GetParam()) ? "LT" : "GT", batch_size, segment_size);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_str));
+  EXPECT_TRUE(RunAndCompare(std::move(hlo_module), ErrorSpec{0, 0}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CubSort, CubSortKeysTest,
+    ::testing::Combine(::testing::Values(F16, F32, F64, S8, S16, S32, S64, U8,
+                                         U16, U32, U64),
+                       ::testing::Bool(), ::testing::Values(1, 10)),
+    [](const ::testing::TestParamInfo<CubSortKeysTest::ParamType>& info) {
+      return absl::StrCat(
+          primitive_util::LowercasePrimitiveTypeName(std::get<0>(info.param)),
+          std::get<1>(info.param) ? "_asc_" : "_desc_", "b",
+          std::get<2>(info.param));
+    });
+
+// ----- Sort pairs
+
+class CubSortPairsTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrimitiveType, bool, int>> {};
+
+TEST_P(CubSortPairsTest, CompareToReference) {
+  int batch_size = std::get<3>(GetParam());
+  int segment_size = GpuSortRewriter::kSortSizeThreshold / batch_size;
+
+  const char* kHloTpl = R"(
+HloModule TestSortPairs
+
+compare {
+  %lhs = $0[] parameter(0)
+  %rhs = $0[] parameter(1)
+  %v0 = $1[] parameter(2)
+  %v1 = $1[] parameter(3)
+  ROOT %comp = pred[] compare(%lhs, %rhs), direction=$2
+}
+
+ENTRY main {
+  %keys = $0[$3,$4] parameter(0)
+  %values = $1[$3,$4] parameter(1)
+  ROOT %sort = ($0[$3,$4], $1[$3,$4]) sort(%keys, %values),
+      dimensions={1}, to_apply=compare
+})";
+  std::string hlo_str = absl::Substitute(
+      kHloTpl,
+      primitive_util::LowercasePrimitiveTypeName(std::get<0>(GetParam())),
+      primitive_util::LowercasePrimitiveTypeName(std::get<1>(GetParam())),
+      std::get<2>(GetParam()) ? "LT" : "GT", batch_size, segment_size);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_str));
+  EXPECT_TRUE(RunAndCompare(std::move(hlo_module), ErrorSpec{0, 0}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CubSort, CubSortPairsTest,
+    ::testing::Combine(::testing::Values(U16, U32, U64),
+                       ::testing::Values(F16, F32, F64), ::testing::Bool(),
+                       ::testing::Values(1, 10)),
+    [](const ::testing::TestParamInfo<CubSortPairsTest::ParamType>& info) {
+      return absl::StrCat(
+          primitive_util::LowercasePrimitiveTypeName(std::get<0>(info.param)),
+          primitive_util::LowercasePrimitiveTypeName(std::get<1>(info.param)),
+          std::get<2>(info.param) ? "_asc_" : "_desc_", "b",
+          std::get<3>(info.param));
+    });
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From a80a89939b7be08c0bb677b4fa8a3faca090b046 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Fri, 31 May 2024 03:56:04 -0700
Subject: [PATCH 189/287] Fix crash in unoptimized builds.

PiperOrigin-RevId: 638983687
---
 .../xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index ff01eda19e2657..d6992761dfca84 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -577,10 +577,8 @@ absl::Status MlirFusionEmitterBase::RunPassPipeline(
   }
 
   tsl::StatusScopedDiagnosticHandler diagnostic_handler(module.getContext());
-  if (pm.run(module).failed()) {
-    return diagnostic_handler.consumeStatus();
-  }
-  return absl::OkStatus();
+  (void)pm.run(module);
+  return diagnostic_handler.consumeStatus();
 }
 
 }  // namespace gpu

From e2bfe621739e02dd624407635e4536f9957fd4e7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 04:04:35 -0700
Subject: [PATCH 190/287] Remove unused header

PiperOrigin-RevId: 638985816
---
 tensorflow/lite/kernels/embedding_lookup_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 16e3b65d9bd88b..ee71042515bf50 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -24,7 +24,6 @@ License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"

From 69c470026d93d1fa9627261d5b8fc66debc8f032 Mon Sep 17 00:00:00 2001
From: Greg Olechwierowicz <olechwierowicz@google.com>
Date: Fri, 31 May 2024 05:12:34 -0700
Subject: [PATCH 191/287] [XLA:GPU][NFC] Split logical RS if it is AR + DS to
 AR + DS + AR.

PiperOrigin-RevId: 638999678
---
 third_party/xla/xla/service/BUILD             |  40 ++
 .../xla/xla/service/all_reduce_splitter.cc    | 392 ++++++++++++++++++
 .../xla/xla/service/all_reduce_splitter.h     |  77 ++++
 .../xla/service/all_reduce_splitter_test.cc   | 361 ++++++++++++++++
 4 files changed, 870 insertions(+)
 create mode 100644 third_party/xla/xla/service/all_reduce_splitter.cc
 create mode 100644 third_party/xla/xla/service/all_reduce_splitter.h
 create mode 100644 third_party/xla/xla/service/all_reduce_splitter_test.cc

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index e6e23eb7e0b1d5..037a0c9c787e35 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -232,6 +232,46 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_reduce_splitter",
+    srcs = ["all_reduce_splitter.cc"],
+    hdrs = ["all_reduce_splitter.h"],
+    deps = [
+        ":hlo_module_config",
+        ":hlo_pass",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_splitter_test",
+    srcs = ["all_reduce_splitter_test.cc"],
+    deps = [
+        ":all_reduce_splitter",
+        ":hlo_module_config",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "float_support",
     srcs = ["float_support.cc"],
diff --git a/third_party/xla/xla/service/all_reduce_splitter.cc b/third_party/xla/xla/service/all_reduce_splitter.cc
new file mode 100644
index 00000000000000..08654d5fb6ec51
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_splitter.cc
@@ -0,0 +1,392 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/all_reduce_splitter.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+// Structure containing the newly calculated replica groups.
+struct ARReplicaGroups {
+  // First AR's replica group.
+  std::vector<ReplicaGroup> first_ar_replica_groups;
+  // Second AR's replica group.
+  std::vector<ReplicaGroup> second_ar_replica_groups;
+};
+
+// Contains relevant data to rewrite the AR + DS into AR + DS + AR.
+struct AllReduceRewriteSpec {
+  // Determines a dimension on which DS occurs.
+  int split_dim;
+  // Determines the size of the process group.
+  int group_size;
+  // AllReduce instruction to be rewritten.
+  HloAllReduceInstruction* all_reduce;
+  // DynamicSlice following the `all_reduce` indicating logical RS.
+  HloDynamicSliceInstruction* dynamic_slice;
+  // New replica groups for an `all_reduce`.
+  ARReplicaGroups replica_groups;
+
+  std::string ToString() {
+    return absl::Substitute(
+        "{\n split_dim=$0\n group_size=$1\n all_reduce=$2\n "
+        "dynamic_slice=$3\n}\n",
+        split_dim, group_size, all_reduce->ToString(),
+        dynamic_slice->ToString());
+  }
+};
+
+// Contains the relevant metadata for debugging why rewrite is infeasible.
+struct RewriteInfeasibleReason {
+  // Instruction for which it is infeasible to do a rewrite.
+  const HloInstruction* ar;
+  // Describes a reason of infeasibility.
+  std::string message;
+};
+
+// Hashable container to hold replica groups.
+struct ReplicaGroups {
+  std::vector<ReplicaGroup> replica_groups;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ReplicaGroups& rg) {
+    return H::combine(std::move(h), rg.replica_groups.size());
+  }
+
+  friend bool operator==(const ReplicaGroups& item,
+                         const ReplicaGroups& other) {
+    if (item.replica_groups.size() != other.replica_groups.size()) {
+      return false;
+    }
+    for (int i = 0; i < item.replica_groups.size(); i++) {
+      const ReplicaGroup& item_replica_group = item.replica_groups[i];
+      const ReplicaGroup& other_replica_group = other.replica_groups[i];
+      for (int i = 0; i < item_replica_group.replica_ids_size(); i++) {
+        if (item_replica_group.replica_ids(i) !=
+            other_replica_group.replica_ids(i)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+};
+
+using ARReplicaGroupMap =
+    absl::flat_hash_map<ReplicaGroups,
+                        std::vector<const HloAllReduceInstruction*>>;
+
+using RewriteDecision =
+    std::variant<AllReduceRewriteSpec, RewriteInfeasibleReason>;
+
+// Returns a single dimension which is being split by `ds`. Returns
+// std::nullopt if there are more, or no dimension to be split.
+std::optional<int> GetSplitDim(const HloAllReduceInstruction& ar,
+                               const HloDynamicSliceInstruction& ds) {
+  int split_dim = -1;
+  int num_dims = 0;
+  for (int64_t dim = 0; dim < ar.shape().rank(); ++dim) {
+    if (ar.shape().dimensions(dim) != ds.shape().dimensions(dim)) {
+      num_dims++;
+      split_dim = dim;
+    }
+  }
+  if (num_dims != 1) {
+    VLOG(2) << "No support for multiple nor 0 split dims.";
+    return std::nullopt;
+  }
+  return split_dim;
+}
+
+// For input collective instruction `ar` get the process group size (# shards).
+std::optional<int> GetProcessGroupSize(const HloAllReduceInstruction& ar,
+                                       const HloDynamicSliceInstruction& ds) {
+  CHECK(ds.operand(0) == &ar) << "Irrelevant AR + DS pair.";
+  std::optional<int> split_dim = GetSplitDim(ar, ds);
+  if (!split_dim.has_value()) {
+    return std::nullopt;
+  }
+
+  return ar.shape().dimensions(*split_dim) /
+         ds.dynamic_slice_sizes()[*split_dim];
+}
+
+ARReplicaGroupMap GetReplicaGroupsMap(HloComputation& computation) {
+  ARReplicaGroupMap map;
+  hlo_query::ForEachInstructionWithOpcode(
+      computation, HloOpcode::kAllReduce,
+      [&map](const HloInstruction* instruction) {
+        const HloAllReduceInstruction* ar =
+            Cast<HloAllReduceInstruction>(instruction);
+        auto rgs = ReplicaGroups{ar->replica_groups()};
+        map[rgs].push_back(ar);
+      });
+  return map;
+}
+
+ARReplicaGroups GetNewReplicaGroups(int group_size, int num_partitions) {
+  CHECK_EQ(num_partitions % group_size, 0);
+
+  std::vector<ReplicaGroup> first_ar_rgs, second_ar_rgs;
+  int num_units = num_partitions / group_size;
+  first_ar_rgs.reserve(num_units);
+  second_ar_rgs.reserve(group_size);
+
+  // Construct first AR replica groups.
+  for (int u = 0; u < group_size * num_units; u += group_size) {
+    ReplicaGroup& group = first_ar_rgs.emplace_back();
+    for (int r = u; r < u + group_size; r++) {
+      group.add_replica_ids(r);
+    }
+  }
+
+  // Construct second AR replica groups.
+  for (int g = 0; g < group_size; g++) {
+    ReplicaGroup& group = second_ar_rgs.emplace_back();
+    for (int r = g; r < group_size * num_units; r += group_size) {
+      group.add_replica_ids(r);
+    }
+  }
+  return {
+      /*first_ar_replica_groups=*/first_ar_rgs,
+      /*second_ar_replica_groups=*/second_ar_rgs,
+  };
+}
+
+// Determine whether the given `spec`'s AllReduce instruction is profitable to
+// split. Currently it employs a simple heuristic, and it checks whether there
+// exists at least one all reduce with same replica groups as any of the all
+// reduce's replica groups after the potential split.
+bool IsProfitableToSplit(const ARReplicaGroupMap& replica_map,
+                         const AllReduceRewriteSpec& spec) {
+  auto new_rgs = spec.replica_groups;
+  bool first_replica_exists =
+      replica_map.contains(ReplicaGroups{new_rgs.first_ar_replica_groups});
+  bool second_replica_exists =
+      replica_map.contains(ReplicaGroups{new_rgs.second_ar_replica_groups});
+  return first_replica_exists || second_replica_exists;
+}
+
+RewriteDecision CanRewrite(const HloModuleConfig& config,
+                           const ARReplicaGroupMap& replica_map,
+                           HloInstruction& instruction) {
+  // We rely on SPMD partitioning enabled, thus asserting `replica_count` = 1.
+  if (config.use_auto_spmd_partitioning() || !config.use_spmd_partitioning() ||
+      config.replica_count() != 1) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Supporting only SPMD partitioning scheme.",
+    };
+  }
+
+  if (instruction.opcode() != HloOpcode::kAllReduce) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Cannot rewrite an AllReduce, since it's not AllReduce.",
+    };
+  }
+
+  auto* ar = Cast<HloAllReduceInstruction>(&instruction);
+
+  if (!ar->use_global_device_ids()) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Only global ids are supported currently.",
+    };
+  }
+
+  if (ar->user_count() != 1 ||
+      ar->users().front()->opcode() != HloOpcode::kDynamicSlice) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Cannot rewrite AllReduce if it is not a logical reduce scatter.",
+    };
+  }
+
+  auto* ds = Cast<HloDynamicSliceInstruction>(ar->users().front());
+
+  if (ds->user_count() > 1) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Exactly one user of dynamic slice is required for a rewrite.",
+    };
+  }
+
+  int num_partitions = config.num_partitions();
+
+  std::vector<ReplicaGroup> rgs = ar->replica_groups();
+  if (rgs.size() != 1 || rgs.front().replica_ids_size() != num_partitions) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        absl::StrCat("Cannot determine a valid split with num_partitions: ",
+                     num_partitions),
+    };
+  }
+
+  std::optional<int> split_dim = GetSplitDim(*ar, *ds);
+  if (!split_dim.has_value()) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Cannot get a split dim.",
+    };
+  }
+
+  std::optional<int> group_size = GetProcessGroupSize(*ar, *ds);
+  if (!group_size.has_value()) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Cannot determine a group size.",
+    };
+  }
+
+  if (num_partitions == group_size) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Nothing to rewrite",
+    };
+  }
+
+  if (num_partitions % *group_size != 0) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Group size does not evenly divide the number of partitions",
+    };
+  }
+
+  auto spec = AllReduceRewriteSpec{
+      /*split_dim=*/*split_dim,
+      /*group_size=*/*group_size,
+      /*all_reduce=*/ar,
+      /*dynamic_slice=*/ds,
+      /*replica_groups=*/GetNewReplicaGroups(*group_size, num_partitions),
+  };
+
+  if (!IsProfitableToSplit(replica_map, spec)) {
+    return RewriteInfeasibleReason{
+        &instruction,
+        "Splitting is not profitable.",
+    };
+  }
+
+  return spec;
+}
+
+absl::StatusOr<bool> SplitAllReduce(AllReduceRewriteSpec spec,
+                                    HloComputation& computation,
+                                    const HloModuleConfig& config) {
+  int64_t next_channel_id =
+      hlo_query::NextChannelId(*spec.all_reduce->GetModule());
+  VLOG(1) << "AR splitting spec: " << spec.ToString();
+  // Create first AR.
+  int num_partitions = config.num_partitions();
+  // # of shards within a replica
+  int group_size = spec.group_size;
+
+  CHECK_EQ(num_partitions % group_size, 0);
+
+  HloAllReduceInstruction& ar = *spec.all_reduce;
+  HloDynamicSliceInstruction& ds = *spec.dynamic_slice;
+
+  const auto& [first_ar_replica_groups, second_ar_replica_groups] =
+      spec.replica_groups;
+  int channel_id = next_channel_id++;
+  HloInstruction* first_ar =
+      computation.AddInstruction(HloInstruction::CreateAllReduce(
+          ar.shape(), ar.operands(), ar.to_apply(),
+          CollectiveDeviceList(first_ar_replica_groups), ar.constrain_layout(),
+          channel_id, ar.use_global_device_ids()));
+
+  // Create second AR.
+  channel_id = next_channel_id++;
+  HloInstruction* second_ar =
+      computation.AddInstruction(HloInstruction::CreateAllReduce(
+          ds.shape(), {&ds}, ar.to_apply(),
+          CollectiveDeviceList(second_ar_replica_groups), ar.constrain_layout(),
+          channel_id, ar.use_global_device_ids()));
+
+  // Rewire.
+  TF_RETURN_IF_ERROR(computation.ReplaceInstruction(&ar, first_ar));
+  if (ds.IsRoot()) {
+    computation.set_root_instruction(second_ar);
+  }
+  TF_RETURN_IF_ERROR(ds.ReplaceAllUsesWith(second_ar));
+  return true;  // changed
+}
+
+// Splits `instruction` if it finds it is feasible and profitable to do so.
+// Return true if `instruction` has been rewritten, or false otherwise.
+absl::StatusOr<bool> SplitAllReduce(const HloModuleConfig& config,
+                                    const ARReplicaGroupMap& replica_map,
+                                    HloComputation& computation,
+                                    HloInstruction& instruction) {
+  RewriteDecision spec = CanRewrite(config, replica_map, instruction);
+  if (std::holds_alternative<RewriteInfeasibleReason>(spec)) {
+    auto reason = std::get<RewriteInfeasibleReason>(spec);
+    VLOG(1) << "Cannot process {" << reason.ar->ToString()
+            << "} due to : " << reason.message;
+    return false;  // changed
+  }
+  return SplitAllReduce(std::get<AllReduceRewriteSpec>(spec), computation,
+                        config);  // changed
+}
+
+}  // namespace
+
+absl::StatusOr<bool> AllReduceSplitter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  const HloModuleConfig& config = module->config();
+  bool changed = false;
+
+  for (auto* computation : module->computations(execution_threads)) {
+    ARReplicaGroupMap replica_map = GetReplicaGroupsMap(*computation);
+    for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(bool rewritten, SplitAllReduce(config, replica_map,
+                                                         *computation, *instr));
+      changed |= rewritten;
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/all_reduce_splitter.h b/third_party/xla/xla/service/all_reduce_splitter.h
new file mode 100644
index 00000000000000..ac8dec7afa7833
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_splitter.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_SPLITTER_H_
+#define XLA_SERVICE_ALL_REDUCE_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Rewrites global AR if it is in the form of AR + DS and matches existing
+// replica groups into a logical RS followed by AR.
+//
+// If the pass detects AR followed by DS, then it checks whether
+// it is profitable to break it down into a logical RS (but AR + DS still),
+// followed by an AR to keep the rewrite numerically equivalent.
+//
+// Consider following example:
+//
+// Input program:
+//   HloModule m, num_partitions=8
+//     p = partition_id()
+//     ar = bf16[32] all-reduce(x), replica_groups={{0,1,2,3,4,5,6,7}}
+//     ds = dynamic-slice(ar, pointer(partition_id)), dynamic_slice_sizes={8}
+//
+// There is a global AR performing a reduction over 8 partitions.
+// However DS is performing 8-sized slice of a 32-sized tensor which implies
+// only 4 distinct slices of a tensor, which further implies 2 replicas of each
+// calculated slice. This can be expressed as RS within the replicas followed by
+// AR across the replicas. The transformation limits collectives to the data
+// that is actually needed for the requested slice.
+//
+// Output program:
+//   HloModule m, num_partitions=8
+//     p = partition_id()
+//     ar = bf16[32] all-reduce(x), replica_groups={{0,1,2,3},{4,5,6,7}}
+//     ds = dynamic-slice(ar, pointer(partition_id)), dynamic_slice_sizes={8}
+//     ar.2 = bf16[32] all-reduce(ds), replica_groups={{0,4},{1,5},{2,6},{3,7}}
+//
+// In addition the pass does the rewrite only if it finds it profitable to do
+// so. The profitability function is simple, and just checks whether there are
+// any collectives with same replica groups. If there are then the combiner pass
+// can pick it up, and fuse it into the same NCCL call.
+//
+// While the solution is orthogonal to existing known distribution patterns, in
+// practice it is profitable for HSDP style communication pattern.
+// https://arxiv.org/pdf/2203.11014
+//
+class AllReduceSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-splitter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_REDUCE_SPLITTER_H_
diff --git a/third_party/xla/xla/service/all_reduce_splitter_test.cc b/third_party/xla/xla/service/all_reduce_splitter_test.cc
new file mode 100644
index 00000000000000..54fc1a60ba818b
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_splitter_test.cc
@@ -0,0 +1,361 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/all_reduce_splitter.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+class AllReduceSplitterTest : public HloTestBase {
+ public:
+  absl::StatusOr<std::unique_ptr<HloModule>> PrepareModule(
+      absl::string_view hlo_module, int64_t num_replicas,
+      int64_t num_partitions) {
+    HloModuleConfig config = GetModuleConfigForTest(
+        /*replica_count=*/num_replicas,
+        /*num_partitions=*/num_partitions);
+    config.set_use_spmd_partitioning(num_partitions > 1);
+    return ParseAndReturnVerifiedModule(hlo_module, config);
+  }
+
+  size_t AllReduceCount(const HloModule &module) {
+    return CollectiveCount(module, HloOpcode::kAllReduce);
+  }
+
+ private:
+  size_t CollectiveCount(const HloModule &module, HloOpcode opcode) {
+    return absl::c_count_if(
+        module.entry_computation()->instructions(),
+        [&opcode](HloInstruction *instr) { return instr->opcode() == opcode; });
+  }
+};
+
+TEST_F(
+    AllReduceSplitterTest,
+    MatchBasicPatternIfDynamicSliceIsRootAndThereExistsAllReduceWithSameReplicaGroups) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(first.ar, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(true));
+
+  EXPECT_EQ(AllReduceCount(*module), 3);
+}
+
+TEST_F(
+    AllReduceSplitterTest,
+    DoesNotMatchMatchBasicPatternIfDynamicSliceIsRootAndThereIsNoAllReduceWithSameReplicaGroups) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(p, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+
+  EXPECT_EQ(AllReduceCount(*module), 1);
+}
+
+TEST_F(
+    AllReduceSplitterTest,
+    MatchBasicPatternIfDynamicSliceIsNotRootAndThereExistsAllReduceWithSameReplicaGroups) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  zero = bf16[] constant(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  reduce = bf16[4096] reduce(p, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  dynamic_slice = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+  broadcast = bf16[1024,1024] broadcast(dynamic_slice), dimensions={0}
+  ROOT _ = tuple(broadcast, first.ar)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_EQ(AllReduceCount(*module), 3);
+}
+
+TEST_F(
+    AllReduceSplitterTest,
+    DoesNotMatchBasicPatternIfDynamicSliceIsNotRootAndThereIsNoAllReduceWithSameReplicaGroups) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  p.1 = bf16[2,4096,4096] parameter(1)
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(p, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  dynamic_slice = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+  broadcast = bf16[1024,1024] broadcast(dynamic_slice), dimensions={0}
+  add = bf16[2,4096,4096] add(p,p.1)
+  ROOT _ = tuple(broadcast, add)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+  EXPECT_EQ(AllReduceCount(*module), 1);
+}
+
+TEST_F(AllReduceSplitterTest,
+       DoesNotMatchBasicPatternIfDynamicSliceIsFullySharded) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(first.ar, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(512)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[512] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={512}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+}
+
+TEST_F(AllReduceSplitterTest,
+       DoesNotMatchBasicPatternIfItIsNotCompiledWithSPMDPartitioning) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(first.ar, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+}
+)";
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/1, /*num_partitions=*/8);
+  config.set_use_spmd_partitioning(false);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+  EXPECT_THAT(AllReduceCount(*module), 2);
+}
+
+TEST_F(AllReduceSplitterTest,
+       DoesNotMatchBasicPatternIfUseGlobalDeviceIdsIsFalse) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, channel_id=1
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(first.ar, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=sum, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+
+  EXPECT_EQ(AllReduceCount(*module), 2);
+}
+
+TEST_F(AllReduceSplitterTest,
+       DoesNotMatchBasicPatternIfIsNotCrossAllPartitionsAllReduce) {  // NOLINT
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = bf16[] parameter(0)
+  b = bf16[] parameter(1)
+  ROOT _ = bf16[] add(a,b)
+}
+
+ENTRY main {
+  p = bf16[2,4096,4096] parameter(0)
+  first.ar = bf16[2,4096,4096] all-reduce(p), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=1
+  zero = bf16[] constant(0)
+  reduce = bf16[4096] reduce(first.ar, zero), dimensions={0,1}, to_apply=sum
+  all-reduce = bf16[4096] all-reduce(reduce), replica_groups={{0,1,2,3},{4,5,6,7}}, to_apply=sum, use_global_device_ids=true, channel_id=2
+  table = s32[8]{0} constant({0,1,2,3,4,5,6,7})
+  pid = u32[] partition-id()
+  id = s32[1] dynamic-slice(table, pid), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(id)
+  slice_size = s32[] constant(1024)
+  offset = s32[] multiply(reshape, slice_size)
+  ROOT _ = bf16[1024] dynamic-slice(all-reduce, offset), dynamic_slice_sizes={1024}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
+
+  EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(false));
+
+  EXPECT_EQ(AllReduceCount(*module), 2);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 2feb38b9b34373f07f0cef59d66838f4331faa6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 06:35:30 -0700
Subject: [PATCH 192/287] [MHLO] Add support for batching dims when legalizing
 `mhlo.gather` to linalg

PiperOrigin-RevId: 639017063
---
 .../legalize_to_linalg/legalize_to_linalg.cc  | 36 ++++++++++-----
 .../Dialect/mhlo/hlo-legalize-to-linalg.mlir  | 44 +++++++++++++++++++
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index bf969fd343016b..622dbdc8a2b6eb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/map_mhlo_to_scalar_op.h"
@@ -4017,11 +4018,6 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
   LogicalResult matchAndRewrite(
       mhlo::GatherOp gatherOp, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    // TODO: b/342172264 - Implement handling of batching dims.
-    if (!gatherOp.getDimensionNumbers().getOperandBatchingDims().empty() ||
-        !gatherOp.getDimensionNumbers().getStartIndicesBatchingDims().empty())
-      return failure();
-
     Location loc = gatherOp.getLoc();
 
     Value startIndices = adaptor.getStartIndices();
@@ -4048,6 +4044,10 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
         gatherOp.getDimensionNumbers().getOffsetDims();
     ArrayRef<int64_t> collapsedSliceDims =
         gatherOp.getDimensionNumbers().getCollapsedSliceDims();
+    ArrayRef<int64_t> operandBatchingDims =
+        gatherOp.getDimensionNumbers().getOperandBatchingDims();
+    ArrayRef<int64_t> startIndicesBatchingDims =
+        gatherOp.getDimensionNumbers().getStartIndicesBatchingDims();
     ArrayRef<int64_t> startIndexMap =
         gatherOp.getDimensionNumbers().getStartIndexMap();
 
@@ -4128,12 +4128,25 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
     for (const auto& it : llvm::enumerate(startIndexMap))
       remappedIndexFromIndices[it.value()] = indexFromStartIndices[it.index()];
 
+    // Now we construct the index based on the operand/start_indices batching
+    // dimensions.
+    SmallVector<Value> indexFromBatching(operandRank, constants[0]);
+    for (auto [operandDim, indicesDim] :
+         llvm::zip_equal(operandBatchingDims, startIndicesBatchingDims)) {
+      indexFromBatching[operandDim] =
+          gatherIndex[indicesDim + (indicesDim < indexVectorDim ? 0 : 1)];
+    }
+
+    auto isCollapsedOrBatching = [&](int64_t dim) {
+      return llvm::is_contained(collapsedSliceDims, dim) ||
+             llvm::is_contained(operandBatchingDims, dim);
+    };
+
     // Now we construct the index based on the offset. First we need to remap
     // the offset dimensions by dropping the collapsed indices.
     SmallVector<unsigned> remappedOffsetDims;
     for (int i = 0; i < operandRank; ++i)
-      if (!llvm::is_contained(collapsedSliceDims, i))
-        remappedOffsetDims.push_back(i);
+      if (!isCollapsedOrBatching(i)) remappedOffsetDims.push_back(i);
 
     assert(remappedOffsetDims.size() == offsetDims.size());
 
@@ -4142,7 +4155,7 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
       // Compute the size of the output shape dimension corresponding to this
       // index dimension. If it's collapsed set it to 1.
       Value outputDimSize = constants[1];
-      if (!llvm::is_contained(collapsedSliceDims, i)) {
+      if (!isCollapsedOrBatching(i)) {
         outputDimSize = rewriter.createOrFold<tensor::DimOp>(
             loc, emptyOp, offsetDims[operandIndexDim++]);
       }
@@ -4171,12 +4184,15 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
     for (unsigned k = 0; k < offsetDims.size(); ++k)
       indexFromOffset[remappedOffsetDims[k]] = linalgIndices[offsetDims[k]];
 
-    // Now we add together our two indices to get the final index into the
+    // Now we add together our three indices to get the final index into the
     // operand.
     SmallVector<Value> combinedIndex;
     for (int i = 0; i < operandRank; ++i)
       combinedIndex.push_back(rewriter.createOrFold<arith::AddIOp>(
-          loc, rewriter.getIndexType(), remappedIndexFromIndices[i],
+          loc, rewriter.getIndexType(),
+          rewriter.createOrFold<arith::AddIOp>(loc, rewriter.getIndexType(),
+                                               remappedIndexFromIndices[i],
+                                               indexFromBatching[i]),
           indexFromOffset[i]));
 
     Value extractOperand;
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index de3b7640a1cb45..3475bcb73cb54c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -4719,6 +4719,50 @@ func.func @gather(%operand : tensor<1x4x8xi32>, %start_indices : tensor<1x8x2xi3
 
 // -----
 
+func.func @gather_batching_dims(%operand : tensor<5x4x8xi32>, %start_indices : tensor<8x5x1xi32>) -> tensor<8x5x8xi32> {
+  %res = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [1],
+      operand_batching_dims = [0],
+      start_indices_batching_dims = [1],
+      index_vector_dim = 2,
+      offset_dims = [2],
+      start_index_map = [1]
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<[1, 1, 8]> : tensor<3xi64>,
+    someattr
+  } : (tensor<5x4x8xi32>, tensor<8x5x1xi32>) -> tensor<8x5x8xi32>
+  func.return %res : tensor<8x5x8xi32>
+}
+
+// CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL:   func @gather_batching_dims(
+// CHECK-SAME:        %[[OPERAND:[a-zA-Z0-9_]+]]
+// CHECK-SAME:        %[[START_INDICES:[a-zA-Z0-9_]+]]
+// CHECK-SAME:    )
+// CHECK-DAG:       %[[C0:.+]] = arith.constant 0
+// CHECK-DAG:       %[[C3:.+]] = arith.constant 3
+// CHECK-DAG:       %[[INIT:.+]] = tensor.empty() : tensor<8x5x8xi32>
+// CHECK:           %[[RES:.+]] = linalg.generic
+// CHECK-SAME:           indexing_maps = [#[[MAP0]]],
+// CHECK-SAME:           iterator_types = ["parallel", "parallel", "parallel"]
+// CHECK-SAME:           outs(%[[INIT]] : tensor<8x5x8xi32>)
+// CHECK-SAME:           {someattr}
+// CHECK:           ^bb0
+// CHECK-DAG:         %[[IDX0:.+]] = linalg.index 0
+// CHECK-DAG:         %[[IDX1:.+]] = linalg.index 1
+// CHECK-DAG:         %[[IDX2:.+]] = linalg.index 2
+// CHECK-DAG:         %[[S0_INT:.+]] = tensor.extract %[[START_INDICES]][%[[IDX0]], %[[IDX1]], %[[C0]]] : tensor<8x5x1xi32>
+// CHECK-DAG:         %[[S0:.+]] = arith.index_cast %[[S0_INT]] : i32 to index
+// CHECK-DAG:         %[[CLAMP0:.+]] = arith.maxsi %[[S0]], %[[C0]]  : index
+// CHECK-DAG:         %[[IN0:.+]] = arith.minsi %[[CLAMP0]], %[[C3]]
+// CHECK:             %[[Y:.+]] = tensor.extract %[[OPERAND]][%[[IDX1]], %[[IN0]], %[[IDX2]]] : tensor<5x4x8xi32>
+// CHECK:             linalg.yield %[[Y]] : i32
+// CHECK-DAG:       return %[[RES]]
+
+// -----
+
 func.func @gather_unsigned_index(
     %operand : tensor<1x4x8xi32>, %start_indices : tensor<1x8x2xui32>)
     -> tensor<1x8x8xi32> {

From 8fb2acb2176abbc294507a0d21f02cd4a53b2d6d Mon Sep 17 00:00:00 2001
From: Greg Olechwierowicz <olechwierowicz@google.com>
Date: Fri, 31 May 2024 06:54:42 -0700
Subject: [PATCH 193/287] [XLA:GPU][NFC] Add filecheck test for
 all-reduce-splitter pass.

PiperOrigin-RevId: 639020890
---
 third_party/xla/xla/service/BUILD             |  3 ++
 .../xla/service/all_reduce_splitter_test.cc   | 52 +++++++++++++++++--
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 037a0c9c787e35..6d3b39ea2f343b 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -262,11 +262,14 @@ xla_cc_test(
         ":all_reduce_splitter",
         ":hlo_module_config",
         "//xla/hlo/ir:hlo",
+        "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/service/all_reduce_splitter_test.cc b/third_party/xla/xla/service/all_reduce_splitter_test.cc
index 54fc1a60ba818b..6fdc555cfb4c83 100644
--- a/third_party/xla/xla/service/all_reduce_splitter_test.cc
+++ b/third_party/xla/xla/service/all_reduce_splitter_test.cc
@@ -18,17 +18,21 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -62,8 +66,20 @@ class AllReduceSplitterTest : public HloTestBase {
   }
 };
 
+class AllReduceSplitterFilecheckTest : public AllReduceSplitterTest {
+ public:
+  absl::Status FileCheck(const std::string &hlo_text,
+                         absl::string_view pattern) {
+    TF_ASSIGN_OR_RETURN(bool matched, RunFileCheck(hlo_text, pattern));
+    if (!matched) {
+      return absl::InternalError("Filecheck failed.");
+    }
+    return absl::OkStatus();
+  }
+};
+
 TEST_F(
-    AllReduceSplitterTest,
+    AllReduceSplitterFilecheckTest,
     MatchBasicPatternIfDynamicSliceIsRootAndThereExistsAllReduceWithSameReplicaGroups) {  // NOLINT
   absl::string_view hlo_string = R"(
 HloModule m
@@ -95,8 +111,19 @@ ENTRY main {
       PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
 
   EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(true));
-
-  EXPECT_EQ(AllReduceCount(*module), 3);
+  TF_EXPECT_OK(FileCheck(module->ToString(), R"(
+    CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
+    CHECK:        %[[AR0:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK-SAME:   replica_groups={[[DESIRED_RGS:.*]]}
+    CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[AR0]], bf16[] %[[ZERO]])
+    CHECK:        %[[AR1:.*]] = bf16[4096]{0} all-reduce(bf16[4096]{0} %[[LOCAL_REDUCE]])
+    CHECK-SAME:   replica_groups={[[DESIRED_RGS]]}
+    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(bf16[4096]{0} %[[AR1]], s32[] %[[_:.*]])
+    CHECK-SAME:   dynamic_slice_sizes={1024}
+    CHECK-NEXT:   ROOT %[[AR2:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[DS]])
+    CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
+    )"));
 }
 
 TEST_F(
@@ -136,7 +163,7 @@ ENTRY main {
 }
 
 TEST_F(
-    AllReduceSplitterTest,
+    AllReduceSplitterFilecheckTest,
     MatchBasicPatternIfDynamicSliceIsNotRootAndThereExistsAllReduceWithSameReplicaGroups) {  // NOLINT
   absl::string_view hlo_string = R"(
 HloModule m
@@ -170,7 +197,22 @@ ENTRY main {
       PrepareModule(hlo_string, /*num_replicas=*/1, /*num_partitions=*/8));
 
   EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(true));
-  EXPECT_EQ(AllReduceCount(*module), 3);
+  TF_EXPECT_OK(FileCheck(module->ToString(), R"(
+    CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
+    CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[P0]], bf16[] %[[ZERO]])
+    CHECK:        %[[AR0:.*]] = bf16[4096]{0} all-reduce(bf16[4096]{0} %[[LOCAL_REDUCE]])
+    CHECK-SAME:   replica_groups={[[DESIRED_RGS:.*]]}
+    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(bf16[4096]{0} %[[AR0]], s32[] %[[_:.*]])
+    CHECK-SAME:   dynamic_slice_sizes={1024}
+    CHECK-NEXT:   %[[AR1:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[DS]])
+    CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
+    CHECK:        %[[EXISTING_AR:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK-SAME:   replica_groups={[[DESIRED_RGS]]}
+    CHECK:        ROOT
+    CHECK-NOT:    %[[AR1]]
+    CHECK-SAME:   %[[EXISTING_AR]]
+    )"));
 }
 
 TEST_F(

From 8a8ba2986cfb32e18e61da66741d47a65a132761 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 31 May 2024 07:15:43 -0700
Subject: [PATCH 194/287] Roll forward without ConvertGenerator

To avoid BUILD problem in some OSS configurations.

Reverts 9ca5a013eca72dfc8f117a7d27a9d53102978a83

PiperOrigin-RevId: 639025529
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  23 ++--
 .../xla/xla/service/gpu/triton_support.cc     |  43 +++++++
 .../xla/xla/service/gpu/triton_support.h      |   8 ++
 .../xla/service/gpu/triton_support_test.cc    | 112 ++++++++++++++++++
 .../service/gpu/triton_tiling_propagation.cc  |  24 +---
 6 files changed, 182 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index f5e3f39b243d52..4522e0bc59492c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1192,6 +1192,7 @@ cc_library(
     hdrs = ["triton_support.h"],
     deps = [
         ":variant_visitor",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index d8835fbe605e5b..8080871125dccf 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1568,21 +1568,24 @@ class MatMulEmitterHelper {
             majormost_dim_start_index_ptr_val, mt::CacheModifier::NONE,
             mt::EvictionPolicy::NORMAL,
             /*isVolatile=*/false);
-        Value majormost_dim_start_index_lower_limit_val =
-            CreateConst(b_, majormost_dim_start_index_val.getType(), 0);
         int64_t majormost_dim_start_index_upper_limit =
             hlo->operand(0)->shape().dimensions(majormost_dim) -
             hlo->dynamic_slice_sizes().at(majormost_dim);
-        Value majormost_dim_start_index_upper_limit_val =
-            CreateConst(b_, majormost_dim_start_index_val.getType(),
-                        majormost_dim_start_index_upper_limit);
-        // Our Triton codegen only supports signed integers so far.
+        // We don't want to cast S64 indices to S32, because that could result
+        // in an incorrect value.
+        if (majormost_dim_start_index_val.getType().isInteger() &&
+            majormost_dim_start_index_val.getType().getIntOrFloatBitWidth() ==
+                64) {
+          return UncompilableMatmul(
+              "64 bit dynamic-slice indices are not supported yet.");
+        }
         majormost_dim_start_index_val =
-            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val,
-                                   majormost_dim_start_index_lower_limit_val);
+            Cast(b_, majormost_dim_start_index_val, b_.getI32Type());
         majormost_dim_start_index_val =
-            b_.create<ma::MinSIOp>(majormost_dim_start_index_val,
-                                   majormost_dim_start_index_upper_limit_val);
+            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(0));
+        majormost_dim_start_index_val = b_.create<ma::MinSIOp>(
+            majormost_dim_start_index_val,
+            Cst32(majormost_dim_start_index_upper_limit));
 
         // How many "rows" (non-contracting dim values) are there in a slice of
         // size 1?
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 7576523d733577..7fd8da75bef13e 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/triton_support.h"
 
+#include <cstdint>
 #include <iterator>
 #include <variant>
 #include <vector>
@@ -25,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
@@ -317,6 +319,43 @@ bool NoNonContractingDimension(const HloDotInstruction& dot) {
   return false;
 }
 
+CodegenDecision IsTritonSupportedDynamicSlice(
+    const HloDynamicSliceInstruction& instr) {
+  for (const HloInstruction* index_operand : instr.index_operands()) {
+    switch (index_operand->shape().element_type()) {
+      case S8:
+      case S16:
+      case S32:
+        break;  // supported
+      default:
+        return CodegenDecision(
+            "Dynamic slice is only supported with S8, S16, or S32 indices.");
+    }
+  }
+
+  // Similar to normal slice, we cannot slice a non-major-most dimension as
+  // that would introduce non-contiguous strides under tiling. The existing
+  // check against this in GetRequirementsIfSupportedOrder is not suitable for
+  // dynamic slices, so we instead check for this here.
+  const HloInstruction* input = instr.operand(0);
+  Layout in_layout = input->shape().layout();
+  int64_t majormost_dim_id =
+      in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
+
+  for (int i = 0; i < input->shape().dimensions_size(); ++i) {
+    if (i == majormost_dim_id) {
+      continue;
+    } else if (input->shape().dimensions(i) != instr.slice_sizes(i)) {
+      return CodegenDecision(
+          "Unsupported dynamic slice on non-major-most dimension.");
+    }
+  }
+
+  // TODO(b/343143854): Check the subtleties of which dynamic slices are
+  // supported, for example that a fragmented dimension cannot be sliced.
+  return CodegenDecision{};
+}
+
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   if (instr.IsElementwise()) {
@@ -342,6 +381,10 @@ CodegenDecision IsTritonSupportedInstruction(
       }
       return "Only supports root tuples.";
     }
+    case HloOpcode::kDynamicSlice: {
+      return IsTritonSupportedDynamicSlice(
+          *Cast<HloDynamicSliceInstruction>(&instr));
+    }
     case HloOpcode::kBitcast:
     case HloOpcode::kTranspose:
     case HloOpcode::kSlice:
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index aebe116ffd6349..667ad729793b93 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -56,6 +56,14 @@ CodegenDecision CanTritonHandleGEMM(
 CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
 
+// Checks dynamic slice against requirements of triton emitter.
+//
+// This is exposed separately from IsTritonSupportedInstruction because we can
+// use it in the dimension order propagation without adding a dependency on the
+// GPU version.
+CodegenDecision IsTritonSupportedDynamicSlice(
+    const HloDynamicSliceInstruction& instr);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
index 7e9a8fdd7312bd..871aed0a7bcd9c 100644
--- a/third_party/xla/xla/service/gpu/triton_support_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -34,8 +34,10 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
@@ -474,6 +476,116 @@ INSTANTIATE_TEST_SUITE_P(DotTestTestSuite, DotTest,
                                             ::testing::Values(HloOpcode::kDot)),
                          TestParamsToString);
 
+struct DynamicSliceTestParam {
+  PrimitiveType data_type;
+  PrimitiveType index_type;
+  bool is_the_majormost_dim_being_sliced;
+
+  using TupleType = std::tuple<PrimitiveType, PrimitiveType, bool>;
+
+  explicit DynamicSliceTestParam(const TupleType& tuple)
+      : data_type(std::get<0>(tuple)),
+        index_type(std::get<1>(tuple)),
+        is_the_majormost_dim_being_sliced(std::get<2>(tuple)) {}
+};
+
+std::string DynamicSliceTestParamToString(
+    const ::testing::TestParamInfo<DynamicSliceTestParam::TupleType>& info) {
+  const DynamicSliceTestParam param(info.param);
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(param.data_type), "_",
+      primitive_util::LowercasePrimitiveTypeName(param.index_type), "_",
+      param.is_the_majormost_dim_being_sliced ? "majormost" : "not_majormost");
+}
+
+// We pass the tuple type here instead of the struct, to avoid the usage of
+// ::testing::ConvertGenerator, which broke the build in some OSS
+// configurations.
+class DynamicSliceTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<DynamicSliceTestParam::TupleType> {};
+
+TEST_P(DynamicSliceTest, IsTritonSupportedExecutesCorrectlyForDynamicSlice) {
+  const DynamicSliceTestParam param(GetParam());
+
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      param.data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  constexpr absl::string_view kHloTestTemplate =
+      R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = $0[$2,$3] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = $1[] parameter(2)
+  start_index1 = $1[] parameter(3)
+  dynamic_slice = $0[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  convert = f32[5,2] convert(dynamic_slice)
+  ROOT dot = f32[5, 4] dot(convert, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = $0[$2,$3] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = $1[] constant($4)
+  start_index1 = $1[] constant($5)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate,
+      primitive_util::LowercasePrimitiveTypeName(param.data_type),
+      primitive_util::LowercasePrimitiveTypeName(param.index_type),
+      param.is_the_majormost_dim_being_sliced ? 7 : 5,  // input dim0
+      param.is_the_majormost_dim_being_sliced ? 2 : 4,  // input dim1
+      param.is_the_majormost_dim_being_sliced ? 1 : 0,  // start_index0
+      param.is_the_majormost_dim_being_sliced ? 0 : 1   // start_index1
+  );
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm");
+  ASSERT_NE(computation, nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kDynamicSlice);
+
+  const bool is_supported_instruction =
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .CanFuse();
+  const bool is_supported_dynamic_slice =
+      IsTritonSupportedDynamicSlice(*Cast<HloDynamicSliceInstruction>(instr))
+          .CanFuse();
+  EXPECT_EQ(is_supported_instruction, is_supported_dynamic_slice);
+
+  if (is_supported_instruction) {
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+  } else {
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                tsl::testing::StatusIs(absl::StatusCode::kFailedPrecondition));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    All, DynamicSliceTest,
+    ::testing::Combine(::testing::Values(F16, BF16, F32),
+                       ::testing::Values(S8, S16, S32, S64, U8, U16, U32, U64),
+                       ::testing::Bool()),
+    DynamicSliceTestParamToString);
+
 TEST_F(TritonSupportTest, UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
   const std::string kHloTest = R"(
 triton_gemm___computation {
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 58b9492fda2bd3..277470eae9c2ea 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -977,25 +977,11 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
       return "Dynamic slices for now are only supported in GEMM fusions.";
     }
 
-    // Similar to normal slice, we cannot slice a non-major-most dimension as
-    // that would introduce non-contiguous strides under tiling. The existing
-    // check against this in GetRequirementsIfSupportedOrder is not suitable for
-    // dynamic slices, so we instead check for this here.
-    const HloInstruction* input = hlo.operand(0);
-    Layout in_layout = input->shape().layout();
-    int64_t majormost =
-        in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
-    const HloDynamicSliceInstruction* dynamic_slice =
-        Cast<HloDynamicSliceInstruction>(&hlo);
-
-    for (int i = 0; i < input->shape().dimensions_size(); ++i) {
-      if (i == majormost) {
-        continue;
-      } else if (input->shape().dimensions(i) !=
-                 dynamic_slice->slice_sizes(i)) {
-        return FusionDecision(
-            "Unsupported dynamic slice on non-major-most dimension.");
-      }
+    if (CodegenDecision decision = IsTritonSupportedDynamicSlice(
+            *Cast<HloDynamicSliceInstruction>(&hlo));
+        !decision.CanFuse()) {
+      // CodegenDecision is actually the same type as FusionDecision.
+      return decision;
     }
 
     return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,

From bc9720308de03feb5106c548f192415dca830682 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Fri, 31 May 2024 07:37:13 -0700
Subject: [PATCH 195/287] Avoid rewrite ReadVaraibleOps that are assigned in
 the same module

PiperOrigin-RevId: 639030592
---
 .../ifrt/sink_variable_as_named_array.mlir    | 26 +++++++++++++++++++
 .../ifrt/sink_variable_as_named_array.cc      | 19 +++++++++-----
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
index b61d3be97df473..f43014135a4fcb 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
@@ -62,3 +62,29 @@ module {
     return %3: tensor<1x1xf32>
   }
 }
+
+// -----
+//  Resources that are created in the same module are not sinked.
+//
+// CHECK-LABEL:  func.func @serving_default
+// CHECK-NOT:  IfrtLoadVariable
+// CHECK:      "tf.VarHandleOp"
+// CHECK-NEXT: "tf.AssignVariableOp"
+// CHECK-NEXT: "tf.ReadVariableOp"
+// CHECK-NEXT: "tf.StatefulPartitionedCall"
+// CHECK-NEXT:  return 
+//
+module {
+  func.func @serving_default() -> tensor<*xi32> {
+    %cst = "tf.Const"() <{value = dense<"some_test.txt"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "Variable"}> : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
+    "tf.AssignVariableOp"(%0, %cst) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<!tf_type.string>>>, tensor<!tf_type.string>) -> ()
+    %2 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<!tf_type.string>>>) -> tensor<*x!tf_type.string>
+    %4 = "tf.StatefulPartitionedCall"(%2) <{config = "", config_proto = "", executor_type = "", f = @__initializer}> : (tensor<*x!tf_type.string>) -> tensor<*xi32>
+    return %4: tensor<*xi32>
+  }
+  func.func @__initializer(%arg0: tensor<*x!tf_type.string>) -> tensor<i32> {
+    %0 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
index c254f58e95d9a8..dffb5320726296 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
@@ -72,7 +72,6 @@ class SinkVariableAsNamedArrayPass
 
     mlir::WalkResult walk_result =
         module.walk([&](mlir::TF::ReadVariableOp read_variable_op) {
-          builder.setInsertionPointAfter(read_variable_op);
           // TODO(b/319045348): consider use resource alias analysis for
           // this.
           auto var_handle = GetDefiningOp<mlir::TF::VarHandleOp>(
@@ -84,11 +83,20 @@ class SinkVariableAsNamedArrayPass
             return mlir::WalkResult::interrupt();
           }
 
+          // Avoid lowering ReadVariableOp to IfrtLoadVariableOp if the
+          // assignment AssignVariableOp happens at the same module because
+          // IfrtLoadVariableOp assumes asynchronous assignment of the variable.
+          for (auto var_handle_user : var_handle->getUsers()) {
+            if (llvm::isa<mlir::TF::AssignVariableOp>(var_handle_user)) {
+              return mlir::WalkResult::advance();
+            }
+          }
           std::vector<mlir::Type> result_types;
           result_types.push_back(mlir::RankedTensorType::get(
               {}, builder.getType<mlir::TF::StringType>()));
           result_types.push_back(read_variable_op.getResult().getType());
 
+          builder.setInsertionPointAfter(read_variable_op);
           auto load_variable_op = builder.create<mlir::TF::IfrtLoadVariableOp>(
               read_variable_op->getLoc(), result_types, var_handle.getResult());
           read_to_load[read_variable_op] = load_variable_op;
@@ -164,17 +172,16 @@ class SinkVariableAsNamedArrayPass
 
     // Remove all ReadVariableOp after replacing the CPU usage of
     // ReadVariableOp.
-    module.walk([&](mlir::TF::ReadVariableOp read_variable_op) {
+    for (auto& [read_variable_op, load_variable_op] : read_to_load) {
       if (!read_variable_op->use_empty()) {
         // This variable tensor is used by CPU host.
-        read_to_load[read_variable_op].setUsedByHost(true);
+        load_variable_op.setUsedByHost(true);
 
         // Replace CPU use of ReadVariableOp
-        read_variable_op.replaceAllUsesWith(
-            read_to_load[read_variable_op].getTensorFuture());
+        read_variable_op.replaceAllUsesWith(load_variable_op.getTensorFuture());
       }
       read_variable_op.erase();
-    });
+    }
   }
 
  private:

From 9ddd46a73f7a0977fa1944da1b8ca6c013696c28 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Fri, 31 May 2024 08:24:05 -0700
Subject: [PATCH 196/287] Use DOxygen comments in lite/core/c/operator.h.

PiperOrigin-RevId: 639041799
---
 tensorflow/lite/core/c/operator.h | 121 +++++++++++++++---------------
 1 file changed, 61 insertions(+), 60 deletions(-)

diff --git a/tensorflow/lite/core/c/operator.h b/tensorflow/lite/core/c/operator.h
index 21986aea2212b2..6fa71059d3ad26 100644
--- a/tensorflow/lite/core/c/operator.h
+++ b/tensorflow/lite/core/c/operator.h
@@ -37,51 +37,52 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// TfLiteOperator is an opaque version of TfLiteRegistration,
-// and is used for registering custom ops.  It represents a definition of a
-// custom op or a builtin op.
-//
-// \warning This is an experimental type and subject to change.
+/// TfLiteOperator is an opaque version of TfLiteRegistration,
+/// and is used for registering custom ops.  It represents a definition of a
+/// custom op or a builtin op.
+///
+/// \warning This is an experimental type and subject to change.
 typedef struct TfLiteOperator TfLiteOperator;
 
-// Returns a new TfLiteOperator instance.
-//
-// The returned TfLiteOperator instance represents a definition
-// of an operator with the identity (builtin_code/custom_name and
-// version) specified by the parameters, but with all callbacks initially unset.
-//
-// Evaluation of any operation using this operator will be done using
-// the "prepare" and "invoke" callbacks, which can be set using
-// `TfLiteOperatorSetPrepare` and
-// `TfLiteOperatorSetInvoke`, or for async execution
-// the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
-// which can be set using `TfLiteOperatorSetAsyncKernel`.
-// If the relevant callbacks are not set, then such evaluation will result
-// in an error status.  So normally any use of this function should be followed
-// by appropriate calls to set those callbacks.
-//
-// \note The caller retains ownership and should ensure that
-// the lifetime of the `TfLiteOperator` must be at least as long as
-// the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
-// used in.
-//
-// \param builtin_code Enumeration code specifying which builtin operator this
-//                     defines, or `TfLiteBuiltinCustom` to define a custom op.
-// \param custom_name  Name of the custom op, or `nullptr` for a builtin op.
-//                     If `custom_name` is non-null, then `builtin_code` should
-//                     be `TfLiteBuiltinCustom`.
-// \param version      Version of the op.  See
-//                     https://www.tensorflow.org/lite/guide/ops_version
-//
+/// Returns a new TfLiteOperator instance.
+///
+/// The returned TfLiteOperator instance represents a definition
+/// of an operator with the identity (builtin_code/custom_name and
+/// version) specified by the parameters, but with all callbacks initially
+/// unset.
+///
+/// Evaluation of any operation using this operator will be done using
+/// the "prepare" and "invoke" callbacks, which can be set using
+/// `TfLiteOperatorSetPrepare` and
+/// `TfLiteOperatorSetInvoke`, or for async execution
+/// the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
+/// which can be set using `TfLiteOperatorSetAsyncKernel`.
+/// If the relevant callbacks are not set, then such evaluation will result
+/// in an error status.  So normally any use of this function should be followed
+/// by appropriate calls to set those callbacks.
+///
+/// \note The caller retains ownership and should ensure that
+/// the lifetime of the `TfLiteOperator` must be at least as long as
+/// the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
+/// used in.
+///
+/// \param builtin_code Enumeration code specifying which builtin operator this
+///                     defines, or `TfLiteBuiltinCustom` to define a custom op.
+/// \param custom_name  Name of the custom op, or `nullptr` for a builtin op.
+///                     If `custom_name` is non-null, then `builtin_code` should
+///                     be `TfLiteBuiltinCustom`.
+/// \param version      Version of the op.  See
+///                     https://www.tensorflow.org/lite/guide/ops_version
+///
 TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
     TfLiteBuiltinOperator builtin_code, const char* custom_name, int version);
 
-// Destroys the TfLiteOperator instance.
-//
+/// Destroys the TfLiteOperator instance.
+///
 TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
 
-// Return the builtin op code of the provided external 'registration'.
-//
+/// Return the builtin op code of the provided external 'registration'.
+///
 TFL_CAPI_EXPORT extern TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
     const TfLiteOperator* registration);
 
@@ -97,42 +98,42 @@ TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
 TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
     const TfLiteOperator* registration);
 
-// Sets the initialization callback for the registration.
-//
-// The callback is called to initialize the op from serialized data.
-// Please refer `init` of `TfLiteRegistration` for the detail.
-//
+/// Sets the initialization callback for the registration.
+///
+/// The callback is called to initialize the op from serialized data.
+/// Please refer `init` of `TfLiteRegistration` for the detail.
+///
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
     TfLiteOperator* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                   size_t length));
 
-// Sets the deallocation callback for the registration.
-//
-// This callback is called to deallocate the data returned by the init callback.
-// The value passed in the `data` parameter is the value that was returned by
-// the `init` callback.
-// Please refer `free` of `TfLiteRegistration` for the detail.
-//
+/// Sets the deallocation callback for the registration.
+///
+/// This callback is called to deallocate the data returned by the init
+/// callback. The value passed in the `data` parameter is the value that was
+/// returned by the `init` callback. Please refer `free` of `TfLiteRegistration`
+/// for the detail.
+///
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
     TfLiteOperator* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
 
-// Sets the preparation callback for the registration.
-//
-// The callback is called when the inputs of operator have been resized.
-// Please refer `prepare` of `TfLiteRegistration` for the detail.
-//
+/// Sets the preparation callback for the registration.
+///
+/// The callback is called when the inputs of operator have been resized.
+/// Please refer `prepare` of `TfLiteRegistration` for the detail.
+///
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
     TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
                             TfLiteOpaqueNode* node));
 
-// Sets the invocation callback for the registration.
-//
-// The callback is called when the operator is executed.
-// Please refer `invoke` of `TfLiteRegistration` for the detail.
-//
+/// Sets the invocation callback for the registration.
+///
+/// The callback is called when the operator is executed.
+/// Please refer `invoke` of `TfLiteRegistration` for the detail.
+///
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
     TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,

From e70cda88665a1f92c196bda54864d06d39561a94 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Fri, 31 May 2024 08:36:11 -0700
Subject: [PATCH 197/287] [XLA:GPU] Use HloFusionAdaptor::GetParameters()
 instead of FindFusionArguments.

We now have two functions that produce the same results. GetParameters() is more convenient.

PiperOrigin-RevId: 639045016
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/service/gpu/hlo_fusion_analysis.cc    | 32 +++++++++++--------
 .../xla/xla/service/gpu/hlo_traversal.cc      |  8 -----
 .../xla/xla/service/gpu/hlo_traversal.h       |  5 ---
 .../xla/xla/service/gpu/hlo_traversal_test.cc | 20 +++++++-----
 5 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 4522e0bc59492c..83bc99d143fa9e 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -5607,6 +5607,7 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index abff91eaf68ed5..c872f8e5878f14 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -106,14 +106,23 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
   return tiled_transpose_hero;
 }
 
-int SmallestBitWidth(absl::Span<HloInstructionAdaptor const> args) {
+const Shape& GetShape(const HloInstructionAdaptor& adaptor) {
+  return adaptor.shape();
+}
+
+const Shape& GetShape(const HloInstruction* instruction) {
+  return instruction->shape();
+}
+
+template <typename Container>
+int SmallestBitWidth(const Container& args) {
   int bits = std::numeric_limits<int>::max();
-  for (const HloInstructionAdaptor& operand : args) {
-    if (!operand.shape().IsArray()) continue;
-    bits = std::min(
-        bits, operand.shape().element_type() == PRED
-                  ? 8
-                  : primitive_util::BitWidth(operand.shape().element_type()));
+  for (const auto& operand : args) {
+    const Shape& shape = GetShape(operand);
+    if (!shape.IsArray()) continue;
+    bits = std::min(bits, shape.element_type() == PRED
+                              ? 8
+                              : primitive_util::BitWidth(shape.element_type()));
   }
   return bits;
 }
@@ -147,14 +156,9 @@ HloFusionAnalysis HloFusionAnalysis::Create(
     heroes.push_back(FindNonTrivialHero(root));
   }
 
-  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_arguments;
-  FindFusionArguments(*fusion, [&](const HloInstructionAdaptor& argument) {
-    fusion_arguments.push_back(argument);
-  });
-
   InputOutputInfo input_output_info{
-      .smallest_input_dtype_bits = SmallestBitWidth(fusion_arguments),
-      .smallest_output_dtype_bits = SmallestBitWidth(roots),
+      /*smallest_input_dtype_bits=*/SmallestBitWidth(fusion->GetParameters()),
+      /*smallest_output_dtype_bits=*/SmallestBitWidth(roots),
   };
 
   std::optional<TransposeDescription> tiled_transpose_hero =
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index b3f23860889c5d..d83b5c155eacbf 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -529,14 +529,6 @@ void HloBfsProducersFirstTraversal(
       /*visit_operands=*/false);
 }
 
-void FindFusionArguments(
-    const HloFusionAdaptor& fusion,
-    const std::function<void(HloInstructionAdaptor param)>& visit) {
-  HloBfsConsumersFirstTraversal(
-      fusion.GetRoots(), fusion,
-      [&](HloInstructionAdaptor) { return TraversalResult::kAdvance; }, visit);
-}
-
 bool HloAnyOf(absl::Span<const HloInstructionAdaptor> roots,
               const HloFusionAdaptor& fusion,
               const std::function<bool(HloInstructionAdaptor node)>& visit,
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index 28ddafeaaf8e9c..bcaff4f4f13ae9 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -189,11 +189,6 @@ std::optional<const HloInstruction*> HloFindIf(
     const std::function<bool(const HloInstruction* node)>& visit,
     bool visit_operands = true);
 
-// Visit the producers of all parameters that are needed by the fusion.
-void FindFusionArguments(
-    const HloFusionAdaptor& fusion,
-    const std::function<void(HloInstructionAdaptor producer)>& visit);
-
 // Find a use chain from `parent` to `root`. Empty if no chain exists.
 // `[parent]` if `parent` is `root`.
 std::vector<HloInstructionAdaptor> HloFindUseChain(HloInstructionAdaptor parent,
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index 0b253863bbb01d..4d03a38f5da20f 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -225,9 +226,11 @@ TEST_F(HloTraversalTest, FindArguments) {
   auto fusion = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> producers;
-  FindFusionArguments(*fusion, [&](HloInstructionAdaptor producer) {
-    producers.emplace_back(producer.name());
-  });
+  absl::c_for_each(fusion->GetParameters(),
+                   [&](const HloInstruction* producer) {
+                     producers.emplace_back(producer->name());
+                   });
+
   EXPECT_THAT(producers, ElementsAre("p0", "negate"));
 }
 
@@ -238,9 +241,10 @@ TEST_F(HloTraversalTest, FindArgumentsAfterFusion) {
       module->entry_computation()->GetInstructionWithName("negate"),
       module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> producers;
-  FindFusionArguments(*fusion, [&](HloInstructionAdaptor producer) {
-    producers.emplace_back(producer.name());
-  });
+  absl::c_for_each(fusion->GetParameters(),
+                   [&](const HloInstruction* producer) {
+                     producers.emplace_back(producer->name());
+                   });
   EXPECT_THAT(producers, ElementsAre("p0", "log"));
 }
 
@@ -374,8 +378,8 @@ TEST_F(HloTraversalTest, FuseFusionConsumerAndProducer) {
                                   return TraversalResult::kAdvance;
                                 });
   std::vector<std::string> params;
-  FindFusionArguments(*fusion, [&](const HloInstructionAdaptor& param) {
-    params.emplace_back(param.name());
+  absl::c_for_each(fusion->GetParameters(), [&](const HloInstruction* param) {
+    params.emplace_back(param->name());
   });
 
   EXPECT_THAT(nodes, ElementsAre("reduce.2", "reduce.1", "mul"));

From 9bec073639e327e9b93f9b38a230c90eadc02d97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 08:54:12 -0700
Subject: [PATCH 198/287] Remove compatibility constraint.

PiperOrigin-RevId: 639049798
---
 third_party/xla/xla/service/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 6d3b39ea2f343b..622daad45bd91f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1416,7 +1416,6 @@ cc_library(
 
 cc_library(
     name = "cpu_plugin",
-    compatible_with = [],
     deps = [
         ":service",
         "//xla/service/cpu:cpu_compiler",

From b17844b07f7155626ac9388790f3ea013e5dbde2 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Fri, 31 May 2024 08:59:39 -0700
Subject: [PATCH 199/287] [XLA:GPU] Add ToString() methods to Tiled HLO
 instructions and computations.

PiperOrigin-RevId: 639051471
---
 third_party/xla/xla/service/gpu/model/BUILD   | 12 ++++
 .../gpu/model/symbolic_tile_analysis.cc       | 30 ++++++++++
 .../gpu/model/symbolic_tile_analysis.h        |  7 +++
 .../model/symbolic_tiled_hlo_instruction.cc   | 10 ++++
 .../model/symbolic_tiled_hlo_instruction.h    |  5 ++
 .../gpu/model/tiled_hlo_computation.cc        | 56 +++++++++++++++++++
 .../service/gpu/model/tiled_hlo_computation.h |  5 ++
 .../gpu/model/tiled_hlo_instruction.cc        |  8 +--
 8 files changed, 129 insertions(+), 4 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/model/tiled_hlo_computation.cc

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 091f48b700125d..d7fa573bc7f681 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -587,6 +587,9 @@ cc_library(
         ":indexing_analysis",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:name_uniquer",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -614,10 +617,16 @@ xla_cc_test(
 
 cc_library(
     name = "tiled_hlo_computation",
+    srcs = ["tiled_hlo_computation.cc"],
     hdrs = ["tiled_hlo_computation.h"],
     deps = [
         ":tiled_hlo_instruction",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:name_uniquer",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/gtl:iterator_range",
     ],
 )
@@ -627,6 +636,7 @@ cc_library(
     srcs = ["symbolic_tile_analysis.cc"],
     hdrs = ["symbolic_tile_analysis.h"],
     deps = [
+        ":affine_map_printer",
         ":indexing_analysis",
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
@@ -634,9 +644,11 @@ cc_library(
         ":tiled_hlo_instruction",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
+        "//xla/service:name_uniquer",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 32bb5bbe307be0..8264c2b90b1f57 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <sstream>
+#include <string>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -26,9 +28,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -37,6 +41,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
@@ -44,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
+#include "xla/service/name_uniquer.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -315,5 +321,29 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
       std::move(tiled_hlo_instructions));
 }
 
+std::string SymbolicTileAnalysis::ToString(
+    const AffineMapPrinter& printer) const {
+  std::stringstream ss;
+  NameUniquer name_uniquer("_");
+  absl::flat_hash_map<SymbolicTiledHloInstruction*, std::string> tile_names;
+
+  for (const auto& tiled_hlo : symbolic_tiled_hlo_instructions_) {
+    std::string tile_name = name_uniquer.GetUniqueName(
+        absl::StrCat(tiled_hlo->hlo()->name(), ".tile_0"));
+    tile_names[tiled_hlo.get()] = tile_name;
+
+    absl::InlinedVector<std::string, 4> operand_names;
+    for (const auto& operand : tiled_hlo->operands()) {
+      operand_names.push_back(tile_names.at(operand));
+    }
+
+    ss << tile_name << " = " << HloOpcodeString(tiled_hlo->hlo()->opcode())
+       << "(" << absl::StrJoin(operand_names, ", ") << ")\n";
+
+    ss << tiled_hlo->ToString();
+  }
+  return ss.str();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index a880437356c4f0..2ce1a7e17b0697 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -25,6 +26,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/instruction_fusion.h"
@@ -67,6 +69,11 @@ class SymbolicTileAnalysis {
   // Return the underlying MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const { return context_; };
 
+  // Returns a string representation of the analysis. Used only for error
+  // messages and debugging.
+  std::string ToString(
+      const AffineMapPrinter& printer = AffineMapPrinter()) const;
+
  private:
   SymbolicTileAnalysis(std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
                            symbolic_tiled_hlo_instructions,
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
index f1b1d4cb34df45..74ecc1cfba7821 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
 
 #include <cstdint>
+#include <sstream>
+#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -75,5 +77,13 @@ std::vector<int64_t> SymbolicTiledHloInstruction::TileStrides(
   return EvaluateTileMap(symbolic_tile_.stride_map(), tile_parameters);
 }
 
+std::string SymbolicTiledHloInstruction::ToString() const {
+  std::stringstream ss;
+  ss << "\thlo: " << hlo_->ToString() << "\n";
+  ss << "\t" << symbolic_tile_.ToString() << "\n";
+  ss << "\tindexing map: " << indexing_map_.ToString() << "\n";
+  return ss.str();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
index 18cab47c7984aa..18aff6d2e36892 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -69,6 +70,10 @@ class SymbolicTiledHloInstruction {
     operands_.push_back(operand);
   }
 
+  // Returns a string representation of the instruction. Used only for error
+  // messages and debugging.
+  std::string ToString() const;
+
  private:
   // Pointer to the original HLO instruction.
   const HloInstruction* hlo_;
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.cc
new file mode 100644
index 00000000000000..7379f3e3c53505
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.cc
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+
+#include <sstream>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/service/name_uniquer.h"
+
+namespace xla {
+namespace gpu {
+
+std::string TiledHloComputation::ToString() const {
+  std::stringstream ss;
+  NameUniquer name_uniquer("_");
+  absl::flat_hash_map<const TiledHloInstruction*, std::string> tile_names;
+
+  for (const auto* tiled_hlo : instructions()) {
+    std::string tile_name = name_uniquer.GetUniqueName(
+        absl::StrCat(tiled_hlo->hlo()->name(), ".tile_0"));
+    tile_names[tiled_hlo] = tile_name;
+
+    absl::InlinedVector<std::string, 4> operand_names;
+    for (const auto& operand : tiled_hlo->operands()) {
+      operand_names.push_back(tile_names.at(operand));
+    }
+
+    ss << tile_name << " = " << HloOpcodeString(tiled_hlo->hlo()->opcode())
+       << "(" << absl::StrJoin(operand_names, ", ") << ")\n";
+
+    ss << tiled_hlo->ToString() << "\n";
+  }
+  return ss.str();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
index d7fe5e71789311..bdd4d667fb81e3 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -56,6 +57,10 @@ class TiledHloComputation {
     return instructions_.back().get();
   }
 
+  // Returns a string representation of the computation. Used only for error
+  // messages and debugging.
+  std::string ToString() const;
+
  private:
   explicit TiledHloComputation(
       std::vector<std::unique_ptr<TiledHloInstruction>> instructions)
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
index 7c0698f2e40c07..dec8abf291fb7e 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
@@ -106,10 +106,10 @@ TiledHloInstruction::Create(const HloInstruction* hlo,
 
 std::string TiledHloInstruction::ToString() const {
   std::stringstream ss;
-  ss << "hlo: " << hlo_->ToString() << "\n";
-  ss << "tile_sizes: {" << absl::StrJoin(tile_sizes_, ", ") << "}\n";
-  ss << "tile_strides: {" << absl::StrJoin(tile_strides_, ", ") << "}\n";
-  ss << "block_id_to_tile_offsets_indexing: "
+  ss << "\thlo: " << hlo_->ToString() << "\n";
+  ss << "\ttile_sizes: (" << absl::StrJoin(tile_sizes_, ", ") << ")\n";
+  ss << "\ttile_strides: (" << absl::StrJoin(tile_strides_, ", ") << ")\n";
+  ss << "\tblock_id_to_tile_offsets_indexing: "
      << block_id_to_tile_offsets_indexing_;
   return ss.str();
 }

From 9ba78fd7901dcac3c22f769fae074616150bf3d4 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Fri, 31 May 2024 09:03:57 -0700
Subject: [PATCH 200/287] CSE after inlining.

Without this, we can end up with functions that
aren't inlined even though they're only called
once (after CSE).

PiperOrigin-RevId: 639052830
---
 .../xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index d6992761dfca84..1d49ed23739633 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -304,7 +304,10 @@ MlirFusionEmitterBase::CreateLLVMModule(
   pm.addPass(CreateEraseDeadFunctionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(CreateLowerXlaGpuToScfPass());
-  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createInlinerPass({}, [&](mlir::OpPassManager& pm) {
+    // CSE after inlining because inlining can introduce duplicates.
+    pm.addPass(mlir::createCSEPass());
+  }));
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());

From 94a24f82b6c7a76fe6d08466d94fab161302f6be Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Fri, 31 May 2024 09:06:52 -0700
Subject: [PATCH 201/287] BUILD file cleanup of old packaging rules.

PiperOrigin-RevId: 639053590
---
 tensorflow/BUILD                          |  1 -
 third_party/xla/xla/stream_executor/BUILD | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 5018c6e642e516..841dd996dad4f9 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1614,7 +1614,6 @@ genrule(
         "//tensorflow/c/eager:headers",
         "//tensorflow/cc:headers",
         "//tensorflow/core:headers",
-        "@local_xla//xla/stream_executor:stream_executor_install_hdrs",
     ],
     outs = ["include"],
     cmd = """
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index ca432a79d96b1a..66664fbe967425 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -4,7 +4,7 @@ load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_build_defs_bzl_deps", "stream_executor_friends", "stream_executor_internal")
-load("//xla/tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
@@ -653,15 +653,6 @@ cc_library(
 
 #===--------------------------------------------------------------------------------------------===#
 
-transitive_hdrs(
-    name = "stream_executor_install_hdrs",
-    tags = [
-        "alt_dep=:stream_executor_headers",
-        "avoid_dep",
-    ],
-    deps = [":stream_executor_headers"],
-)
-
 # TODO(ezhulenev): This should be merged into regular `stream_executor` target and `stream.cc` moved
 # into its own target, however today we have problems with backward references when we try to link
 # everything together. See: https://lld.llvm.org/ELF/warn_backrefs.html.

From 1a4ab9f315952c4c0082eb6b721525fb7cec416d Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Fri, 31 May 2024 09:51:11 -0700
Subject: [PATCH 202/287] [XLA:GPU] Use xla_gpu.apply_indexing in
 ir_emitter_triton.

`xla_gpu.apply_indexing` is the new preferred way to emit IndexingMaps.

PiperOrigin-RevId: 639066133
---
 third_party/xla/xla/service/gpu/BUILD         |  4 ++
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 43 ++++++++++--------
 .../gpu/ir_emitter_triton_mem_utils_test.cc   |  6 ++-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 44 +++++++++----------
 4 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 83bc99d143fa9e..5c2587f2f49f2f 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -524,6 +524,8 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:instruction_fusion",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
+        "//xla/service/gpu/fusions/mlir:passes",
+        "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/service/gpu/model:affine_map_printer",
         "//xla/service/gpu/model:indexing_analysis",
@@ -558,6 +560,7 @@ cc_library(
         "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:LLVMDialect",
@@ -656,6 +659,7 @@ cc_test(
         ":ir_emitter_triton",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:tiled_hlo_instruction",
         "//xla/service/llvm_ir:llvm_util",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 8080871125dccf..e4551cbf55a566 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
@@ -95,7 +96,6 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/comparison_util.h"
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -109,11 +109,14 @@ limitations under the License.
 #include "xla/service/algorithm_util.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
+#include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+#include "xla/service/gpu/fusions/mlir/passes.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
@@ -131,7 +134,6 @@ limitations under the License.
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
@@ -2376,26 +2378,23 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
 }
 
 // Computes the base pointer offset for the given pid and shape.
-// `tile_offset_indexing` is a mapping from
-// (program_id) -> [tile_offset0, ..., tile_offsetN]
 Value ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
                            const TiledHloInstruction& tiled_hlo) {
   const Shape& shape = tiled_hlo.hlo()->shape();
-  ArrayRef<mlir::AffineExpr> dimension_exprs =
-      tiled_hlo.block_id_to_tile_offsets_indexing().GetAffineMap().getResults();
+  Shape linear_shape = ShapeUtil::MakeShape(shape.element_type(),
+                                            {ShapeUtil::ElementsIn(shape)});
 
-  mlir::AffineExpr linear_index =
-      mlir::getAffineConstantExpr(0, b.getContext());
-  int64_t stride = 1;
-  for (int i : shape.layout().minor_to_major()) {
-    linear_index = linear_index + dimension_exprs[i] * stride;
-    stride *= shape.dimensions(i);
-  }
+  // Bitcast map gives an indexing map from linear index to the parameter shape
+  // index respecting physical layout of the memory.
+  auto bitcast_map = GetBitcastMap(shape, linear_shape, b.getContext());
+  auto compose_indexing_maps = ComposeIndexingMaps(
+      tiled_hlo.block_id_to_tile_offsets_indexing(), bitcast_map);
+  compose_indexing_maps.Simplify();
 
   return b.create<ma::IndexCastUIOp>(
-      b.getI64Type(),
-      mlir_converter::ApplyAffineExpr(linear_index, /*dims=*/pid,
-                                      /*symbols=*/{}, b));
+      b.getI64Type(), mlir_converter::ApplyIndexing(compose_indexing_maps,
+                                                    /*dims=*/pid,
+                                                    /*symbols=*/{}, b)[0]);
 }
 
 namespace ir_emitter_triton_internal {
@@ -2748,9 +2747,12 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     const std::vector<int64_t>& output_tile_sizes, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
-  mlir_context
-      .loadDialect<mt::TritonDialect, mt::gpu::TritonGPUDialect,
-                   mlir::arith::ArithDialect, mlir::affine::AffineDialect>();
+  mlir_context.loadDialect<
+      mt::TritonDialect, mt::gpu::TritonGPUDialect, mlir::arith::ArithDialect,
+      mlir::affine::AffineDialect, xla::gpu::XlaGpuDialect>();
+  mlir::DialectRegistry registry;
+  mlir::func::registerInlinerExtension(registry);
+  mlir_context.appendDialectRegistry(registry);
 
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
@@ -2892,6 +2894,9 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   // Lower affine expressions into arithmetic ops.
   pm.addPass(mlir::createLowerAffinePass());
 
+  // Lower xla_gpu.apply_indexing into arithmetic ops.
+  pm.addPass(CreateSimplifyAffinePass());
+
   mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
   if (!CreateTritonPipeline(pm, cc, config, /*out*/ cluster_info).ok()) {
     return Internal("Failed to create Triton pipeline.");
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc
index 267b7fb555f394..75e92b43cd9b61 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_mem_utils_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/ir_emitter_triton.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
@@ -61,8 +62,9 @@ using ::testing::ElementsAre;
 class TritonMakeTensorPtrTest : public HloTestBase {
  public:
   void SetUp() override {
-    mlir_context_.loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
-                              mlir::affine::AffineDialect>();
+    mlir_context_
+        .loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
+                     mlir::affine::AffineDialect, xla::gpu::XlaGpuDialect>();
   }
 
  protected:
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 882fe84664c369..5b61ab8b933239 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -383,7 +383,7 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:            arith.index_castui %[[PID]] : i32 to index
@@ -437,7 +437,7 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:            arith.index_castui %[[PID]] : i32 to index
@@ -494,13 +494,13 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
@@ -561,13 +561,13 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
@@ -626,13 +626,13 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
@@ -695,13 +695,13 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
-CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
@@ -769,13 +769,13 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK-DAG:        %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK-DAG:        %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK-DAG:        %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
@@ -844,7 +844,7 @@ ENTRY main {
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
+CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0 * 16)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
@@ -857,7 +857,7 @@ CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK:           %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
 CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32>, i64
 CHECK:           tt.splat
-CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK:           tt.make_tensor_ptr
@@ -961,7 +961,7 @@ ENTRY main {
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 32)>
+CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
@@ -976,7 +976,7 @@ CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      <tensor<32xf32>>
 CHECK:           tt.load
 CHECK-SAME:      !tt.ptr<tensor<32xf32>>
-CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:      tt.make_tensor_ptr
@@ -1035,7 +1035,7 @@ ENTRY main {
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-// CHECK:         #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
+// CHECK:         #[[MAP:.*]] = affine_map<(d0) -> (d0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
@@ -1049,7 +1049,7 @@ ENTRY main {
 // CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.load
 // CHECK-SAME:      : !tt.ptr<f32>
-// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 // CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 // CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK:           tt.make_tensor_ptr
@@ -1108,7 +1108,7 @@ ENTRY main {
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config,
                                           /*output_tile_sizes=*/{}, EmitSoftMax,
                                           "triton_softmax_computation", R"(
-// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
+// CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
@@ -1124,7 +1124,7 @@ ENTRY main {
 // CHECK-SAME:      <tensor<16xf32>>
 // CHECK:           tt.load
 // CHECK-SAME:      !tt.ptr<tensor<16xf32>>
-// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[PID_INDEX]]
 // CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 // CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
@@ -5576,7 +5576,7 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, output_tile_sizes,
                                           EmitGeneric,
                                           "triton_reduction_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:            arith.index_castui %[[PID]] : i32 to index
@@ -5638,7 +5638,7 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, output_tile_sizes,
                                           EmitGeneric,
                                           "triton_softmax_computation", R"(
-CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
+CHECK:        #[[MAP:.*]] = affine_map<(d0) -> (d0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:            arith.index_castui %[[PID]] : i32 to index

From f34dc6a8ce44406ab28c9b3bb7e37938b135fbe1 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Fri, 31 May 2024 10:02:53 -0700
Subject: [PATCH 203/287] Add max_enqueued_batches option for model servers.

PiperOrigin-RevId: 639069686
---
 .../mlir/tfrt/tests/reconfig_batch_op.mlir    | 55 ++++++++++++++++++-
 .../compiler/mlir/tfrt/transforms/passes.cc   |  3 +-
 .../compiler/mlir/tfrt/transforms/passes.h    |  1 +
 .../mlir/tfrt/transforms/reconfig_batch_op.cc | 31 +++++++----
 .../tfrt/transforms/tfrt_pipeline_options.h   |  6 ++
 .../mlir/tfrt/translate/import_model.cc       |  1 +
 .../tfrt/translate/tfrt_compile_options.cc    |  2 +
 .../tfrt/translate/tfrt_compile_options.h     |  6 ++
 8 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir b/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
index 79991dc1e0f6cf..5cbb8962a71044 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/reconfig_batch_op.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -split-input-file -tfrt-reconfig-batch-op="tfrt-min-num-batch-threads=2" %s | FileCheck %s --dump-input=always
+// RUN: tf-tfrt-opt -split-input-file -tfrt-reconfig-batch-op="tfrt-min-num-batch-threads=2 tfrt-min-max-enqueued-batches=3" %s | FileCheck %s --dump-input=always
 
 // -----
 
@@ -51,3 +51,56 @@ func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> {
   %1 = "tf.BatchFunction"(%arg0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 3 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<1x3xf32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
 }
+
+// -----
+
+// The max_enqueued_batches is changed to 3 from the original attribute of 2
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  %2 = "tf.Identity"(%arg0) : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> {
+  // CHECK:  "tf.BatchFunction"
+  // CHECK-SAME: allowed_batch_sizes = [6]
+  // CHECK-SAME: batch_timeout_micros = 100000 : i64
+  // CHECK-SAME: batching_queue = ""
+  // CHECK-SAME: container = ""
+  // CHECK-SAME: enable_large_batch_splitting = false
+  // CHECK-SAME: max_batch_size = 6 : i64
+  // CHECK-SAME: max_enqueued_batches = 3 : i64
+  // CHECK-SAME: num_batch_threads = 2 : i64
+  // CHECK-SAME: shared_name = "batch/"
+  %1 = "tf.BatchFunction"(%arg0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 2 : i64, num_batch_threads = 2 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+
+// -----
+
+// The max_enqueued_batches remains 10 (the same as the original attribute)
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  %2 = "tf.Identity"(%arg0) : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> {
+  // CHECK:  "tf.BatchFunction"
+  // CHECK-SAME: allowed_batch_sizes = [6]
+  // CHECK-SAME: batch_timeout_micros = 100000 : i64
+  // CHECK-SAME: batching_queue = ""
+  // CHECK-SAME: container = ""
+  // CHECK-SAME: enable_large_batch_splitting = false
+  // CHECK-SAME: max_batch_size = 6 : i64
+  // CHECK-SAME: max_enqueued_batches = 10 : i64
+  // CHECK-SAME: num_batch_threads = 3 : i64
+  // CHECK-SAME: shared_name = "batch/"
+  %1 = "tf.BatchFunction"(%arg0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 3 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
index 93ee91f91cdad1..51abf57bc00951 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -119,7 +119,8 @@ void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
 
   // Lower bound on the number of batch threads in `tf.BatchFunction`.
   pm.addPass(tfrt_compiler::CreateReconfigBatchOpPass(
-      {.min_num_batch_threads = options.min_num_batch_threads}));
+      {.min_num_batch_threads = options.min_num_batch_threads,
+       .min_max_enqueued_batches = options.min_max_enqueued_batches}));
 
   // Deduplicate functions invoked by tf.BatchFunction with the same
   // shared_name
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
index f91670b039567e..b5d036d98a46db 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -70,6 +70,7 @@ CreateDeduplicateFunctionsInovkedByBatchFunctionPass();
 // Create a pass to lower bound the number of threads in tf.BatchFunction.
 struct ReconfigBatchOpPassOptions {
   int64_t min_num_batch_threads = 1;
+  int64_t min_max_enqueued_batches = 1;
 };
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateReconfigBatchOpPass(
     ReconfigBatchOpPassOptions options);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc b/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
index 4e2872cd645c4f..2dcedd31dc6fa9 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/reconfig_batch_op.cc
@@ -39,6 +39,7 @@ class ReconfigBatchOpPass
       : mlir::PassWrapper<ReconfigBatchOpPass,
                           mlir::OperationPass<mlir::ModuleOp>>() {
     min_num_batch_threads_ = options.min_num_batch_threads;
+    min_max_enqueued_batches_ = options.min_max_enqueued_batches;
   }
   ReconfigBatchOpPass()
       : mlir::PassWrapper<ReconfigBatchOpPass,
@@ -55,26 +56,34 @@ class ReconfigBatchOpPass
   llvm::StringRef getArgument() const final { return "tfrt-reconfig-batch-op"; }
 
   llvm::StringRef getDescription() const final {
-    return "Reconfig batch op such as num_batch_threads.";
+    return "Reconfig batch op such as num_batch_threads and "
+           "max_enqueued_batches.";
   }
 
   void runOnOperation() override {
-    if (min_num_batch_threads_ > 0) {
-      mlir::ModuleOp module = getOperation();
-      module.walk([&](mlir::TF::BatchFunctionOp batch_op) {
-        int64_t num_batch_threads = batch_op.getNumBatchThreads();
-        num_batch_threads =
-            std::max(num_batch_threads, min_num_batch_threads_.getValue());
-        batch_op.setNumBatchThreads(num_batch_threads);
-      });
-    }
+    if (min_num_batch_threads_ == 0 && min_max_enqueued_batches_ == 0) return;
+    mlir::ModuleOp module = getOperation();
+    module.walk([&](mlir::TF::BatchFunctionOp batch_op) {
+      int64_t num_batch_threads = batch_op.getNumBatchThreads();
+      num_batch_threads =
+          std::max(num_batch_threads, min_num_batch_threads_.getValue());
+      batch_op.setNumBatchThreads(num_batch_threads);
+
+      int64_t max_enqueued_batches = batch_op.getMaxEnqueuedBatches();
+      max_enqueued_batches =
+          std::max(max_enqueued_batches, min_max_enqueued_batches_.getValue());
+      batch_op.setMaxEnqueuedBatches(max_enqueued_batches);
+    });
   }
 
  protected:
   mlir::Pass::Option<int64_t> min_num_batch_threads_{
       *this, "tfrt-min-num-batch-threads", llvm::cl::init(1),
       llvm::cl::desc("Minimum number of batch threads")};
-  ;
+  mlir::Pass::Option<int64_t> min_max_enqueued_batches_{
+      *this, "tfrt-min-max-enqueued-batches", llvm::cl::init(1),
+      llvm::cl::desc(
+          "Minimum of the maximum number of outstanding enqueued batches")};
 };
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
index f893749011aca0..cfab730816e95a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
@@ -142,6 +142,12 @@ struct TfrtPipelineOptions
       *this, "tfrt-min-num-batch-threads",
       llvm::cl::desc("The minimum number of batch threads"), llvm::cl::init(1)};
 
+  Option<int64_t> min_max_enqueued_batches{
+      *this, "tfrt-min-max-enqueued-batches",
+      llvm::cl::desc(
+          "The minimum of the maximum number of outstanding enqueued batches"),
+      llvm::cl::init(1)};
+
   Option<bool> merge_inter_dependent_streams{
       *this, "tfrt-merge-inter-dependent-streams",
       llvm::cl::desc("If true, streams with inter data depenedencies will be "
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 77e3c687f0c02a..fa98f1a18768c5 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -346,6 +346,7 @@ std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
       options.enable_while_parallel_iterations;
   pipeline_options->cost_threshold = options.cost_threshold;
   pipeline_options->min_num_batch_threads = options.min_num_batch_threads;
+  pipeline_options->min_max_enqueued_batches = options.min_max_enqueued_batches;
 
   pipeline_options->merge_inter_dependent_streams =
       options.merge_inter_dependent_streams;
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
index aadacd8563b7ea..183d63bd1068c9 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
@@ -56,6 +56,8 @@ std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options) {
             << options.enable_while_parallel_iterations
             << ", cost_threshold = " << options.cost_threshold
             << ", min_num_batch_threads = " << options.min_num_batch_threads
+            << ", min_max_enqueued_batches = "
+            << options.min_max_enqueued_batches
             << ", merge_inter_dependent_streams = "
             << options.merge_inter_dependent_streams
             << ", decompose_resource_ops = " << options.decompose_resource_ops
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
index db50b062b8a209..b1e1b93653347e 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -143,6 +143,12 @@ struct TfrtCompileOptions {
   // of resources.
   int64_t min_num_batch_threads = 1;
 
+  // The minimum of the maximum number of enqueued batches. This number provides
+  // a lower bound on top of what is specified in the model. If the number of
+  // max_enqueued_batches is too small, it can lead to under utilization of
+  // resources.
+  int64_t min_max_enqueued_batches = 1;
+
   // If true, streams with inter data depenedencies will be preferred to be
   // merged for inline execution.
   bool merge_inter_dependent_streams = true;

From 6415561d94857dfd9f4ae1e6612d7797639a729f Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Fri, 31 May 2024 10:10:40 -0700
Subject: [PATCH 204/287] Remove unnecessary dependency accidentally included
 in a refactoring.

PiperOrigin-RevId: 639072371
---
 third_party/xla/xla/pjrt/gpu/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 50ee6294bff7b1..792aa5bfedf646 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -343,7 +343,6 @@ xla_cc_test(
         "//xla/service:compiler",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
-        "//xla/service/gpu:amdgpu_compiler_impl",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",

From 286e7d4b24b4235ca71027b1a68d5619e1c70d8a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 31 May 2024 10:22:24 -0700
Subject: [PATCH 205/287] [xla:cpu] NFC: Remove deprecated linalg-based dot
 instruction codegen

PiperOrigin-RevId: 639075801
---
 third_party/xla/xla/service/cpu/BUILD         |  43 +----
 .../xla/xla/service/cpu/dot_op_emitter.cc     | 161 +++---------------
 .../xla/xla/service/cpu/dot_op_emitter.h      |  18 +-
 .../xla/xla/service/cpu/mlir_emitter.cc       | 140 ---------------
 .../xla/xla/service/cpu/mlir_emitter.h        |  44 -----
 5 files changed, 41 insertions(+), 365 deletions(-)
 delete mode 100644 third_party/xla/xla/service/cpu/mlir_emitter.cc
 delete mode 100644 third_party/xla/xla/service/cpu/mlir_emitter.h

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 337967d214bcb3..8120b609e9df3a 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -2,7 +2,6 @@
 #    LLVM-based CPU backend for XLA.
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -76,18 +75,6 @@ test_suite(
     ],
 )
 
-bool_flag(
-    name = "experimental_mlir_gpu",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "experimental_mlir_gpu_enabled",
-    flag_values = {
-        ":experimental_mlir_gpu": "True",
-    },
-)
-
 cc_library(
     name = "test_header_helper",
     testonly = True,
@@ -869,7 +856,6 @@ cc_library(
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
-        ":mlir_emitter",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -884,14 +870,19 @@ cc_library(
         "//xla/service/llvm_ir:kernel_support_library",
         "//xla/service/llvm_ir:llvm_loop",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
     ],
@@ -1634,30 +1625,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "mlir_emitter",
-    srcs = ["mlir_emitter.cc"],
-    hdrs = ["mlir_emitter.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/translate/hlo_to_mhlo:hlo_utils",
-        "@com_google_absl//absl/status",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:IPO",
-        "@llvm-project//llvm:Linker",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFToControlFlow",
-        "@llvm-project//mlir:ToLLVMIRTranslation",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorToLLVM",
-    ],
-)
-
 tf_proto_library(
     name = "backend_config_proto",
     srcs = ["backend_config.proto"],
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
index ef06f3bb08e338..87806bf803d8ee 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.cc
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -15,42 +15,56 @@ limitations under the License.
 
 #include "xla/service/cpu/dot_op_emitter.h"
 
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/cpu_runtime.h"
-#include "xla/service/cpu/ir_emission_utils.h"
-#include "xla/service/cpu/mlir_emitter.h"
 #include "xla/service/cpu/target_machine_features.h"
 #include "xla/service/cpu/tiled_dot_emitter.h"
-#include "xla/service/cpu/vector_support_library.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -103,9 +117,6 @@ enum class DotImplementationStrategy {
   // and the output have to be row major.
   kTiledLlvmIrGemm,
 
-  // The dot operation is lowered into linalg.matmul op and lowered to LLVM IR.
-  kLinalgMatmul,
-
   // The dot operation is lowered into a call into an Eigen routine.  No fusions
   // are supported today.  The two inputs and the output have to be row major.
   // However, we do allow transposing either the LHS or the RHS as part of the
@@ -191,9 +202,6 @@ class DotOpEmitter {
   // Lowers the dot operation as a tiled Matrix*Matrix loop.
   void EmitTiledLlvmIrGemm();
 
-  // Lowers the dot operation through MLIR's linalg.matmul.
-  absl::Status EmitLinalgMatmul();
-
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
   void EmitNaiveLlvmIrGemm();
@@ -264,118 +272,6 @@ DotOpEmitter::DotOpEmitter(
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-absl::Status DotOpEmitter::EmitLinalgMatmul() {
-  Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
-  llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
-                                 rhs_array_.GetBasePointer()};
-  llvm::Value* target_ptr = target_array_.GetBasePointer();
-
-  // Zero out the output buffer.
-  int64_t size_bytes = ShapeUtil::ByteSizeOf(dot_info_.result_shape);
-  b_->CreateMemSet(target_ptr, b_->getInt8(0), /*Size=*/size_bytes,
-                   /*Align=*/llvm::MaybeAlign(1));
-
-  std::string name =
-      absl::StrCat("linalgMatMul_", dot_info_.result_shape.ToString(true), "_",
-                   dot_info_.lhs_shape.ToString(true), "_",
-                   dot_info_.rhs_shape.ToString(true));
-
-  return EmitMlirFuncAndCall(
-      mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
-      operand_ptrs, name,
-      [&](mlir::OpBuilder* builder, mlir::func::FuncOp function) {
-        CHECK_EQ(dot_info_.dim_nums.lhs_contracting_dimensions_size(), 1);
-        CHECK_EQ(dot_info_.dim_nums.rhs_contracting_dimensions_size(), 1);
-        mlir::MLIRContext* context = builder->getContext();
-        mlir::Value a = function.getArgument(0), b = function.getArgument(1),
-                    c = function.getArgument(2);
-
-        llvm::SmallVector<mlir::AffineExpr, 2> b_exprs(
-            dot_info_.lhs_shape.rank());
-        llvm::SmallVector<mlir::AffineExpr, 2> c_exprs(
-            dot_info_.rhs_shape.rank());
-
-        llvm::SmallVector<mlir::AffineExpr, 2> parallel_exprs;
-        mlir::AffineExpr reduce_expr;
-        for (int i = 0; i != dot_info_.result_shape.rank(); ++i) {
-          parallel_exprs.push_back(mlir::getAffineDimExpr(i, context));
-        }
-        reduce_expr =
-            mlir::getAffineDimExpr(dot_info_.result_shape.rank(), context);
-
-        // The reduction expr is shared for both inputs.
-        b_exprs[dot_info_.dim_nums.lhs_contracting_dimensions(0)] = reduce_expr;
-        c_exprs[dot_info_.dim_nums.rhs_contracting_dimensions(0)] = reduce_expr;
-
-        // Fill in the remaining parallel exprs.
-        int par_expr_num = 0;
-        for (auto* v : {&b_exprs, &c_exprs}) {
-          for (auto& e : *v) {
-            if (!e) {
-              e = parallel_exprs[par_expr_num++];
-            }
-          }
-        }
-
-        llvm::SmallVector<mlir::utils::IteratorType, 4> iteratorTypes(
-            parallel_exprs.size(), mlir::utils::IteratorType::parallel);
-        iteratorTypes.push_back(mlir::utils::IteratorType::reduction);
-        builder->create<mlir::linalg::GenericOp>(
-            function.getLoc(),
-            /*inputs=*/mlir::ValueRange{b, c},
-            /*outputs=*/mlir::ValueRange{a},
-            /*indexingMaps=*/
-            mlir::AffineMap::inferFromExprList(
-                {b_exprs, c_exprs, parallel_exprs}, context),
-            /*iteratorTypes=*/iteratorTypes,
-            [](mlir::OpBuilder& b, mlir::Location loc, mlir::ValueRange args) {
-              mlir::ArithBuilder ab(b, loc);
-              mlir::Value mul = ab.mul(args[0], args[1]);
-              mlir::Value add = ab.add(mul, args[2]);
-              b.create<mlir::linalg::YieldOp>(loc, add);
-            });
-        builder->create<mlir::func::ReturnOp>(function.getLoc());
-
-        // TODO(kramerb): this has been retired upstream, reevaluate whether
-        // this path really needs it or if it is even relevant anymore.
-        // mlir::linalg::LinalgTilingOptions tilingOptions;
-        // tilingOptions = tilingOptions.setTileSizes(GetMlirGemmTileSize());
-        // int64_t alignment =
-        //     target_machine_features_.minimum_alignment_for_allocation(
-        //         ShapeUtil::ByteSizeOf(dot_info_.result_shape));
-        // mlir::linalg::CodegenStrategy strategy;
-        // strategy.tile(mlir::linalg::GenericOp::getOperationName(),
-        //               tilingOptions);
-        // .promote(mlir::linalg::GenericOp::getOperationName(),
-        //          mlir::linalg::LinalgPromotionOptions()
-        //              .setAlignment(alignment)
-        //              .setUseFullTileBuffersByDefault(true)
-        //              .setUseAlloca(true))
-        // .vectorize(mlir::linalg::GenericOp::getOperationName())
-        // .vectorLowering(
-        //    mlir::linalg::LinalgVectorLoweringOptions()
-        //        .setVectorTransformsOptions(
-        //            mlir::vector::VectorTransformsOptions()
-        //                .setVectorTransformsOptions(
-        //                    mlir::vector::VectorContractLowering::
-        //                        OuterProduct))
-        //        .setVectorTransferToSCFOptions(
-        //            mlir::VectorTransferToSCFOptions().enableFullUnroll()));
-        // TODO(kramerb): this should be within a pass and we should be able to
-        // create a nested OpPassManager.
-        // Created a nested OpPassManager, populate the strategy and run.
-        // mlir::OpPassManager dynamicPM("func.func");
-        // strategy.configurePassPipeline(dynamicPM, function.getContext());
-        // Propagate pass failure?
-        // (void)mlir::runPipeline(dynamicPM, function);
-        // mlir::PassManager pm(function.getContext(),
-        //                      function.getOperationName());
-        // strategy.configurePassPipeline(pm, function.getContext());
-        // Propagate pass failure?
-        // (void)pm.run(function);
-      });
-}
-
 void DotOpEmitter::EmitTiledLlvmIrGemm() {
   PrimitiveType primitive_type = dot_info_.result_shape.element_type();
   MatMultDims mat_mult_dims = GetMatMultDims();
@@ -576,9 +472,6 @@ absl::Status DotOpEmitter::Emit() {
       EmitTiledLlvmIrGemm();
       return absl::OkStatus();
 
-    case DotImplementationStrategy::kLinalgMatmul:
-      return EmitLinalgMatmul();
-
     case DotImplementationStrategy::kEigen:
       return EmitCallToRuntime();
   }
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.h b/third_party/xla/xla/service/cpu/dot_op_emitter.h
index 58e3afee737f7b..d55c0af629ff69 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.h
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.h
@@ -16,20 +16,20 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 #define XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 
-#include "absl/strings/string_view.h"
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/target_machine_features.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/ir_array.h"
-#include "xla/service/llvm_ir/llvm_loop.h"
-#include "xla/types.h"
-#include "tsl/platform/status.h"
 
-namespace xla {
-namespace cpu {
+namespace xla::cpu {
+
 // Returns true if the two operands and the output of `dot_instr` must have row
 // major layout.
 bool DotOperandsAndResultMustHaveRowMajorLayout(
@@ -64,7 +64,7 @@ absl::Status EmitDotOperation(
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
     mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features);
-}  // namespace cpu
-}  // namespace xla
+
+}  // namespace xla::cpu
 
 #endif  // XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.cc b/third_party/xla/xla/service/cpu/mlir_emitter.cc
deleted file mode 100644
index 62939d896b6eff..00000000000000
--- a/third_party/xla/xla/service/cpu/mlir_emitter.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/mlir_emitter.h"
-
-#include "llvm/Linker/Linker.h"
-#include "llvm/Transforms/IPO/Internalize.h"
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/translate/hlo_to_mhlo/hlo_utils.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-// Lower an MLIR module to an LLVM module.
-std::unique_ptr<llvm::Module> MakeLLVMModule(
-    mlir::OwningOpRef<mlir::ModuleOp> module, llvm::LLVMContext *context) {
-  // When set, the LLVM backend will be allowed to reassociate floating-point
-  // reductions, which enables much more efficient "horizontal" SIMD
-  // implementations.
-  // TODO(kramerb): link this to the right option, command line flag, etc.
-  constexpr bool kReassociateFPReductions = true;
-
-  mlir::PassManager manager((*module)->getName(),
-                            mlir::OpPassManager::Nesting::Implicit);
-  manager.addPass(mlir::createConvertLinalgToLoopsPass());
-  manager.addPass(mlir::createLowerAffinePass());
-  manager.addPass(mlir::createConvertSCFToCFPass());
-  mlir::ConvertVectorToLLVMPassOptions opts;
-  opts.reassociateFPReductions = kReassociateFPReductions;
-  manager.addPass(mlir::createConvertVectorToLLVMPass(opts));
-  CHECK(succeeded(manager.run(*module)));
-  return mlir::translateModuleToLLVMIR(*module, *context);
-}
-
-// Get arguments to pass a memref to an mlir function.
-void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
-                        llvm::IRBuilder<> *b, const Shape &opShape,
-                        llvm::Value *op_val) {
-  args->push_back(op_val);          // Allocated pointer.
-  args->push_back(op_val);          // Aligned pointer.
-  args->push_back(b->getInt64(0));  // Offset.
-
-  // Sizes.
-  for (int64_t dim : opShape.dimensions()) {
-    args->push_back(b->getInt64(dim));
-  }
-
-  int64_t accumulated_stride = 1;
-  llvm::SmallVector<int64_t, 4> strides(opShape.rank(), 1);
-  for (int64_t dim : LayoutUtil::MinorToMajor(opShape)) {
-    strides[dim] = accumulated_stride;
-    accumulated_stride *= opShape.dimensions(dim);
-  }
-
-  // Strides.
-  for (int64_t stride : strides) {
-    args->push_back(b->getInt64(stride));
-  }
-}
-}  // namespace
-
-absl::Status EmitMlirFuncAndCall(
-    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
-    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
-    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
-    llvm::function_ref<void(mlir::OpBuilder *, mlir::func::FuncOp)> emitter) {
-  llvm::Module *llvm_module = b->GetInsertBlock()->getParent()->getParent();
-  mlir::Builder mlir_builder(context);
-
-  // Get memref types for the inputs and output.
-  TF_ASSIGN_OR_RETURN(mlir::Type ret_memref, ConvertTensorShapeToMemRefType(
-                                                 result_shape, mlir_builder));
-  std::vector<mlir::Type> operand_types = {ret_memref};
-  for (int i = 0; i != operand_shapes.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(
-        mlir::Type op_memref,
-        ConvertTensorShapeToMemRefType(operand_shapes[i], mlir_builder));
-    operand_types.push_back(op_memref);
-  }
-
-  // Create the function an call the emission callback.
-  mlir::Location loc = mlir::UnknownLoc::get(context);
-  auto function = mlir::func::FuncOp::create(
-      loc, func_name, mlir::FunctionType::get(context, operand_types, {}));
-  function.addEntryBlock();
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      llvm_ir::CreateMlirModuleOp(loc);
-  mlir_module->push_back(function);
-  mlir::OpBuilder op_builder(&function.getBody());
-  emitter(&op_builder, function);
-
-  // Now link it all into the main LLVM module.
-  auto mlir_llvm_module =
-      MakeLLVMModule(std::move(mlir_module), &b->getContext());
-  mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
-  llvm::Linker::linkModules(
-      *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
-      [](llvm::Module &M, const llvm::StringSet<> &GVS) {
-        llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) {
-          return !GV.hasName() || (GVS.count(GV.getName()) == 0);
-        });
-      });
-
-  // And leave behind a call to the function generated by MLIR.
-  llvm::Function *func = llvm_module->getFunction(func_name);
-  llvm::SmallVector<llvm::Value *, 4> op_vals;
-  BuildViewForBuffer(&op_vals, b, result_shape, result_ptr);
-  for (int i = 0; i != operand_shapes.size(); ++i) {
-    BuildViewForBuffer(&op_vals, b, operand_shapes[i], operand_ptrs[i]);
-  }
-  b->CreateCall(func, op_vals);
-
-  return absl::OkStatus();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.h b/third_party/xla/xla/service/cpu/mlir_emitter.h
deleted file mode 100644
index ae9deb8f9764c0..00000000000000
--- a/third_party/xla/xla/service/cpu/mlir_emitter.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_MLIR_EMITTER_H_
-#define XLA_SERVICE_CPU_MLIR_EMITTER_H_
-
-#include "absl/status/status.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/shape.h"
-
-namespace xla {
-namespace cpu {
-
-// Create a new MLIR function with the name `func_name`, populate it with
-// `emitter` and create a call, passing it the buffers defined by
-// resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
-// the LLVM module at `b`s insertion point.
-absl::Status EmitMlirFuncAndCall(
-    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
-    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
-    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
-    llvm::function_ref<void(mlir::OpBuilder *, mlir::func::FuncOp)> emitter);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_MLIR_EMITTER_H_

From c09c4cd108bdd4e6e68364ac1ee7f45fd0f72185 Mon Sep 17 00:00:00 2001
From: Arian Arfaian <aarfaian@google.com>
Date: Fri, 31 May 2024 11:06:15 -0700
Subject: [PATCH 206/287] [TOCO Removal] Copy converter's Python API wrapper to
 a more appropriate dir.

Previously, the converter's Python API was intertwined with TOCO. This CL
copies it to a new location and does some light renaming in preparating of
further disentangling of TOCO artifacts from the TFL converter. Method
signatures are left untouched.

We copy rather than move to decouple migrating away from TOCO and deletion of
TOCO artifacts. TOCO APIs will be removed/deleted in subsequent CLs.

PiperOrigin-RevId: 639090132
---
 tensorflow/compiler/mlir/lite/metrics/BUILD   |   1 +
 tensorflow/compiler/mlir/lite/python/BUILD    | 111 ++++
 .../lite/python/_pywrap_converter_api.pyi     |  21 +
 .../mlir/lite/python/converter_python_api.cc  | 511 ++++++++++++++++++
 .../mlir/lite/python/converter_python_api.h   |  77 +++
 .../python/converter_python_api_wrapper.cc    | 130 +++++
 .../mlir/lite/python/wrap_converter.py        |  92 ++++
 tensorflow/compiler/mlir/lite/sparsity/BUILD  |   1 +
 tensorflow/lite/python/BUILD                  |  17 +-
 tensorflow/lite/python/analyzer.py            |   8 +-
 tensorflow/lite/python/convert.py             |  10 +-
 tensorflow/lite/python/convert_test.py        |   4 +-
 tensorflow/lite/python/metrics/BUILD          |   2 +-
 .../python/metrics/wrapper/metrics_wrapper.py |   4 +-
 tensorflow/lite/toco/BUILD                    |   9 +
 tensorflow/lite/toco/logging/BUILD            |   2 +
 .../lite/toco/tensorflow_graph_matching/BUILD |   5 +
 tensorflow/lite/toco/tflite/BUILD             |   5 +
 tensorflow/python/BUILD                       |   2 +
 .../tools/def_file_filter/symbols_pybind.txt  |   8 +
 .../tools/def_file_filter/symbols_pybind.txt  |   8 +
 21 files changed, 1000 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/python/_pywrap_converter_api.pyi
 create mode 100644 tensorflow/compiler/mlir/lite/python/converter_python_api.cc
 create mode 100644 tensorflow/compiler/mlir/lite/python/converter_python_api.h
 create mode 100644 tensorflow/compiler/mlir/lite/python/converter_python_api_wrapper.cc
 create mode 100644 tensorflow/compiler/mlir/lite/python/wrap_converter.py

diff --git a/tensorflow/compiler/mlir/lite/metrics/BUILD b/tensorflow/compiler/mlir/lite/metrics/BUILD
index 464cd8f33822b7..e80a6039b4e1df 100644
--- a/tensorflow/compiler/mlir/lite/metrics/BUILD
+++ b/tensorflow/compiler/mlir/lite/metrics/BUILD
@@ -6,6 +6,7 @@ package(
     default_visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/compiler/mlir/lite:__subpackages__",
+        "//tensorflow/lite/python/converter:__subpackages__",
         "//tensorflow/lite/toco/python:__subpackages__",
     ],
     licenses = ["notice"],
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index c7f50ba6edf81c..b20c28602df2df 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 licenses(["notice"])
@@ -13,7 +15,10 @@ package(
 package_group(
     name = "friends",
     packages = [
+        "//learning/brain/mobile/lite/tooling/model_analyzer/...",
         "//learning/brain/mobile/lite/tools/analyzer/...",
+        "//tensorflow/lite/python/...",
+        "//tensorflow/lite/python/converter/...",
         "//tensorflow/lite/toco/...",
     ],
 )
@@ -151,3 +156,109 @@ cc_library(
         "@llvm-project//mlir:TranslateLib",
     ],
 )
+
+py_strict_library(
+    name = "wrap_converter",
+    srcs = [
+        "wrap_converter.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":_pywrap_converter_api",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+config_setting(
+    name = "tflite_convert_with_select_tf_ops",
+    define_values = {"tflite_convert_with_select_tf_ops": "true"},
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+filegroup(
+    name = "converter_python_api_hdrs",
+    srcs = [
+        "converter_python_api.h",
+    ],
+    visibility = [
+        "//tensorflow/python:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "converter_python_api",
+    srcs = ["converter_python_api.cc"],
+    hdrs = ["converter_python_api.h"],
+    features = ["-parse_headers"],
+    visibility = [
+        "//tensorflow/python:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/compiler/mlir/lite/debug:debug_options_proto_cc",
+        "//tensorflow/compiler/mlir/lite/metrics:error_collector",
+        "//tensorflow/compiler/mlir/lite/python:flatbuffer_to_mlir",
+        "//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:jax_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
+        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite:model_builder",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_convert",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_graphviz_dump_options",
+        "//tensorflow/lite/toco:toco_port",
+        "//tensorflow/lite/toco:toco_tooling",
+        "//tensorflow/lite/toco:tooling_util",
+        "//tensorflow/lite/toco:types_proto_cc",
+        "//tensorflow/lite/toco/logging:conversion_log_util",
+        "//tensorflow/lite/toco/logging:toco_conversion_log_proto_cc",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf_headers",
+        "@flatbuffers//:runtime_cc",
+        "@local_tsl//tsl/platform:status",
+    ] + select({
+        # This is required when running `tflite_convert` from `bazel`.
+        # It requires to link with TensorFlow Ops to get the op definitions.
+        ":tflite_convert_with_select_tf_ops": [
+            "//tensorflow/core:ops",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = True,
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_converter_api",
+    srcs = [
+        "converter_python_api_wrapper.cc",
+    ],
+    hdrs = [":converter_python_api_hdrs"],
+    enable_stub_generation = True,
+    pytype_srcs = [
+        "_pywrap_converter_api.pyi",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/python/_pywrap_converter_api.pyi b/tensorflow/compiler/mlir/lite/python/_pywrap_converter_api.pyi
new file mode 100644
index 00000000000000..cdb1e881b7dc9f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/_pywrap_converter_api.pyi
@@ -0,0 +1,21 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+def Convert(model_flags_proto_txt_raw: object, toco_flags_proto_txt_raw: object, input_contents_txt_raw: object, extended_return: bool = ..., debug_info_txt_raw: object = ..., enable_mlir_converter: bool = ..., quantization_py_function_library = ...) -> object: ...
+def ExperimentalMlirQuantizeModel(input_contents_txt_raw: object, disable_per_channel: bool = ..., fully_quantize: bool = ..., inference_type: int = ..., input_data_type: int = ..., output_data_type: int = ..., enable_numeric_verify: bool = ..., enable_whole_model_verify: bool = ..., op_blocklist: object = ..., node_blocklist: object = ..., enable_variable_quantization: bool = ..., disable_per_channel_for_dense_layers: bool = ..., debug_options_proto_txt_raw: object = ...) -> object: ...
+def ExperimentalMlirSparsifyModel(input_contents_txt_raw: object) -> object: ...
+def FlatBufferToMlir(arg0: str, arg1: bool) -> str: ...
+def RegisterCustomOpdefs(custom_opdefs_txt_raw: object) -> object: ...
+def RetrieveCollectedErrors() -> list: ...
diff --git a/tensorflow/compiler/mlir/lite/python/converter_python_api.cc b/tensorflow/compiler/mlir/lite/python/converter_python_api.cc
new file mode 100644
index 00000000000000..2d57c0fe3d9030
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/converter_python_api.cc
@@ -0,0 +1,511 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/python/converter_python_api.h"
+
+#include <Python.h>
+
+#include <fstream>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/compiler/mlir/lite/debug/debug_options.pb.h"
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
+#include "tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.h"
+#include "tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
+#include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/logging/conversion_log_util.h"
+#include "tensorflow/lite/toco/logging/toco_conversion_log.pb.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_convert.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/types.pb.h"
+
+namespace tflite {
+
+void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
+                                 toco::TocoFlags* toco_flags,
+                                 const std::string& input_contents_txt,
+                                 const std::string& output_file_contents_txt,
+                                 absl::string_view error_message,
+                                 toco::GraphVizDumpOptions* dump_options) {
+  // Make sure the graphviz file will be dumped under the same folder.
+  dump_options->dump_graphviz = toco_flags->conversion_summary_dir();
+  // Here we construct the `toco::Model` class based on the input graph def,
+  // it will then be used to populate the conversion log.
+  // TODO(haoliang): Don't depend on `toco::Model`.
+  std::unique_ptr<toco::Model> imported_model =
+      toco::Import(*toco_flags, model_flags, input_contents_txt);
+  // Dump pre-conversion toco logs.
+  toco::TocoConversionLog toco_log_before;
+  PopulateConversionLog(*imported_model, &toco_log_before);
+  std::ofstream osstream_before(toco_flags->conversion_summary_dir() +
+                                "/toco_log_before.pb");
+  toco_log_before.SerializeToOstream(&osstream_before);
+  osstream_before.close();
+  toco::LogDump(toco::kLogLevelModelChanged, "tf_graph", *imported_model);
+
+  // Populate the post-conversion log, for convenient initiate the
+  // `toco::Model` class from the generated flatbuffer.
+  toco_flags->set_input_format(toco::FileFormat::TFLITE);
+  std::unique_ptr<toco::Model> flatbuffer_model =
+      toco::Import(*toco_flags, model_flags, output_file_contents_txt);
+  // Dump post-conversion toco logs.
+  toco::TocoConversionLog toco_log_after;
+  PopulateConversionLog(*flatbuffer_model, &toco_log_after);
+  // Make sure we sanitize the error message.
+  toco_log_after.set_toco_err_logs(toco::SanitizeErrorMessage(error_message));
+  std::ofstream ostream_after(toco_flags->conversion_summary_dir() +
+                              "/toco_log_after.pb");
+  toco_log_after.SerializeToOstream(&ostream_after);
+  ostream_after.close();
+  toco::LogDump(toco::kLogLevelModelChanged, "tflite_graph", *flatbuffer_model);
+}
+
+// NOTE(aselle): We are using raw PyObject's here because we want to make
+// sure we input and output bytes rather than unicode strings for Python3.
+PyObject* Convert(PyObject* model_flags_proto_txt_raw,
+                  PyObject* toco_flags_proto_txt_raw,
+                  PyObject* input_contents_txt_raw, bool extended_return,
+                  PyObject* debug_info_txt_raw, bool enable_mlir_converter,
+                  const tensorflow::quantization::PyFunctionLibrary*
+                      quantization_py_function_library) {
+  // Use Python C API to validate and convert arguments. In py3 (bytes),
+  // in py2 (str).
+  auto ConvertArg = [&](PyObject* obj, bool* error) {
+    char* buf;
+    Py_ssize_t len;
+    if (::tflite::python_utils::ConvertFromPyString(obj, &buf, &len) == -1) {
+      *error = true;
+      return std::string();
+    } else {
+      *error = false;
+      return std::string(buf, len);
+    }
+  };
+
+  bool error;
+  std::string model_flags_proto_txt =
+      ConvertArg(model_flags_proto_txt_raw, &error);
+  if (error) {
+    PyErr_SetString(PyExc_ValueError, "Model flags are invalid.");
+    return nullptr;
+  }
+  std::string toco_flags_proto_txt =
+      ConvertArg(toco_flags_proto_txt_raw, &error);
+  if (error) {
+    PyErr_SetString(PyExc_ValueError, "Toco flags are invalid.");
+    return nullptr;
+  }
+
+  // Use TOCO to produce new outputs.
+  toco::ModelFlags model_flags;
+  if (!model_flags.ParseFromString(model_flags_proto_txt)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Failed to convert Model to Python String.");
+    return nullptr;
+  }
+  toco::TocoFlags toco_flags;
+  if (!toco_flags.ParseFromString(toco_flags_proto_txt)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Failed to convert Toco to Python String.");
+    return nullptr;
+  }
+
+  tensorflow::GraphDebugInfo debug_info;
+  if (debug_info_txt_raw && debug_info_txt_raw != Py_None) {
+    std::string debug_info_txt = ConvertArg(debug_info_txt_raw, &error);
+    if (error) {
+      PyErr_SetString(PyExc_ValueError, "Input DebugInfo is invalid.");
+      return nullptr;
+    }
+    if (!debug_info.ParseFromString(debug_info_txt)) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Failed to convert DebugInfo to Python String.");
+      return nullptr;
+    }
+  }
+
+  tensorflow::GraphDef graph_def;
+  std::string input_contents_txt;
+  if (model_flags.saved_model_dir().empty()) {
+    input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
+    if (error) {
+      PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
+      return nullptr;
+    }
+    if (!model_flags.use_hlo_import() &&
+        !graph_def.ParseFromString(input_contents_txt)) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Failed to convert GraphDef to Python String.");
+      return nullptr;
+    }
+  }
+
+  auto& dump_options = *toco::GraphVizDumpOptions::singleton();
+  if (toco_flags.has_dump_graphviz_dir()) {
+    dump_options.dump_graphviz = toco_flags.dump_graphviz_dir();
+  }
+  if (toco_flags.has_dump_graphviz_include_video()) {
+    dump_options.dump_graphviz_video = toco_flags.dump_graphviz_include_video();
+  }
+
+  std::string output_file_contents_txt;
+  tensorflow::Status status;
+  int64_t arithmetic_ops_count;
+
+  // Convert model.
+  if (enable_mlir_converter) {
+    if (model_flags.use_hlo_import() && model_flags.has_saved_model_dir()) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Cannot specify both saved_model and hlo import.");
+      return nullptr;
+    }
+
+    if (model_flags.use_hlo_import()) {
+      status = tensorflow::ConvertJaxToTFLiteFlatBuffer(
+          input_contents_txt, model_flags, toco_flags,
+          &output_file_contents_txt);
+    } else if (!model_flags.saved_model_dir().empty()) {
+      status = tensorflow::ConvertSavedModelToTFLiteFlatBuffer(
+          model_flags, toco_flags, &output_file_contents_txt,
+          quantization_py_function_library);
+    } else {
+      tensorflow::GraphDef graph_def;
+      if (!graph_def.ParseFromString(input_contents_txt)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Failed to convert GraphDef to Python String.");
+        return nullptr;
+      }
+
+      status = tensorflow::ConvertGraphDefToTFLiteFlatBuffer(
+          model_flags, toco_flags, debug_info, graph_def,
+          &output_file_contents_txt);
+      if (!toco_flags.conversion_summary_dir().empty()) {
+        PopulateConversionLogHelper(
+            model_flags, &toco_flags, input_contents_txt,
+            output_file_contents_txt, status.message(), &dump_options);
+      }
+    }
+  } else {
+    status = Convert(input_contents_txt, toco_flags, model_flags,
+                     &output_file_contents_txt, &arithmetic_ops_count);
+  }
+
+  if (!status.ok()) {
+    PyErr_SetString(PyExc_Exception, absl::StatusMessageAsCStr(status));
+    return nullptr;
+  }
+  if (extended_return && !enable_mlir_converter) {
+    PyObject* dict = PyDict_New();
+    PyDict_SetItemString(
+        dict, "flatbuffer",
+        ::tflite::python_utils::ConvertToPyString(
+            output_file_contents_txt.data(), output_file_contents_txt.size()));
+    PyDict_SetItemString(dict, "arithmetic_ops",
+                         PyLong_FromLong(arithmetic_ops_count));
+    return dict;
+  }
+  // Convert arguments back to byte (py3) or str (py2)
+  return ::tflite::python_utils::ConvertToPyString(
+      output_file_contents_txt.data(), output_file_contents_txt.size());
+}
+
+tflite::TensorType FromTocoDataTypeToTflitToTensorType(int inference_type) {
+  switch (inference_type) {
+    case toco::IODataType::QUANTIZED_INT16:
+      return tflite::TensorType_INT16;
+    case toco::IODataType::QUANTIZED_UINT8:
+      return tflite::TensorType_UINT8;
+    case toco::IODataType::UINT8:
+      return tflite::TensorType_UINT8;
+    case toco::IODataType::QUANTIZED_INT8:
+      return tflite::TensorType_INT8;
+    case toco::IODataType::INT8:
+      return tflite::TensorType_INT8;
+    default:
+      return tflite::TensorType_FLOAT32;
+  }
+}
+
+int ToStringSet(PyObject* py_denylist,
+                absl::flat_hash_set<std::string>* string_set) {
+  using tflite::python_utils::ConvertFromPyString;
+  // Ensure op_denylist is non null
+  if (!py_denylist) {
+    return 0;
+  }
+  if (PyList_Check(py_denylist)) {
+    for (int i = 0; i < PyList_GET_SIZE(py_denylist); ++i) {
+      PyObject* value = PyList_GetItem(py_denylist, i);
+      char* str_buf;
+      Py_ssize_t length;
+      if (ConvertFromPyString(value, &str_buf, &length) == -1) {
+        return -1;
+      }
+      string_set->emplace(str_buf, length);
+    }
+  }
+  if (PySet_Check(py_denylist)) {
+    auto* tmp = PySet_New(py_denylist);
+    while (PySet_GET_SIZE(tmp)) {
+      PyObject* value = PySet_Pop(tmp);
+      char* str_buf;
+      Py_ssize_t length;
+      if (ConvertFromPyString(value, &str_buf, &length) == -1) {
+        return -1;
+      }
+      string_set->emplace(str_buf, length);
+    }
+  }
+  return 0;
+}
+
+PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
+                            bool fully_quantize, int inference_type,
+                            int input_data_type, int output_data_type,
+                            bool enable_numeric_verify,
+                            bool enable_whole_model_verify,
+                            PyObject* op_denylist, PyObject* node_denylist,
+                            bool enable_variable_quantization,
+                            bool disable_per_channel_for_dense_layers,
+                            PyObject* debug_options_proto_txt_raw) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+
+  if (tflite::python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    PyErr_Format(PyExc_ValueError, "Failed to convert input PyObject");
+    return nullptr;
+  }
+
+  std::optional<tensorflow::converter::DebugOptions> debug_options =
+      tensorflow::converter::DebugOptions();
+  if (debug_options_proto_txt_raw != nullptr) {
+    auto ConvertArg = [&](PyObject* obj, bool* error) {
+      char* buf;
+      Py_ssize_t len;
+      if (::tflite::python_utils::ConvertFromPyString(obj, &buf, &len) == -1) {
+        *error = true;
+        return std::string();
+      } else {
+        *error = false;
+        return std::string(buf, len);
+      }
+    };
+
+    bool error;
+    std::string debug_options_proto_txt =
+        ConvertArg(debug_options_proto_txt_raw, &error);
+    if (error) {
+      PyErr_SetString(PyExc_ValueError, "Toco flags are invalid.");
+      return nullptr;
+    }
+
+    if (!debug_options->ParseFromString(debug_options_proto_txt)) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Failed to convert Toco to Python String.");
+      return nullptr;
+    }
+  } else {
+    debug_options = std::nullopt;
+  }
+
+  absl::flat_hash_set<std::string> denylisted_ops;
+  absl::flat_hash_set<std::string> denylisted_nodes;
+  if (ToStringSet(op_denylist, &denylisted_ops) == -1) {
+    PyErr_Format(PyExc_ValueError, "Failed to convert op denylist PyObject");
+    return nullptr;
+  }
+  if (ToStringSet(node_denylist, &denylisted_nodes) == -1) {
+    PyErr_Format(PyExc_ValueError, "Failed to convert node denylist PyObject");
+    return nullptr;
+  }
+
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  auto tflite_model = std::make_unique<tflite::ModelT>();
+  model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
+
+  const tflite::TensorType inference_tensor_type =
+      FromTocoDataTypeToTflitToTensorType(inference_type);
+  const tflite::TensorType input_type =
+      FromTocoDataTypeToTflitToTensorType(input_data_type);
+  const tflite::TensorType output_type =
+      FromTocoDataTypeToTflitToTensorType(output_data_type);
+
+  std::string output_model;
+  const absl::string_view input_model_buffer(buf, length);
+  auto status = mlir::lite::QuantizeModel(
+      input_model_buffer, input_type, output_type, inference_tensor_type,
+      /*operator_names=*/{}, disable_per_channel, fully_quantize, output_model,
+      error_reporter.get(), enable_numeric_verify, enable_whole_model_verify,
+      /*legacy_float_scale=*/true, denylisted_ops, denylisted_nodes,
+      enable_variable_quantization, disable_per_channel_for_dense_layers,
+      debug_options);
+  if (status != kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+
+  return tflite::python_utils::ConvertToPyString(output_model.data(),
+                                                 output_model.size());
+}
+
+PyObject* MlirSparsifyModel(PyObject* data) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+
+  if (tflite::python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    PyErr_Format(PyExc_ValueError, "Failed to convert input PyObject");
+    return nullptr;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  auto tflite_model = std::make_unique<tflite::ModelT>();
+  model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
+
+  flatbuffers::FlatBufferBuilder builder;
+  auto status =
+      mlir::lite::SparsifyModel(*tflite_model, &builder, error_reporter.get());
+
+  if (status != kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+  return tflite::python_utils::ConvertToPyString(
+      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+      builder.GetSize());
+}
+
+PyObject* RegisterCustomOpdefs(PyObject* list) {
+  if (!PyList_Check(list)) {
+    PyErr_SetString(PyExc_TypeError, "Expected list in argument");
+    return nullptr;
+  }
+
+  int64_t size = PyList_Size(list);
+  for (int i = 0; i < size; ++i) {
+    // Get character array from Python object.
+    char* tf_opdefs;
+    Py_ssize_t len;
+    if (tflite::python_utils::ConvertFromPyString(PyList_GetItem(list, i),
+                                                  &tf_opdefs, &len) == -1) {
+      PyErr_Format(PyExc_ValueError,
+                   "Failed to convert Python string at index %d of custom op "
+                   "defs argument",
+                   i);
+      return nullptr;
+    }
+
+    // Parse op def from character array.
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs, &opdef)) {
+      PyErr_Format(
+          PyExc_ValueError,
+          "Failed to parse opdefs at index %d of custom op defs argument: %s",
+          i, tf_opdefs);
+      return nullptr;
+    }
+
+    // Register extra opdefs to TensorFlow global op registry.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](
+            tensorflow::OpRegistrationData* op_reg_data) -> tensorflow::Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return absl::OkStatus();
+        });
+
+    // Register the corresponding fake op kernel.
+    const char* node_name = opdef.name().c_str();
+    const char* op_name = opdef.name().c_str();
+    const char* device_name = "CPU";
+    static auto fake_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    };
+
+    TF_KernelBuilder* builder =
+        TF_NewKernelBuilder(op_name, device_name, /*create_func=*/nullptr,
+                            fake_compute_func, /*delete_func=*/nullptr);
+
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(node_name, builder, status);
+    if (TF_GetCode(status) != TF_OK) {
+      TF_DeleteStatus(status);
+      PyErr_Format(PyExc_ValueError,
+                   "Failed to register fake op kernel at index %d of custom op "
+                   "defs argument",
+                   i);
+      return nullptr;
+    }
+    TF_DeleteStatus(status);
+  }
+
+  Py_RETURN_TRUE;
+}
+
+std::vector<std::string> RetrieveCollectedErrors() {
+  mlir::TFL::ErrorCollector* collector =
+      mlir::TFL::ErrorCollector::GetErrorCollector();
+  std::vector<std::string> collected_errors;
+  for (const auto& error_data : collector->CollectedErrors()) {
+    collected_errors.push_back(error_data.SerializeAsString());
+  }
+  collector->Clear();
+  return collected_errors;
+}
+
+std::string FlatBufferFileToMlir(const std::string& model,
+                                 bool input_is_filepath) {
+  return ::tensorflow::FlatBufferFileToMlir(model, input_is_filepath);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/python/converter_python_api.h b/tensorflow/compiler/mlir/lite/python/converter_python_api.h
new file mode 100644
index 00000000000000..6dbcf0603d7e8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/converter_python_api.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
+
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
+namespace tflite {
+
+// Convert a model represented in `input_contents`. `model_flags_proto`
+// describes model parameters. `flags_proto` describes conversion
+// parameters (see relevant .protos for more information). Returns a string
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statistics like arithmetic ops count.
+// `debug_info_str` contains the `GraphDebugInfo` proto. When
+// `enable_mlir_converter` is True, use MLIR-based conversion instead of
+// TOCO conversion.
+PyObject* Convert(PyObject* model_flags_proto_txt_raw,
+                  PyObject* toco_flags_proto_txt_raw,
+                  PyObject* input_contents_txt_raw,
+                  bool extended_return = false,
+                  PyObject* debug_info_txt_raw = nullptr,
+                  bool enable_mlir_converter = false,
+                  const tensorflow::quantization::PyFunctionLibrary*
+                      quantization_py_function_library = nullptr);
+
+// Quantize the model with calibration data. Throw errors if `fully_quantize`
+// is specified by the calibration data are not sufficient to quantize the
+// model.
+PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
+                            bool fully_quantize, int inference_type,
+                            int input_data_type, int output_data_type,
+                            bool enable_numeric_verify = false,
+                            bool enable_whole_model_verify = false,
+                            PyObject* op_denylist = nullptr,
+                            PyObject* node_denylist = nullptr,
+                            bool enable_variable_quantization = false,
+                            bool disable_per_channel_for_dense_layers = false,
+                            PyObject* debug_options_proto_txt_raw = nullptr);
+
+// Sparsifies model to encode sparse tensors with proper format. Throws error if
+// sparsification fails.
+PyObject* MlirSparsifyModel(PyObject* data);
+
+// Registers the given custom opdefs to TensorFlow global op registry.
+PyObject* RegisterCustomOpdefs(PyObject* list);
+
+// Returns the collected TFLite conversion errors.
+std::vector<std::string> RetrieveCollectedErrors();
+
+// Returns MLIR string dump of the given Flatbuffer model.
+std::string FlatBufferFileToMlir(const std::string& model,
+                                 bool input_is_filepath);
+
+// All the exported functions should be listed in
+// tensorflow/tools/def_file_filter/symbols_pybind.txt for the Windows build.
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
diff --git a/tensorflow/compiler/mlir/lite/python/converter_python_api_wrapper.cc b/tensorflow/compiler/mlir/lite/python/converter_python_api_wrapper.cc
new file mode 100644
index 00000000000000..de46a6f9115339
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/converter_python_api_wrapper.cc
@@ -0,0 +1,130 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/compiler/mlir/lite/python/converter_python_api.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_pywrap_converter_api, m) {
+  m.def(
+      "Convert",
+      [](py::object model_flags_proto_txt_raw,
+         py::object toco_flags_proto_txt_raw, py::object input_contents_txt_raw,
+         bool extended_return, py::object debug_info_txt_raw,
+         bool enable_mlir_converter,
+         const tensorflow::quantization::PyFunctionLibrary*
+             quantization_py_function_library) {
+        return tensorflow::PyoOrThrow(tflite::Convert(
+            model_flags_proto_txt_raw.ptr(), toco_flags_proto_txt_raw.ptr(),
+            input_contents_txt_raw.ptr(), extended_return,
+            debug_info_txt_raw.ptr(), enable_mlir_converter,
+            quantization_py_function_library));
+      },
+      py::arg("model_flags_proto_txt_raw"), py::arg("toco_flags_proto_txt_raw"),
+      py::arg("input_contents_txt_raw"), py::arg("extended_return") = false,
+      py::arg("debug_info_txt_raw") = py::none(),
+      py::arg("enable_mlir_converter") = false,
+      py::arg("quantization_py_function_library") = py::none(),
+      R"pbdoc(
+      Convert a model represented in `input_contents`. `model_flags_proto`
+      describes model parameters. `flags_proto` describes conversion
+      parameters (see relevant .protos for more information). Returns a string
+      representing the contents of the converted model. When extended_return
+      flag is set to true returns a dictionary that contains string representation
+      of the converted model and some statistics like arithmetic ops count.
+      `debug_info_str` contains the `GraphDebugInfo` proto. When
+      `enable_mlir_converter` is True, tuse MLIR-based conversion instead of
+      TOCO conversion.
+    )pbdoc");
+  m.def(
+      "ExperimentalMlirQuantizeModel",
+      [](py::object input_contents_txt_raw, bool disable_per_channel,
+         bool fully_quantize, int inference_type, int input_data_type,
+         int output_data_type, bool enable_numeric_verify,
+         bool enable_whole_model_verify, py::object op_blocklist,
+         py::object node_blocklist, bool enable_variable_quantization,
+         bool disable_per_channel_for_dense_layers,
+         py::object debug_options_proto_txt_raw) {
+        return tensorflow::PyoOrThrow(tflite::MlirQuantizeModel(
+            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
+            inference_type, input_data_type, output_data_type,
+            enable_numeric_verify, enable_whole_model_verify,
+            op_blocklist.ptr(), node_blocklist.ptr(),
+            enable_variable_quantization, disable_per_channel_for_dense_layers,
+            debug_options_proto_txt_raw.ptr()));
+      },
+      py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
+      py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
+      py::arg("input_data_type") = 0, py::arg("output_data_type") = 0,
+      py::arg("enable_numeric_verify") = false,
+      py::arg("enable_whole_model_verify") = false,
+      py::arg("op_blocklist") = py::none(),
+      py::arg("node_blocklist") = py::none(),
+      py::arg("enable_variable_quantization") = false,
+      py::arg("disable_per_channel_for_dense_layers") = false,
+      py::arg("debug_options_proto_txt_raw") = nullptr,
+      R"pbdoc(
+      Returns a quantized model.
+    )pbdoc");
+  m.def(
+      "ExperimentalMlirSparsifyModel",
+      [](py::object input_contents_txt_raw) {
+        return tensorflow::PyoOrThrow(
+            tflite::MlirSparsifyModel(input_contents_txt_raw.ptr()));
+      },
+      py::arg("input_contents_txt_raw"),
+      R"pbdoc(
+      Returns a sparsified model.
+    )pbdoc");
+  m.def(
+      "RegisterCustomOpdefs",
+      [](py::object custom_opdefs_txt_raw) {
+        return tensorflow::PyoOrThrow(
+            tflite::RegisterCustomOpdefs(custom_opdefs_txt_raw.ptr()));
+      },
+      py::arg("custom_opdefs_txt_raw"),
+      R"pbdoc(
+      Registers the given custom opdefs to the TensorFlow global op registry.
+    )pbdoc");
+  m.def(
+      "RetrieveCollectedErrors",
+      []() {
+        std::vector<std::string> collected_errors =
+            tflite::RetrieveCollectedErrors();
+        pybind11::list serialized_message_list(collected_errors.size());
+        int i = 0;
+        for (const auto& error_data : collected_errors) {
+          serialized_message_list[i++] = pybind11::bytes(error_data);
+        }
+        return serialized_message_list;
+      },
+      R"pbdoc(
+      Returns and clears the list of collected errors in ErrorCollector.
+    )pbdoc");
+  m.def(
+      "FlatBufferToMlir",
+      [](const std::string& model, bool input_is_filepath) {
+        return tflite::FlatBufferFileToMlir(model, input_is_filepath);
+      },
+      R"pbdoc(
+      Returns MLIR dump of the given TFLite model.
+    )pbdoc");
+}
diff --git a/tensorflow/compiler/mlir/lite/python/wrap_converter.py b/tensorflow/compiler/mlir/lite/python/wrap_converter.py
new file mode 100644
index 00000000000000..1c198f062388fc
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/wrap_converter.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wraps TFLite Converter interface with python lazy loader."""
+# We need to import pywrap_tensorflow prior to the converter wrapper.
+# pylint: disable=invalid-import-order,g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.compiler.mlir.lite.python import _pywrap_converter_api
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
+
+
+def wrapped_convert(
+    model_flags_str,
+    toco_flags_str,
+    input_data_str,
+    debug_info_str,
+    enable_mlir_converter,
+):
+  """Wraps TocoConvert with lazy loader."""
+  return _pywrap_converter_api.Convert(
+      model_flags_str,
+      toco_flags_str,
+      input_data_str,
+      False,  # extended_return
+      debug_info_str,
+      enable_mlir_converter,
+      py_function_lib.PyFunctionLibrary(),
+  )
+
+
+def wrapped_experimental_mlir_quantize(
+    input_data_str,
+    disable_per_channel,
+    fully_quantize,
+    inference_type,
+    input_data_type,
+    output_data_type,
+    enable_numeric_verify,
+    enable_whole_model_verify,
+    denylisted_ops,
+    denylisted_nodes,
+    enable_variable_quantization,
+    disable_per_channel_for_dense_layers,
+    debug_options_str,
+):
+  """Wraps experimental mlir quantize model."""
+  return _pywrap_converter_api.ExperimentalMlirQuantizeModel(
+      input_data_str,
+      disable_per_channel,
+      fully_quantize,
+      inference_type,
+      input_data_type,
+      output_data_type,
+      enable_numeric_verify,
+      enable_whole_model_verify,
+      denylisted_ops,
+      denylisted_nodes,
+      enable_variable_quantization,
+      disable_per_channel_for_dense_layers,
+      debug_options_str,
+  )
+
+
+def wrapped_experimental_mlir_sparsify(input_data_str):
+  """Wraps experimental mlir sparsify model."""
+  return _pywrap_converter_api.ExperimentalMlirSparsifyModel(input_data_str)
+
+
+def wrapped_register_custom_opdefs(custom_opdefs_list):
+  """Wraps RegisterCustomOpdefs with lazy loader."""
+  return _pywrap_converter_api.RegisterCustomOpdefs(custom_opdefs_list)
+
+
+def wrapped_retrieve_collected_errors():
+  """Wraps RetrieveCollectedErrors with lazy loader."""
+  return _pywrap_converter_api.RetrieveCollectedErrors()
+
+
+def wrapped_flat_buffer_file_to_mlir(model, input_is_filepath):
+  """Wraps FlatBufferFileToMlir with lazy loader."""
+  return _pywrap_converter_api.FlatBufferToMlir(model, input_is_filepath)
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index 0566ea545a7b0b..d17b9fccb88dd1 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -14,6 +14,7 @@ package_group(
     name = "friends",
     packages = [
         "//learning/brain/experimental/mlir/...",
+        "//tensorflow/compiler/mlir/lite/...",
         "//tensorflow/lite/...",
     ],
 )
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 8ff6d3939d996b..2770b0df083726 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -452,19 +452,6 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
-    name = "wrap_toco",
-    srcs = [
-        "wrap_toco.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
-        "//tensorflow/python:_pywrap_toco_api",
-        "//tensorflow/python:pywrap_tensorflow",
-    ],
-)
-
 py_strict_library(
     name = "lite_constants",
     srcs = ["lite_constants.py"],
@@ -486,7 +473,7 @@ pytype_strict_library(
         ":convert_phase",
         ":lite_constants",
         ":util",
-        ":wrap_toco",
+        "//tensorflow/compiler/mlir/lite/python:wrap_converter",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_py",
         "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
@@ -643,7 +630,7 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        ":wrap_toco",
+        "//tensorflow/compiler/mlir/lite/python:wrap_converter",
         "//tensorflow/lite/python/analyzer_wrapper:_pywrap_analyzer_wrapper",
         "//tensorflow/python/util:tf_export",
     ],
diff --git a/tensorflow/lite/python/analyzer.py b/tensorflow/lite/python/analyzer.py
index 9f9617540ea33c..110d8014786b68 100644
--- a/tensorflow/lite/python/analyzer.py
+++ b/tensorflow/lite/python/analyzer.py
@@ -20,7 +20,7 @@
 if not os.path.splitext(__file__)[0].endswith(
     os.path.join("tflite_runtime", "analyzer")):
   # This file is part of tensorflow package.
-  from tensorflow.lite.python import wrap_toco
+  from tensorflow.compiler.mlir.lite.python import wrap_converter
   from tensorflow.lite.python.analyzer_wrapper import _pywrap_analyzer_wrapper as _analyzer_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
 else:
@@ -97,8 +97,10 @@ def analyze(model_path=None,
 
     if kwargs.get("experimental_use_mlir", False):
       print(
-          wrap_toco.wrapped_flat_buffer_file_to_mlir(tflite_model,
-                                                     input_is_filepath))
+          wrap_converter.wrapped_flat_buffer_file_to_mlir(
+              tflite_model, input_is_filepath
+          )
+      )
     else:
       print(
           _analyzer_wrapper.ModelAnalyzer(tflite_model, input_is_filepath,
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 29e1073e2a3266..26057563feaf27 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -24,11 +24,11 @@
 from typing import Optional
 import warnings
 
+from tensorflow.compiler.mlir.lite.python import wrap_converter
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
-from tensorflow.lite.python import wrap_toco
 from tensorflow.lite.python.convert_phase import Component
 from tensorflow.lite.python.convert_phase import convert_phase
 from tensorflow.lite.python.convert_phase import ConverterError
@@ -267,7 +267,7 @@ def mlir_quantize(
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
     inputs and outputs.
   """
-  return wrap_toco.wrapped_experimental_mlir_quantize(
+  return wrap_converter.wrapped_experimental_mlir_quantize(
       input_data_str,
       disable_per_channel,
       fully_quantize,
@@ -294,7 +294,7 @@ def mlir_sparsify(input_data_str):
   Returns:
     Sparsified model in serialized form (e.g. a TFLITE model).
   """
-  return wrap_toco.wrapped_experimental_mlir_sparsify(input_data_str)
+  return wrap_converter.wrapped_experimental_mlir_sparsify(input_data_str)
 
 
 def register_custom_opdefs(custom_opdefs_list):
@@ -307,7 +307,7 @@ def register_custom_opdefs(custom_opdefs_list):
   Returns:
     True if the registration is successfully completed.
   """
-  return wrap_toco.wrapped_register_custom_opdefs(custom_opdefs_list)
+  return wrap_converter.wrapped_register_custom_opdefs(custom_opdefs_list)
 
 
 def convert(
@@ -342,7 +342,7 @@ def convert(
   # pipeline surfaces errors instead, and can be safely run in-process.
   if enable_mlir_converter or not _deprecated_conversion_binary:
     try:
-      return wrap_toco.wrapped_toco_convert(
+      return wrap_converter.wrapped_convert(
           model_flags.SerializeToString(),
           conversion_flags.SerializeToString(),
           input_data_str,
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index faac3422f63cd8..dfa02250b673bf 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -35,7 +35,7 @@
 from tensorflow.python.platform import test
 
 
-def _mock_wrapped_toco_convert(
+def _mock_wrapped_convert(
     unused_model_flags_str="",
     conversion_flags_str="",
     unused_input_data_str="",
@@ -100,7 +100,7 @@ def testBasicDeprecatedConversionBinary(self, mock_func):
     mock_func.assert_called_once()
 
   @mock.patch.object(
-      convert.wrap_toco, "wrapped_toco_convert", new=_mock_wrapped_toco_convert
+      convert.wrap_converter, "wrapped_convert", new=_mock_wrapped_convert
   )
   @mock.patch.object(
       metrics_wrapper, "retrieve_collected_errors", new=_mock_retrieve_errors
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 0258eb2dcefd15..0ce418d452f35c 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -62,7 +62,7 @@ pytype_strict_library(
     deps = [
         ":_pywrap_tensorflow_lite_metrics_wrapper",
         ":converter_error_data_proto_py",
-        "//tensorflow/lite/python:wrap_toco",
+        "//tensorflow/compiler/mlir/lite/python:wrap_converter",
     ],
 )
 
diff --git a/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py b/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py
index fa05f7d62644ce..a8f521345d1766 100644
--- a/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py
+++ b/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Stub to make pywrap metrics wrapper accessible."""
 
-from tensorflow.lite.python import wrap_toco
+from tensorflow.compiler.mlir.lite.python import wrap_converter
 from tensorflow.lite.python.metrics import converter_error_data_pb2
 from tensorflow.lite.python.metrics._pywrap_tensorflow_lite_metrics_wrapper import MetricsWrapper  # pylint: disable=unused-import
 
@@ -28,7 +28,7 @@ def retrieve_collected_errors():
   Returns:
     A list of ConverterErrorData.
   """
-  serialized_message_list = wrap_toco.wrapped_retrieve_collected_errors()
+  serialized_message_list = wrap_converter.wrapped_retrieve_collected_errors()
   return list(
       map(converter_error_data_pb2.ConverterErrorData.FromString,
           serialized_message_list))
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 66718eb70c185f..e8eaeaaffd9d2a 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -88,6 +88,7 @@ cc_library(
     hdrs = [
         "model.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":model_flags_proto_cc",
@@ -106,6 +107,7 @@ cc_library(
     hdrs = [
         "toco_graphviz_dump_options.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
 )
 
@@ -117,6 +119,7 @@ cc_library(
     hdrs = [
         "toco_cmdline_flags.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":model_cmdline_flags",
@@ -140,6 +143,7 @@ cc_library(
         "args.h",
         "model_cmdline_flags.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":model_flags_proto_cc",
@@ -162,6 +166,7 @@ cc_library(
         "toco_port.h",
         "toco_types.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
@@ -283,6 +288,7 @@ cc_library(
         "graph_transformations/identify_util.h",
         "graph_transformations/lstm_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":model",
@@ -318,6 +324,7 @@ cc_library(
         "tensorflow_util.h",
         "toco_tooling.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -369,6 +376,7 @@ cc_library(
         "dump_graphviz.h",
         "tooling_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -406,6 +414,7 @@ cc_library(
     name = "toco_convert",
     srcs = ["toco_convert.cc"],
     hdrs = ["toco_convert.h"],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":model",
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 4f6efd6d03a31c..e55ffc55ef74d2 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -5,6 +5,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -27,6 +28,7 @@ cc_library(
     name = "conversion_log_util",
     srcs = ["conversion_log_util.cc"],
     hdrs = ["conversion_log_util.h"],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":toco_conversion_log_proto_cc",
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
index f54c3d3f663919..ea575cb60de98d 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -2,6 +2,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -17,6 +18,7 @@ cc_library(
     hdrs = [
         "cluster_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite/toco:toco_port",
     ],
@@ -30,6 +32,7 @@ cc_library(
     hdrs = [
         "cluster.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":cluster_utils",
         "//tensorflow/core:protos_all_cc",
@@ -46,6 +49,7 @@ cc_library(
     hdrs = [
         "resolve_svdf.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
@@ -81,6 +85,7 @@ cc_library(
     hdrs = [
         "resolve_cluster.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 2c1becd924e824..a62abadf47b553 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -2,6 +2,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -19,6 +20,7 @@ cc_library(
         "operator.h",
         "simple_operator.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = [
         "//tensorflow/lite/toco:__subpackages__",
     ],
@@ -70,6 +72,7 @@ cc_library(
     hdrs = [
         "types.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/schema:schema_fbs",
@@ -104,6 +107,7 @@ cc_library(
     hdrs = [
         "export.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":operator",
@@ -156,6 +160,7 @@ cc_library(
     hdrs = [
         "import.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
         ":operator",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index fac07e2d3e3fc8..c3f9a377e01521 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -767,6 +767,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/cc/saved_model:fingerprinting_impl",
         "//tensorflow/cc/saved_model:loader_lite_impl",
         "//tensorflow/cc/saved_model:metrics_impl",
+        "//tensorflow/compiler/mlir/lite/python:converter_python_api",
         "//tensorflow/compiler/mlir/quantization/stablehlo/python:pywrap_quantization_lib_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
@@ -884,6 +885,7 @@ filegroup(
         "//tensorflow/cc/saved_model:metrics_impl",  # SavedModel metrics
         "//tensorflow/compiler/jit:flags",  # tfe
         "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
+        "//tensorflow/compiler/mlir/lite/python:converter_python_api",  # converter
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",  # quantization
         "//tensorflow/compiler/mlir/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl",  # tensorflow_to_stablehlo
         "//tensorflow/compiler/tf2xla:tf2xla_opset",  # pywrap_xla_ops
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 84c187dc78c7c7..0bdf3111c133b9 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -257,6 +257,14 @@ tensorflow::ExperimentalTFLiteToTosaBytecode
 tensorflow::ImportGraphDef
 tensorflow::ImportFunction
 
+[//tensorflow/compiler/mlir/lite/python:converter_python_api] # converter_python_api
+tflite::Convert
+tflite::MlirQuantizeModel
+tflite::MlirSparsifyModel
+tflite::RegisterCustomOpdefs
+tflite::RetrieveCollectedErrors
+tflite::FlatBufferFileToMlir
+
 [//tensorflow/core:op_gen_lib] # tf_session
 tensorflow::ApiDefMap::~ApiDefMap
 
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index 84c187dc78c7c7..0bdf3111c133b9 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -257,6 +257,14 @@ tensorflow::ExperimentalTFLiteToTosaBytecode
 tensorflow::ImportGraphDef
 tensorflow::ImportFunction
 
+[//tensorflow/compiler/mlir/lite/python:converter_python_api] # converter_python_api
+tflite::Convert
+tflite::MlirQuantizeModel
+tflite::MlirSparsifyModel
+tflite::RegisterCustomOpdefs
+tflite::RetrieveCollectedErrors
+tflite::FlatBufferFileToMlir
+
 [//tensorflow/core:op_gen_lib] # tf_session
 tensorflow::ApiDefMap::~ApiDefMap
 

From c94d3bd3dc232b88dc3dfd3b1a2b99cd7defd94e Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Fri, 31 May 2024 11:13:08 -0700
Subject: [PATCH 207/287] #tf-data-sevice Downgrade null model warning to vlog.

PiperOrigin-RevId: 639092597
---
 .../core/kernels/data/experimental/data_service_dataset_op.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 951b53a3b8504b..77163e89e2ad30 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -414,7 +414,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
       double GetTargetProcessingTimeNsec() const override {
         if (ctx_.model() == nullptr) {
-          LOG(WARNING) << "tf.data Model is null in DataServiceIteratorContext";
+          VLOG(1) << "tf.data Model is null in DataServiceIteratorContext";
           return 0.0;
         }
 

From 0445480a74c9757eb49019eeee215c97a58ce099 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 31 May 2024 12:49:04 -0700
Subject: [PATCH 208/287] [xla:cpu] Fix ir_emitter2 test crash

PiperOrigin-RevId: 639123120
---
 .../xla/xla/service/cpu/ir_emitter2.cc        | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index f33fb34c3be357..d270b070f83eb5 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -121,10 +121,10 @@ static llvm::FunctionType* KernelFunctionTy(llvm::LLVMContext& ctx) {
 class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
  public:
   ElementalIrEmitter(llvm::Module* module, llvm::IRBuilder<>* b,
-                     const HloSchedule* schedule, IrEmitter* nested_ir_emitter,
+                     const HloModule* hlo_module, IrEmitter* nested_ir_emitter,
                      bool fast_min_max)
       : xla::ElementalIrEmitter(module, b),
-        schedule_(schedule),
+        hlo_module_(hlo_module),
         nested_ir_emitter_(nested_ir_emitter),
         fast_min_max_(fast_min_max) {}
 
@@ -148,16 +148,21 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
   absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name, bool is_reducer) override {
+    // Module must be scheduled to emit thread local computation.
+    if (!hlo_module_ || !hlo_module_->has_schedule()) {
+      return absl::InternalError(
+          "HLO module must be scheduled to emit thread local computation.");
+    }
     // Create a nested function for thread local computation if it is not
     // already created. Nested functions are created with internal linkage.
     if (!nested_ir_emitter_->is_computation_emitted(callee, is_reducer)) {
       VLOG(2) << "Emit nested computation: " << callee.name();
       TF_RETURN_IF_ERROR(
           nested_ir_emitter_
-              ->EmitComputation(const_cast<HloComputation*>(&callee), name,
-                                false,
-                                schedule_->sequence(&callee).instructions(),
-                                /*allow_reassociation=*/is_reducer)
+              ->EmitComputation(
+                  const_cast<HloComputation*>(&callee), name, false,
+                  hlo_module_->schedule().sequence(&callee).instructions(),
+                  /*allow_reassociation=*/is_reducer)
               .status());
     }
 
@@ -173,7 +178,7 @@ class IrEmitter2::ElementalIrEmitter : public xla::ElementalIrEmitter {
   bool fast_min_max() override { return fast_min_max_; }
 
  private:
-  const HloSchedule* schedule_;
+  const HloModule* hlo_module_;
   IrEmitter* nested_ir_emitter_;
   bool fast_min_max_;
 };
@@ -217,7 +222,7 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     return absl::InternalError("Multi-output host kernels are not supported");
   }
 
-  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_.schedule(),
+  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_,
                                        nested_ir_emitter_, fast_min_max());
   llvm_ir::ElementGenerator element_generator =
       elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
@@ -243,7 +248,7 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
   llvm::IRBuilder<> b(module_->getContext());
   b.SetInsertPoint(kernel_prototype.function->getEntryBlock().getTerminator());
 
-  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_.schedule(),
+  ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_,
                                        nested_ir_emitter_, fast_min_max());
   FusedIrEmitter fused_emitter(elemental_emitter);
 

From 55aaac6ac404d9bf701ec7897ee17eace942d130 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 13:15:15 -0700
Subject: [PATCH 209/287] Allow optional KV store overwrites. Cleanups: use
 std::string_view, migrate away from TF_ASSERT/EXPECT.

PiperOrigin-RevId: 639131697
---
 ...coordination_service_barrier_proxy_test.cc |   4 +
 .../tsl/protobuf/coordination_service.proto   |   1 +
 third_party/xla/xla/pjrt/distributed/BUILD    |  13 +-
 .../xla/xla/pjrt/distributed/client.cc        |  19 +-
 third_party/xla/xla/pjrt/distributed/client.h |   7 +-
 .../pjrt/distributed/client_server_test.cc    |  43 ++-
 third_party/xla/xla/python/xla.cc             |  20 +-
 .../xla/xla/python/xla_extension/__init__.pyi |   6 +-
 .../distributed_runtime/coordination/BUILD    |   1 -
 .../coordination/coordination_service.cc      |  95 +++---
 .../coordination/coordination_service.h       |  27 +-
 .../coordination_service_agent.cc             |   8 +
 .../coordination/coordination_service_agent.h |   4 +
 .../coordination_service_rpc_handler.cc       |  13 +-
 .../coordination/coordination_service_test.cc | 282 +++++++++---------
 15 files changed, 316 insertions(+), 227 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index a7f77c59888a73..b74535ccb44d93 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -99,6 +99,10 @@ class MockCoordinationServiceAgent : public CoordinationServiceAgent {
               (override));
   MOCK_METHOD(Status, InsertKeyValue,
               (std::string_view key, std::string_view value), (override));
+  MOCK_METHOD(Status, InsertKeyValue,
+              (std::string_view key, std::string_view value,
+               bool allow_overwrite),
+              (override));
   MOCK_METHOD(Status, DeleteKeyValue, (std::string_view key), (override));
   MOCK_METHOD(Status, UpdateKeyValue,
               (std::string_view key, std::string_view value), (override));
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/coordination_service.proto b/third_party/xla/third_party/tsl/tsl/protobuf/coordination_service.proto
index b4dcfaf4928174..2f7cc804cae5cb 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/coordination_service.proto
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/coordination_service.proto
@@ -159,6 +159,7 @@ message KeyValueEntry {
 // Request and response messages for inserting configuration key-value data.
 message InsertKeyValueRequest {
   KeyValueEntry kv = 1;
+  bool allow_overwrite = 2;
 }
 
 message InsertKeyValueResponse {}
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index ffcec08bd07daf..add82ada375b34 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -70,23 +70,17 @@ cc_library(
     deps = [
         ":key_value_store_interface",
         ":util",
-        "//xla:statusor",
-        "//xla:types",
-        "//xla:util",
         "//xla/tsl/distributed_runtime/coordination:coordination_client",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_error_util",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:random",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
@@ -150,13 +144,16 @@ xla_cc_test(
         "//xla:protobuf_util",
         "//xla:status_macros",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ] + tsl_grpc_cc_dependencies(),
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 74f30ff06ccc4e..d0d96c6c511d9f 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -24,18 +24,19 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
-#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
@@ -59,6 +60,8 @@ class DistributedRuntimeCoordinationServiceClient
   KeyValueDirGet(std::string_view key) override;
   absl::Status KeyValueSet(std::string_view key,
                            std::string_view value) override;
+  absl::Status KeyValueSet(std::string_view key, std::string_view value,
+                           bool allow_overwrite) override;
   absl::Status KeyValueDelete(std::string_view key) override;
   absl::Status WaitAtBarrier(
       std::string barrier_id, absl::Duration timeout,
@@ -150,10 +153,7 @@ DistributedRuntimeCoordinationServiceClient::BlockingKeyValueGet(
 absl::StatusOr<std::vector<std::pair<std::string, std::string>>>
 DistributedRuntimeCoordinationServiceClient::KeyValueDirGet(
     std::string_view key) {
-  // TODO(hanyangtay): Migrate to string_view for both client and coordination
-  // agent APIs.
-  TF_ASSIGN_OR_RETURN(const auto results,
-                      coord_agent_->GetKeyValueDir(std::string(key)));
+  TF_ASSIGN_OR_RETURN(const auto results, coord_agent_->GetKeyValueDir(key));
 
   std::vector<std::pair<std::string, std::string>> kvs;
   kvs.reserve(results.size());
@@ -173,7 +173,12 @@ absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueDelete(
 
 absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
     std::string_view key, std::string_view value) {
-  return coord_agent_->InsertKeyValue(key, value);
+  return KeyValueSet(key, value, /*allow_overwrite=*/false);
+}
+
+absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
+    std::string_view key, std::string_view value, bool allow_overwrite) {
+  return coord_agent_->InsertKeyValue(key, value, allow_overwrite);
 }
 
 absl::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index 03560cd1ca59f7..79973124485452 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -25,12 +25,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/statusor.h"
-#include "xla/types.h"
 #include "tsl/platform/env.h"
 
 namespace tsl {
@@ -133,6 +134,8 @@ class DistributedRuntimeClient {
 
   virtual absl::Status KeyValueSet(std::string_view key,
                                    std::string_view value) = 0;
+  virtual absl::Status KeyValueSet(std::string_view key, std::string_view value,
+                                   bool allow_overwrite) = 0;
 
   // Delete the key-value. If the key is a directory, recursively clean
   // up all key-values under the directory.
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index 9fe0741a83ae85..c574209856ae6d 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -20,15 +21,23 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/barrier.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "grpcpp/grpcpp.h"
+#include "absl/types/span.h"
+#include "grpcpp/channel.h"
+#include "grpcpp/create_channel.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
 #include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
+#include "grpcpp/support/channel_arguments.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
@@ -39,7 +48,9 @@ limitations under the License.
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
@@ -407,8 +418,8 @@ TEST_F(ClientServerTest, ClientsTerminateShutdownIfAnyClientGoesAway) {
     // 1. Internal: node turns into ERROR state during the shutdown call.
     // 2. Failed Precondition: node is already in ERROR state before the
     // shutdown call (note: agent will still stop sending heartbeats).
-    EXPECT_TRUE(tsl::errors::IsInternal(statuses[i]) ||
-                tsl::errors::IsFailedPrecondition(statuses[i]));
+    EXPECT_TRUE(absl::IsInternal(statuses[i]) ||
+                absl::IsFailedPrecondition(statuses[i]));
   }
 }
 
@@ -812,6 +823,32 @@ TEST_F(ClientServerTest, KeyValueDirGet) {
                                         Pair("test_dir/3", "3")));
 }
 
+TEST_F(ClientServerTest, KeyValueSet_Duplicate_Fails) {
+  StartService(/*num_nodes=*/1);
+  auto client = GetClient(/*node_id=*/0);
+  TF_ASSERT_OK(client->Connect());
+  TF_ASSERT_OK(client->KeyValueSet("test_key", "original_value"));
+  EXPECT_TRUE(
+      absl::IsAlreadyExists(client->KeyValueSet("test_key", "never_added")));
+  auto result =
+      client->BlockingKeyValueGet("test_key", absl::Milliseconds(100));
+  TF_ASSERT_OK(result.status());
+  EXPECT_EQ(result.value(), "original_value");
+}
+
+TEST_F(ClientServerTest, KeyValueSet_Duplicate_Overwrites) {
+  StartService(/*num_nodes=*/1);
+  auto client = GetClient(/*node_id=*/0);
+  TF_ASSERT_OK(client->Connect());
+  TF_ASSERT_OK(client->KeyValueSet("test_key", "original_value"));
+  TF_EXPECT_OK(client->KeyValueSet("test_key", "overwritten_value",
+                                   /*allow_overwrite=*/true));
+  auto result =
+      client->BlockingKeyValueGet("test_key", absl::Milliseconds(100));
+  TF_ASSERT_OK(result.status());
+  EXPECT_EQ(result.value(), "overwritten_value");
+}
+
 TEST_F(ClientServerTest, KeyValueDelete) {
   StartService(/*num_nodes=*/1);
   auto client = GetClient(/*node_id=*/0);
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 8b05979daf2523..2e087f8c82d235 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -660,32 +660,34 @@ NB_MODULE(xla_extension, m_nb) {
       .def(
           "key_value_set",
           [](DistributedRuntimeClient& client, std::string_view key,
-             std::string_view value) {
+             std::string_view value, bool allow_overwrite) {
             nb::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(key, value));
+            xla::ThrowIfError(client.KeyValueSet(key, value, allow_overwrite));
           },
-          nb::arg("key"), nb::arg("value"))
+          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
       .def(
           "key_value_set",
           [](DistributedRuntimeClient& client, std::string_view key,
-             nb::bytes value) {
+             nb::bytes value, bool allow_overwrite) {
             nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.KeyValueSet(
-                key, std::string_view(value.c_str(), value.size())));
+                key, std::string_view(value.c_str(), value.size()),
+                allow_overwrite));
           },
-          nb::arg("key"), nb::arg("value"))
+          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
       // The key must be a string, but the value must a
       // Python bytes object.
       // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.
       .def(
           "key_value_set_bytes",
           [](DistributedRuntimeClient& client, std::string_view key,
-             nb::bytes value) {
+             nb::bytes value, bool allow_overwrite) {
             nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.KeyValueSet(
-                key, std::string_view(value.c_str(), value.size())));
+                key, std::string_view(value.c_str(), value.size()),
+                allow_overwrite));
           },
-          nb::arg("key"), nb::arg("value"))
+          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
       // Assumes that all values in the directory are Python strings.
       .def(
           "key_value_dir_get",
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 1f53b5d8208dd3..354a7f10d9c3f9 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -775,8 +775,10 @@ class DistributedRuntimeClient:
   ) -> _Status: ...
   def key_value_dir_get(self, key: str) -> _Status: ...
   def key_value_dir_get_bytes(self, key: str) -> _Status: ...
-  def key_value_set(self, key: str, value: str) -> _Status: ...
-  def key_value_set_bytes(self, key: str, value: bytes) -> _Status: ...
+  def key_value_set(self, key: str, value: str,
+                    allow_overwrite: bool = False) -> _Status: ...
+  def key_value_set_bytes(self, key: str, value: bytes,
+                          allow_overwrite: bool = False) -> _Status: ...
   def key_value_delete(self, key: str) -> _Status: ...
   def wait_at_barrier(
       self, barrier_id: str, timeout_in_ms: int, process_ids: Optional[List[int]]
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
index 5810a6ebcbc602..1d3b12097180ee 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
@@ -121,7 +121,6 @@ tsl_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:random",
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index 622696c8e4c306..cfd672266d44ce 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -34,7 +35,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -66,7 +66,7 @@ constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
 constexpr int kPendingTaskLogLimit = 20;
 constexpr int kPendingStragglerLogLimit = 3;
 
-std::string GetTaskName(absl::string_view job_name, int task_id) {
+std::string GetTaskName(std::string_view job_name, int task_id) {
   return absl::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
 }
 
@@ -74,7 +74,7 @@ std::string GetTaskName(const CoordinatedTask& task) {
   return GetTaskName(task.job_name(), task.task_id());
 }
 
-CoordinatedTask GetTaskFromName(absl::string_view task_name) {
+CoordinatedTask GetTaskFromName(std::string_view task_name) {
   DeviceNameUtils::ParsedName parsed;
   DeviceNameUtils::ParseFullName(task_name, &parsed);
   CoordinatedTask task;
@@ -123,19 +123,21 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
                                absl::Status error) override;
   std::vector<CoordinatedTaskStateInfo> GetTaskState(
       const std::vector<CoordinatedTask>& task) override;
-  absl::Status InsertKeyValue(const std::string& key,
-                              const std::string& value) override;
-  void GetKeyValueAsync(const std::string& key,
+  absl::Status InsertKeyValue(std::string_view key,
+                              std::string_view value) override;
+  absl::Status InsertKeyValue(std::string_view key, std::string_view value,
+                              bool allow_overwrite) override;
+  void GetKeyValueAsync(std::string_view key,
                         StatusOrValueCallback done) override;
-  absl::StatusOr<std::string> TryGetKeyValue(const std::string& key) override;
+  absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) override;
   std::vector<KeyValueEntry> GetKeyValueDir(
-      absl::string_view directory_key) override;
-  absl::Status DeleteKeyValue(const std::string& key) override;
-  void BarrierAsync(const std::string& barrier_id, absl::Duration timeout,
+      std::string_view directory_key) override;
+  absl::Status DeleteKeyValue(std::string_view key) override;
+  void BarrierAsync(std::string_view barrier_id, absl::Duration timeout,
                     const CoordinatedTask& task,
                     const std::vector<CoordinatedTask>& participating_tasks,
                     StatusCallback done) override;
-  absl::Status CancelBarrier(const std::string& barrier_id,
+  absl::Status CancelBarrier(std::string_view barrier_id,
                              const CoordinatedTask& task) override;
 
  private:
@@ -154,7 +156,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   void PropagateError(const CoordinatedTask& source_task,
                       bool is_reported_by_task = false)
       ABSL_LOCKS_EXCLUDED(state_mu_);
-  void SetTaskError(absl::string_view task_name, absl::Status error)
+  void SetTaskError(std::string_view task_name, absl::Status error)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   void AggregateClusterDevices() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   absl::Status DisconnectTask(const CoordinatedTask& task)
@@ -172,7 +174,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
         tasks_at_barrier;
     std::vector<StatusCallback> done_callbacks;
   };
-  void PassBarrier(absl::string_view barrier_id, absl::Status result,
+  void PassBarrier(std::string_view barrier_id, absl::Status result,
                    BarrierState* barrier)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   // Check if participating tasks are specified correctly across barrier calls.
@@ -181,7 +183,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
       const absl::flat_hash_map<CoordinatedTask, bool, CoordinatedTaskHash,
                                 CoordinatedTaskEqual>& tasks_at_barrier,
       int64_t cluster_size);
-  bool isRecoverableJob(absl::string_view task_name) const;
+  bool isRecoverableJob(std::string_view task_name) const;
 
   class TaskState {
    public:
@@ -214,8 +216,8 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
     bool DeviceInfoIsCollected() { return devices_.device_size() != 0; }
 
     absl::flat_hash_set<std::string> GetOngoingBarriers();
-    void JoinBarrier(absl::string_view barrier_id);
-    void ExitBarrier(absl::string_view barrier_id);
+    void JoinBarrier(std::string_view barrier_id);
+    void ExitBarrier(std::string_view barrier_id);
 
    private:
     // Incarnation ID for CPU:0 on remote task.
@@ -334,12 +336,12 @@ CoordinationServiceStandaloneImpl::TaskState::GetOngoingBarriers() {
 }
 
 void CoordinationServiceStandaloneImpl::TaskState::JoinBarrier(
-    absl::string_view barrier_id) {
+    std::string_view barrier_id) {
   ongoing_barriers_for_task_.emplace(barrier_id);
 }
 
 void CoordinationServiceStandaloneImpl::TaskState::ExitBarrier(
-    absl::string_view barrier_id) {
+    std::string_view barrier_id) {
   ongoing_barriers_for_task_.erase(barrier_id);
 }
 
@@ -382,7 +384,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
       env_.StartThread({}, kHealthCheckThread, [this]() {
         const bool has_service_to_client_connection = client_cache_ != nullptr;
         // Used to store stale tasks and barriers.
-        std::vector<absl::string_view> stale_task_names;
+        std::vector<std::string_view> stale_task_names;
         absl::flat_hash_map<std::string, BarrierState*> expired_barriers;
         while (true) {
           {
@@ -448,7 +450,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
           {
             absl::MutexLock l(&state_mu_);
             // Gather barriers which have timed out.
-            for (const std::string& barrier_id : ongoing_barriers_) {
+            for (std::string_view barrier_id : ongoing_barriers_) {
               auto* barrier = &barriers_[barrier_id];
               if (current_time_micros > barrier->deadline_in_micros) {
                 expired_barriers[barrier_id] = barrier;
@@ -879,7 +881,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   call_opts.SetTimeout(kServiceToClientTimeoutMs);
   std::vector<std::shared_ptr<absl::Notification>> notifications;
 
-  std::vector<absl::string_view> task_names;
+  std::vector<std::string_view> task_names;
   {
     absl::ReaderMutexLock l(&state_mu_);
     task_names.reserve(cluster_state_.size());
@@ -887,7 +889,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
       task_names.emplace_back(pair.first);
     }
   }
-  for (absl::string_view task : task_names) {
+  for (std::string_view task : task_names) {
     {
       absl::MutexLock l(&state_mu_);
       // Propagate error only to tasks that are connected
@@ -928,7 +930,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
 // The normalized key will not have leading or trailing slashes, and all parts
 // in the key path are separated by exactly one slack ('/').
 // E.g., ///a//b/c// --> a/b/c
-std::string NormalizeKey(absl::string_view orig_key) {
+std::string NormalizeKey(std::string_view orig_key) {
   std::string norm_key = std::string(orig_key);
   const char* src = norm_key.c_str();
   std::string::iterator dst = norm_key.begin();
@@ -953,15 +955,21 @@ std::string NormalizeKey(absl::string_view orig_key) {
 }
 
 absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
-    const std::string& key, const std::string& value) {
-  VLOG(3) << "InsertKeyValue(): " << key << ": " << value;
+    std::string_view key, std::string_view value) {
+  return InsertKeyValue(key, value, /*allow_overwrite=*/false);
+}
+
+absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
+    std::string_view key, std::string_view value, bool allow_overwrite) {
+  VLOG(3) << "InsertKeyValue(): " << key << ": " << value
+          << " allow_overwrite: " << allow_overwrite;
   const std::string norm_key = NormalizeKey(key);
   absl::MutexLock l(&kv_mu_);
-  if (kv_store_.find(norm_key) != kv_store_.end()) {
+  if (!allow_overwrite && kv_store_.find(norm_key) != kv_store_.end()) {
     return MakeCoordinationError(absl::AlreadyExistsError(
         absl::StrCat("Config key ", key, " already exists.")));
   }
-  kv_store_.emplace(norm_key, value);
+  kv_store_.insert_or_assign(norm_key, value);
   auto iter = get_cb_.find(norm_key);
   if (iter != get_cb_.end()) {
     for (const auto& cb : iter->second) {
@@ -973,7 +981,7 @@ absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
 }
 
 void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
-    const std::string& key, StatusOrValueCallback done) {
+    std::string_view key, StatusOrValueCallback done) {
   VLOG(3) << "GetKeyValue(): " << key;
   const std::string norm_key = NormalizeKey(key);
   absl::MutexLock l(&kv_mu_);
@@ -991,7 +999,7 @@ void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
 }
 
 absl::StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
-    const std::string& key) {
+    std::string_view key) {
   VLOG(3) << "TryGetKeyValue(): " << key;
   const std::string norm_key = NormalizeKey(key);
   absl::MutexLock l(&kv_mu_);
@@ -1003,7 +1011,7 @@ absl::StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
 }
 
 std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
-    absl::string_view directory_key) {
+    std::string_view directory_key) {
   VLOG(3) << "TryGetKeyValueDir(): " << directory_key;
   std::vector<KeyValueEntry> kvs_in_directory;
   const std::string norm_key = NormalizeKey(directory_key);
@@ -1031,7 +1039,7 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
 }
 
 absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
-    const std::string& key) {
+    std::string_view key) {
   VLOG(3) << "DeleteKeyValue(): " << key;
   const std::string norm_key = NormalizeKey(key);
   absl::MutexLock l(&kv_mu_);
@@ -1052,8 +1060,8 @@ absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
   return absl::OkStatus();
 }
 
-void CoordinationServiceStandaloneImpl::SetTaskError(
-    absl::string_view task_name, absl::Status error) {
+void CoordinationServiceStandaloneImpl::SetTaskError(std::string_view task_name,
+                                                     absl::Status error) {
   cluster_state_[task_name]->SetError(error);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
@@ -1068,7 +1076,7 @@ void CoordinationServiceStandaloneImpl::SetTaskError(
 }
 
 void CoordinationServiceStandaloneImpl::BarrierAsync(
-    const std::string& barrier_id, absl::Duration timeout,
+    std::string_view barrier_id, absl::Duration timeout,
     const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& participating_tasks,
     StatusCallback done) {
@@ -1087,8 +1095,9 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
                    }) != participating_tasks.end();
 
   if (!participating_tasks.empty() && !among_participating_tasks) {
+    const std::string task_name = GetTaskName(task);
     absl::Status error = MakeCoordinationError(absl::InvalidArgumentError(
-        absl::StrCat("A non-participating task (", source_task_name,
+        absl::StrCat("A non-participating task (", GetTaskName(task),
                      ") called the barrier: ", barrier_id)));
     {
       absl::MutexLock l(&state_mu_);
@@ -1123,14 +1132,12 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   auto* barrier = &it->second;
   // Create barrier for the first time.
   if (inserted) {
-    LOG(INFO) << "Barrier(" << barrier_id
-              << ") has been created by task: " << source_task_name;
     // Initialize barrier state.
     barrier->passed = false;
     // Assume barrier is for entire cluster if no tasks are specified.
     if (participating_tasks.empty()) {
       for (const auto& task_state : cluster_state_) {
-        absl::string_view task_name = task_state.first;
+        std::string_view task_name = task_state.first;
         barrier->tasks_at_barrier[GetTaskFromName(task_name)] = false;
       }
     } else {
@@ -1229,7 +1236,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
 }
 
 absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
-    const std::string& barrier_id, const CoordinatedTask& task) {
+    std::string_view barrier_id, const CoordinatedTask& task) {
   absl::MutexLock l(&state_mu_);
   if (ServiceHasStopped()) {
     return MakeCoordinationError(absl::InternalError(
@@ -1260,12 +1267,12 @@ absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
 }
 
 // Mark barrier as passed.
-void CoordinationServiceStandaloneImpl::PassBarrier(
-    absl::string_view barrier_id, absl::Status result, BarrierState* barrier) {
+void CoordinationServiceStandaloneImpl::PassBarrier(std::string_view barrier_id,
+                                                    absl::Status result,
+                                                    BarrierState* barrier) {
   barrier->passed = true;
   barrier->result = result;
-  LOG(INFO) << "Barrier(" << barrier_id
-            << ") has passed with status: " << result;
+  VLOG(3) << "Barrier(" << barrier_id << ") has passed with status: " << result;
   // Special hook for device propagation barrier to set global device ids.
   if (barrier_id == device_propagation_barrier_id_) {
     AggregateClusterDevices();
@@ -1374,7 +1381,7 @@ std::unique_ptr<CoordinationServiceInterface> EnableCoordinationService(
 }
 
 bool CoordinationServiceStandaloneImpl::isRecoverableJob(
-    const absl::string_view task_name) const {
+    const std::string_view task_name) const {
   return recoverable_jobs_.find(task_name) != recoverable_jobs_.end();
 }
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
index 627cce09186f9b..63bea88dbd5eaf 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -27,7 +28,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/platform/macros.h"
@@ -74,12 +74,12 @@ class CoordinationServiceInterface {
           std::unique_ptr<CoordinationClientCache> cache)>;
 
   using StatusOrValueCallback =
-      std::function<void(const absl::StatusOr<std::string>&)>;
+      std::function<void(const absl::StatusOr<std::string_view>&)>;
 
   virtual ~CoordinationServiceInterface() = default;
 
   static void RegisterCoordinationService(
-      const std::string& service_type_name,
+      std::string_view service_type_name,
       CoordinationServiceFactory factory_fn) {
     auto factories = GetCoordinationServiceFactories();
     factories->emplace(service_type_name, factory_fn);
@@ -167,29 +167,31 @@ class CoordinationServiceInterface {
   // Insert a configuration key-value in the coordination service.
   // For now, a key-value can only be inserted once and cannot be updated.
   // The key-values are not persisted and will be lost if the leader fails.
-  virtual absl::Status InsertKeyValue(const std::string& key,
-                                      const std::string& value) = 0;
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value,
+                                      bool allow_overwrite) = 0;
 
   // Get a configuration key-value from the coordination service. The `done`
   // callback is invoked when the key-value becomes available.
-  virtual void GetKeyValueAsync(const std::string& key,
+  virtual void GetKeyValueAsync(std::string_view key,
                                 StatusOrValueCallback done) = 0;
 
   // Get a configuration key-value from the coordination service. If the key
   // does not exist, return NotFound error.
-  virtual absl::StatusOr<std::string> TryGetKeyValue(
-      const std::string& key) = 0;
+  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
 
   // Gets all values under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
   // the directory. This is not a blocking call. Agent does not need to be
   // connected to utilize the distributed key-value store.
   virtual std::vector<tensorflow::KeyValueEntry> GetKeyValueDir(
-      absl::string_view directory_key) = 0;
+      std::string_view directory_key) = 0;
 
   // Delete configuration key-value. If key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual absl::Status DeleteKeyValue(const std::string& key) = 0;
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -222,7 +224,7 @@ class CoordinationServiceInterface {
   //       list of participating tasks.
   //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state.
   virtual void BarrierAsync(
-      const std::string& barrier_id, absl::Duration timeout,
+      std::string_view barrier_id, absl::Duration timeout,
       const tensorflow::CoordinatedTask& task,
       const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
       StatusCallback done) = 0;
@@ -233,8 +235,7 @@ class CoordinationServiceInterface {
   // Possible service errors:
   //   - FailedPrecondition: Barrier has already been passed.
   virtual absl::Status CancelBarrier(
-      const std::string& barrier_id,
-      const tensorflow::CoordinatedTask& task) = 0;
+      std::string_view barrier_id, const tensorflow::CoordinatedTask& task) = 0;
 
  private:
   friend class CoordinationServiceRpcHandler;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 860d4fd65ab90b..39a96c2fe64a24 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -108,6 +108,8 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
                            StatusOrValueDirCallback done) override;
   absl::Status InsertKeyValue(std::string_view key,
                               std::string_view value) override;
+  absl::Status InsertKeyValue(std::string_view key, std::string_view value,
+                              bool allow_overwrite) override;
   absl::Status DeleteKeyValue(std::string_view key) override;
   absl::Status UpdateKeyValue(std::string_view key,
                               std::string_view value) override;
@@ -709,9 +711,15 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
 
 absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
     std::string_view key, std::string_view value) {
+  return InsertKeyValue(key, value, /*allow_overwrite=*/false);
+}
+
+absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
+    std::string_view key, std::string_view value, bool allow_overwrite) {
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
+  request.set_allow_overwrite(allow_overwrite);
   VLOG(3) << "InsertKeyValueRequest: " << request.DebugString();
   InsertKeyValueResponse response;
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index d9cddbe7d9642b..3ec188ac251801 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -198,6 +198,10 @@ class CoordinationServiceAgent {
   virtual absl::Status InsertKeyValue(std::string_view key,
                                       std::string_view value) = 0;
 
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value,
+                                      bool allow_overwrite) = 0;
+
   // Delete config keys in the coordination service.
   virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index 920d7840486a9e..315947473cf51c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <iterator>
-#include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -198,7 +198,8 @@ void CoordinationServiceRpcHandler::InsertKeyValueAsync(
         absl::InternalError("Coordination service is not enabled.")));
     return;
   }
-  done(service_->InsertKeyValue(request->kv().key(), request->kv().value()));
+  done(service_->InsertKeyValue(request->kv().key(), request->kv().value(),
+                                request->allow_overwrite()));
 }
 
 void CoordinationServiceRpcHandler::GetKeyValueAsync(
@@ -212,10 +213,12 @@ void CoordinationServiceRpcHandler::GetKeyValueAsync(
   }
   response->mutable_kv()->set_key(request->key());
   service_->GetKeyValueAsync(
-      request->key(), [response, done = std::move(done)](
-                          const absl::StatusOr<std::string>& status_or_value) {
+      request->key(),
+      [response, done = std::move(done)](
+          const absl::StatusOr<std::string_view>& status_or_value) {
         if (status_or_value.ok()) {
-          response->mutable_kv()->set_value(status_or_value.value());
+          auto value = status_or_value.value();
+          response->mutable_kv()->set_value(value.data(), value.size());
         }
         done(status_or_value.status());
       });
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 8eba77c43211f7..7d5dd06c5e56d7 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "xla/tsl/distributed_runtime/coordination/test_device.pb.h"
-#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/random.h"
 #include "tsl/platform/status.h"
@@ -286,22 +285,22 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
 
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Notification wait_for_all;
   coord_service_->WaitForAllTasks(task_0_, {}, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     wait_for_all.Notify();
   });
   // Not all tasks have registered, so must not be notified here.
   ASSERT_FALSE(wait_for_all.HasBeenNotified());
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   coord_service_->WaitForAllTasks(task_1_, {},
-                                  [&](absl::Status s) { TF_ASSERT_OK(s); });
+                                  [&](absl::Status s) { ASSERT_OK(s); });
   // All tasks have registered.
   wait_for_all.WaitForNotification();
 
-  TF_ASSERT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RecordHeartbeat(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RecordHeartbeat(task_1_, incarnation_1_));
   EXPECT_TRUE(
       absl::IsInvalidArgument(coord_service_->RecordHeartbeat(task_2, 0)));
 
@@ -350,21 +349,21 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
 
   // Each coordinated task registers and waits for other tasks.
   absl::Notification register_chief;
-  TF_ASSERT_OK(coord_service->RegisterTask(chief, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(chief, /*incarnation=*/0));
   coord_service->WaitForAllTasks(chief, {}, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     register_chief.Notify();
   });
   absl::Notification register_task0;
-  TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
   coord_service->WaitForAllTasks(task_0, {}, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     register_task0.Notify();
   });
   absl::Notification register_task1;
-  TF_ASSERT_OK(coord_service->RegisterTask(task_1, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(task_1, /*incarnation=*/0));
   coord_service->WaitForAllTasks(task_1, {}, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     register_task1.Notify();
   });
   // All tasks in the coordinated jobs have registered.
@@ -393,13 +392,13 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
           Env::Default(), config,
           /*cache=*/nullptr);
   // Task connects to coordination service.
-  TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
   // Registration should succeed since it is the same task.
   const absl::Status status =
       coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
-  TF_EXPECT_OK(status) << status;
+  EXPECT_OK(status) << status;
 }
 
 TEST(CoordinationServiceTest,
@@ -414,7 +413,7 @@ TEST(CoordinationServiceTest,
           Env::Default(), config,
           /*cache=*/nullptr);
   // Task connects to coordination service.
-  TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
   // Registration should fail since task already registered previously with a
   // different incarnation. Note that incarnation usually changes if an agent
@@ -437,10 +436,10 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
           Env::Default(), config,
           /*cache=*/nullptr);
   // Task connects to coordination service.
-  TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
+  ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
   // Arbitrarily set task to be in error.
-  TF_ASSERT_OK(coord_service->ReportTaskError(
-      task_0, absl::InternalError("test_error")));
+  ASSERT_OK(coord_service->ReportTaskError(task_0,
+                                           absl::InternalError("test_error")));
 
   // Registration should fail since task already registered previously.
   const absl::Status status =
@@ -452,8 +451,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
 
 TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
   EnableCoordinationService();
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
   // No heartbeat for a while, leader considers the task as stale.
   Env::Default()->SleepForMicroseconds(
@@ -467,8 +466,8 @@ TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutWithoutServerToClientConnection) {
   EnableCoordinationService(/*has_service_to_client_connection=*/false);
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
   // No heartbeat for a while, leader consider the task as stale.
   // Service stops and disconnects both tasks.
@@ -484,8 +483,8 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   EnableCoordinationService();
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
   // Simulate task restart scenario: trying to register to cluster again.
   absl::Status s =
@@ -495,36 +494,53 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   EXPECT_TRUE(absl::IsAborted(client_0_.GetStatus())) << client_0_.GetStatus();
 }
 
+TEST_F(CoordinateTwoTasksTest, InsertKeyValue_Duplicate_Fail) {
+  EnableCoordinationService();
+  ASSERT_OK(coord_service_->InsertKeyValue("key0", "original_value"));
+  EXPECT_TRUE(absl::IsAlreadyExists(
+      coord_service_->InsertKeyValue("key0", "never_added")));
+  auto result = coord_service_->TryGetKeyValue("key0");
+  EXPECT_OK(result.status());
+  EXPECT_EQ(result.value(), "original_value");
+}
+
+TEST_F(CoordinateTwoTasksTest, InsertKeyValue_Duplicate_Overwrite) {
+  EnableCoordinationService();
+  ASSERT_OK(coord_service_->InsertKeyValue("key0", "original_value"));
+  EXPECT_OK(coord_service_->InsertKeyValue("key0", "overwritten_value",
+                                           /*allow_overwrite=*/true));
+  auto result = coord_service_->TryGetKeyValue("key0");
+  EXPECT_OK(result.status());
+  EXPECT_EQ(result.value(), "overwritten_value");
+}
+
 TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
   EnableCoordinationService();
 
   // Simple key
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("key0", "value0"));
+  ASSERT_OK(coord_service_->InsertKeyValue("key0", "value0"));
   // Unix file like key path
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("/path", "value"));
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("/path/to/key1", "value1"));
+  ASSERT_OK(coord_service_->InsertKeyValue("/path", "value"));
+  ASSERT_OK(coord_service_->InsertKeyValue("/path/to/key1", "value1"));
   // Key with redundant slashes
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("path/to//key2/", "value2"));
-  // Error when repeatedly inserting the same key
-  EXPECT_TRUE(absl::IsAlreadyExists(
-      coord_service_->InsertKeyValue("/path/to/key1/", "value2")));
+  ASSERT_OK(coord_service_->InsertKeyValue("path/to//key2/", "value2"));
 
   // Get simple key
   absl::Notification n1;
-  absl::StatusOr<std::string> ret;
+  absl::StatusOr<std::string_view> ret;
   coord_service_->GetKeyValueAsync(
-      "key0", [&](const absl::StatusOr<std::string>& status_or_value) {
+      "key0", [&](const absl::StatusOr<std::string_view>& status_or_value) {
         ret = status_or_value;
         n1.Notify();
       });
   n1.WaitForNotification();
-  TF_ASSERT_OK(ret.status());
+  ASSERT_OK(ret.status());
   EXPECT_EQ(ret.value(), "value0");
   // Get key with redundant slashes
   absl::Notification n2;
   coord_service_->GetKeyValueAsync(
       "path//to///key1////",
-      [&](const absl::StatusOr<std::string>& status_or_value) {
+      [&](const absl::StatusOr<std::string_view>& status_or_value) {
         ret = status_or_value;
         n2.Notify();
       });
@@ -532,22 +548,22 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
   EXPECT_EQ(ret.value(), "value1");
 
   // Delete single key-value
-  TF_ASSERT_OK(coord_service_->DeleteKeyValue("key0"));
+  ASSERT_OK(coord_service_->DeleteKeyValue("key0"));
   // Get key that is not available
   absl::Notification n3;
   coord_service_->GetKeyValueAsync(
-      "key0", [&](const absl::StatusOr<std::string>& status_or_value) {
+      "key0", [&](const absl::StatusOr<std::string_view>& status_or_value) {
         ret = status_or_value;
         n3.Notify();
       });
   EXPECT_FALSE(n3.HasBeenNotified());
   // Insert the previously deleted key again
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("key0", "value0_new"));
+  ASSERT_OK(coord_service_->InsertKeyValue("key0", "value0_new"));
   n3.WaitForNotification();
   EXPECT_EQ(ret.value(), "value0_new");
 
   // Delete key-values recursively
-  TF_ASSERT_OK(coord_service_->DeleteKeyValue("/path"));
+  ASSERT_OK(coord_service_->DeleteKeyValue("/path"));
   // Get key that is not available
   auto n4 = std::make_shared<absl::Notification>();
   coord_service_->GetKeyValueAsync(
@@ -556,7 +572,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
       // service shutdown. Hence, we use a shared pointer for notification so
       // that the it will not be deallocated before the pending callback is
       // cleaned up.
-      [n4](const absl::StatusOr<std::string>& status_or_value) {
+      [n4](const absl::StatusOr<std::string_view>& status_or_value) {
         n4->Notify();
       });
   EXPECT_FALSE(n4->HasBeenNotified());
@@ -576,12 +592,12 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
   EXPECT_TRUE(absl::IsNotFound(result.status()));
 
   // Insert key value.
-  TF_ASSERT_OK(coord_service->InsertKeyValue("test_key", "test_value"));
+  ASSERT_OK(coord_service->InsertKeyValue("test_key", "test_value"));
   result = coord_service->TryGetKeyValue("test_key");
   EXPECT_EQ(result.value(), "test_value");
 
   // Delete Key, and try to get the key again.
-  TF_ASSERT_OK(coord_service->DeleteKeyValue("test_key"));
+  ASSERT_OK(coord_service->DeleteKeyValue("test_key"));
   result = coord_service->TryGetKeyValue("test_key");
   EXPECT_TRUE(absl::IsNotFound(result.status()));
 }
@@ -589,7 +605,7 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
 TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_SingleValueInDirectory) {
   EnableCoordinationService();
   KeyValueEntry kv = CreateKv("dir/path", "value0");
-  TF_ASSERT_OK(coord_service_->InsertKeyValue(kv.key(), kv.value()));
+  ASSERT_OK(coord_service_->InsertKeyValue(kv.key(), kv.value()));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -602,9 +618,9 @@ TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_MultipleValuesInDirectory) {
   KeyValueEntry kv2 = CreateKv("dir/path2", "value1");
   // Placed in nested subdirectory.
   KeyValueEntry kv_sub = CreateKv("dir/sub_dir/path", "value_sub");
-  TF_ASSERT_OK(coord_service_->InsertKeyValue(kv.key(), kv.value()));
-  TF_ASSERT_OK(coord_service_->InsertKeyValue(kv2.key(), kv2.value()));
-  TF_ASSERT_OK(coord_service_->InsertKeyValue(kv_sub.key(), kv_sub.value()));
+  ASSERT_OK(coord_service_->InsertKeyValue(kv.key(), kv.value()));
+  ASSERT_OK(coord_service_->InsertKeyValue(kv2.key(), kv2.value()));
+  ASSERT_OK(coord_service_->InsertKeyValue(kv_sub.key(), kv_sub.value()));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -623,7 +639,7 @@ TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_Empty_ReturnsEmptyList) {
 TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_WrongDir_ReturnsEmptyList) {
   EnableCoordinationService();
   // Wrong directory.
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("dir0/path", "value0"));
+  ASSERT_OK(coord_service_->InsertKeyValue("dir0/path", "value0"));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -633,7 +649,7 @@ TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_WrongDir_ReturnsEmptyList) {
 TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_WrongDirPrefix_ReturnsEmptyList) {
   EnableCoordinationService();
   // Check that we don't match with nested subdirectories with the wrong prefix.
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("wrong_dir/dir/path", "value0"));
+  ASSERT_OK(coord_service_->InsertKeyValue("wrong_dir/dir/path", "value0"));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -644,7 +660,7 @@ TEST_F(CoordinateTwoTasksTest,
        GetKeyValueDir_NonDirectoryPrefix_ReturnsEmptyList) {
   EnableCoordinationService();
   // Wrong directory.
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("dir_key", "value0"));
+  ASSERT_OK(coord_service_->InsertKeyValue("dir_key", "value0"));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -655,7 +671,7 @@ TEST_F(CoordinateTwoTasksTest,
        GetKeyValueDir_NonDirectoryKey_ReturnsEmptyList) {
   EnableCoordinationService();
   // Insert same key that is not a directory.
-  TF_ASSERT_OK(coord_service_->InsertKeyValue("dir", "value0"));
+  ASSERT_OK(coord_service_->InsertKeyValue("dir", "value0"));
 
   std::vector<KeyValueEntry> result = coord_service_->GetKeyValueDir("dir");
 
@@ -700,11 +716,11 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   // Each task sends its device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
     n.Notify();
@@ -775,11 +791,11 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   // Make sure that cluster device order is deterministic even if devices are
   // sent out of order.
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
-    TF_ASSERT_OK(s);
+    ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
     n.Notify();
@@ -830,15 +846,15 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   // Task0 sends device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [](absl::Status s) { ASSERT_OK(s); });
 
   // Task0 sends device info sgain.
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](absl::Status s) { TF_ASSERT_OK(s); });
+                                 [](absl::Status s) { ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_1, local_devices_1,
                                  [coord_service = coord_service.get(),
                                   &cluster_devices, &n](absl::Status s) {
-                                   TF_ASSERT_OK(s);
+                                   ASSERT_OK(s);
                                    // Gather the cluster device info.
                                    cluster_devices =
                                        coord_service->ListClusterDevices();
@@ -894,9 +910,9 @@ TEST_F(CoordinationBarrierTest, Barrier) {
   EXPECT_TRUE(n_0.HasBeenNotified());
   EXPECT_TRUE(n_1.HasBeenNotified());
   EXPECT_TRUE(n_2.HasBeenNotified());
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
-  TF_EXPECT_OK(barrier_status_2);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_2);
 }
 
 TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
@@ -925,8 +941,8 @@ TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
   // All listed tasks passed the barrier.
   EXPECT_TRUE(n_0.HasBeenNotified());
   EXPECT_TRUE(n_1.HasBeenNotified());
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
 }
 
 TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
@@ -1001,8 +1017,8 @@ TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTaskThreeTasks) {
   n_1.WaitForNotification();
 
   // Barrier should pass because only participating tasks have called it.
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
 
   // Task 2 unexpectedly calls a barrier that it is not participating in.
   GetCoordinationService()->BarrierAsync(
@@ -1075,7 +1091,7 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
         barrier_status_0 = s;
         n_0.Notify();
       });
-  TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
+  ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), absl::InternalError("test_error")));
   // Block until barrier has failed due to task error.
   n_0.WaitForNotification();
@@ -1102,7 +1118,7 @@ TEST_F(CoordinationBarrierTest, BarrierCancelled) {
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsCancelled(barrier_status));
-  TF_EXPECT_OK(cancelled_status);
+  EXPECT_OK(cancelled_status);
 }
 
 TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
@@ -1111,7 +1127,7 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
   absl::Status barrier_status;
 
   // Cancel barrier should still succeed.
-  TF_ASSERT_OK(GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0)));
+  ASSERT_OK(GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0)));
   // Calling a cancelled barrier should fail instantly.
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
@@ -1145,9 +1161,9 @@ TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsFailedPrecondition(cancelled_status));
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
-  TF_EXPECT_OK(barrier_status_2);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_2);
 }
 
 TEST_F(CoordinationBarrierTest, PassedBarrierReturnsImmediately) {
@@ -1193,17 +1209,17 @@ TEST_F(CoordinationBarrierTest, PassedBarrierReturnsImmediately) {
   EXPECT_TRUE(n1.HasBeenNotified());
   EXPECT_TRUE(n2.HasBeenNotified());
   EXPECT_TRUE(n_repeat.HasBeenNotified());
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
-  TF_EXPECT_OK(barrier_status_2);
-  TF_EXPECT_OK(barrier_status_repeat);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_2);
+  EXPECT_OK(barrier_status_repeat);
 }
 
 TEST_F(CoordinationBarrierTest, BarrierFailsIfTaskIsAlreadyInError) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
   // Set task 0 to error state.
-  TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
+  ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), absl::InternalError("test_error")));
   absl::Status barrier_status;
 
@@ -1227,7 +1243,7 @@ TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
         barrier_status = s;
         n0.Notify();
       });
-  TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
+  ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), absl::InternalError("test_error")));
   n0.WaitForNotification();
 
@@ -1271,28 +1287,28 @@ TEST_F(CoordinationBarrierTest,
         barrier_status_2 = s;
         n_2.Notify();
       });
-  TF_EXPECT_OK(barrier_status_0);
-  TF_EXPECT_OK(barrier_status_1);
-  TF_EXPECT_OK(barrier_status_2);
+  EXPECT_OK(barrier_status_0);
+  EXPECT_OK(barrier_status_1);
+  EXPECT_OK(barrier_status_2);
 }
 
 TEST_F(CoordinateTwoTasksTest, ResetAndRegisterAgain) {
   EnableCoordinationService();
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
-  TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
+  EXPECT_OK(coord_service_->ResetTask(task_0_));
 
   // Task should be allowed to register again after being reset.
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 }
 
 TEST_F(CoordinateTwoTasksTest, Reset_HeartbeatsAreAcceptedForAGracePeriod) {
   EnableCoordinationService();
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
-  TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
+  EXPECT_OK(coord_service_->ResetTask(task_0_));
   // Heartbeat should be allowed for a short grace period after reset.
-  TF_EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
 
   // Heartbeat failure should be triggered for disconnected task after grace
   // period.
@@ -1305,7 +1321,7 @@ TEST_F(CoordinateTwoTasksTest, Reset_HeartbeatsAreAcceptedForAGracePeriod) {
 TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
   coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
@@ -1316,7 +1332,7 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
                                  barrier_n.Notify();
                                });
 
-  TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
+  EXPECT_OK(coord_service_->ResetTask(task_0_));
 
   // Ongoing barrier should fail with error after shutdown.
   EXPECT_TRUE(barrier_n.HasBeenNotified());
@@ -1326,17 +1342,17 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
 TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   absl::Notification n;
   coord_service_->ShutdownTaskAsync(task_0_, [&n](absl::Status s) {
-    TF_EXPECT_OK(s);
+    EXPECT_OK(s);
     n.Notify();
   });
   n.WaitForNotification();
 
   // Heartbeat should be allowed for a short grace period after shutdown.
-  TF_EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
 
   // Heartbeat failure should be triggered for disconnected task after grace
   // period.
@@ -1349,7 +1365,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
 TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
   coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
@@ -1362,7 +1378,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
 
   absl::Notification shutdown_n;
   coord_service_->ShutdownTaskAsync(task_0_, [&shutdown_n](absl::Status s) {
-    TF_EXPECT_OK(s);
+    EXPECT_OK(s);
     shutdown_n.Notify();
   });
   shutdown_n.WaitForNotification();
@@ -1375,8 +1391,8 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
 TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/true);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
   absl::Status barrier_status_2;
 
@@ -1385,22 +1401,22 @@ TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
   coord_service_->ShutdownTaskAsync(
       task_1_, [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
 
-  TF_EXPECT_OK(barrier_status);
-  TF_EXPECT_OK(barrier_status_2);
+  EXPECT_OK(barrier_status);
+  EXPECT_OK(barrier_status_2);
 
   // Confirm that both tasks have disconnected.
   // Note: this should not happen in prod where RegisterTask() is called after
   // Shutdown(), which is prevented by agent-side logic.
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 }
 
 TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFails_TaskDisconnectsOtherTaskIsAlerted) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/true);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
 
   absl::Notification n;
@@ -1416,7 +1432,7 @@ TEST_F(CoordinateTwoTasksTest,
   // Confirm that task_0_ has disconnected.
   // Note: this should not happen in prod where RegisterTask() is called after
   // Shutdown(), which is prevented by agent-side logic.
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   // Other task is alerted that shutdown has been initiated without it.
   absl::Status other_task_status = client_1_.GetStatus();
@@ -1427,8 +1443,8 @@ TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFailsWithoutClientConnection_ServiceStops) {
   EnableCoordinationService(/*has_service_to_client_connection=*/false,
                             /*enable_shutdown_barrier=*/true);
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
 
   absl::Notification n;
@@ -1456,8 +1472,8 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest, BarrierFailsIfServiceHasStopped) {
   EnableCoordinationService(/*has_service_to_client_connection=*/false);
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
   absl::Status barrier_status;
   // No heartbeat for a while, leader consider the task as stale.
@@ -1478,8 +1494,8 @@ TEST_F(CoordinateTwoTasksTest, BarrierFailsIfServiceHasStopped) {
 
 TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfServiceHasStopped) {
   EnableCoordinationService(/*has_service_to_client_connection=*/false);
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
   absl::Status barrier_status;
   // No heartbeat for a while, leader consider the task as stale.
@@ -1504,8 +1520,8 @@ TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfServiceHasStopped) {
 TEST_F(CoordinateTwoTasksTest,
        BarrierWithNonParticipatingTaskFailsIfServiceHasStopped) {
   EnableCoordinationService(/*has_service_to_client_connection=*/false);
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
   absl::Status barrier_status;
   // No heartbeat for a while, leader consider the task as stale.
@@ -1534,11 +1550,11 @@ TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
                             /*enable_shutdown_barrier=*/false,
                             /*set_worker_job_recoverable=*/false);
 
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(coord_service_->ReportTaskError(
-      task_0_, absl::InternalError("test_error")));
+  ASSERT_OK(coord_service_->ReportTaskError(task_0_,
+                                            absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
@@ -1551,17 +1567,17 @@ TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
                             /*enable_shutdown_barrier=*/false,
                             /*set_worker_job_recoverable=*/true);
 
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(coord_service_->ReportTaskError(
-      task_0_, absl::InternalError("test_error")));
+  ASSERT_OK(coord_service_->ReportTaskError(task_0_,
+                                            absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
   // Since no error propagation for recoverable tasks, other tasks should work
   // as normal.
-  TF_EXPECT_OK(client_1_.GetStatus());
+  EXPECT_OK(client_1_.GetStatus());
 }
 
 TEST_F(CoordinateTwoTasksTest,
@@ -1570,23 +1586,23 @@ TEST_F(CoordinateTwoTasksTest,
                             /*enable_shutdown_barrier=*/false,
                             /*set_worker_job_recoverable=*/true);
 
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(coord_service_->ReportTaskError(
-      task_0_, absl::InternalError("test_error")));
+  ASSERT_OK(coord_service_->ReportTaskError(task_0_,
+                                            absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
   // Since no error propagation for recoverable tasks, other tasks should work
   // as normal.
-  TF_EXPECT_OK(client_1_.GetStatus());
+  EXPECT_OK(client_1_.GetStatus());
 
   // Reset and register the error task again, both tasks should be healthy.
-  TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
-  TF_EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_new_));
-  TF_EXPECT_OK(client_1_.GetStatus());
+  EXPECT_OK(coord_service_->ResetTask(task_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
+  EXPECT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_new_));
+  EXPECT_OK(client_1_.GetStatus());
 }
 
 TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
@@ -1595,11 +1611,11 @@ TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
                             /*set_worker_job_recoverable=*/false,
                             /*allow_new_incarnation_to_reconnect=*/true);
 
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
-  TF_ASSERT_OK(coord_service_->ReportTaskError(
+  ASSERT_OK(coord_service_->ReportTaskError(
       task_0_, MakeCoordinationError(absl::UnavailableError("test_error"))));
 
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
+  EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
 }
 }  // namespace tsl

From 6a8281fa53a56a641eb1ad6a1da8d8a150fdae9a Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Fri, 31 May 2024 13:24:03 -0700
Subject: [PATCH 210/287] Support cast in Ifrt restore operation

PiperOrigin-RevId: 639134024
---
 .../tensorflow/ir/host_runtime/tfrt_ops.td    |   4 +-
 .../compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td |   3 +-
 .../ifrt/lower_to_ifrt_restore_variable.mlir  |  52 ++++++-
 .../mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir      |   4 +-
 .../ifrt/lower_to_ifrt_restore_variable.cc    |  96 ++++++++++---
 .../mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc   |   2 +-
 tensorflow/core/tfrt/mlrt/kernel/BUILD        |   2 +
 .../core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc  | 132 ++++++++++++++++--
 .../tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc  |  35 +++--
 9 files changed, 279 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
index 01e82d7fc15e22..8f74c9de99d61c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
@@ -87,7 +87,9 @@ Empty strings indicate that they are non-partitioned tensors.}]>:$shape_and_slic
     // VarHandles for restored tensors.
     Variadic<TF_Tensor>:$var_handles,
     // Data types for restored tensor.
-    TypeArrayAttr: $restored_dtypes
+    TypeArrayAttr: $restored_dtypes,
+    // whether trucate is used when casting from the restored_dytpe to the type specified in VarHandle
+    DenseBoolArrayAttr: $truncate_in_cast
   );
 
 }
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index f872ca4971c80b..be1af525edf6ca 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -493,7 +493,8 @@ def IfrtRestoreVariableOp: TensorflowMlrt_Op<"ifrt_restore_variable", []> {
     TFTensorType:$tensor_names,
     TFTensorType:$shape_and_slices,
     Variadic<TFTensorType>:$var_handles,
-    TypeArrayAttr: $restored_dtypes
+    TypeArrayAttr: $restored_dtypes,
+    DenseBoolArrayAttr: $truncate_in_cast
   );
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
index 5052694566de89..f8b1f9910f7694 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
@@ -1,5 +1,6 @@
 // RUN: tf-tfrt-opt -split-input-file -verify-diagnostics -lower-to-ifrt-restore-variable %s | FileCheck %s
 
+
 // -----
 // single variable
 
@@ -9,7 +10,7 @@
 // CHECK-NEXT:     [[NAME:%.*]] = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
 // CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
 // CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]])
-// CHECK-SAME:        {restored_dtypes = [f32]}
+// CHECK-SAME:        {restored_dtypes = [f32], truncate_in_cast = array<i1: false>}
 // CHECK-NOT:       "tf.RestoreV2"
 // CHECK-NEXT:     return
 
@@ -34,7 +35,7 @@ module {
 // CHECK-NEXT:     [[NAME:%.*]] = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
 // CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
 // CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]])
-// CHECK-SAME:        {restored_dtypes = [f32]}
+// CHECK-SAME:        {restored_dtypes = [f32], truncate_in_cast = array<i1: false>}
 // CHECK-NOT:       "tf.RestoreV2"
 // CHECK-NEXT:     return
 
@@ -61,7 +62,7 @@ module {
 // CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "x", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
 // CHECK-NEXT:     [[HANDLEZ:%.*]] = "tf.VarHandleOp"() <{container = "x", shared_name = "z"}> : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
 // CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]], [[HANDLEZ]])
-// CHECK-SAME:        {restored_dtypes = [f32, f32]}
+// CHECK-SAME:        {restored_dtypes = [f32, f32], truncate_in_cast = array<i1: false, false>}
 // CHECK-NOT:       "tf.RestoreV2"
 // CHECK-NEXT:     return
 
@@ -87,7 +88,7 @@ module {
     %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
     %cst_0 = "tf.Const"() <{value = dense<["", ""]> : tensor<2x!tf_type.string>}> : () -> tensor<2x!tf_type.string>
     %cst_1 = "tf.Const"() <{value = dense<["y", "z"]> : tensor<2x!tf_type.string>}> : () -> tensor<2x!tf_type.string>
-    //expected-error@below {{'tf.RestoreV2' op expects 2 VarHandleOps, but got 1}}
+    //expected-error@below {{'tf.RestoreV2' op expects 2 valid users, but got 1}}
     %0:2 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<3x1xf32>, tensor<1x3xf32>)
     %1 = "tf.VarHandleOp"() <{container = "x", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
     "tf.AssignVariableOp"(%1, %0#0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
@@ -95,3 +96,46 @@ module {
   }
 }
 
+// -----
+// Unsupported OP from RestoreV2 to AssignVariableOp is an error.
+
+module {
+  func.func @unassigned_restore_return_error() {
+    %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+    %cst_0 = "tf.Const"() <{value = dense<["", ""]> : tensor<2x!tf_type.string>}> : () -> tensor<2x!tf_type.string>
+    %cst_1 = "tf.Const"() <{value = dense<["y", "z"]> : tensor<2x!tf_type.string>}> : () -> tensor<2x!tf_type.string>
+    %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<3x1xf32>)
+    //expected-error@below {{'tf.ReluOp' op is not a supported user of RestoreV2Op}}
+    %2 = "tf.ReluOp"(%0) : (tensor<3x1xf32>) -> tensor<3x1xf32>
+    %1 = "tf.VarHandleOp"() <{container = "x", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    "tf.AssignVariableOp"(%1, %2) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
+    return
+  }
+}
+
+
+
+// -----
+// variable with cast
+// CHECK-LABEL:   func.func @restore_with_cast() {
+// CHECK-NEXT:     [[PREFIX:%.*]] = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+// CHECK-NEXT:     [[SLICE:%.*]] = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[NAME:%.*]] = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xbf16>>>
+// CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]])
+// CHECK-SAME:        {restored_dtypes = [f32], truncate_in_cast = array<i1: false>}
+// CHECK-NOT:       "tf.RestoreV2"
+// CHECK-NEXT:     return
+
+module {
+  func.func @restore_with_cast() {
+    %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+    %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+    %1 = "tf.Cast"(%0) <{truncate = false}> : (tensor<3x1xf32>) -> tensor<3x1xbf16>
+    %2 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xbf16>>>
+    "tf.AssignVariableOp"(%2, %1) : (tensor<!tf_type.resource<tensor<3x1xbf16>>>, tensor<3x1xbf16>) -> ()
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 58efb3edcbc9cf..258f705bb45ec7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -489,8 +489,8 @@ func.func @ifrt_restore_variable_test() -> () {
   %cst_1 = "tf.Const"()  {__op_key = 2: i32, value = dense<["y"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
   // CHECK-NEXT: [[HANDLE:%.*]] = tf_mlrt.executeop
   %handle = "tf.VarHandleOp"() {__op_key = 3: i32, container = "x", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
-  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) <{restored_dtypes = [f32]}>
-  "tf.IfrtRestoreVariableOp"(%cst, %cst_1, %cst_0, %handle) {restored_dtypes = [f32]} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<!tf_type.resource<tensor<3x1xf32>>>) -> ()
+  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) <{restored_dtypes = [f32], truncate_in_cast = array<i1: true>}>
+  "tf.IfrtRestoreVariableOp"(%cst, %cst_1, %cst_0, %handle) {restored_dtypes = [f32], truncate_in_cast = array<i1: true>} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<!tf_type.resource<tensor<3x1xf32>>>) -> ()
   // CHECK-NEXT: return
   func.return
 }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
index 7c0fa364b593a7..158b1badf7beab 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -62,32 +63,66 @@ class LowerToIfrtRestoreVariablePass
   }
 
  private:
+  // Returns true if the given user is a VarHandleOp.
+  struct RestoredTensorUser {
+    // Path to the AssignVariableOp from the RestoreV2Op. Those ops are deleted
+    // after rewriting RestoreV2Op to IfrtRestoreVariableOp.
+    std::vector<mlir::Operation*> path_to_assign_variable_op;
+
+    // The VarHandleOp associated with the AssignVariableOp.
+    mlir::TF::VarHandleOp var_handle_op;
+    bool truncate_in_cast =
+        false;  // value of the truncate attribute in the CastOp.
+                // Default to false if CastOp is not present.
+  };
+
+  mlir::LogicalResult ValidateThenUpdateUser(
+      mlir::Operation* user,
+      std::vector<RestoredTensorUser>& restored_tensor_users) {
+    RestoredTensorUser restored_tensor_user;
+    for (;;) {
+      restored_tensor_user.path_to_assign_variable_op.push_back(user);
+      if (auto cast_op = llvm::dyn_cast<mlir::TF::CastOp>(user)) {
+        if (!cast_op.getResult().hasOneUse()) {
+          return cast_op->emitOpError()
+                 << " has more than one use in the restore user chain";
+        }
+        restored_tensor_user.truncate_in_cast = cast_op.getTruncate();
+        user = *cast_op.getResult().getUsers().begin();
+      } else if (auto assign_variable_op =
+                     llvm::dyn_cast<mlir::TF::AssignVariableOp>(user)) {
+        if (auto var_handle_op = llvm::dyn_cast<mlir::TF::VarHandleOp>(
+                assign_variable_op.getResource().getDefiningOp())) {
+          restored_tensor_user.var_handle_op = var_handle_op;
+          break;
+        } else {
+          return assign_variable_op->emitOpError()
+                 << "does not have any associated VarHandle";
+        }
+      } else {
+        return user->emitOpError() << "is not a supported user of RestoreV2Op";
+      }
+    }
+
+    restored_tensor_users.push_back(restored_tensor_user);
+    return mlir::success();
+  }
+
   mlir::LogicalResult RewriteRestore(mlir::TF::RestoreV2Op restore_op) {
-    std::vector<mlir::Value> var_handle_values;
-    std::vector<mlir::TF::AssignVariableOp> assign_variable_ops;
+    std::vector<RestoredTensorUser> restored_tensor_users;
 
-    var_handle_values.reserve(restore_op.getTensors().size());
-    assign_variable_ops.reserve(restore_op.getTensors().size());
     for (const auto& out_tensor : restore_op.getTensors()) {
       for (mlir::Operation* user : out_tensor.getUsers()) {
-        if (auto assign_variable_op =
-                llvm::dyn_cast<mlir::TF::AssignVariableOp>(user)) {
-          assign_variable_ops.push_back(assign_variable_op);
-          if (auto var_handle_op = llvm::dyn_cast<mlir::TF::VarHandleOp>(
-                  assign_variable_op.getResource().getDefiningOp())) {
-            var_handle_values.push_back(var_handle_op.getResult());
-          } else {
-            return assign_variable_op->emitOpError()
-                   << "does not have any associated VarHandle";
-          }
+        if (mlir::failed(ValidateThenUpdateUser(user, restored_tensor_users))) {
+          return mlir::failure();
         }
       }
     }
 
-    if (var_handle_values.size() != restore_op.getTensors().size()) {
+    if (restored_tensor_users.size() != restore_op.getTensors().size()) {
       return restore_op->emitOpError()
              << "expects " << restore_op.getTensors().size()
-             << " VarHandleOps, but got " << var_handle_values.size();
+             << " valid users, but got " << restored_tensor_users.size();
     }
 
     std::vector<mlir::Attribute> dtypes;
@@ -95,21 +130,36 @@ class LowerToIfrtRestoreVariablePass
       dtypes.push_back(mlir::TypeAttr::get(dtype));
     }
 
+    std::vector<mlir::Value> var_handle_values;
+    llvm::SmallVector<bool, 4> truncate_in_cast;
+    var_handle_values.reserve(restored_tensor_users.size());
+    truncate_in_cast.reserve(restored_tensor_users.size());
+    for (auto& restored_tensor_user : restored_tensor_users) {
+      var_handle_values.push_back(
+          restored_tensor_user.var_handle_op.getResult());
+
+      truncate_in_cast.push_back(restored_tensor_user.truncate_in_cast);
+
+      // Delete the path from the RestoreV2Op to the AssignVariableOp in reverse
+      // order.
+      for (auto r = restored_tensor_user.path_to_assign_variable_op.rbegin();
+           r != restored_tensor_user.path_to_assign_variable_op.rend(); ++r) {
+        (*r)->erase();
+      }
+    }
+
     // Insert at the end of the block so that all dependencies are satisfied.
     mlir::OpBuilder builder =
         mlir::OpBuilder::atBlockTerminator(restore_op->getBlock());
     builder.create<mlir::TF::IfrtRestoreVariableOp>(
         restore_op->getLoc(), restore_op.getPrefix(),
         restore_op.getTensorNames(), restore_op.getShapeAndSlices(),
-        var_handle_values, builder.getArrayAttr(dtypes));
+        var_handle_values, builder.getArrayAttr(dtypes),
+        builder.getDenseBoolArrayAttr(truncate_in_cast));
 
-    for (auto& assign_variable_op : assign_variable_ops) {
-      assign_variable_op.erase();
-    }
     if (!restore_op->use_empty()) {
-      return restore_op->emitOpError()
-             << "failed to identify all AssignVariableOps "
-                "associated with this RestoreV2Op.";
+      return restore_op->emitOpError() << "failed to identify all users"
+                                          "associated with this RestoreV2Op.";
     } else {
       restore_op.erase();
     }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 84719706fd6b01..05fdaece51a3a9 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -373,7 +373,7 @@ class IfrtRestoreVariableOpConversion
         op.getLoc(), adaptor.getOperands()[0], adaptor.getOperands()[1],
         adaptor.getOperands()[2],
         adaptor.getOperands().slice(3, adaptor.getOperands().size() - 3),
-        op.getRestoredDtypes());
+        op.getRestoredDtypes(), op.getTruncateInCast());
     rewriter.replaceOp(op, new_op);
 
     return mlir::success();
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index c8871340143dee..3291f7d3281e88 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -72,6 +72,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core/common_runtime:function",
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:protobuf",
@@ -199,6 +200,7 @@ tf_cc_shared_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_matcher",
         "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/ops:math_ops_op_lib",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index 60bccdffbcf616..693129166ca0e4 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -94,6 +95,10 @@ struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
     return attributes().GetAs<mlrt::bc::Vector<tensorflow::DataType>>(0);
   }
 
+  mlrt::bc::Vector<bool> truncate_in_cast() const {
+    return attributes().GetAs<mlrt::bc::Vector<bool>>(1);
+  }
+
   std::vector<tensorflow::tfrt_stub::FallbackTensor> var_handles() const {
     DCHECK_GT(arguments().size(), 3);
     std::vector<tensorflow::tfrt_stub::FallbackTensor> result;
@@ -120,6 +125,8 @@ struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
     tensorflow::Tensor shape_and_slices;
     std::vector<tensorflow::tfrt_stub::FallbackTensor> var_handles;
     tensorflow::AttrValue dtypes_attr_value;
+    std::vector<tensorflow::DataType> restored_dtypes;
+    std::vector<bool> truncate_in_cast;
   };
 
   absl::Status InvokeHelper();
@@ -136,6 +143,53 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
   }
 }
 
+// Returns a casted tensor if successful.
+absl::StatusOr<tensorflow::Tensor> Cast(
+    tensorflow::Tensor& in_tensor, tensorflow::DataType restored_dtype,
+    tensorflow::DataType cast_dtype, bool truncate_in_cast,
+    const tensorflow::DeviceMgr& device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime&
+        process_function_library_runtime,
+    OpKernelContext::Params& params) {
+  auto runner =
+      tfrt_stub::OpKernelRunner::Create(
+          /*op_name=*/
+          "Cast", /*node_name=*/"Cast", params.device->name(),
+          /*num_args=*/1,
+          [&](tensorflow::AttrValueMap* attr_value_map) {
+            tensorflow::AttrValue restored_dtype_attr_value;
+            restored_dtype_attr_value.set_type(restored_dtype);
+            attr_value_map->insert({"SrcT", restored_dtype_attr_value});
+
+            tensorflow::AttrValue cast_dtype_attr_value;
+            cast_dtype_attr_value.set_type(cast_dtype);
+            attr_value_map->insert({"DstT", cast_dtype_attr_value});
+
+            tensorflow::AttrValue truncate_attr_value;
+            truncate_attr_value.set_b(truncate_in_cast);
+            attr_value_map->insert({"Truncate", truncate_attr_value});
+            return absl::OkStatus();
+          },
+          device_manager, process_function_library_runtime)
+          .value();
+
+  std::vector<tensorflow::TensorValue> input_tf_tensor_values;
+  input_tf_tensor_values.push_back(tensorflow::TensorValue(&in_tensor));
+
+  SetUpParams(runner, input_tf_tensor_values, params);
+  // Use persistent device instead of the per request device.
+
+  OpKernelContext op_kernel_context(&params, /*num_outputs=*/1);
+
+  runner.Run(&op_kernel_context);
+
+  if (!op_kernel_context.status().ok()) {
+    return op_kernel_context.status();
+  }
+  DCHECK_EQ(op_kernel_context.num_outputs(), 1);
+  return *(op_kernel_context.mutable_output(0));
+}
+
 absl::Status MlrtIfrtRestoreVariableKernel::RunShard(
     RestoreVariableShard shard) {
   std::optional<IfrtModelContext*> ifrt_model_context =
@@ -186,18 +240,27 @@ absl::Status MlrtIfrtRestoreVariableKernel::RunShard(
   struct AsyncState {
     explicit AsyncState(
         const std::vector<tensorflow::TensorValue>& input_tf_tensor_values,
-        const OpKernelContext::Params& params, int num_outputs)
+        const OpKernelContext::Params& params, int num_outputs,
+        const tensorflow::DeviceMgr& device_manager,
+        const tensorflow::ProcessFunctionLibraryRuntime&
+            process_function_library_runtime)
         : run_state(input_tf_tensor_values, params),
-          context(&run_state.params, num_outputs) {}
+          context(&run_state.params, num_outputs),
+          device_manager(device_manager),
+          process_function_library_runtime(process_function_library_runtime) {}
 
     tfrt_stub::OpKernelRunState run_state;
     OpKernelContext context;
+    const tensorflow::DeviceMgr& device_manager;
+    const tensorflow::ProcessFunctionLibraryRuntime&
+        process_function_library_runtime;
+
     std::vector<xla::ifrt::Promise<tensorflow::Tensor>> results;
   };
-  auto async_state =
-      std::make_unique<AsyncState>(input_tf_tensor_values, params, num_outputs);
-
-  async_state->results.reserve(num_outputs);
+  auto async_state = std::make_unique<AsyncState>(
+      input_tf_tensor_values, params, num_outputs,
+      fallback_request_state.device_manager(),
+      fallback_request_state.process_function_library_runtime());
 
   ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry =
       (*ifrt_model_context)->GetRestoreTensorRegistry();
@@ -247,10 +310,46 @@ absl::Status MlrtIfrtRestoreVariableKernel::RunShard(
           }
           return;
         }
+        DCHECK_EQ(shard.var_handles.size(), op_kernel_context.num_outputs());
+        DCHECK_EQ(shard.truncate_in_cast.size(),
+                  op_kernel_context.num_outputs());
+
+        // TODO(b/343964091): consider to run multiple casts in parallel.
         for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
           DCHECK(op_kernel_context.mutable_output(i));
-          std::move(async_state->results[i])
-              .Set(std::move(*op_kernel_context.mutable_output(i)));
+
+          if (op_kernel_context.mutable_output(i)->dtype() !=
+              shard.restored_dtypes[i]) {
+            std::move(async_state->results[i])
+                .Set(absl::InvalidArgumentError(absl::StrCat(
+                    "The restored tensor has a different dtype than the "
+                    "variable handle: ",
+                    op_kernel_context.mutable_output(i)->dtype(), " vs. ",
+                    shard.restored_dtypes[i])));
+            return;
+          }
+          const ResourceHandle& var_handle =
+              shard.var_handles[i]
+                  .tensor()
+                  .scalar<tensorflow::ResourceHandle>()();
+
+          if (shard.restored_dtypes[i] ==
+              var_handle.dtypes_and_shapes()[0].dtype) {
+            std::move(async_state->results[i])
+                .Set(*std::move(op_kernel_context.mutable_output(i)));
+          } else {
+            absl::StatusOr<tensorflow::Tensor> cast_output = Cast(
+                *op_kernel_context.mutable_output(i), shard.restored_dtypes[i],
+                var_handle.dtypes_and_shapes()[0].dtype,
+                shard.truncate_in_cast[i], async_state->device_manager,
+                async_state->process_function_library_runtime,
+                async_state->run_state.params);
+            if (!cast_output.ok()) {
+              std::move(async_state->results[i]).Set(cast_output.status());
+            } else {
+              std::move(async_state->results[i]).Set(*std::move(cast_output));
+            }
+          }
         }
       });
   return absl::OkStatus();
@@ -288,6 +387,12 @@ absl::Status MlrtIfrtRestoreVariableKernel::ValidateInput() {
         "elements.");
   }
 
+  if (tensor_names().tensor().NumElements() != truncate_in_cast().size()) {
+    return absl::InvalidArgumentError(
+        "The tensor_names and truncate_in_cast must have the same number of "
+        "elements.");
+  }
+
   return absl::OkStatus();
 }
 
@@ -322,14 +427,23 @@ absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
   shards.reserve(sharded_indices.size());
   for (auto& sharded_index : sharded_indices) {
     RestoreVariableShard shard;
+    shard.var_handles.reserve(sharded_index.size());
+    shard.truncate_in_cast.reserve(sharded_index.size());
+    shard.restored_dtypes.reserve(sharded_index.size());
+
     std::vector<tsl::tstring> tensor_names;
     std::vector<tsl::tstring> shape_and_slices;
+    shape_and_slices.reserve(sharded_index.size());
+    tensor_names.reserve(sharded_index.size());
     for (int index : sharded_index) {
       tensor_names.push_back(tensor_names_flat(index));
       shape_and_slices.push_back(shape_and_slices_flat(index));
-      shard.var_handles.push_back(var_handles()[index]);
       shard.dtypes_attr_value.mutable_list()->add_type(
           restored_dtypes()[index]);
+
+      shard.var_handles.push_back(var_handles()[index]);
+      shard.restored_dtypes.push_back(restored_dtypes()[index]);
+      shard.truncate_in_cast.push_back(truncate_in_cast()[index]);
     }
 
     shard.prefix = prefix().tensor();
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index 6038ff422a6eb0..0d2bf1594f89fd 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -99,6 +99,18 @@ std::string EncodeRestoreDtypesInt32(int num_outputs) {
   return std::string(buffer.data(), buffer.size());
 }
 
+std::string EncodeTruncateInCast(int num_outputs) {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto ctor = mlrt::bc::New<mlrt::bc::Vector<bool>>(&allocator, num_outputs);
+
+  for (int i = 0; i < num_outputs; ++i) {
+    ctor.ConstructAt(i, false);
+  }
+  return std::string(buffer.data(), buffer.size());
+}
+
 mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp(
     int num_variables = 1) {
   mlrt::bc::Buffer buffer;
@@ -116,12 +128,14 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp(
   kernels.Def(kernel_names);
 
   static constexpr int kNumAttributes =
-      4;  // Size of attributes when there are 1 variable.
+      5;  // Size of attributes when there are 1 variable.
   mlrt::testing::AttributeTable attributes(executable_ctor.construct_attributes(
       kNumAttributes + 2 * (num_variables - 1)));
 
   std::string restore_dtypes = EncodeRestoreDtypesInt32(num_variables);
   attributes.Add("restore_dtypes", restore_dtypes);
+  std::vector<bool> truncate_in_cast(num_variables, false);
+  attributes.Add("truncate_in_cast", EncodeTruncateInCast(num_variables));
 
   for (int i = 0; i < num_variables; ++i) {
     attributes.Add(
@@ -140,11 +154,11 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp(
                  }
                  attr {
                    key: "dtype"
-                   value { type: DT_INT32 }
+                   value { type: DT_INT16 }
                  }
                  attr {
                    key: "shape"
-                   value { shape { dim { size: 1 } } }
+                   value { shape { dim { size: 3 } } }
                  }
             )pb",
             absl::StrCat("VarHandleOp", i), kContainer,
@@ -215,8 +229,9 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp(
       restore_ctor.set_code(kernels.Use("tf_mlrt.ifrt_restore_variable"));
       restore_ctor.construct_arguments(args.size()).Assign(regs.Use(args));
       restore_ctor.construct_results(0);
-      restore_ctor.construct_attributes(1).Assign(
-          {attributes.GetHandle("restore_dtypes")});
+      restore_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("restore_dtypes"),
+           attributes.GetHandle("truncate_in_cast")});
       kernel_index++;
     }
     {
@@ -565,7 +580,7 @@ TEST_F(KernelTest, IfrtRestoreVariableOp) {
           absl::StrCat(kVariableRuntimeName, 0));
   absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
   TF_ASSERT_OK(restored_tensor.status());
-  EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
+  EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int16_t>({1, 2, 3}, {3})));
 }
 
 TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
@@ -632,7 +647,7 @@ TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
           absl::StrCat(kVariableRuntimeName, 0));
   absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
   TF_ASSERT_OK(restored_tensor.status());
-  EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
+  EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int16_t>({1, 2, 3}, {3})));
 
   xla::ifrt::Future<tensorflow::Tensor> restored_future1 =
       ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
@@ -640,7 +655,7 @@ TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
   absl::StatusOr<tensorflow::Tensor> restored_tensor1 =
       restored_future1.Await();
   TF_ASSERT_OK(restored_tensor1.status());
-  EXPECT_THAT(*restored_tensor1, TensorEq(AsTensor<int32_t>({4, 5, 6}, {3})));
+  EXPECT_THAT(*restored_tensor1, TensorEq(AsTensor<int16_t>({4, 5, 6}, {3})));
 
   xla::ifrt::Future<tensorflow::Tensor> restored_future2 =
       ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
@@ -648,7 +663,7 @@ TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
   absl::StatusOr<tensorflow::Tensor> restored_tensor2 =
       restored_future2.Await();
   TF_ASSERT_OK(restored_tensor2.status());
-  EXPECT_THAT(*restored_tensor2, TensorEq(AsTensor<int32_t>({7, 8, 9}, {3})));
+  EXPECT_THAT(*restored_tensor2, TensorEq(AsTensor<int16_t>({7, 8, 9}, {3})));
 
   xla::ifrt::Future<tensorflow::Tensor> restored_future3 =
       ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
@@ -657,7 +672,7 @@ TEST_F(KernelTest, IfrtRestoreVariableOp4Variables) {
       restored_future3.Await();
   TF_ASSERT_OK(restored_tensor3.status());
   EXPECT_THAT(*restored_tensor3,
-              TensorEq(AsTensor<int32_t>({10, 11, 12}, {3})));
+              TensorEq(AsTensor<int16_t>({10, 11, 12}, {3})));
 }
 
 }  // namespace

From 0a9e492aa9c5aef2356bcf59a2d4ab7a21ebaaae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 13:38:30 -0700
Subject: [PATCH 211/287] Fix a bug that tries to access RUNTIME_FUNCTION_REFS
 after deletion

PiperOrigin-RevId: 639138453
---
 .../python/eager/polymorphic_function/atomic_function.py       | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/eager/polymorphic_function/atomic_function.py b/tensorflow/python/eager/polymorphic_function/atomic_function.py
index 1cb6c2e1a8bea0..abb7cf5fe72efd 100644
--- a/tensorflow/python/eager/polymorphic_function/atomic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/atomic_function.py
@@ -287,6 +287,9 @@ def __del__(self):
     if self._generated_graph:
       func_graph_module.dismantle_func_graph(self._generated_graph)
 
+    if RUNTIME_FUNCTION_REFS is None:
+      pass
+
     key = (self._bound_context.function_scope_id, self.name)
     RUNTIME_FUNCTION_REFS[key] -= 1
     if RUNTIME_FUNCTION_REFS[key] < 0:

From c1feffcaf16d45e9228398424e9ae171f40755f3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 13:42:22 -0700
Subject: [PATCH 212/287] [XLA:TPU] Fix a bug in
 GetGatherScatterBatchParallelDims.

The bug is that the function does not handle the case where the indices are a copy of a concatenate. Thus this we add code to search up a tree of copies to see if the first non-copy op is a concatenate, in which case we can use that to test if it is a concatenate of an iota, to then find parallel dims.

PiperOrigin-RevId: 639139490
---
 third_party/xla/xla/hlo/utils/hlo_sharding_util.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index e0d1c55e398899..fa550467006414 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -2245,6 +2245,19 @@ std::optional<GatherScatterParallelDims> GetGatherScatterBatchParallelDims(
   absl::InlinedVector<const HloIotaInstruction*, 4> iotas;
   const int num_indices = index_map.size();
   std::vector<int64_t> index_parallel_in_dim(num_indices, -1);
+
+  // looks through any copies to find the concatenate.
+  auto findConcatenate = [&](const HloInstruction* indices) {
+    const HloInstruction* orig_indices = indices;
+    while (indices->opcode() == HloOpcode::kCopy) {
+      indices = indices->operand(0);
+    }
+    if (indices->opcode() == HloOpcode::kConcatenate) {
+      return indices;
+    }
+    return orig_indices;
+  };
+  indices = findConcatenate(indices);
   // Handle cases where we concatenate pieces of the indices one at a time.
   if (indices->opcode() == HloOpcode::kConcatenate &&
       indices->concatenate_dimension() == index_vector_dim) {

From 52f0fe51399e7a765ef67b5885519949481b160c Mon Sep 17 00:00:00 2001
From: Anshuman Goswami <anshumang@google.com>
Date: Fri, 31 May 2024 14:07:39 -0700
Subject: [PATCH 213/287] Add more logging to remote tensor handle
 serialization

PiperOrigin-RevId: 639147348
---
 .../common_runtime/eager/tensor_handle.cc     | 20 +++++++++++++++++--
 .../distributed_runtime/eager/remote_mgr.cc   |  7 ++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 8aa56efd495c80..24a0258fa3db33 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/tf_tensor_internal.h"
@@ -727,7 +728,16 @@ Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
   }
 
   if (remote_data != nullptr) {
-    return remote_data->OpIdAndOutputNum(wait_until_ready, op_id, output_num);
+    auto status =
+        remote_data->OpIdAndOutputNum(wait_until_ready, op_id, output_num);
+    if (!status.ok()) {
+      return errors::Internal(
+          absl::StrCat("Remote address looked up from remote mirrors found to "
+                       "be poisoned with status ",
+                       status.ToString()));
+    } else {
+      return absl::OkStatus();
+    }
   }
 
   if (Type() != REMOTE) {
@@ -735,7 +745,13 @@ Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
   }
 
   auto& data = std::get<RemoteTensorHandleData>(data_);
-  return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
+  auto status = data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
+  if (!status.ok()) {
+    return errors::Internal(
+        "Remote address looked up from remote data found to be poisoned");
+  } else {
+    return absl::OkStatus();
+  }
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index d415ca1123780c..0b17582a07b0d0 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -171,7 +171,12 @@ Status RemoteMgr::SerializeRemoteTensorHandle(
     const bool serialize_resource_dtype_and_shape) {
   int64_t op_id;
   int32_t output_num;
-  if (!in->RemoteAddress(device, wait_until_ready, &op_id, &output_num).ok()) {
+  auto status =
+      in->RemoteAddress(device, wait_until_ready, &op_id, &output_num);
+  if (!status.ok()) {
+    LOG(ERROR)
+        << "Failed to get remote address for tensor handle with given device "
+        << device->name() << " error " << status.message();
     tf_shared_lock l(remote_tensor_handle_mu_);
     TF_RETURN_IF_ERROR(
         GetRemoteTensorHandle(in, wait_until_ready, &op_id, &output_num));

From 7b077cd4a83bcf3e296796bc7a32ff9e68a10fa2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 14:45:08 -0700
Subject: [PATCH 214/287] Switch libtensorflow (Windows) to use Docker + Clang.

PiperOrigin-RevId: 639158619
---
 .../rel/windows/cpu_libtensorflow.bat         | 29 +++++++++++++++----
 .../windows/cpu/bazel/run_libtensorflow.bat   |  1 -
 .../ci_build/windows/libtensorflow_cpu.sh     | 18 +++++-------
 3 files changed, 31 insertions(+), 17 deletions(-)
 delete mode 100644 tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat

diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
index 07c5456600a3ba..131ad3c7059067 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
@@ -1,4 +1,4 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+:: Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 ::
 :: Licensed under the Apache License, Version 2.0 (the "License");
 :: you may not use this file except in compliance with the License.
@@ -13,10 +13,29 @@
 :: limitations under the License.
 :: =============================================================================
 
-CALL tensorflow\tools\ci_build\release\common_win.bat
+SET TF_DIR=%cd%\github\tensorflow
+SET TF_DOCKER_DIR=C:\src\tensorflow
+REM TODO(belitskiy): Switch to Artifact Registry
+set TF_DOCKER_IMAGE="gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
 
-call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b 1
+docker pull %TF_DOCKER_IMAGE% || exit /b 1
+@echo *****Finished docker image pull: %date% %time%
 
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
+docker run ^
+    --name tf ^
+    -itd ^
+    --env TF_PYTHON_VERSION=%TF_PYTHON_VERSION% ^
+    -v %TF_DIR%:%TF_DOCKER_DIR% ^
+    -v T:\tmp:C:\tmp ^
+    -w %TF_DOCKER_DIR% ^
+    -e GOOGLE_APPLICATION_CREDENTIALS=%GUESTKEYNAME% ^
+    --dns 8.8.8.8 ^
+    --dns 8.8.4.4 ^
+    %TF_DOCKER_IMAGE% ^
+    bash || exit /b 1
 
-CALL gsutil cp windows_cpu_libtensorflow_binaries.tar.gz gs://libtensorflow-nightly/prod/tensorflow/release/windows/latest/cpu
+docker exec tf ^
+    bash tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh || exit /b 1
+
+gsutil cp %TF_DIR%\windows_cpu_libtensorflow_binaries.tar.gz ^
+  gs://libtensorflow-nightly/prod/tensorflow/release/windows/latest/cpu
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat b/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat
deleted file mode 100644
index 6a88b1865a4fe0..00000000000000
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat
+++ /dev/null
@@ -1 +0,0 @@
-c:\tools\msys64\usr\bin\bash -l %cd%/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh %*
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index d79bcedabdead7..f2ead092469e47 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -17,26 +17,22 @@
 # Script to produce binary release of libtensorflow (C API, Java jars etc.).
 
 set -ex
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-# Setup environment for bazel builds
-source "${SCRIPT_DIR}/bazel/common_env.sh"
-source "${SCRIPT_DIR}/bazel/bazel_test_lib.sh"
-
-# Sanity check that this is being run from the root of the git repository.
-cd ${SCRIPT_DIR}/../../../..
 if [ ! -e "WORKSPACE" ]; then
   echo "Must run this from the root of the bazel workspace"
-  echo "Currently at ${PWD}, script is at ${SCRIPT_DIR}"
+  echo "Currently at ${PWD}"
   exit 1
 fi
 
-run_configure_for_cpu_build
-
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX --announce_rc --config=short_logs \
+bazel --output_user_root=${TMPDIR} build \
+  -c opt \
+  --copt=/arch:AVX \
+  --announce_rc \
+  --config=short_logs \
+  --config=win_clang \
   :LICENSE \
   tensorflow:tensorflow.dll \
   tensorflow:tensorflow_dll_import_lib \

From 04b2a511d2c787b50c0e49412f41ec93c09794b2 Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Fri, 31 May 2024 15:51:14 -0700
Subject: [PATCH 215/287] PR #13244: [ROCm]  Get rid of .bazelrc file in
 run_xla script

Imported from GitHub PR https://github.com/openxla/xla/pull/13244

Copybara import of the project:

--
e05087b657e6014431826c62ffc417137e7a66b0 by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

[ROCm]  Get rid of .bazelrc file in run_xla script

Merging this change closes #13244

PiperOrigin-RevId: 639177139
---
 third_party/xla/build_tools/rocm/run_xla.sh | 61 ++++++---------------
 1 file changed, 16 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/build_tools/rocm/run_xla.sh b/third_party/xla/build_tools/rocm/run_xla.sh
index 798a737dc058ea..59022e6bbaf5ac 100755
--- a/third_party/xla/build_tools/rocm/run_xla.sh
+++ b/third_party/xla/build_tools/rocm/run_xla.sh
@@ -45,54 +45,25 @@ else
 fi
 
 export PYTHON_BIN_PATH=`which python3`
-PYTHON_VERSION=`python3 -c "import sys;print(f'{sys.version_info.major}.{sys.version_info.minor}')"`
-export TF_PYTHON_VERSION=$PYTHON_VERSION
 export TF_NEED_ROCM=1
 export ROCM_PATH=$ROCM_INSTALL_DIR
 TAGS_FILTER="gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm"
 UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
 TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
-if [ -f /usertools/rocm.bazelrc ]; then
-        # Use the bazelrc files in /usertools if available
-	if [ ! -d /tf ];then
-           # The bazelrc files in /usertools expect /tf to exist
-           mkdir /tf
-        fi
- 
-	bazel \
-        --bazelrc=/usertools/rocm.bazelrc \
-        test \
-        --config=sigbuild_local_cache \
-        --config=rocm \
-        --config=xla_cpp \
-        --build_tag_filters=${TAGS_FILTER} \
-        --test_tag_filters=${TAGS_FILTER} \
-        --keep_going \
-        --test_output=errors \
-        --local_test_jobs=${N_TEST_JOBS} \
-        --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
-        --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
-        --repo_env=HERMETIC_PYTHON_VERSION=3.11 \
-        --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-        --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
-        --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
-        -- //xla/...
-else
 
-    yes "" | $PYTHON_BIN_PATH configure.py
-    bazel \
-        test \
-        -k \
-        --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm --keep_going \
-        --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm \
-        --config=rocm \
-        --test_output=errors \
-        --local_test_jobs=${N_TEST_JOBS} \
-        --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
-        --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
-        --repo_env=HERMETIC_PYTHON_VERSION=3.11 \
-        --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-        --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
-        --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
-        -- //xla/...
-fi
+bazel \
+    test \
+    --config=rocm \
+    --build_tag_filters=${TAGS_FILTER} \
+    --test_tag_filters=${TAGS_FILTER} \
+    --test_timeout=920,2400,7200,9600 \
+    --test_sharding_strategy=disabled \
+    --test_output=errors \
+    --keep_going \
+    --local_test_jobs=${N_TEST_JOBS} \
+    --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+    --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+    --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
+    -- //xla/...

From 1d36138fd0204daa48d97d66e415f46ba91f0697 Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Fri, 31 May 2024 16:25:01 -0700
Subject: [PATCH 216/287] Uses Stacker api to compute table stacking logic for
 SC

PiperOrigin-RevId: 639186593
---
 .../core/tpu/kernels/sparse_core_layout.cc    |  12 +-
 .../core/tpu/kernels/sparse_core_layout.h     |   4 +-
 tensorflow/python/tpu/BUILD                   |  58 +++
 .../python/tpu/_pywrap_sparse_core_layout.pyi |   2 +-
 .../python/tpu/pywrap_sparse_core_layout.cc   |   3 +-
 tensorflow/python/tpu/tpu_embedding_v3.py     | 278 +++++-----
 .../tpu/tpu_embedding_v3_additional_test.py   | 473 ++++++++++++++++++
 ...edding.-sparse-core-embedding-config.pbtxt |   2 +-
 ...mental.embedding.-t-p-u-embedding-v2.pbtxt |   4 +
 ...edding.-sparse-core-embedding-config.pbtxt |   2 +-
 ...mental.embedding.-t-p-u-embedding-v2.pbtxt |   4 +
 11 files changed, 707 insertions(+), 135 deletions(-)
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v3_additional_test.py

diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout.cc b/tensorflow/core/tpu/kernels/sparse_core_layout.cc
index ee00a9a50a740c..2f4a945be745cb 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout.cc
@@ -33,9 +33,14 @@ namespace tensorflow {
 
 // Provide reasonable default values for the parameters. Note the WEAK attribute
 // on these methods: these can be (and in many cases are) overridden.
-ABSL_ATTRIBUTE_WEAK bool GetDisableTableStacking() {
+ABSL_ATTRIBUTE_WEAK bool GetDisableTableStacking(bool disable_table_stacking) {
+  bool should_disable_stacking = false;
+  // BEGIN GOOGLE-INTERNAL
   XlaSparseCoreFlags *sparse_core_flags = GetXlaSparseCoreFlags();
-  return sparse_core_flags->tf_xla_sparse_core_disable_table_stacking;
+  should_disable_stacking =
+      sparse_core_flags->tf_xla_sparse_core_disable_table_stacking;
+  // END GOOGLE-INTERNAL
+  return should_disable_stacking || disable_table_stacking;
 }
 
 ABSL_ATTRIBUTE_WEAK int64_t GetXlaSparseCoreStackingMemLimit() {
@@ -58,11 +63,12 @@ static int64_t NextLargestMultiple(int64_t n, int64_t factor) {
 }
 
 SparseCoreLayoutStacker::SparseCoreLayoutStacker(int num_partitions,
+                                                 bool disable_table_stacking,
                                                  int sparse_cores_per_partition)
     : num_partitions_(num_partitions),
       sparse_cores_per_partition_(sparse_cores_per_partition),
       num_sparse_cores_(num_partitions_ * sparse_cores_per_partition_),
-      stacking_enabled_(!GetDisableTableStacking()),
+      stacking_enabled_(!GetDisableTableStacking(disable_table_stacking)),
       activation_mem_bytes_limit_(GetXlaSparseCoreStackingMemLimit()),
       variable_shard_bytes_limit_(GetXlaSparseCoreStackingTableShardLimit()) {}
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout.h b/tensorflow/core/tpu/kernels/sparse_core_layout.h
index 67d5547cc52ec3..c1d22f330c3882 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout.h
@@ -38,8 +38,10 @@ class SparseCoreLayoutStacker {
   //     into (usually one per TPU chip).
   //       NOTE: As of Q4 2023, SPMD is not supported by the sparse core python
   //       libraries so we don't support it here.
-  //   sparse_cores_per_partition:
+  //   sparse_cores_per_partition: Number of sparsecore per partition
+  //   disable_table_stacking: Should not stack tables.
   explicit SparseCoreLayoutStacker(int num_partitions,
+                                   bool disable_table_stacking = false,
                                    int sparse_cores_per_partition = 4);
 
   // Change various limits. You must call these before calling Addtable.
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 610e26e03f179e..41d86edb9fd429 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1066,6 +1066,8 @@ py_strict_library(
         ":_pywrap_tpu_embedding",
         ":tpu_embedding_base",
         ":tpu_embedding_v2_utils",
+        ":tpu_embedding_v3_checkpoint_adapter",
+        ":tpu_embedding_v3_utils",
         ":tpu_replication",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/tpu/kernels:sparse_core_layout_proto_py",
@@ -1141,4 +1143,60 @@ tpu_py_strict_test(
     ],
 )
 
+tpu_py_strict_test(
+    name = "tpu_embedding_v3_additional_test",
+    srcs = ["tpu_embedding_v3_additional_test.py"],
+    args = [
+        "--tpu=''",
+    ],
+    disable_experimental = True,
+    disable_tfrt = False,
+    disable_v2 = True,
+    disable_v3 = True,
+    # copybara:uncomment disable_v4i = True,
+    # copybara:uncomment disable_v5 = False,
+    # copybara:uncomment disable_v5_grm = True,
+    env = {
+        "TF_XLA_FLAGS": (
+            "--tf_mlir_enable_mlir_bridge=true " +
+            "--tf_mlir_enable_convert_control_to_data_outputs_pass=true " +
+            "--tf_mlir_enable_merge_control_flow_pass=true " +
+            "--tf_mlir_enable_strict_clusters=true " +
+            # These tests run lookup without while loops.
+            "--tf_xla_disable_full_embedding_pipelining=true"
+        ),
+    },
+    python_version = "PY3",
+    # Make it so there are enough shards so each test runs in separate shards.
+    shard_count = 12,
+    srcs_version = "PY3",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":device_assignment",
+        ":tpu_embedding_for_serving",
+        ":tpu_embedding_v2_utils",
+        ":tpu_embedding_v3",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 internal_create_sanitizer_settings()
diff --git a/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi b/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi
index 778093406836bc..cf6aae1857f4f0 100644
--- a/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi
+++ b/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi
@@ -16,7 +16,7 @@
 from typing import Any
 
 class SparseCoreLayoutStacker:
-    def __init__(self, num_partitions: int, sparse_cores_per_partition: int) -> None: ...
+    def __init__(self, num_partitions: int, disable_table_stacking: bool, sparse_cores_per_partition: int) -> None: ...
     def AddTable(self, table_name: str, table_height: int, table_width: int, group: str, output_samples: int) -> None: ...
     def GetLayouts(self, *args, **kwargs) -> Any: ...
     def SetActivationMemoryBytesLimit(self, arg0: int) -> None: ...
diff --git a/tensorflow/python/tpu/pywrap_sparse_core_layout.cc b/tensorflow/python/tpu/pywrap_sparse_core_layout.cc
index 785efe5d682a25..693f3d252ad0ec 100644
--- a/tensorflow/python/tpu/pywrap_sparse_core_layout.cc
+++ b/tensorflow/python/tpu/pywrap_sparse_core_layout.cc
@@ -27,7 +27,8 @@ namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_sparse_core_layout, m) {
   py::class_<SparseCoreLayoutStacker>(m, "SparseCoreLayoutStacker")
-      .def(py::init<int, int>(), py::arg("num_partitions"),
+      .def(py::init<int, bool, int>(), py::arg("num_partitions"),
+           py::arg("disable_table_stacking"),
            py::arg("sparse_cores_per_partition"))
       .def("SetActivationMemoryBytesLimit",
            &SparseCoreLayoutStacker::SetActivationMemoryBytesLimit)
diff --git a/tensorflow/python/tpu/tpu_embedding_v3.py b/tensorflow/python/tpu/tpu_embedding_v3.py
index c1eb333915a0d8..2dd3b26ae0fc72 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3.py
@@ -18,6 +18,7 @@
 import copy
 import dataclasses
 import functools
+import hashlib
 import operator
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
@@ -47,9 +48,11 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.tpu import _pywrap_tpu_embedding
+from tensorflow.python.tpu import _pywrap_sparse_core_layout
 from tensorflow.python.tpu import tpu_embedding_base
 from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_embedding_v3_checkpoint_adapter
+from tensorflow.python.tpu import tpu_embedding_v3_utils
 from tensorflow.python.tpu import tpu_replication
 from tensorflow.python.tpu.ops import gen_xla_ops as xla_ops
 from tensorflow.python.trackable import base
@@ -75,7 +78,7 @@
 class SparseCoreEmbeddingConfig:
   """Config for sparsecore embedding."""
 
-  disable_table_stacking: bool = True
+  disable_table_stacking: bool = False
   max_ids_per_chip_per_sample: int = 64
   max_ids_per_table: Optional[Dict[str, int]] = None
   max_unique_ids_per_table: Optional[Dict[str, int]] = None
@@ -334,140 +337,127 @@ def _stack_tables_with_same_table_dim_and_optimizer(
   if sparse_core_embedding_config:
     disable_table_stacking = sparse_core_embedding_config.disable_table_stacking
 
-  s = TableStacking()
+  if disable_table_stacking:
+    logging.warn("Table stacking is disabled.")
 
-  # Round the table sizes to be divisible by the number of SCs.
-  num_shards = num_partitions * num_sc_per_partition * 8
-
-  s.table_to_padding_columns = {}
-  s.table_to_padding_rows = {}
-  table_name_to_table = {}
-  for table in table_config:
-    table_name_to_table[table.name] = table
-    extra_rows = (
-        num_shards - (table.vocabulary_size % num_shards)
-    ) % num_shards
-    extra_cols = (8 - (table.dim % 8)) % 8
-    if extra_rows != 0:
-      if table.vocabulary_size < num_shards:
-        logging.warning(
-            "!!! Adding %d extra rows to a small table %s!!! Table had"
-            " %d rows before padding and %d rows after padding.",
-            extra_rows,
-            table.name,
-            table.vocabulary_size,
-            table.vocabulary_size + extra_rows,
+  stacker = _pywrap_sparse_core_layout.SparseCoreLayoutStacker(
+      num_partitions=num_partitions,
+      sparse_cores_per_partition=num_sc_per_partition,
+      disable_table_stacking=disable_table_stacking,
+  )
+  s = TableStacking()
+  s.table_name_to_table = {table.name: table for table in table_config}
+  table_to_num_samples = {table.name: 0 for table in table_config}
+  for _, feature in flat_features:
+    table_to_num_samples[feature.table.name] += functools.reduce(
+        operator.mul, feature.output_shape
+    )
+    # First generate stacking for any tables our caller didn't stack for us.
+    # Note that we process the tables sorted by name so the ordering is
+    # deterministic.
+    sorted_tables = sorted(table_config, key=lambda t: t.name)
+    for table in sorted_tables:
+      if not table.layout:
+        # All tables in a stack have to have the same hyperparemeters; this key
+        # contains everything we care about. The key is an arbitrary string
+        # whose value is not particularly meaningful except that it has to be
+        # different if the tables cannot be stacked together.
+        #
+        # Note that later we rewrite the stack name based on the tables in that
+        # stack; this is just a temporary initial name.
+        #
+        # The key does not need to include the embedding width; that is handled
+        # separately.
+        key_tuple = (
+            # Optimizers don't have a repr but do support hash.
+            hash(table.optimizer),
+            # Quantization configs don't have a hash but do support repr.
+            repr(table.quantization_config),
         )
-      else:
-        logging.warning(
-            "Adding %d extra rows to table %s to get %d rows.",
-            extra_rows,
+        key_str = hashlib.sha1(
+            repr(key_tuple).encode(),
+            usedforsecurity=False,
+        ).hexdigest()
+        key = "_xxtpuv3internal_" + key_str
+        stacker.AddTable(
+            table_name=table.name,
+            table_height=table.vocabulary_size,
+            table_width=table.dim,
+            group=key,
+            output_samples=table_to_num_samples[table.name],
+        )
+    # First generate stacking for any tables our caller didn't stack for us.
+    # Note that we process the tables sorted by name so the ordering is
+    # deterministic.
+    # Put the layout information we just computed back into the tables, so we
+    # can treat tables whose layouts were given by the caller and tables whose
+    # layouts we computed the same.
+    for layout in stacker.GetLayouts().tables:
+      table = s.table_name_to_table[layout.table_name]
+      assert not table.layout  # It's a bug if it was already set.
+      table.layout = layout
+
+    # Collect all the layout information from all the tables, whether we just
+    # computed it above, or whether the caller passed it as part of the
+    # TableConfig:
+    tables_by_stack = collections.defaultdict(list)
+    for table in sorted_tables:
+      layout = table.layout
+      assert layout.table_name == table.name
+      s.table_to_layout[table.name] = layout
+      tables_by_stack[layout.stacked_table_name].append(table)
+
+    for stack_name, tables in tables_by_stack.items():
+      s.quantization_configs[stack_name] = tables[0].quantization_config
+      s.stacked_table_to_tables[stack_name] = tables
+
+      logging.vlog(1, "Stacked table name: %s", stack_name)
+      for table in tables:
+        layout = table.layout
+        logging.vlog(
+            1,
+            "  Table %s: offset %d, rotation %d",
             table.name,
-            table.vocabulary_size + extra_rows,
+            layout.sparse_core_shard_row_offset,
+            layout.sparse_core_shard_rotation,
+        )
+        s.table_to_stacked_table_offset[table.name] = (
+            stack_name,
+            layout.sparse_core_shard_row_offset
+            * num_partitions
+            * num_sc_per_partition,
+            layout.sparse_core_shard_rotation,
+        )
+        # Update dimensions in the table to the padded dimensions.
+        table.vocabulary_size = layout.unsharded_padded_shape[0]
+        table.dim = layout.unsharded_padded_shape[1]
+        s.table_to_padding_rows[table.name] = (
+            layout.unsharded_padded_shape[0] - layout.unsharded_shape[0]
+        )
+        s.table_to_padding_columns[table.name] = (
+            layout.unsharded_padded_shape[1] - layout.unsharded_shape[1]
         )
-    if extra_cols != 0:
-      logging.warning(
-          "Adding %d extra columns to table %s to get %d columns.",
-          extra_cols,
-          table.name,
-          table.dim + extra_cols,
-      )
-    s.table_to_padding_columns[table.name] = extra_cols
-    s.table_to_padding_rows[table.name] = extra_rows
-    table.vocabulary_size += extra_rows
-    table.dim += extra_cols
-
-  if disable_table_stacking:
-    logging.warn("Table stacking is disabled.")
-    table_stacks = [[table] for table in table_config]
-  else:
-    table_names = []
-    table_widths = []
-    table_heights = []
-    table_num_samples = []
-    table_groups = []
-
-    table_data_to_group = {}
-    table_to_num_samples = {table.name: 0 for table in table_config}
-    for _, feature in flat_features:
-      table_to_num_samples[feature.table.name] += functools.reduce(
-          operator.mul, feature.output_shape
-      )
 
-    for table in table_config:
-      key = (
-          table.dim,
-          table.optimizer,
-          repr(table.quantization_config)
-          if table.quantization_config
-          else None,
-      )
-      if key not in table_data_to_group:
-        table_data_to_group[key] = len(table_data_to_group)
-      table_groups.append(table_data_to_group[key])
-      table_names.append(table.name)
-      table_widths.append(table.dim)
-      table_heights.append(table.vocabulary_size)
-      table_num_samples.append(table_to_num_samples[table.name])
-
-    table_stacks_by_name = _pywrap_tpu_embedding.stack_tables(
-        table_heights,
-        table_widths,
-        table_num_samples,
-        table_groups,
-        table_names,
-        num_partitions,
+    logging.info(
+        "Number of tables after stacking is %d.",
+        len(s.stacked_table_to_tables),
     )
 
-    table_stacks = [
-        [table_name_to_table[table_name] for table_name in stack_by_name]
-        for stack_by_name in table_stacks_by_name
-    ]
-
-  s.table_name_to_table = table_name_to_table
-  # Store the mapping between stacked table names to the actual tableConfigs.
-  s.stacked_table_to_tables = {}
-  # Store the mapping between table to name of the stacked table which
-  # contains the table and its offset.
-  s.table_to_stacked_table_offset = {}
-  # Save Quantization Config per stacked tables
-  s.quantization_configs = {}
-  for tables in table_stacks:
-    stacked_table_name = "_".join(map(lambda table: table.name, tables))
-    if stacked_table_name in s.stacked_table_to_tables:
-      raise ValueError(f"{stacked_table_name} already exists!")
-    s.stacked_table_to_tables[stacked_table_name] = tables
-    s.quantization_configs[stacked_table_name] = tables[0].quantization_config
-
-    current_offset = 0
-    current_index = 0
-    for table in tables:
-      s.table_to_stacked_table_offset[table.name] = (
-          stacked_table_name,
-          current_offset,
-          num_sc_per_partition * current_index,
+    s.table_to_sample_count = {
+        table_name: 0 for table_name in s.stacked_table_to_tables
+    }
+    for feature_path, feature in flat_features:
+      stacked_table_name = s.table_to_stacked_table_offset[feature.table.name][
+          0
+      ]
+      s.feature_to_sample_offset[feature_path] = s.table_to_sample_count[
+          stacked_table_name
+      ]
+      s.table_to_sample_count[stacked_table_name] += functools.reduce(
+          operator.mul, feature.output_shape
       )
-      current_offset += table.vocabulary_size
-      current_index += 1
-
-  logging.info(
-      "Number of tables after stacking is %d.",
-      len(s.stacked_table_to_tables),
-  )
 
-  s.feature_to_sample_offset = {}
-  s.table_to_sample_count = {
-      table_name: 0 for table_name in s.stacked_table_to_tables
-  }
-  for feature_path, feature in flat_features:
-    stacked_table_name = s.table_to_stacked_table_offset[feature.table.name][0]
-    s.feature_to_sample_offset[feature_path] = s.table_to_sample_count[
-        stacked_table_name
-    ]
-    s.table_to_sample_count[stacked_table_name] += functools.reduce(
-        operator.mul, feature.output_shape
-    )
-  return s
+    return s
 
 
 # TODO(b/233952762): Add tests of this version of the mid-level API.
@@ -668,6 +658,17 @@ def embedding_table_shards(
 
     return table_shards
 
+  @property
+  def embedding_layouts(
+      self,
+  ) -> Dict[str, sparse_core_layout_pb2.SparseCoreTableLayout]:
+    """Returns how the tables are laid out in the variables.
+
+    The SparseCoreTableLayout describes how a table is stored in its internal
+    state. You need this only if you need to pull apart the internal state.
+    """
+    return self._s.table_to_layout
+
   @property
   def variables(
       self,
@@ -825,6 +826,28 @@ def _create_variables_and_slots(
       )
     return variables
 
+  def _track_restore_info_for_cpu(self) -> None:
+    layouts = sparse_core_layout_pb2.SparseCoreTableLayouts()
+    layouts.tables.extend(self.embedding_layouts.values())
+    logging.info(
+        "Saving sparse core layouts for %s tables", len(layouts.tables)
+    )
+    with ops.device("/cpu:0"):
+      self._track_trackable(
+          tpu_embedding_v3_utils.SparseCoreLayoutsTrackable(
+              constant_op.constant(
+                  layouts.SerializeToString(), dtype=dtypes.string
+              )
+          ),
+          tpu_embedding_v3_utils.SPARSECORE_LAYOUTS_CHECKPOINT_KEY,
+      )
+
+  def _checkpoint_adapter(self, path):
+    # The TPUEmbedding may need to reshard checkpoint values during restore.
+    return tpu_embedding_v3_checkpoint_adapter.TpuEmbeddingV3CheckpointAdapter.create_from_checkpoint(
+        path
+    )
+
   def _maybe_build(self):
     if not self._built:
       # This can be called while tracing a function, so we wrap the
@@ -840,6 +863,7 @@ def build(self):
     if self._built:
       return
     self._variables = self._create_variables_and_slots()
+    self._track_restore_info_for_cpu()
     self._built = True
 
   def apply_gradients(
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py b/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py
new file mode 100644
index 00000000000000..97c87d84346f04
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py
@@ -0,0 +1,473 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Additional multi/single worker tests for tpu_embedding_v3."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.checkpoint import checkpoint as tf_checkpoint
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import values as values_lib
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework.constant_op import constant as tf_constant
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.python.tpu import tpu_embedding_for_serving
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_embedding_v3
+
+
+_TPU = flags.DEFINE_string('tpu', None, 'The TPU to use for TPUStrategy.')
+# pylint: disable=g-long-lambda
+
+
+RowIdInitializer = tpu_embedding_v2_utils.RowIdInitializer
+
+
+def get_replica_values(per_replica_or_tensor):
+  if isinstance(per_replica_or_tensor, values_lib.PerReplica):
+    return per_replica_or_tensor.values
+  else:
+    return [per_replica_or_tensor]
+
+
+def pad_to_shape_initializer(init_mat):
+  """An initializer that pads init_mat out to the given shape."""
+  return lambda shape, dtype: array_ops.pad(
+      init_mat,
+      [
+          [0, shape[0] - init_mat.shape[0]],
+          [0, shape[1] - init_mat.shape[1]],
+      ],
+      'CONSTANT',
+  )
+
+
+class TPUEmbeddingLayerV2Test(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.vocabulary_size = 128
+    self.embedding_dim = 8
+
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.vocabulary_size,
+        dim=self.embedding_dim,
+        initializer=RowIdInitializer(0),
+        combiner='sum',
+        name='video',
+    )
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.vocabulary_size,
+        dim=self.embedding_dim,
+        initializer=RowIdInitializer(1000),
+        combiner='sum',
+        name='user',
+    )
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu=_TPU.value)
+    if _TPU.value is None:
+      remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+
+    # FIXME(b/303466959): Remove this device assignment after TPUStrategy
+    # can follow the actual device ordering under SC.
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
+    tpu_metadata = resolver.get_tpu_system_metadata()
+
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=tpu_metadata.num_cores
+    )
+    self._strategy = tpu_strategy.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment
+    )
+
+    self.addCleanup(tpu_cluster_resolver.shutdown_tpu_system, resolver)
+
+    self.feature_video = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video,
+        name='video',
+        output_shape=[self.vocabulary_size],
+    )
+    self.feature_user = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_user, name='user', output_shape=[self.vocabulary_size]
+    )
+
+    self.assertEqual(
+        self._strategy.extended._tpu_devices.shape, (tpu_metadata.num_cores, 1)
+    )
+
+  def testSingleTableInitializeAndLookup(self):
+    # This test sets up devices to lookup the entire table.
+
+    feature_config = [self.feature_video]
+
+    strategy = self._strategy
+
+    with strategy.scope():
+      embedding_layer = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config,
+          tpu_embedding_v2_utils.Adagrad(),
+          pipeline_execution_with_tensor_core=True,
+      )
+
+      @def_function.function
+      def train_step(features):
+        def train_step_fn(features):
+          return embedding_layer(features)[0]
+
+        return strategy.run(train_step_fn, args=(features,))
+
+    def value_fn(ctx):
+      del ctx  # unused
+      return [
+          sparse_tensor.SparseTensor(
+              indices=[[i, 0] for i in range(0, self.vocabulary_size)],
+              values=np.arange(0, self.vocabulary_size),
+              dense_shape=[self.vocabulary_size, 1],
+          ),
+      ]
+
+    features = strategy.experimental_distribute_values_from_function(value_fn)
+
+    [embeddings] = train_step(features)
+
+    expected = RowIdInitializer(0)(
+        shape=(self.vocabulary_size, self.embedding_dim), dtype=dtypes.float32
+    )
+    for replica in get_replica_values(embeddings):
+      self.assertAllClose(replica, expected)
+
+  def testStackedTableInitializeAndLookup(self):
+    # This test sets up devices to lookup the entire table.
+    feature_config = [self.feature_video, self.feature_user]
+
+    strategy = self._strategy
+
+    def value_fn(ctx):
+      del ctx  # unused
+      return [
+          sparse_tensor.SparseTensor(
+              indices=[[i, 0] for i in range(0, self.vocabulary_size)],
+              values=np.arange(0, self.vocabulary_size),
+              dense_shape=[self.vocabulary_size, 1],
+          ),
+          sparse_tensor.SparseTensor(
+              indices=[[i, 0] for i in range(0, self.vocabulary_size)],
+              values=np.arange(0, self.vocabulary_size),
+              dense_shape=[self.vocabulary_size, 1],
+          ),
+      ]
+
+    features = strategy.experimental_distribute_values_from_function(value_fn)
+
+    with strategy.scope():
+      embedding_layer = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config,
+          tpu_embedding_v2_utils.Adagrad(),
+          pipeline_execution_with_tensor_core=True,
+      )
+
+      @def_function.function
+      def train_step(features):
+        def train_step_fn(features):
+          return embedding_layer(features)[0]
+
+        return strategy.run(train_step_fn, args=(features,))
+
+    [embeddings_video, embeddings_user] = train_step(features)
+
+    expected_video = RowIdInitializer(0)(
+        shape=(self.vocabulary_size, self.embedding_dim), dtype=dtypes.float32
+    )
+    expected_user = RowIdInitializer(1000)(
+        shape=(self.vocabulary_size, self.embedding_dim), dtype=dtypes.float32
+    )
+    for replica_video, replica_user in zip(
+        get_replica_values(embeddings_video),
+        get_replica_values(embeddings_user),
+    ):
+      self.assertAllClose(replica_video, expected_video)
+      self.assertAllClose(replica_user, expected_user)
+
+  def testTwoTablesStackedHaveCorrectInitialValues(self):
+    table1_initial_value = np.arange(
+        start=100, stop=120, dtype=np.float32
+    ).reshape([10, 2])
+    table1 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=10,
+        dim=2,
+        initializer=pad_to_shape_initializer(table1_initial_value),
+        combiner='sum',
+        name='table1',
+    )
+    table2_initial_value = np.arange(
+        start=200, stop=240, dtype=np.float32
+    ).reshape([20, 2])
+    table2 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=20,
+        dim=2,
+        initializer=pad_to_shape_initializer(table2_initial_value),
+        combiner='sum',
+        name='table2',
+    )
+    feature_configs = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table1, name='feature1', output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table2, name='feature2', output_shape=[16]
+        ),
+    ]
+
+    with self._strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+          pipeline_execution_with_tensor_core=True,
+      )
+      # The two tables should be stacked into the same variable.
+      self.assertLen(mid_level_api.embedding_tables, 1)
+
+  def testCpuRestoreForNoStackedTables(self):
+    table1_initial_value = np.arange(
+        start=100, stop=120, dtype=np.float32
+    ).reshape([10, 2])
+    table1 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=12,
+        dim=2,
+        initializer=pad_to_shape_initializer(table1_initial_value),
+        combiner='sum',
+        name='table1',
+    )
+    table2_initial_value = np.arange(
+        start=200, stop=380, dtype=np.float32
+    ).reshape([20, 9])
+    table2 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=20,
+        dim=9,  # to ensure stacking does not occur
+        initializer=pad_to_shape_initializer(table2_initial_value),
+        combiner='sum',
+        name='table2',
+    )
+    feature_configs = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table1, name='feature1', output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table2, name='feature2', output_shape=[16]
+        ),
+    ]
+
+    with self._strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+      # The two tables should be *not* be stacked.
+      self.assertLen(mid_level_api.embedding_tables, 2)
+      # Save v3 embedding
+      checkpoint = tf_checkpoint.Checkpoint(mid_level_api)
+      checkpoint_prefix = os.path.join(self.create_tempdir().full_path, 'ckpt')
+      checkpoint_path = checkpoint.save(checkpoint_prefix)
+
+    # Restore in serving embedding
+    with distribute_lib.get_strategy().scope():
+      serving_embedding = tpu_embedding_for_serving.TPUEmbeddingForServing(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+      checkpoint_for_restore = tf_checkpoint.Checkpoint(serving_embedding)
+      checkpoint_for_restore.restore(checkpoint_path)
+      serving_embedding.build()
+    # Check that 2 tables exist in serving.
+    self.assertLen(serving_embedding.embedding_tables, 2)
+    look_for_row_idx_1 = [
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]
+        ),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]
+        ),
+    ]
+    row_lookup = serving_embedding(look_for_row_idx_1)
+    self.assertAllEqual(
+        row_lookup[0],
+        tf_constant(
+            [
+                102.0,
+                103.0,
+            ],
+            shape=(1, 2),
+        ),
+    )
+    self.assertAllEqual(
+        row_lookup[1],
+        tf_constant(
+            [209.0, 210.0, 211.0, 212.0, 213.0, 214.0, 215.0, 216.0, 217.0],
+            shape=(1, 9),
+        ),
+    )
+
+  def testCpuRestoreForStackedTables(self):
+    table1_initial_value = np.arange(
+        start=100, stop=120, dtype=np.float32
+    ).reshape([10, 2])
+    table1 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=12,
+        dim=2,
+        initializer=pad_to_shape_initializer(table1_initial_value),
+        combiner='sum',
+        name='table1',
+    )
+    table2_initial_value = np.arange(
+        start=200, stop=240, dtype=np.float32
+    ).reshape([20, 2])
+    table2 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=20,
+        dim=2,
+        initializer=pad_to_shape_initializer(table2_initial_value),
+        combiner='sum',
+        name='table2',
+    )
+    feature_configs = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table1, name='feature1', output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table2, name='feature2', output_shape=[16]
+        ),
+    ]
+
+    with self._strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+      # The two tables should be stacked into the same variable.
+      self.assertLen(mid_level_api.embedding_tables, 1)
+      # Save v3 embedding
+      checkpoint = tf_checkpoint.Checkpoint(mid_level_api)
+      checkpoint_prefix = os.path.join(self.create_tempdir().full_path, 'ckpt')
+      checkpoint_path = checkpoint.save(checkpoint_prefix)
+
+    # Restore in serving embedding
+    with distribute_lib.get_strategy().scope():
+      serving_embedding = tpu_embedding_for_serving.TPUEmbeddingForServing(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+      checkpoint_for_restore = tf_checkpoint.Checkpoint(serving_embedding)
+      checkpoint_for_restore.restore(checkpoint_path)
+      serving_embedding.build()
+    # Check that unstacking happens on restore
+    self.assertLen(serving_embedding.embedding_tables, 2)
+    look_for_row_idx_1 = [
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]
+        ),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]
+        ),
+    ]
+    row_lookup = serving_embedding(look_for_row_idx_1)
+    self.assertAllEqual(
+        row_lookup[0],
+        tf_constant([102.0, 103.0], shape=(1, 2)),
+    )
+    self.assertAllEqual(
+        row_lookup[1],
+        tf_constant([202.0, 203.0], shape=(1, 2)),
+    )
+
+  def testUnshardedToTpuRestore(self):
+    table1 = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=25,
+        dim=6,
+        initializer=RowIdInitializer(0),
+        combiner='sum',
+        name='table1',
+    )
+    feature_configs = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=table1, name='feature1', output_shape=[16]
+        ),
+    ]
+
+    with distribute_lib.get_strategy().scope():
+      cpu_embedding = tpu_embedding_for_serving.TPUEmbeddingForServing(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.Adagrad(0.1),
+      )
+
+      self.assertLen(cpu_embedding.embedding_tables, 1)
+      # Save unsharded embedding
+      checkpoint = tf_checkpoint.Checkpoint(cpu_embedding)
+      checkpoint_prefix = os.path.join(self.create_tempdir().full_path, 'ckpt')
+      checkpoint_path = checkpoint.save(checkpoint_prefix)
+
+    # Restore in TPU embedding
+    strategy = self._strategy
+    with strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_configs,
+          optimizer=tpu_embedding_v2_utils.Adagrad(0.1),
+      )
+      checkpoint_for_restore = tf_checkpoint.Checkpoint(mid_level_api)
+      checkpoint_for_restore.restore(checkpoint_path)
+      mid_level_api.build()
+
+    replicas, cores_per_replica = strategy.extended._tpu_devices.shape
+    total_sc_shards = (
+        replicas * cores_per_replica * mid_level_api._num_sc_per_chip
+    )
+    padded_vocab = 8 * total_sc_shards
+    unsharded_full_value = cpu_embedding._variables['table1']['parameters']
+    shard_shape = [padded_vocab // total_sc_shards, 8]
+    offset = 0
+    ordered_devices = []
+    for devices in strategy.extended._tpu_devices:  # pylint: disable=protected-access
+      ordered_devices.extend(devices)
+    for device in ordered_devices:
+      partition = []
+      for _ in range(mid_level_api._num_sc_per_chip):
+        sh = unsharded_full_value[offset::total_sc_shards, :]
+        padded_sh = pad_to_shape_initializer(sh)(shard_shape, dtypes.float32)
+        partition.append(padded_sh)
+        offset += 1
+      # Check value at each partition
+      self.assertAllEqual(
+          mid_level_api._variables['table1']['parameters'].read_from_device(
+              device
+          ),
+          array_ops.concat(partition, axis=0),
+      )
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  config.enable_mlir_bridge()
+  test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
index c55c1d49e13d6e..2621223bab87c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
@@ -24,6 +24,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'True\', \'64\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'False\', \'64\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
index a39d11ce6c13f8..a7351e9a75c9f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "DEFAULT_MAX_UNIQUE_IDS_PER_TABLE"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "embedding_layouts"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "embedding_table_shards"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
index c55c1d49e13d6e..2621223bab87c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
@@ -24,6 +24,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'True\', \'64\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'False\', \'64\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
index a39d11ce6c13f8..a7351e9a75c9f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "DEFAULT_MAX_UNIQUE_IDS_PER_TABLE"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "embedding_layouts"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "embedding_table_shards"
     mtype: "<type \'property\'>"

From 39897a2a9a81cc32d93c8c448a8dc0ef6b361ce1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 31 May 2024 16:59:02 -0700
Subject: [PATCH 217/287] [xla:cpu] Add support for multi-output elemental
 loops

+ enabled exhaustive tests for thunks runtime

PiperOrigin-RevId: 639194922
---
 third_party/xla/xla/service/cpu/BUILD         | 24 +--------
 .../xla/xla/service/cpu/ir_emitter2.cc        | 49 ++++++++++++++-----
 .../xla/xla/service/cpu/runtime/copy_thunk.cc |  8 +--
 .../xla/service/cpu/runtime/kernel_thunk.cc   |  5 +-
 .../xla/xla/service/cpu/thunk_emitter.cc      |  6 ++-
 third_party/xla/xla/tests/BUILD               |  9 ++++
 third_party/xla/xla/tests/exhaustive/BUILD    | 13 +++--
 7 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 8120b609e9df3a..5e418b4550487b 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -52,29 +52,6 @@ filegroup(
     ]),
 )
 
-# Collection of XLA tests that support XLA:CPU thunk-based runtime. We keep
-# running them on TAP while we keep working on porting XLA:CPU to the new
-# runtime.
-#
-# XLA:CPU thunks enabled with:
-#   --test_env=XLA_FLAGS=--xla_cpu_use_thunk_runtime=true
-#
-test_suite(
-    name = "thunk_runtime_tests",
-    tests = [
-        "//xla/tests:array_elementwise_ops_test_cpu",
-        "//xla/tests:axpy_simple_test_cpu",
-        "//xla/tests:convert_test_cpu",
-        "//xla/tests:copy_test_cpu",
-        "//xla/tests:floor_ceil_test_cpu",
-        "//xla/tests:numerics_test_cpu",
-        "//xla/tests:reshape_test_cpu",
-        "//xla/tests:reverse_test_cpu",
-        "//xla/tests:unary_op_test_cpu",
-        "//xla/tests/exhaustive:exhaustive_binary_16_bit_test_cpu",
-    ],
-)
-
 cc_library(
     name = "test_header_helper",
     testonly = True,
@@ -641,6 +618,7 @@ cc_library(
         ":elemental_math_emitter",
         ":ir_emitter",
         "//xla:shape_util",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
         "//xla/service/llvm_ir:fused_ir_emitter",
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index d270b070f83eb5..37cea9e6423074 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/cpu/elemental_math_emitter.h"
 #include "xla/service/cpu/ir_emitter.h"
@@ -47,6 +48,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -201,6 +203,35 @@ bool IrEmitter2::fast_min_max() const {
   return hlo_module_.config().debug_options().xla_cpu_enable_fast_min_max();
 }
 
+static absl::Status EmitElementalLoops(
+    llvm::IRBuilder<>& b, const HloInstruction* instr,
+    const llvm_ir::ElementGenerator& element_generator,
+    absl::Span<const llvm_ir::IrArray> results) {
+  // We can emit loops for instruction with multiple results only if it is a
+  // fusion, reduce or reduce window.
+  bool multiple_results = results.size() > 1;
+  bool support_multiple_results = instr->opcode() == HloOpcode::kFusion ||
+                                  instr->opcode() == HloOpcode::kReduce ||
+                                  instr->opcode() == HloOpcode::kReduceWindow;
+
+  if (multiple_results && !support_multiple_results) {
+    return Internal(
+        "Multi-output host kernels are not supported for %s instruction",
+        HloOpcodeString(instr->opcode()));
+  }
+
+  if (multiple_results) {
+    TF_RETURN_IF_ERROR(llvm_ir::LoopEmitter(element_generator, results, &b)
+                           .EmitLoop(llvm_ir::IrName(instr)));
+  } else {
+    TF_RETURN_IF_ERROR(
+        llvm_ir::LoopEmitter(element_generator, results.front(), &b)
+            .EmitLoop(llvm_ir::IrName(instr)));
+  }
+
+  return absl::OkStatus();
+}
+
 absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     const HloInstruction* instr) {
   VLOG(2) << "Emit elemental host kernel: " << instr->name();
@@ -218,19 +249,13 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitElementalHostKernel(
     };
   }
 
-  if (kernel_prototype.results.size() > 1) {
-    return absl::InternalError("Multi-output host kernels are not supported");
-  }
-
   ElementalIrEmitter elemental_emitter(module_, &b, &hlo_module_,
                                        nested_ir_emitter_, fast_min_max());
   llvm_ir::ElementGenerator element_generator =
       elemental_emitter.MakeElementGenerator(instr, operand_to_generator);
 
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
-          .EmitLoop(llvm_ir::IrName(instr)));
-
+  TF_RETURN_IF_ERROR(EmitElementalLoops(b, instr, element_generator,
+                                        kernel_prototype.results));
   return kernels_.emplace_back(kernel_prototype.function->getName().str());
 }
 
@@ -263,10 +288,8 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
       auto element_generator,
       fused_emitter.GetGenerator(*fusion->fused_expression_root()));
 
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(element_generator, kernel_prototype.results[0], &b)
-          .EmitLoop(llvm_ir::IrName(fusion)));
-
+  TF_RETURN_IF_ERROR(EmitElementalLoops(b, fusion, element_generator,
+                                        kernel_prototype.results));
   return kernels_.emplace_back(kernel_prototype.function->getName().str());
 }
 
@@ -328,7 +351,7 @@ IrEmitter2::KernelPrototype IrEmitter2::EmitKernelPrototype(
           << ", #arguments=" << arguments.size()
           << ", #results=" << results.size();
   for (const Shape& argument : arguments) {
-    VLOG(3) << "  arguments: " << argument.ToString(true);
+    VLOG(3) << "  argument: " << argument.ToString(true);
   }
   for (const Shape& result : results) {
     VLOG(3) << "  result: " << result.ToString(true);
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
index f7dbb067c058c5..54f20bd88b8487 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
@@ -49,10 +49,6 @@ CopyThunk::CopyThunk(BufferAllocation::Slice source_buffer,
       << " must be compatble with destination shape "
       << destination_shape_.ToString(true);
 
-  // TODO(ezhulenev): This is almost certainly wrong for many types of copies
-  // that change layout, however it works in a few tests. This implementation
-  // is copied from `xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc`. It seems to
-  // work only if destination is a row-major layout.
   if (source_shape_ != destination_shape_) {
     TransposePlan::Options options;
     options.elem_size_in_bytes =
@@ -83,10 +79,10 @@ absl::Status CopyThunk::Execute(const ExecuteParams& params) {
   VLOG(3) << absl::StreamFormat("Copy buffer: use_transpose=%s",
                                 transpose_plan_ ? "true" : "false");
   VLOG(3) << absl::StreamFormat(
-      " - src: %s in slice %s (%p)", source_shape_.ToString(true),
+      "  src: %s in slice %s (%p)", source_shape_.ToString(true),
       source_buffer_.ToString(), source_data.opaque());
   VLOG(3) << absl::StreamFormat(
-      " - dst: %s in slice %s (%p)", destination_shape_.ToString(true),
+      "  dst: %s in slice %s (%p)", destination_shape_.ToString(true),
       destination_buffer_.ToString(), destination_data.opaque());
 
   // TODO(ezhulenev): Add benchmarks for copy thunk and add support for
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
index cc1dfab06f7675..232c1e534de32d 100644
--- a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/cpu/runtime/kernel_thunk.h"
 
+#include <cstdint>
 #include <string>
 #include <utility>
 
@@ -49,10 +50,12 @@ absl::Status KernelThunk::Execute(const ExecuteParams& params) {
   absl::InlinedVector<se::DeviceMemoryBase, 8> buffers_data;
   buffers_data.reserve(buffers_.size());
 
+  int64_t arg_num = 0;
   for (BufferAllocation::Slice& buffer : buffers_) {
     TF_ASSIGN_OR_RETURN(buffers_data.emplace_back(),
                         params.buffer_allocations->GetDeviceAddress(buffer));
-    VLOG(3) << absl::StreamFormat(" - arg: %s (%p)", buffer.ToString(),
+    VLOG(3) << absl::StreamFormat("  arg #%d: %s (%p)", arg_num++,
+                                  buffer.ToString(),
                                   buffers_data.back().opaque());
   }
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 5504bea37c8d4b..63c4c74da8023a 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -51,7 +51,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
   if (!module.has_schedule()) {
     return absl::InternalError("HLO module must be scheduled to emit thunks");
   }
-  VLOG(0) << module.ToString();
   return EmitHloComputation(module.entry_computation());
 }
 
@@ -114,6 +113,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kBroadcast:
+    case HloOpcode::kCbrt:
+    case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
     case HloOpcode::kCompare:
@@ -123,6 +124,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
+    case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
@@ -139,6 +141,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
     case HloOpcode::kReverse:
+    case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRoundNearestEven:
     case HloOpcode::kRsqrt:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 2ee5737a933110..d4a0f1211df9d5 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -583,6 +583,7 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -695,6 +696,7 @@ xla_test(
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -803,6 +805,7 @@ xla_test(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     shard_count = 25,
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -1595,6 +1598,7 @@ xla_test(
     shard_count = 31,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
     ],
     deps = [
         ":client_library_test_base",
@@ -1981,6 +1985,7 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -2006,6 +2011,7 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -2070,6 +2076,7 @@ xla_test(
 xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -2261,6 +2268,7 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
@@ -2978,6 +2986,7 @@ xla_test(
 xla_test(
     name = "numerics_test",
     srcs = ["numerics_test.cc"],
+    tags = ["test_xla_cpu_thunks"],
     deps = [
         ":hlo_test_base",
         ":test_macros_header",
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index dbf6dc793f3197..82a27c02396a44 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -66,6 +66,7 @@ xla_test(
     shard_count = 50,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
         # This is a big test that we skip for capacity reasons in OSS testing.
         "no_oss",
     ],
@@ -83,9 +84,10 @@ xla_test(
         "gpu",
         "cpu",
     ],
-    shard_count = 48,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
         # This is a big test that we skip for capacity reasons in OSS testing.
         "no_oss",
         # TODO(b/151340488): Timed out on 2020-03-18.
@@ -107,9 +109,10 @@ xla_test(
         "gpu",
         "cpu",
     ],
-    shard_count = 48,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
         # This is a big test that we skip for capacity reasons in OSS testing.
         "no_oss",
     ],
@@ -132,9 +135,10 @@ xla_test(
         "gpu",
         "cpu",
     ],
-    shard_count = 48,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
         # This is a big test that we skip for capacity reasons in OSS testing.
         "no_oss",
     ],
@@ -151,9 +155,10 @@ xla_test(
         "gpu",
         "cpu",
     ],
-    shard_count = 48,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_xla_cpu_thunks",
         # This is a big test that we skip for capacity reasons in OSS testing.
         "no_oss",
     ],

From 980e3ff526dbfe5fb81d9f401b704e9721e7dcdb Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 31 May 2024 18:30:50 -0700
Subject: [PATCH 218/287] [xla:cpu] Add TraceMe annotations to XLA:CPU thunks

PiperOrigin-RevId: 639213971
---
 third_party/xla/xla/service/cpu/runtime/BUILD |  6 +++++
 .../xla/xla/service/cpu/runtime/call_thunk.cc |  7 +++--
 .../xla/xla/service/cpu/runtime/call_thunk.h  |  2 +-
 .../xla/xla/service/cpu/runtime/copy_thunk.cc |  8 ++++--
 .../xla/xla/service/cpu/runtime/copy_thunk.h  |  3 ++-
 .../service/cpu/runtime/copy_thunk_test.cc    |  4 +--
 .../xla/service/cpu/runtime/kernel_thunk.cc   |  8 ++++--
 .../xla/service/cpu/runtime/kernel_thunk.h    |  2 +-
 .../service/cpu/runtime/kernel_thunk_test.cc  |  2 +-
 .../xla/xla/service/cpu/runtime/thunk.cc      | 11 ++++++++
 .../xla/xla/service/cpu/runtime/thunk.h       | 16 ++++++++++-
 .../xla/service/cpu/runtime/while_thunk.cc    |  7 +++--
 .../xla/xla/service/cpu/runtime/while_thunk.h |  4 +--
 .../xla/xla/service/cpu/thunk_emitter.cc      | 27 ++++++++++++++-----
 14 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index 9c5a8b436da330..19d93d01ebca0e 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -41,6 +41,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -51,6 +53,7 @@ cc_library(
     deps = [
         ":thunk",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -71,6 +74,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -109,6 +113,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -142,5 +147,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
diff --git a/third_party/xla/xla/service/cpu/runtime/call_thunk.cc b/third_party/xla/xla/service/cpu/runtime/call_thunk.cc
index a68165a30fdf20..60188e1ef1e6fb 100644
--- a/third_party/xla/xla/service/cpu/runtime/call_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/call_thunk.cc
@@ -19,13 +19,16 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/cpu/runtime/thunk.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla::cpu {
 
-CallThunk::CallThunk(ThunkSequence called_sequence)
-    : Thunk(Kind::kCall), called_sequence_(std::move(called_sequence)) {}
+CallThunk::CallThunk(Info info, ThunkSequence called_sequence)
+    : Thunk(Kind::kCall, std::move(info)),
+      called_sequence_(std::move(called_sequence)) {}
 
 absl::Status CallThunk::Execute(const ExecuteParams& params) {
+  tsl::profiler::TraceMe trace([&] { return TraceMeEncode(); });
   return called_sequence_.Execute(params);
 }
 
diff --git a/third_party/xla/xla/service/cpu/runtime/call_thunk.h b/third_party/xla/xla/service/cpu/runtime/call_thunk.h
index c57b3ea4215b00..137d1e2d42f692 100644
--- a/third_party/xla/xla/service/cpu/runtime/call_thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/call_thunk.h
@@ -25,7 +25,7 @@ namespace xla::cpu {
 // sequence emitted from the called computation.
 class CallThunk final : public Thunk {
  public:
-  explicit CallThunk(ThunkSequence called_sequence);
+  CallThunk(Info info, ThunkSequence called_sequence);
 
   absl::Status Execute(const ExecuteParams& params) final;
 
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
index 54f20bd88b8487..064d18f2111fe0 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <functional>
+#include <utility>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
@@ -31,14 +32,15 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla::cpu {
 
-CopyThunk::CopyThunk(BufferAllocation::Slice source_buffer,
+CopyThunk::CopyThunk(Info info, BufferAllocation::Slice source_buffer,
                      const Shape& source_shape,
                      BufferAllocation::Slice destination_buffer,
                      const Shape& destination_shape)
-    : Thunk(Kind::kCopy),
+    : Thunk(Kind::kCopy, std::move(info)),
       source_buffer_(source_buffer),
       source_shape_(source_shape),
       destination_buffer_(destination_buffer),
@@ -68,6 +70,8 @@ CopyThunk::CopyThunk(BufferAllocation::Slice source_buffer,
 }
 
 absl::Status CopyThunk::Execute(const ExecuteParams& params) {
+  tsl::profiler::TraceMe trace([&] { return TraceMeEncode(); });
+
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase source_data,
       params.buffer_allocations->GetDeviceAddress(source_buffer_));
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk.h b/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
index 0ab4d8d380d7fc..3393635e372685 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk.h
@@ -32,7 +32,8 @@ namespace xla::cpu {
 // destination buffers have different layouts it will transpose the data.
 class CopyThunk final : public Thunk {
  public:
-  CopyThunk(BufferAllocation::Slice source_buffer, const Shape& source_shape,
+  CopyThunk(Info info, BufferAllocation::Slice source_buffer,
+            const Shape& source_shape,
             BufferAllocation::Slice destination_buffer,
             const Shape& destination_shape);
 
diff --git a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
index 9152380b8e8dfd..a09303b38c7551 100644
--- a/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
+++ b/third_party/xla/xla/service/cpu/runtime/copy_thunk_test.cc
@@ -50,7 +50,7 @@ TEST(CopyThunkTest, CopySameShape) {
   BufferAllocation::Slice dst_slice(&dst_alloc, 0, size_in_bytes);
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
-  CopyThunk thunk(src_slice, shape, dst_slice, shape);
+  CopyThunk thunk({"copy"}, src_slice, shape, dst_slice, shape);
 
   Thunk::ExecuteParams params = {nullptr, &allocations};
   TF_ASSERT_OK(thunk.Execute(params));
@@ -78,7 +78,7 @@ TEST(CopyThunkTest, CopyTransposed) {
   Shape src_shape = ShapeUtil::MakeShape(F32, {2, 2});
   *src_shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   Shape dst_shape = ShapeUtil::MakeShape(F32, {2, 2});
-  CopyThunk thunk(src_slice, src_shape, dst_slice, dst_shape);
+  CopyThunk thunk({"copy"}, src_slice, src_shape, dst_slice, dst_shape);
 
   Thunk::ExecuteParams params = {nullptr, &allocations};
   TF_ASSERT_OK(thunk.Execute(params));
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
index 232c1e534de32d..1408ce0949b747 100644
--- a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.cc
@@ -32,17 +32,21 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla::cpu {
 
-KernelThunk::KernelThunk(absl::Span<const BufferAllocation::Slice> buffers,
+KernelThunk::KernelThunk(Info info,
+                         absl::Span<const BufferAllocation::Slice> buffers,
                          std::string kernel_name, se::ThreadDim thread_dim)
-    : Thunk(Kind::kKernel),
+    : Thunk(Kind::kKernel, std::move(info)),
       buffers_(buffers.begin(), buffers.end()),
       kernel_name_(std::move(kernel_name)),
       thread_dim_(thread_dim) {}
 
 absl::Status KernelThunk::Execute(const ExecuteParams& params) {
+  tsl::profiler::TraceMe trace([&] { return TraceMeEncode(); });
+
   VLOG(3) << absl::StreamFormat(
       "Launch host kernel %s with %d buffer arguments: %s", kernel_name_,
       buffers_.size(), thread_dim_.ToString());
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
index a65162c1a088d0..fed116179b6d98 100644
--- a/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk.h
@@ -30,7 +30,7 @@ namespace xla::cpu {
 // Launches compiled host kernel on the caller thread.
 class KernelThunk final : public Thunk {
  public:
-  KernelThunk(absl::Span<const BufferAllocation::Slice> buffers,
+  KernelThunk(Info info, absl::Span<const BufferAllocation::Slice> buffers,
               std::string kernel_name, se::ThreadDim thread_dim);
 
   absl::Status Execute(const ExecuteParams& params) final;
diff --git a/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc b/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc
index b2ac099c5bd27f..02be1fe495ec77 100644
--- a/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc
+++ b/third_party/xla/xla/service/cpu/runtime/kernel_thunk_test.cc
@@ -70,7 +70,7 @@ TEST(KernelThunkTest, AddF32) {
   BufferAllocation::Slice out_slice(&out_alloc, 0, size_in_bytes);
   std::vector<BufferAllocation::Slice> slices = {in_slice, out_slice};
 
-  KernelThunk thunk(slices, "add_f32", se::ThreadDim(4));
+  KernelThunk thunk({"add_f32"}, slices, "add_f32", se::ThreadDim(4));
 
   AddF32HostKernels host_kernels;
   Thunk::ExecuteParams params = {&host_kernels, &allocations};
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.cc b/third_party/xla/xla/service/cpu/runtime/thunk.cc
index b2dfe1ab5aacce..20ee657e35cde7 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <memory>
 #include <ostream>
+#include <string>
 #include <string_view>
 #include <utility>
 
 #include "absl/status/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
 
 namespace xla::cpu {
 
@@ -39,6 +42,14 @@ std::string_view Thunk::KindToString(Kind kind) {
   }
 }
 
+// Encodes thunk info into the TraceMe compatible format.
+std::string Thunk::TraceMeEncode() const {
+  return tsl::profiler::TraceMeEncode(info_.op_name,
+                                      {{"hlo_op", info_.op_name},
+                                       {"hlo_module", info_.module_name},
+                                       {"hlo_module_id", info_.module_id}});
+}
+
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
   os << Thunk::KindToString(kind);
   return os;
diff --git a/third_party/xla/xla/service/cpu/runtime/thunk.h b/third_party/xla/xla/service/cpu/runtime/thunk.h
index 5616d1f328b026..49d9270b6325de 100644
--- a/third_party/xla/xla/service/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/thunk.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_RUNTIME_THUNK_H_
 #define XLA_SERVICE_CPU_RUNTIME_THUNK_H_
 
+#include <cstdint>
 #include <memory>
 #include <ostream>
+#include <string>
 #include <string_view>
 #include <type_traits>
 #include <utility>
@@ -53,14 +55,21 @@ class Thunk {
     kWhile,
   };
 
+  struct Info {
+    std::string op_name;
+    std::string module_name;
+    int64_t module_id;
+  };
+
   virtual ~Thunk() = default;
 
   Thunk(const Thunk&) = delete;
   Thunk& operator=(const Thunk&) = delete;
 
-  explicit Thunk(Kind kind) : kind_(kind) {}
+  explicit Thunk(Kind kind, Info info) : kind_(kind), info_(std::move(info)) {}
 
   Kind kind() const { return kind_; }
+  const Info& info() const { return info_; }
 
   static std::string_view KindToString(Kind kind);
 
@@ -91,8 +100,13 @@ class Thunk {
 
   virtual absl::Status Execute(const ExecuteParams& params) = 0;
 
+ protected:
+  // Encodes thunk info into the TraceMe compatible format.
+  std::string TraceMeEncode() const;
+
  private:
   Kind kind_;
+  Info info_;
 };
 
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
diff --git a/third_party/xla/xla/service/cpu/runtime/while_thunk.cc b/third_party/xla/xla/service/cpu/runtime/while_thunk.cc
index 0c5f3712d5a0fa..4471c60f217a8e 100644
--- a/third_party/xla/xla/service/cpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/cpu/runtime/while_thunk.cc
@@ -23,17 +23,20 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla::cpu {
 
-WhileThunk::WhileThunk(BufferAllocation::Slice cond_buffer,
+WhileThunk::WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
                        ThunkSequence cond_sequence, ThunkSequence body_sequence)
-    : Thunk(Kind::kWhile),
+    : Thunk(Kind::kWhile, std::move(info)),
       cond_buffer_(cond_buffer),
       cond_sequence_(std::move(cond_sequence)),
       body_sequence_(std::move(body_sequence)) {}
 
 absl::Status WhileThunk::Execute(const ExecuteParams& params) {
+  tsl::profiler::TraceMe trace([&] { return TraceMeEncode(); });
+
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase cond_data,
       params.buffer_allocations->GetDeviceAddress(cond_buffer_));
diff --git a/third_party/xla/xla/service/cpu/runtime/while_thunk.h b/third_party/xla/xla/service/cpu/runtime/while_thunk.h
index dc76ad9aea5108..d51678133b6b34 100644
--- a/third_party/xla/xla/service/cpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/service/cpu/runtime/while_thunk.h
@@ -31,8 +31,8 @@ namespace xla::cpu {
 // Condition buffer must be a i1 (bool) buffer that holds a loop predicate.
 class WhileThunk final : public Thunk {
  public:
-  WhileThunk(BufferAllocation::Slice cond_buffer, ThunkSequence cond_sequence,
-             ThunkSequence body_sequence);
+  WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
+             ThunkSequence cond_sequence, ThunkSequence body_sequence);
 
   absl::Status Execute(const ExecuteParams& params) final;
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 63c4c74da8023a..af4228256c9af3 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/cpu/thunk_emitter.h"
 
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -46,6 +47,12 @@ ThunkEmitter::ThunkEmitter(IrEmitter2* ir_emitter,
                            const BufferAssignment* buffer_assignment)
     : ir_emitter_(ir_emitter), buffer_assignment_(buffer_assignment) {}
 
+static Thunk::Info ThunkInfo(const HloInstruction* instruction) {
+  const HloModule* module = instruction->GetModule();
+  return Thunk::Info{std::string(instruction->name()),
+                     std::string(module->name()), module->unique_id()};
+}
+
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
     const HloModule& module) {
   if (!module.has_schedule()) {
@@ -181,7 +188,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
   TF_ASSIGN_OR_RETURN(
       ThunkSequence called_sequence,
       EmitHloComputation(instruction->called_computations().front()));
-  return ThunkSequence::Of<CallThunk>(std::move(called_sequence));
+  return ThunkSequence::Of<CallThunk>(ThunkInfo(instruction),
+                                      std::move(called_sequence));
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
@@ -189,8 +197,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyThunk(
   const HloInstruction* source = instruction->operand(0);
   TF_ASSIGN_OR_RETURN(auto source_buffer, GetAllocationSlice(source));
   TF_ASSIGN_OR_RETURN(auto destination_buffer, GetAllocationSlice(instruction));
-  return ThunkSequence::Of<CopyThunk>(source_buffer, source->shape(),
-                                      destination_buffer, instruction->shape());
+  return ThunkSequence::Of<CopyThunk>(ThunkInfo(instruction), source_buffer,
+                                      source->shape(), destination_buffer,
+                                      instruction->shape());
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
@@ -201,7 +210,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
 
   // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
   // invocation, for now we assume that we always emit a full loop.
-  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+  return ThunkSequence::Of<KernelThunk>(ThunkInfo(instruction), buffers,
+                                        kernel.name, se::ThreadDim());
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
@@ -212,7 +222,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
 
   // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
   // invocation, for now we assume that we always emit a full loop.
-  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+  return ThunkSequence::Of<KernelThunk>(ThunkInfo(instruction), buffers,
+                                        kernel.name, se::ThreadDim());
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitReductionKernelThunk(
@@ -223,7 +234,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitReductionKernelThunk(
 
   // TODO(ezhulenev): IrEmitter should return requested ThreadDim for a kernel
   // invocation, for now we assume that we always emit a full loop.
-  return ThunkSequence::Of<KernelThunk>(buffers, kernel.name, se::ThreadDim());
+  return ThunkSequence::Of<KernelThunk>(ThunkInfo(instruction), buffers,
+                                        kernel.name, se::ThreadDim());
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitWhileThunk(
@@ -236,7 +248,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitWhileThunk(
   TF_ASSIGN_OR_RETURN(ThunkSequence body_thunk,
                       EmitHloComputation(instruction->while_body()));
 
-  return ThunkSequence::Of<WhileThunk>(cond_buffer, std::move(cond_thunk),
+  return ThunkSequence::Of<WhileThunk>(ThunkInfo(instruction), cond_buffer,
+                                       std::move(cond_thunk),
                                        std::move(body_thunk));
 }
 

From 28da2fbf14db06df78a3eb4553fde4b5c762d903 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 31 May 2024 19:03:10 -0700
Subject: [PATCH 219/287] Adds a validation check to BasicStringArray to ensure
 that the underlying buffers an array, when they get ready, are consistent
 with the sharding specified at the time of its (array's) construction.

PiperOrigin-RevId: 639219351
---
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |   1 +
 .../python/pjrt_ifrt/basic_string_array.cc    |  62 +++++---
 .../xla/python/pjrt_ifrt/basic_string_array.h |   8 +-
 .../pjrt_ifrt/basic_string_array_test.cc      | 135 +++++++++++-------
 4 files changed, 133 insertions(+), 73 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index ee98154218fb0d..9e0c115de0e100 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -280,6 +280,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
index bd77289d264d3f..69957955701289 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -47,19 +48,60 @@ absl::StatusOr<tsl::RCReference<BasicStringArray>> BasicStringArray::Create(
   if (!buffers.IsValid()) {
     return absl::InvalidArgumentError("Got buffers_ future is invalid");
   }
-  return tsl::MakeRef<BasicStringArray>(client, std::move(shape),
-                                        std::move(sharding), std::move(buffers),
-                                        std::move(on_done_with_buffer));
+
+  auto buffers_promise = Future<Buffers>::CreatePromise();
+  auto buffers_future = Future<Buffers>(buffers_promise);
+
+  auto ready_promise = Future<>::CreatePromise();
+  auto ready_future = Future<>(ready_promise);
+
+  // Buffers when the become ready must be consistent with the sharding. For
+  // instance, Buffers.size() (the number of per-shard spans of string_views)
+  // and the devices in the sharding that was used to create an array must
+  // match. If they do not, the array's ready future and buffers future should
+  // become ready with an appropriate error status.
+
+  auto buffer_validator =
+      [buffers_promise = std::move(buffers_promise),
+       ready_promise = std::move(ready_promise),
+       sharding = sharding](absl::StatusOr<Buffers> buffers) mutable {
+        if (!buffers.ok()) {
+          buffers_promise.Set(buffers.status());
+          ready_promise.Set(buffers.status());
+          return;
+        }
+
+        if (sharding->devices().size() != (*buffers).size()) {
+          auto error = absl::FailedPreconditionError(absl::StrCat(
+              "Number of buffers: ", (*buffers).size(),
+              " does not match the number of devices in sharding: ",
+              sharding->devices().size()));
+          buffers_promise.Set(error);
+          ready_promise.Set(error);
+          return;
+        }
+
+        buffers_promise.Set(std::move(buffers));
+        ready_promise.Set(absl::OkStatus());
+      };
+
+  buffers.OnReady(std::move(buffer_validator));
+
+  return tsl::MakeRef<BasicStringArray>(
+      client, std::move(shape), std::move(sharding), std::move(buffers_future),
+      std::move(ready_future), std::move(on_done_with_buffer));
 }
 
 BasicStringArray::BasicStringArray(Client* client, Shape shape,
                                    std::shared_ptr<const Sharding> sharding,
                                    Future<Buffers> buffers,
+                                   Future<> ready_future,
                                    OnDoneWithBuffer on_done_with_buffer)
     : client_(client),
       shape_(std::move(shape)),
       sharding_(std::move(sharding)),
       buffers_(std::move(buffers)),
+      ready_future_(std::move(ready_future)),
       on_done_with_buffer_(std::move(on_done_with_buffer)) {}
 
 BasicStringArray::~BasicStringArray() { DeleteInternal(); }
@@ -92,20 +134,6 @@ Future<> BasicStringArray::GetReadyFuture() const {
     return Future<>(
         absl::FailedPreconditionError("Array has already been deleted"));
   }
-  if (ready_future_.IsValid()) {
-    return ready_future_;
-  }
-
-  // TODO(b/337922817) The ready future returned should capture the status
-  // of consistency checks across the buffers, shape and sharding. These checks
-  // will run when the buffers become available - i.e., when the `buffers_`
-  // future becomes ready.
-  auto promise = Future<>::CreatePromise();
-  ready_future_ = Future<>(promise);
-  buffers_.OnReady(
-      [promise = std::move(promise)](absl::StatusOr<Buffers> buffers) mutable {
-        promise.Set(buffers.status());
-      });
   return ready_future_;
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
index 03d06bbec5cb67..6344e8d7c0f14a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
@@ -138,7 +138,7 @@ class BasicStringArray final
 
   BasicStringArray(Client* client, Shape shape,
                    std::shared_ptr<const Sharding> sharding,
-                   Future<Buffers> buffers,
+                   Future<Buffers> buffers, Future<> ready_future,
                    OnDoneWithBuffer on_done_with_buffer);
 
   // Internal implementation of delete.
@@ -148,15 +148,11 @@ class BasicStringArray final
   Shape shape_;
   std::shared_ptr<const Sharding> sharding_;
   Future<Buffers> buffers_;
-
-  // TODO(b/337922817): Consider checking the buffers when they become available
-  // (i.e., the future above becomes ready) to ensure that they are consistent
-  // with the Shape and Sharding provided at the construction time.
+  Future<> ready_future_;
 
   mutable absl::Mutex mu_;
   OnDoneWithBuffer on_done_with_buffer_ ABSL_GUARDED_BY(mu_);
   bool is_deleted_ ABSL_GUARDED_BY(mu_) = false;
-  mutable Future<> ready_future_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index 9c3424a7bfb7dc..0f69b1ece0e1bd 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -69,6 +69,55 @@ absl::StatusOr<tsl::RCReference<BasicStringArray>> CreateTestArray(
                                   std::move(on_done_with_buffer));
 }
 
+// Makes a single-sharded `BasicStringArray::Buffers` and its associated
+// `BasicStringArray::OnDoneWithBuffer` from the given span of strings.
+std::pair<BasicStringArray::Buffers, BasicStringArray::OnDoneWithBuffer>
+MakeBuffersAndOnDoneWithBuffer(
+    absl::Span<const absl::string_view> input_strings) {
+  BasicStringArray::Buffers buffers;
+  auto string_holder = std::make_shared<std::vector<std::string>>();
+  string_holder->reserve(input_strings.size());
+  auto string_view_holder = std::make_shared<std::vector<absl::string_view>>();
+  string_view_holder->reserve(input_strings.size());
+  for (const auto str : input_strings) {
+    string_holder->push_back(std::string(str));
+  }
+  for (const auto& str : *string_holder) {
+    string_view_holder->push_back(absl::string_view(str));
+  }
+  buffers.push_back(*string_view_holder);
+
+  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
+      [string_holder = std::move(string_holder),
+       string_view_holder = std::move(string_view_holder)]() {};
+
+  return std::make_pair(std::move(buffers), std::move(on_done_with_buffer));
+}
+
+// Makes a simple single device sharded `BasicStringArray` that is not ready at
+// the time of creation. Returns a promise that can be set to make the array
+// ready. If the callers set this promise with buffers (i.e., not an error),
+// then they must ensure that the underlying strings live until the
+// `on-host-buffer-done` callback they provided is run.
+absl::StatusOr<std::pair<tsl::RCReference<BasicStringArray>,
+                         Promise<BasicStringArray::Buffers>>>
+CreateNonReadyTestArray(
+    Client* client, Device* const device,
+    BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
+  auto buffers_promise = Future<BasicStringArray::Buffers>::CreatePromise();
+  auto buffers_future = Future<BasicStringArray::Buffers>(buffers_promise);
+  Shape shape({1});
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  TF_ASSIGN_OR_RETURN(auto array,
+                      BasicStringArray::Create(client, shape, sharding,
+                                               std::move(buffers_future),
+                                               std::move(on_done_with_buffer)));
+
+  return std::make_pair(std::move(array), std::move(buffers_promise));
+}
+
 TEST(BasicStringArrayTest, CreateSuccess) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   BasicStringArray::Buffers buffers;
@@ -82,7 +131,7 @@ TEST(BasicStringArrayTest, CreateSuccess) {
                                /*on_done_with_buffer=*/nullptr));
 }
 
-TEST(BasicStringArrayTest, CreateFailure) {
+TEST(BasicStringArrayTest, CreateFailureWithInvalidFuture) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   // Create fails if with invalid future.
   EXPECT_THAT(CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(),
@@ -119,6 +168,41 @@ TEST(BasicStringArrayTest, Destruction) {
   on_done_with_buffer_called.WaitForNotification();
 }
 
+TEST(BasicStringArrayTest, InvalidBuffersAreHandledCorrectly) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto devices = client->addressable_devices();
+  ASSERT_GE(devices.size(), 1);
+
+  // Make a BasicStringArray::Buffer with two shards.
+  auto shard0_data = std::make_shared<std::vector<absl::string_view>>();
+  shard0_data->push_back("abc");
+  auto shard1_data = std::make_shared<std::vector<absl::string_view>>();
+  shard1_data->push_back("def");
+  BasicStringArray::Buffers buffers;
+  buffers.push_back(*shard0_data);
+  buffers.push_back(*shard1_data);
+
+  auto on_done_with_buffer = [shard0_data = std::move(shard0_data),
+                              shard1_data = std::move(shard1_data)]() {};
+
+  // Make a single device array that is not ready at the time of creation.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ret, CreateNonReadyTestArray(client.get(), devices[0],
+                                        std::move(on_done_with_buffer)));
+  auto array = ret.first;
+  auto promise = ret.second;
+  auto basic_string_array = llvm::dyn_cast<BasicStringArray>(array.get());
+
+  // Buffers with two shards and a single-device array are inconsistent.
+  tsl::Env::Default()->SchedClosure([&]() { promise.Set(buffers); });
+
+  EXPECT_THAT(basic_string_array->GetReadyFuture().Await(),
+              StatusIs(absl::StatusCode::kFailedPrecondition));
+
+  EXPECT_THAT(basic_string_array->buffers().Await(),
+              StatusIs(absl::StatusCode::kFailedPrecondition));
+}
+
 TEST(BasicStringArrayTest, Delete) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   BasicStringArray::Buffers buffers;
@@ -382,55 +466,6 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
               StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
-// Makes a `BasicStringArray::Buffers` and its associated
-// `BasicStringArray::OnDoneWithBuffer` from the given span of strings.
-std::pair<BasicStringArray::Buffers, BasicStringArray::OnDoneWithBuffer>
-MakeBuffersAndOnDoneWithBuffer(
-    absl::Span<const absl::string_view> input_strings) {
-  BasicStringArray::Buffers buffers;
-  auto string_holder = std::make_shared<std::vector<std::string>>();
-  string_holder->reserve(input_strings.size());
-  auto string_view_holder = std::make_shared<std::vector<absl::string_view>>();
-  string_view_holder->reserve(input_strings.size());
-  for (const auto str : input_strings) {
-    string_holder->push_back(std::string(str));
-  }
-  for (const auto& str : *string_holder) {
-    string_view_holder->push_back(absl::string_view(str));
-  }
-  buffers.push_back(*string_view_holder);
-
-  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
-      [string_holder = std::move(string_holder),
-       string_view_holder = std::move(string_view_holder)]() {};
-
-  return std::make_pair(std::move(buffers), std::move(on_done_with_buffer));
-}
-
-// Makes a simple single device sharded `BasicStringArray` that is not ready at
-// the time of creation. Returns a promise that can be set to make the array
-// ready. If the callers set this promise with buffers (i.e., not an error),
-// then they must ensure that the underlying strings live until the
-// `on-host-buffer-done` callback they provided is run.
-absl::StatusOr<std::pair<tsl::RCReference<BasicStringArray>,
-                         Promise<BasicStringArray::Buffers>>>
-CreateNonReadyTestArray(
-    Client* client, Device* const device,
-    BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
-  auto buffers_promise = Future<BasicStringArray::Buffers>::CreatePromise();
-  auto buffers_future = Future<BasicStringArray::Buffers>(buffers_promise);
-  Shape shape({1});
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
-
-  TF_ASSIGN_OR_RETURN(auto array,
-                      BasicStringArray::Create(client, shape, sharding,
-                                               std::move(buffers_future),
-                                               std::move(on_done_with_buffer)));
-
-  return std::make_pair(std::move(array), std::move(buffers_promise));
-}
-
 TEST(AssembleArrayFromSingleDeviceArraysTest,
      FromNonReadySingleDeviceArraysSuccess) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());

From a1e0b4798ce4cabdcfe920f09892ffa19c48607c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 1 Jun 2024 02:02:08 -0700
Subject: [PATCH 220/287] compat: Update forward compatibility horizon to
 2024-06-01

PiperOrigin-RevId: 639292023
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3b2f26d436ad4e..47c4c8623a79a4 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 31)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 6, 1)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From c4123e3efb57ffb3e2dd87ebe729260fe491f985 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 1 Jun 2024 02:02:17 -0700
Subject: [PATCH 221/287] Update GraphDef version to 1880.

PiperOrigin-RevId: 639292068
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ed6ed68fa05dd7..9dfd12cc77eb33 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1879  // Updated: 2024/5/31
+#define TF_GRAPH_DEF_VERSION 1880  // Updated: 2024/6/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From da9212fd7a8404dfdd37b9fbf0dcc51f0a553abe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 1 Jun 2024 02:13:17 -0700
Subject: [PATCH 222/287] Automated Code Change

PiperOrigin-RevId: 639293926
---
 tensorflow/lite/delegates/gpu/common/testing/BUILD           | 2 --
 .../lite/delegates/gpu/common/testing/interpreter_utils.cc   | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/testing/BUILD b/tensorflow/lite/delegates/gpu/common/testing/BUILD
index 1667d93c8589af..edc76471f5f2e1 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/BUILD
@@ -11,7 +11,6 @@ cc_library(
     hdrs = ["interpreter_utils.h"],
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
@@ -19,6 +18,5 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/memory",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
index fb7f3e117c9483..3ba822b27aaa88 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
@@ -20,14 +20,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/memory/memory.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace gpu {

From 03a2e488c09b8767d457d55e4ef43b1877081d7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 1 Jun 2024 04:30:45 -0700
Subject: [PATCH 223/287] Automated Code Change

PiperOrigin-RevId: 639314567
---
 tensorflow/lite/core/async/testing/BUILD           | 3 ++-
 tensorflow/lite/core/async/testing/test_backend.cc | 4 ++--
 tensorflow/lite/core/async/testing/test_backend.h  | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/core/async/testing/BUILD b/tensorflow/lite/core/async/testing/BUILD
index 7a65cfe1d994c7..c9aba3c4cc15d9 100644
--- a/tensorflow/lite/core/async/testing/BUILD
+++ b/tensorflow/lite/core/async/testing/BUILD
@@ -15,9 +15,10 @@ cc_library(
     srcs = ["test_backend.cc"],
     hdrs = ["test_backend.h"],
     deps = [
+        "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:util",
         "//tensorflow/lite/core/async:async_kernel_internal",
+        "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:utils",
diff --git a/tensorflow/lite/core/async/testing/test_backend.cc b/tensorflow/lite/core/async/testing/test_backend.cc
index 731186df87cfa4..9b016469740b0f 100644
--- a/tensorflow/lite/core/async/testing/test_backend.cc
+++ b/tensorflow/lite/core/async/testing/test_backend.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/array.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/utils.h"
-#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace async {
diff --git a/tensorflow/lite/core/async/testing/test_backend.h b/tensorflow/lite/core/async/testing/test_backend.h
index 0a170accda6097..493c65a9c848b8 100644
--- a/tensorflow/lite/core/async/testing/test_backend.h
+++ b/tensorflow/lite/core/async/testing/test_backend.h
@@ -18,8 +18,9 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace async {

From a7eaaec3c10ea83e98c5b4d0540519d9e5c8bbec Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 09:24:41 -0700
Subject: [PATCH 224/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/runtime/nccl_all_gather_thunk.cc

PiperOrigin-RevId: 639351617
---
 third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index 156efbc74ccd44..49aae84589b97a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 
 #include <cstdint>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"

From 7bece6de11723e46e1d1fce59127bef99101cda2 Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 09:31:24 -0700
Subject: [PATCH 225/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h

PiperOrigin-RevId: 639352712
---
 .../service/gpu/fusions/in_place_dynamic_update_slice_mlir.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
index dafe43c3c1cc28..dd3f40c3859fd3 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -61,7 +61,7 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* mlir_context) const override;
+      mlir::MLIRContext* indexing_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(

From f2535930fb518a3996cac26d59c3dd2bf9776e77 Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 09:31:29 -0700
Subject: [PATCH 226/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h

PiperOrigin-RevId: 639352731
---
 .../xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
index 703befc82db1f2..a55db422b4b932 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
 #define XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
 
+#include <functional>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"

From 0bc8217fc2531c0d0fae80a4b0d41c90d06e6ee9 Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 14:09:01 -0700
Subject: [PATCH 227/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/fusions/transpose.cc

PiperOrigin-RevId: 639390119
---
 third_party/xla/xla/service/gpu/fusions/transpose.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 14b419f0dcd063..88586edf44fa51 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"

From 5dc4ff7ec205ee0a22f7e761fff4e0437cd39a64 Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 14:10:10 -0700
Subject: [PATCH 228/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/command_buffer_scheduling.cc

PiperOrigin-RevId: 639390244
---
 third_party/xla/xla/service/gpu/command_buffer_scheduling.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index e9571714393dce..0dba4805ece3ad 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <iterator>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"

From 59910ba39b4336e3634452b6e86ac6349ebcf101 Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Sat, 1 Jun 2024 14:15:17 -0700
Subject: [PATCH 229/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/algebraic_simplifier.cc

PiperOrigin-RevId: 639390879
---
 .../xla/xla/service/algebraic_simplifier.cc        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 259dc6f1683dcd..97f52024c40bc5 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -411,8 +411,8 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructToDotMaps(
   // Extract a and b contraction dimensions from dnums
   auto a_batch_dims = dnums.lhs_batch_dimensions();
   auto b_batch_dims = dnums.rhs_batch_dimensions();
-  auto a_contracting_dims = dnums.lhs_contracting_dimensions();
-  auto b_contracting_dims = dnums.rhs_contracting_dimensions();
+  const auto& a_contracting_dims = dnums.lhs_contracting_dimensions();
+  const auto& b_contracting_dims = dnums.rhs_contracting_dimensions();
   // First add the batch dimensions
   for (int64_t i = 0; i < a_batch_dims.size(); i++) {
     map_a_ab[a_batch_dims[i]] = ab_index;
@@ -727,7 +727,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfCompatible(
   // it should remove the op when control deps are present. I.e.
   // control deps might be added to preserve a certain order.
   // It's better to not process in that case.
-  if (old_instruction->control_predecessors().size() > 0) {
+  if (!old_instruction->control_predecessors().empty()) {
     VLOG(3) << old_instruction->ToString()
             << " has control predecessors, skipping.";
     return false;
@@ -748,7 +748,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfCompatible(
   // it should remove the op when control deps are present. I.e.
   // control deps might be added to preserve a certain order.
   // It's better to not process in that case.
-  if (old_instruction->control_predecessors().size() > 0) {
+  if (!old_instruction->control_predecessors().empty()) {
     VLOG(3) << old_instruction->ToString()
             << " has control predecessors, skipping.";
     return false;
@@ -1181,7 +1181,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
   // it should remove the op when control deps are present. I.e.
   // control deps might be added to preserve a certain order.
   // It's better to not process in that case.
-  if (bitcast->control_predecessors().size() > 0) {
+  if (!bitcast->control_predecessors().empty()) {
     VLOG(3) << bitcast->ToString() << " has control predecessors, skipping.";
     return absl::OkStatus();
   }
@@ -7223,8 +7223,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
       }
 
       // Construct dot dimension numbers for new dot
-      auto a_contracting_dims = ab_dnums.lhs_contracting_dimensions();
-      auto b_contracting_dims = ab_dnums.rhs_contracting_dimensions();
+      const auto& a_contracting_dims = ab_dnums.lhs_contracting_dimensions();
+      const auto& b_contracting_dims = ab_dnums.rhs_contracting_dimensions();
       DotDimensionNumbers new_dot_dnums;
       for (int64_t reduce_a_index = 0;
            reduce_a_index < reduce_a->shape().rank(); ++reduce_a_index) {

From 623bc1e768ee1ca1e5a2e75c8b25a24fb6868db5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 01:27:28 -0700
Subject: [PATCH 230/287] Fixes source list after schema utility functions
 relocation.

PiperOrigin-RevId: 639478125
---
 tensorflow/compiler/mlir/lite/schema/BUILD | 9 +++++++++
 tensorflow/lite/BUILD                      | 1 +
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/schema/BUILD b/tensorflow/compiler/mlir/lite/schema/BUILD
index 7cbc2253a83821..7dd8eecb66d935 100644
--- a/tensorflow/compiler/mlir/lite/schema/BUILD
+++ b/tensorflow/compiler/mlir/lite/schema/BUILD
@@ -14,6 +14,15 @@ exports_files(
     srcs = ["schema.fbs"],
 )
 
+filegroup(
+    name = "tflite_internal_cc_3p_api_deps_src",
+    srcs = [
+        ":schema_fbs_srcs",
+        ":schema_utils.h",
+    ],
+    visibility = ["//tensorflow/lite:__pkg__"],
+)
+
 flatbuffer_cc_library(
     name = "schema_fbs",
     srcs = ["schema.fbs"],
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 23b7726b31a4d3..07d4435c5e4087 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -126,6 +126,7 @@ filegroup(
     name = "tflite_internal_cc_3p_api_deps_src_all",
     srcs = [
         ":tflite_internal_cc_3p_api_deps_src",
+        "//tensorflow/compiler/mlir/lite/schema:tflite_internal_cc_3p_api_deps_src",
         "//tensorflow/lite/core:macros.h",
         "//tensorflow/lite/core/acceleration/configuration/c:tflite_internal_cc_3p_api_deps_src",
         "//tensorflow/lite/core/api:tflite_internal_cc_3p_api_deps_src",

From d56e9460caf1d89b90a8d9858acb501ff2465699 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 02:02:08 -0700
Subject: [PATCH 231/287] Update GraphDef version to 1881.

PiperOrigin-RevId: 639483278
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9dfd12cc77eb33..8f179ca534047a 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1880  // Updated: 2024/6/1
+#define TF_GRAPH_DEF_VERSION 1881  // Updated: 2024/6/2
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c601e0c41fb0b95febbfb633fdceda208137688d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 02:02:10 -0700
Subject: [PATCH 232/287] compat: Update forward compatibility horizon to
 2024-06-02

PiperOrigin-RevId: 639483289
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 47c4c8623a79a4..a6a770fd15972e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 6, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 6, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a96e3431f6f97fa248a289ca82b9b2c262dc3de4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 13:19:48 -0700
Subject: [PATCH 233/287] Implements the `DisassembleIntoSingleDeviceArrays`
 method for the BasicStringArray class.

PiperOrigin-RevId: 639571840
---
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |   1 +
 .../python/pjrt_ifrt/basic_string_array.cc    |  94 +++++++++++++++-
 .../pjrt_ifrt/basic_string_array_test.cc      | 100 +++++++++++++++---
 3 files changed, 178 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 9e0c115de0e100..b24907b1737603 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -286,6 +286,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
index 69957955701289..7c0bfb5d7ee14c 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/pjrt_layout.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -141,7 +143,97 @@ absl::StatusOr<std::vector<tsl::RCReference<Array>>>
 BasicStringArray::DisassembleIntoSingleDeviceArrays(
     ArrayCopySemantics semantics) {
   DCHECK(this);
-  return absl::UnimplementedError("Not implemented");
+  absl::MutexLock lock(&mu_);
+  if (is_deleted_) {
+    return absl::FailedPreconditionError("Array has already been deleted");
+  }
+
+  int num_shards = sharding_->devices().size();
+
+  // For each single device array we are going to pre-make:
+  //   (1) a Promise-Future pair for passing the buffers,
+  //
+  //   (2) a Per-shard buffer backing store and the corresponding
+  //   on-done-with-buffer callback.
+  //
+  //   (3) shape and sharding by disassembing the source array's sharding.
+  //
+  // The Futures, the on-done-with-host-buffer callbacks, shapes and shardings
+  // are used to make the arrays. The promises and the buffer backing stores are
+  // passed onto the OnReady callback that populates them when the buffers of
+  // the source array become ready.
+  std::vector<Promise<Buffers>> buffer_promises;
+  buffer_promises.reserve(num_shards);
+  std::vector<Future<Buffers>> buffer_futures;
+  buffer_futures.reserve(num_shards);
+
+  struct PerShardBufferBackingStore {  // Data (strings) for a single shard.
+    void CopyFrom(absl::Span<const absl::string_view> input_buffer) {
+      strings.reserve(input_buffer.size());
+      string_views.reserve(input_buffer.size());
+      for (absl::string_view buf : input_buffer) {
+        strings.push_back(std::string(buf.data(), buf.size()));
+        string_views.push_back(strings.back());
+      }
+    }
+    std::vector<std::string> strings;
+    std::vector<absl::string_view> string_views;
+  };
+  std::vector<std::shared_ptr<PerShardBufferBackingStore>>
+      per_shard_buffer_backing_stores;
+  per_shard_buffer_backing_stores.reserve(num_shards);
+  std::vector<OnDoneWithBuffer> on_done_with_buffer_callbacks;
+  on_done_with_buffer_callbacks.reserve(num_shards);
+
+  for (int i = 0; i < num_shards; ++i) {
+    buffer_promises.push_back(Future<Buffers>::CreatePromise());
+    buffer_futures.push_back(Future<Buffers>(buffer_promises.back()));
+
+    auto backing_store = std::make_shared<PerShardBufferBackingStore>();
+    per_shard_buffer_backing_stores.push_back(backing_store);
+    on_done_with_buffer_callbacks.push_back(
+        [backing_store = std::move(backing_store)]() {});
+  }
+
+  // Copy each of the per-shard data into the its per-shard buffer backing
+  // store, make a Buffers object and set the corresponding promise.
+  buffers_.OnReady([buffer_promises = std::move(buffer_promises),
+                    per_shard_buffer_backing_stores =
+                        std::move(per_shard_buffer_backing_stores)](
+                       absl::StatusOr<Buffers> buffers) mutable {
+    if (!buffers.ok()) {
+      for (auto& promise : buffer_promises) {
+        promise.Set(buffers.status());
+      }
+      per_shard_buffer_backing_stores.clear();
+      return;
+    }
+    auto num_shards = buffers->size();
+    for (int i = 0; i < num_shards; ++i) {
+      per_shard_buffer_backing_stores[i]->CopyFrom((*buffers)[i]);
+      Buffers buffers;
+      buffers.push_back(per_shard_buffer_backing_stores[i]->string_views);
+      buffer_promises[i].Set(std::move(buffers));
+    }
+  });
+
+  // Make and return the individual single device arrays. These will become
+  // ready when the this (source) array becomes ready and the callback we set up
+  // above runs.
+  TF_ASSIGN_OR_RETURN(auto shapes_and_shadings, sharding_->Disassemble(shape_));
+
+  std::vector<tsl::RCReference<Array>> arrays;
+  arrays.reserve(num_shards);
+  for (int i = 0; i < num_shards; ++i) {
+    TF_ASSIGN_OR_RETURN(auto array,
+                        BasicStringArray::Create(
+                            client_, std::move(shapes_and_shadings[i].first),
+                            std::move(shapes_and_shadings[i].second),
+                            std::move(buffer_futures[i]),
+                            std::move(on_done_with_buffer_callbacks[i])));
+    arrays.push_back(array);
+  }
+  return arrays;
 }
 
 Future<> BasicStringArray::CopyToHostBuffer(
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index 0f69b1ece0e1bd..bcbb17f576624c 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -376,29 +376,34 @@ absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceFloatTestArray(
       /*on_done_with_host_buffer=*/nullptr);
 }
 
-// Makes a sharded string array with two shards.
-absl::StatusOr<tsl::RCReference<Array>> MakeShardedStringTestArrray(
-    Client* client, const std::string shard1_contents,
-    const std::string shard2_contents) {
+// Makes a sharded string array with two shards. Uses the first two strings from
+// the input `data`, one per shard.
+absl::StatusOr<tsl::RCReference<Array>> MakeShardedStringTestArray(
+    Client* client, absl::Span<const std::string> data) {
+  if (data.size() < 2) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Input data has too few strings. Need at least 2. got: ", data.size()));
+  }
   auto devices = client->addressable_devices();
   if (devices.size() < 2) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Test client has too few devices. Need 2, got:", devices.size()));
   }
 
-  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
-      DeviceList({devices[0], devices[1]}), MemoryKind());
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(DeviceList({devices[0], devices[1]}),
+                                   MemoryKind(), Shape({2, 1}), Shape({1}),
+                                   /*is_fully_replicated=*/false);
 
   std::vector<tsl::RCReference<Array>> arrays;
   for (int i = 0; i < 2; ++i) {
-    TF_ASSIGN_OR_RETURN(
-        auto array, MakeSingleDeviceStringTestArray({absl::StrCat("shard ", i)},
-                                                    client, devices[i]));
+    TF_ASSIGN_OR_RETURN(auto array, MakeSingleDeviceStringTestArray(
+                                        {data[i]}, client, devices[i]));
     arrays.push_back(std::move(array));
   }
 
   return client->AssembleArrayFromSingleDeviceArrays(
-      Shape({2}), std::move(opaque_sharding), absl::MakeSpan(arrays),
+      Shape({2, 1}), std::move(sharding), absl::MakeSpan(arrays),
       ArrayCopySemantics::kAlwaysCopy);
 }
 
@@ -408,9 +413,8 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
 
   // Make a BasicStringArray with two underlying basic string arrays.
   const std::vector<std::string> per_shard_contents({"shard 0", "shard 1"});
-  TF_ASSERT_OK_AND_ASSIGN(auto array, MakeShardedStringTestArrray(
-                                          client.get(), per_shard_contents[0],
-                                          per_shard_contents[1]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, MakeShardedStringTestArray(client.get(), per_shard_contents));
   auto basic_string_array = llvm::dyn_cast<BasicStringArray>(array.get());
   ASSERT_NE(basic_string_array, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(auto buffers, basic_string_array->buffers().Await());
@@ -453,9 +457,8 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
 
   std::vector<tsl::RCReference<Array>> arrays(2);
   const std::vector<std::string> per_shard_contents({"abc", "def"});
-  TF_ASSERT_OK_AND_ASSIGN(arrays[0], MakeShardedStringTestArrray(
-                                         client.get(), per_shard_contents[0],
-                                         per_shard_contents[1]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays[0], MakeShardedStringTestArray(client.get(), per_shard_contents));
   TF_ASSERT_OK_AND_ASSIGN(
       arrays[1], MakeSingleDeviceStringTestArray({"string_array_contents"},
                                                  client.get(), devices[1]));
@@ -569,6 +572,71 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
   done_readying_single_device_arrays.WaitForNotification();
 }
 
+TEST(DisassembleArrayIntoSingleDeviceArrays,
+     SingleDeviceArrayDisassembleSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  auto [buffers, on_done_with_buffer] = MakeBuffersAndOnDoneWithBuffer({"abc"});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
+                      std::move(on_done_with_buffer)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
+                          array->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kAlwaysCopy));
+
+  ASSERT_EQ(disassembled_arrays.size(), 1);
+  auto basic_string_array =
+      llvm::dyn_cast<BasicStringArray>(disassembled_arrays[0].get());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto new_buffers,
+                          basic_string_array->buffers().Await());
+  ASSERT_EQ(buffers.size(), 1);
+  EXPECT_THAT(buffers[0], testing::ElementsAre("abc"));
+}
+
+TEST(DisassembleArrayIntoSingleDeviceArrays, ShardedArrayDisassembleSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  const std::vector<std::string> per_shard_contents({"abc", "def"});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, MakeShardedStringTestArray(client.get(), per_shard_contents));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
+                          array->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kAlwaysCopy));
+
+  ASSERT_EQ(disassembled_arrays.size(), 2);
+
+  for (int i = 0; i < disassembled_arrays.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("dissembled array: ", i));
+    auto basic_string_array =
+        llvm::dyn_cast<BasicStringArray>(disassembled_arrays[i].get());
+    TF_ASSERT_OK_AND_ASSIGN(auto buffer, basic_string_array->buffers().Await());
+    ASSERT_EQ(buffer.size(), 1);
+    EXPECT_THAT(buffer[0], testing::ElementsAre(per_shard_contents[i]));
+  }
+}
+
+TEST(DisassembleArrayIntoSingleDeviceArrays, FailsIfTheArrayHasBeenDeleted) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  auto [buffers, on_done_with_buffer] = MakeBuffersAndOnDoneWithBuffer({"abc"});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
+                      std::move(on_done_with_buffer)));
+
+  array->Delete();
+
+  EXPECT_THAT(
+      array->DisassembleIntoSingleDeviceArrays(ArrayCopySemantics::kAlwaysCopy),
+      StatusIs(absl::StatusCode::kFailedPrecondition));
+}
+
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla

From 3f12000d145d5278667e1bf3b74646e14e6eec82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 20:39:05 -0700
Subject: [PATCH 234/287] Automated Code Change

PiperOrigin-RevId: 639636819
---
 tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD  | 2 +-
 .../compiler/mlir/lite/experimental/tac/utils/utils.cc      | 6 ++++--
 .../compiler/mlir/lite/experimental/tac/utils/utils.h       | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
index 1d9d934c736a56..bf830df4cd3982 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
@@ -19,10 +19,10 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
index fc0172bfa9e0d4..6254df0d6db94b 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
@@ -31,14 +33,14 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
index af5732683e92a1..a61efd75433670 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 
 namespace mlir {

From 900453a90bec8f0debbb4500e30a8d128a23cff5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 21:24:17 -0700
Subject: [PATCH 235/287] Automated Code Change

PiperOrigin-RevId: 639643354
---
 tensorflow/core/tpu/ops/BUILD                             | 6 ++++++
 tensorflow/core/tpu/ops/sparse_core_ops.cc                | 2 --
 tensorflow/core/tpu/ops/topk_ops.cc                       | 2 +-
 tensorflow/core/tpu/ops/tpu_compile_op.cc                 | 2 +-
 tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc | 2 ++
 tensorflow/core/tpu/ops/tpu_execute_op.cc                 | 2 +-
 tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc           | 2 +-
 7 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
index 75729c64f27ab5..3cfd5e82da8fa7 100644
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -54,6 +54,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = 1,
 )
@@ -77,6 +78,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = 1,
 )
@@ -106,6 +108,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = 1,
 )
@@ -119,6 +122,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = 1,
 )
@@ -217,6 +221,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:status",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tpu/ops/sparse_core_ops.cc b/tensorflow/core/tpu/ops/sparse_core_ops.cc
index 81b1b8ebdd9e1d..0af7378247c8bf 100644
--- a/tensorflow/core/tpu/ops/sparse_core_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_ops.cc
@@ -14,11 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/status/status.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/ops/topk_ops.cc b/tensorflow/core/tpu/ops/topk_ops.cc
index b13464bf1f9c6a..d621398c59d502 100644
--- a/tensorflow/core/tpu/ops/topk_ops.cc
+++ b/tensorflow/core/tpu/ops/topk_ops.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
index 0c3de059d0ac54..1957c4b8ef9a6c 100644
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
index 03ae84a7ceb0b5..cb6396d5e5ffc6 100644
--- a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 // Op that copy dynamic shape tensor to device.
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tpu/ops/tpu_execute_op.cc b/tensorflow/core/tpu/ops/tpu_execute_op.cc
index 323b215d9cf348..05f4c9e4f44f10 100644
--- a/tensorflow/core/tpu/ops/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc b/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
index 06a7f284af5e4e..8edb57a295538f 100644
--- a/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {

From 63a0146c62a5c5199abfb054dee9d4e9d084eb59 Mon Sep 17 00:00:00 2001
From: tilakrayal <81610181+tilakrayal@users.noreply.github.com>
Date: Mon, 3 Jun 2024 10:25:57 +0530
Subject: [PATCH 236/287] Update bot_config.yml

---
 .github/bot_config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index f3508fe7fefae7..def3fac77d7e6a 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -16,7 +16,6 @@
 # A list of assignees
 assignees:
    - sushreebarsa
-   - SuryanarayanaY
    - tilakrayal
    - Venkat6871
 # A list of assignees for compiler folder

From 72c1ce8e21bd4484ea3ff9c66a0191f079e7256b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 23:19:32 -0700
Subject: [PATCH 237/287] Automated Code Change

PiperOrigin-RevId: 639662830
---
 tensorflow/core/kernels/dequantize_op_test.cc | 14 ++++++++------
 tensorflow/core/kernels/nn_ops_test.cc        |  8 ++++----
 tensorflow/core/kernels/save_op_test.cc       |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
index 30138a8e40d1df..a0fe56ce884a92 100644
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -255,9 +255,10 @@ static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
   inputs.reserve(num_values);
   for (int i = 0; i < num_values; ++i) inputs.push_back(i);
 
-  ops::Dequantize(root, test::AsTensor<T>(inputs), test::AsScalar<float>(-1.5f),
-                  test::AsScalar<float>(20.5f),
-                  ops::Dequantize::Attrs().Mode("MIN_COMBINED"));
+  ops::Dequantize give_me_a_name(root, test::AsTensor<T>(inputs),
+                                 test::AsScalar<float>(-1.5f),
+                                 test::AsScalar<float>(20.5f),
+                                 ops::Dequantize::Attrs().Mode("MIN_COMBINED"));
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
@@ -299,9 +300,10 @@ static void BM_DequantizeBfloat16MinCombinedCpu(
   inputs.reserve(num_values);
   for (int i = 0; i < num_values; ++i) inputs.push_back(i);
 
-  ops::Dequantize(root, test::AsTensor<T>(inputs), test::AsScalar<float>(-1.5f),
-                  test::AsScalar<float>(20.5f),
-                  ops::Dequantize::Attrs().Dtype(DT_BFLOAT16));
+  ops::Dequantize give_me_a_name(root, test::AsTensor<T>(inputs),
+                                 test::AsScalar<float>(-1.5f),
+                                 test::AsScalar<float>(20.5f),
+                                 ops::Dequantize::Attrs().Dtype(DT_BFLOAT16));
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 42218c003d46f6..405b58edbfc979 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -1186,10 +1186,10 @@ static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
   output_diff.flat<float>().setRandom();
 
   CHECK_EQ(kernel_rows, kernel_cols);
-  ops::internal::MaxPoolGrad(root, input_data, output_data, output_diff,
-                             {1, kernel_rows, kernel_cols, 1} /* ksize */,
-                             {1, stride, stride, 1} /* stride */,
-                             padding == VALID ? "VALID" : "SAME");
+  ops::internal::MaxPoolGrad give_me_a_name(
+      root, input_data, output_data, output_diff,
+      {1, kernel_rows, kernel_cols, 1} /* ksize */,
+      {1, stride, stride, 1} /* stride */, padding == VALID ? "VALID" : "SAME");
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
index d28bf28afce710..98fab0c85155ee 100644
--- a/tensorflow/core/kernels/save_op_test.cc
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -675,7 +675,7 @@ void BM_LargeTensorWrite(::testing::benchmark::State& state) {
       io::JoinPath(testing::TmpDir(), "benchmark_checkpoint");
   auto root = Scope::NewRootScope().ExitOnError();
   const tstring tensor_name = "my_tensor";
-  ops::Save(root, temp_filename, {tensor_name}, {{tensor}});
+  ops::Save give_me_a_name(root, temp_filename, {tensor_name}, {{tensor}});
 
   // Disables optimizations.
   SessionOptions session_options;

From e5ea2423494085350398ab5e2b5a085bacade6d3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 2 Jun 2024 23:50:53 -0700
Subject: [PATCH 238/287] Automated Code Change

PiperOrigin-RevId: 639668658
---
 tensorflow/c/experimental/ops/gen/cpp/views/BUILD           | 4 +---
 .../c/experimental/ops/gen/cpp/views/arg_type_view.cc       | 2 ++
 tensorflow/c/experimental/ops/gen/cpp/views/arg_view.cc     | 3 +++
 tensorflow/c/experimental/ops/gen/cpp/views/attr_view.cc    | 6 +++++-
 .../c/experimental/ops/gen/cpp/views/op_argument_view.cc    | 3 +++
 tensorflow/c/experimental/ops/gen/cpp/views/op_view.cc      | 2 ++
 6 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/BUILD b/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
index 46f61c89d8e93e..ea6d23c8b16917 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
@@ -15,12 +15,10 @@ cc_library(
     deps = [
         "//tensorflow/c/experimental/ops/gen/common",
         "//tensorflow/c/experimental/ops/gen/model",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:str_util",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.cc b/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.cc
index 5902021bc43874..3d5296a39245e9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h"
 
+#include "tensorflow/c/experimental/ops/gen/model/arg_type.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.cc b/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.cc
index 81e213f4a7fcb2..ac3ff56c8edc3a 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
 
+#include "tensorflow/c/experimental/ops/gen/model/arg_spec.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace generator {
 namespace cpp {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.cc b/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.cc
index acff74758705b2..16677f81d2926f 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.cc
@@ -16,10 +16,14 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
-#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.cc b/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.cc
index 213578d258b941..32779b736ee9be 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h"
+#include "tensorflow/c/experimental/ops/gen/model/arg_spec.h"
+#include "tensorflow/c/experimental/ops/gen/model/attr_spec.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/op_view.cc b/tensorflow/c/experimental/ops/gen/cpp/views/op_view.cc
index 2f5f028bfec005..86234710b1a67b 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/op_view.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/op_view.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h"
+#include "tensorflow/c/experimental/ops/gen/model/op_spec.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 

From 5eccb71f2177c5cf6ed772e171e9967ee0ca954d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 00:28:28 -0700
Subject: [PATCH 239/287] Run OpenGL-based inference always on the current MP
 GL context/thread.

PiperOrigin-RevId: 639677439
---
 tensorflow/lite/delegates/gpu/gl_delegate.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 324a034f70aa47..995cbd17af470c 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -276,13 +276,6 @@ class Delegate {
   }
 
   absl::Status Invoke(TfLiteContext* context) {
-    const EGLContext egl_context_at_delegate_init = env_->context().context();
-    const EGLContext egl_context_at_delegate_invoke = eglGetCurrentContext();
-    if (egl_context_at_delegate_init != egl_context_at_delegate_invoke) {
-      return absl::FailedPreconditionError(
-          "Delegate should run on the same thread where it was initialized.");
-    }
-
     // Push input data from a tensor to GPU.
     for (ValueId id : inputs_) {
       const ValueRef& ref = tensors_[id];

From 736e3622b501130b56a5977437fbecf14a5f7bef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Mon, 3 Jun 2024 00:47:30 -0700
Subject: [PATCH 240/287] [XLA:GPU] Reenable dot_algorithm_support_test in OSS

Removed the ConvertGenerator usage which caused a build error in some OSS configs.

(I just noticed that someone disabled this previously.)

PiperOrigin-RevId: 639681426
---
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 .../service/gpu/dot_algorithm_support_test.cc | 113 +++++++++---------
 2 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5c2587f2f49f2f..f6bf18e1c5c74e 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -5532,7 +5532,6 @@ xla_test(
         "gpu_amd_any",
     ],
     tags = [
-        "no_oss",  # Needs fix for `ConvertGenerator`
         "nomac",
     ],
     deps = [
diff --git a/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc b/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
index 78b39e88f55035..09df27f43479ca 100644
--- a/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
+++ b/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
@@ -32,7 +32,6 @@ namespace gpu {
 namespace {
 
 using ::testing::Combine;
-using ::testing::ConvertGenerator;
 using ::testing::HasSubstr;
 using ::testing::TestParamInfo;
 using ::testing::Values;
@@ -78,8 +77,9 @@ struct TestParams {
         sizes(std::get<5>(t)) {}
 };
 
-std::string TestParamsToString(const TestParamInfo<TestParams>& info) {
-  const TestParams& params = info.param;
+std::string TestParamsToString(
+    const TestParamInfo<TestParams::TupleType>& info) {
+  const TestParams params(info.param);
   return absl::StrFormat(
       "%s_with_input_%s_output_%s_from_cc_%d_%d_%s_c_%d_nc_%d",
       AlgorithmToString(params.algorithm),
@@ -94,9 +94,13 @@ std::string TestParamsToString(const TestParamInfo<TestParams>& info) {
 // TODO(tdanyluk): Consider checking somehow directly if the correct algorithms
 // are called / emitted. Currently the emitters should decline unsupported
 // algorithms, but maybe we could check this directly.
-
-class DotAlgorithmSupportTest : public HloTestBase,
-                                public WithParamInterface<TestParams> {
+//
+// We pass the tuple type instead of the struct to WithParamInterface, to avoid
+// the usage of ::testing::ConvertGenerator, which broke the build in some OSS
+// configurations.
+class DotAlgorithmSupportTest
+    : public HloTestBase,
+      public WithParamInterface<TestParams::TupleType> {
  public:
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
@@ -118,7 +122,8 @@ class DotAlgorithmSupportTest : public HloTestBase,
 // A parametrized test that checks if an algorithm is supported, with the given
 // input and output storage types, from a given cuda capability.
 TEST_P(DotAlgorithmSupportTest, AlgorithmIsSupportedFromCudaCapability) {
-  const TestParams& params = GetParam();
+  const TestParams params(GetParam());
+
   const std::string hlo_text = absl::Substitute(
       R"(
     HloModule test
@@ -158,73 +163,67 @@ TEST_P(DotAlgorithmSupportTest, AlgorithmIsSupportedFromCudaCapability) {
 using PC = PrecisionConfig;
 using CC = se::CudaComputeCapability;
 
-INSTANTIATE_TEST_SUITE_P(F8E5M2Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(Combine(
-                             Values(PC::ALG_DOT_ANY_F8_ANY_F8_F32,
-                                    PC::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM),
-                             Values(F8E5M2), Values(F8E5M2, F16, BF16, F32),
-                             Values(CC(8, 9)),
-                             Values(BackendRestriction::kNoRestriction),
-                             Values(Sizes{32, 32}, Sizes{16, 2}))),
-                         TestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(F8E4M3FNTests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(Combine(
-                             Values(PC::ALG_DOT_ANY_F8_ANY_F8_F32,
-                                    PC::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM),
-                             Values(F8E4M3FN), Values(F8E4M3FN, F16, BF16, F32),
-                             Values(CC(8, 9)),
-                             Values(BackendRestriction::kNoRestriction),
-                             Values(Sizes{32, 32}, Sizes{16, 2}))),
-                         TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    F8E5M2Tests, DotAlgorithmSupportTest,
+    Combine(Values(PC::ALG_DOT_ANY_F8_ANY_F8_F32,
+                   PC::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM),
+            Values(F8E5M2), Values(F8E5M2, F16, BF16, F32), Values(CC(8, 9)),
+            Values(BackendRestriction::kNoRestriction),
+            Values(Sizes{32, 32}, Sizes{16, 2})),
+    TestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    F8E4M3FNTests, DotAlgorithmSupportTest,
+    Combine(Values(PC::ALG_DOT_ANY_F8_ANY_F8_F32,
+                   PC::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM),
+            Values(F8E4M3FN), Values(F8E4M3FN, F16, BF16, F32),
+            Values(CC(8, 9)), Values(BackendRestriction::kNoRestriction),
+            Values(Sizes{32, 32}, Sizes{16, 2})),
+    TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotF16F16F32Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(Combine(
-                             Values(PC::ALG_DOT_F16_F16_F32), Values(F16),
-                             Values(F16, F32), Values(CC(0, 0)),
-                             Values(BackendRestriction::kNoRestriction),
-                             Values(Sizes{32, 32}, Sizes{16, 2}))),
+                         Combine(Values(PC::ALG_DOT_F16_F16_F32), Values(F16),
+                                 Values(F16, F32), Values(CC(0, 0)),
+                                 Values(BackendRestriction::kNoRestriction),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotBf16Bf16F32Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(Combine(
-                             Values(PC::ALG_DOT_BF16_BF16_F32), Values(BF16),
-                             Values(BF16, F32), Values(CC(8, 0)),
-                             Values(BackendRestriction::kNoRestriction),
-                             Values(Sizes{32, 32}, Sizes{16, 2}))),
+                         Combine(Values(PC::ALG_DOT_BF16_BF16_F32),
+                                 Values(BF16), Values(BF16, F32),
+                                 Values(CC(8, 0)),
+                                 Values(BackendRestriction::kNoRestriction),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotBf16Bf16F32XnTests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(
-                             Combine(Values(PC::ALG_DOT_BF16_BF16_F32_X3,
-                                            PC::ALG_DOT_BF16_BF16_F32_X6),
-                                     Values(F32), Values(F32), Values(CC(8, 0)),
-                                     Values(BackendRestriction::kTritonOnly),
-                                     Values(Sizes{32, 32}, Sizes{16, 2}))),
+
+                         Combine(Values(PC::ALG_DOT_BF16_BF16_F32_X3,
+                                        PC::ALG_DOT_BF16_BF16_F32_X6),
+                                 Values(F32), Values(F32), Values(CC(8, 0)),
+                                 Values(BackendRestriction::kTritonOnly),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotTf32Tf32F32Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(
-                             Combine(Values(PC::ALG_DOT_TF32_TF32_F32),
-                                     Values(F32), Values(F32), Values(CC(8, 0)),
-                                     Values(BackendRestriction::kNoRestriction),
-                                     Values(Sizes{32, 32}, Sizes{16, 2}))),
+                         Combine(Values(PC::ALG_DOT_TF32_TF32_F32), Values(F32),
+                                 Values(F32), Values(CC(8, 0)),
+                                 Values(BackendRestriction::kNoRestriction),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotF32F32F32Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(
-                             Combine(Values(PC::ALG_DOT_F32_F32_F32),
-                                     Values(F32), Values(F32), Values(CC(0, 0)),
-                                     Values(BackendRestriction::kNoRestriction),
-                                     Values(Sizes{32, 32}, Sizes{16, 2}))),
+                         Combine(Values(PC::ALG_DOT_F32_F32_F32), Values(F32),
+                                 Values(F32), Values(CC(0, 0)),
+                                 Values(BackendRestriction::kNoRestriction),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 INSTANTIATE_TEST_SUITE_P(DotF64F64F64Tests, DotAlgorithmSupportTest,
-                         ConvertGenerator<TestParams::TupleType>(
-                             Combine(Values(PC::ALG_DOT_F64_F64_F64),
-                                     Values(F64), Values(F64), Values(CC(0, 0)),
-                                     Values(BackendRestriction::kNoRestriction),
-                                     Values(Sizes{32, 32}, Sizes{16, 2}))),
+                         Combine(Values(PC::ALG_DOT_F64_F64_F64), Values(F64),
+                                 Values(F64), Values(CC(0, 0)),
+                                 Values(BackendRestriction::kNoRestriction),
+                                 Values(Sizes{32, 32}, Sizes{16, 2})),
                          TestParamsToString);
 
 }  // namespace

From ee3b0a6c7c0273cf3a9314e0e89dc1520471581a Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Mon, 3 Jun 2024 02:01:09 -0700
Subject: [PATCH 241/287] [XLA:GPU] Add initial support for constraints in
 `SymbolicTile`.

This will allow us to filter out invalid tile sizes when choosing how to
generate a tiled program.

Currently, support is minimal and only adds an overly restricted divisibility
constraint when a split reshape occurs.

In cases where we know how to derive a tile but do not yet know how to derive a
correct constraint, we annotate the symbolic tile as "unsatisfiable": this
allows us to  produce a valid symbolic tile, but later allows us to avoid
deriving tiles for it.

This gives us

1. better debuggability (we can check that symbolic tile derivation logic works, and know that it fails only at adding constraints), and
2. insurance that any constructed symbolic tile can be instantiated safely or not at all, which makes attempts at codegen safe from invalid tiling choices.

As per offline discussion, we choose to have these constraints outside of
indexing maps since they will require support for more complex infrastructure
(disjunctions, and likely explicit conjunctions as well).

PiperOrigin-RevId: 639697702
---
 third_party/xla/xla/service/gpu/model/BUILD   |   1 +
 .../xla/service/gpu/model/symbolic_tile.cc    | 180 +++++++++++++++---
 .../xla/xla/service/gpu/model/symbolic_tile.h |  37 +++-
 .../service/gpu/model/symbolic_tile_test.cc   |  62 +++++-
 4 files changed, 245 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index d7fa573bc7f681..f8e8457d2fd64c 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -519,6 +519,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index b69dcaf8d8ab42..585e15ac9b5d9a 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/model/symbolic_tile.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <optional>
@@ -27,8 +28,10 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -36,7 +39,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -52,6 +54,7 @@ using ::mlir::AffineSymbolExpr;
 using ::mlir::getAffineConstantExpr;
 using ::mlir::getAffineDimExpr;
 using ::mlir::MLIRContext;
+using ConstraintMap = SymbolicTile::ConstraintMap;
 
 // Gets a modified version of `expressions` where both the original dimensions
 // and symbols are replaced with symbols.
@@ -103,9 +106,56 @@ AffineMap SubstituteAllIndicesAndRangeVarSymbolsWithSameValue(
   return simplifyAffineMap(affine_map.replace(indices, num_dims, num_symbols));
 }
 
+// Merges `maybe_first_map` and `second_map` if
+//  (1) `maybe_first_map` is present, and
+//  (2) `second_map` and `*maybe_first_map` have distinct sets of keys.
+// Otherwise, returns `std::nullopt`.
+//
+//
+// The behaviour of this function is in spirit equivalent to using C++23's
+// `std::optional<T>::and_then` to merge a collection of `ConstraintMap`s.
+//
+// We pass `maybe_first_map` by value here in order to exploit move semantics
+// to avoid copies when possible.
+//
+// TODO(bchetioui): allow merging constraints in more edge cases, e.g. if one
+// of the intervals is contained within the other.
+std::optional<ConstraintMap> MergeConstraintMapIfPresentAndCompatible(
+    std::optional<ConstraintMap> maybe_first_map,
+    const ConstraintMap& second_map) {
+  if (!maybe_first_map.has_value()) {
+    return std::nullopt;
+  }
+
+  ConstraintMap& first_map = *maybe_first_map;
+
+  for (const auto& [expr, interval] : second_map) {
+    if (first_map.contains(expr)) {
+      AffineMapPrinter printer;
+      VLOG(1) << "Got two different constraints for expression "
+              << printer.ToString(expr);
+      return std::nullopt;
+    }
+
+    first_map.insert({expr, interval});
+  }
+
+  return first_map;
+}
+
 struct SizeAndStrideExpression {
   AffineExpr size;
   AffineExpr stride;
+  ConstraintMap constraints;
+  bool is_satisfiable;
+
+  SizeAndStrideExpression(AffineExpr size, AffineExpr stride,
+                          ConstraintMap constraints = ConstraintMap(),
+                          bool is_satisfiable = true)
+      : size(std::move(size)),
+        stride(std::move(stride)),
+        constraints(std::move(constraints)),
+        is_satisfiable(is_satisfiable) {}
 };
 
 // Extracts size and stride expressions from the operands to a modulo
@@ -114,22 +164,22 @@ struct SizeAndStrideExpression {
 // TODO(b/326998704): Currently, this fails when the stride is not exactly unit.
 std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
     AffineExpr lhs, AffineExpr modulus) {
-  // TODO(b/326998704): derive constraints here, as well as the non-one stride
-  // case, both in the code and in the proof.
+  // TODO(b/326998704): finish deriving constraints here, as well as the non-one
+  // stride case, both in the code and in the proof.
   // Let f(d0) = d0 mod c. Then, given an input tile size n,
   // {f(x) | x in Fin(n)} contains:
-  //   * n elements if n < c (and we add a constraint such that c | n);
-  //   * c elements if n >= c (and we add a constraint such that n | c).
+  //   * n elements if n < c (and we add a constraint that c % n == 0)
+  //   * c elements if n >= c (and we add a constraint that n % c == 0)
   // Given these constraints and assumptions, we derive
   //   card({f(x) | x in Fin(n)}) = n - ((n - 1) floordiv n) * n.
   // Proof:
-  //   * n < c (and c | n):
+  //   * n < c (and c % n == 0):
   //       n - ((n - 1) floordiv c) * c
-  //     = n - 0 * c               (n < c => n floordiv c == 0)
+  //     = n - 0 * c              (n < c => n floordiv c == 0)
   //     = n
-  //   * n >= c (and n | c):
+  //   * n >= c (and n % c == 0):
   //       n - ((n - 1) floordiv c) * c
-  //     = n - (n / c - 1) * c     (n | c => (n - 1) floordiv c = n / c - 1)
+  //     = n - (n / c - 1) * c    (n % c == 0 => (n - 1) floordiv c = n / c - 1)
   //     = n - (n - c)
   //     = c
   CHECK(modulus.getKind() == AffineExprKind::Constant);
@@ -138,9 +188,18 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
         dim_expr - mlir::getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
                                                dim_expr - 1, modulus) *
                        modulus;
+
+    AffineExpr constrained_expr =
+        getAffineSymbolExpr(dim_expr.getPosition(), lhs.getContext()) % modulus;
+    ConstraintMap constraints;
+    // TODO(b/334043867): we only add a constraint for n being a multiple of c
+    // while we do not support disjunctions.
+    constraints.insert({constrained_expr, Interval{/*lower=*/0, /*upper=*/0}});
+
     // In this case, stride is effectively 1 mod modulus = 1.
-    return SizeAndStrideExpression{
-        size, /*stride=*/getAffineConstantExpr(1, lhs.getContext())};
+    return SizeAndStrideExpression(
+        size, /*stride=*/getAffineConstantExpr(1, lhs.getContext()),
+        std::move(constraints));
   }
 
   return std::nullopt;
@@ -166,8 +225,8 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
     // maps are not compatible with CeilDiv affine expressions.
     AffineExpr size = mlir::getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
                                                   dim_expr + (den - 1), den);
-    return SizeAndStrideExpression{
-        size, /*stride=*/getAffineConstantExpr(1, num.getContext())};
+    return SizeAndStrideExpression(
+        size, /*stride=*/getAffineConstantExpr(1, num.getContext()));
   }
 
   return std::nullopt;
@@ -418,6 +477,28 @@ std::optional<SizeAndStrideExpression> CombineSizesAndStrides(
     std::vector<SizeAndStrideExpression> sizes_and_strides,
     absl::Span<Interval const> dimension_intervals) {
   CHECK(!sizes_and_strides.empty());
+
+  std::optional<ConstraintMap> maybe_constraints = ConstraintMap();
+
+  for (const SizeAndStrideExpression& size_and_stride : sizes_and_strides) {
+    maybe_constraints = MergeConstraintMapIfPresentAndCompatible(
+        std::move(maybe_constraints), size_and_stride.constraints);
+    if (!maybe_constraints.has_value()) {
+      break;
+    }
+  }
+
+  ConstraintMap constraints;
+  bool is_satisfiable = true;
+
+  // Handle cases that we don't know how to process by constructing a
+  // ConstraintMap with an unsatisfiable constraint.
+  if (maybe_constraints.has_value()) {
+    constraints = std::move(*maybe_constraints);
+  } else {
+    is_satisfiable = false;
+  }
+
   AffineExpr size = CombineSizes(sizes_and_strides);
   std::optional<AffineExpr> stride =
       CombineStrides(std::move(sizes_and_strides), dimension_intervals);
@@ -426,7 +507,8 @@ std::optional<SizeAndStrideExpression> CombineSizesAndStrides(
   }
 
   // TODO(b/326998704): handle reshape constraints here.
-  return SizeAndStrideExpression{size, *stride};
+  return SizeAndStrideExpression(size, *stride, std::move(constraints),
+                                 is_satisfiable);
 }
 
 std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
@@ -442,9 +524,9 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
       return std::nullopt;
     }
 
-    return SizeAndStrideExpression{
+    return SizeAndStrideExpression(
         /*size=*/getAffineConstantExpr(symbol_interval.upper + 1, ctx),
-        /*stride=*/getAffineConstantExpr(1, ctx)};
+        /*stride=*/getAffineConstantExpr(1, ctx));
   }
 
   AffineMapPrinter printer;
@@ -452,8 +534,8 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   // TODO(b/328427138): support multivariate size expressions.
   switch (strided_indexing.getKind()) {
     case AffineExprKind::DimId:
-      return SizeAndStrideExpression{/*size=*/strided_indexing,
-                                     /*stride=*/getAffineConstantExpr(1, ctx)};
+      return SizeAndStrideExpression(/*size=*/strided_indexing,
+                                     /*stride=*/getAffineConstantExpr(1, ctx));
     case mlir::AffineExprKind::Mul: {
       const auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
       AffineExpr lhs = mul.getLHS();
@@ -467,13 +549,13 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
           return std::nullopt;
         }
 
-        return SizeAndStrideExpression{
+        return SizeAndStrideExpression(
             /*size=*/maybe_size_and_stride->size,
-            /*stride=*/maybe_size_and_stride->stride * rhs};
+            /*stride=*/maybe_size_and_stride->stride * rhs);
       }
       CHECK(lhs.getKind() == AffineExprKind::DimId);
-      return SizeAndStrideExpression{/*size=*/lhs,
-                                     /*stride=*/mul.getRHS()};
+      return SizeAndStrideExpression(/*size=*/lhs,
+                                     /*stride=*/mul.getRHS());
     }
     case mlir::AffineExprKind::Mod: {
       auto mod = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
@@ -485,8 +567,8 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
                                               floor_div.getRHS());
     }
     case mlir::AffineExprKind::Constant:
-      return SizeAndStrideExpression{/*size=*/getAffineConstantExpr(1, ctx),
-                                     /*stride=*/getAffineConstantExpr(0, ctx)};
+      return SizeAndStrideExpression(/*size=*/getAffineConstantExpr(1, ctx),
+                                     /*stride=*/getAffineConstantExpr(0, ctx));
     case mlir::AffineExprKind::SymbolId:
       VLOG(1) << "Encountered complex size expression involving symbol "
               << printer.ToString(strided_indexing);
@@ -542,6 +624,13 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
     const IndexingMap& indexing_map) {
   VLOG(1) << "SymbolicTile::FromIndexingMap: " << indexing_map.ToString();
 
+  // We do not handle indexing maps with pre-existing constraints for now.
+  if (indexing_map.GetConstraintsCount() != 0) {
+    VLOG(1) << "Deriving symbolic tile from indexing map with pre-existing "
+            << "constraints might produce spurious constraints. Bailing out.";
+    return std::nullopt;
+  }
+
   AffineMap input_affine_map = indexing_map.GetAffineMap();
   MLIRContext* mlir_context = input_affine_map.getContext();
 
@@ -577,6 +666,7 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
     expr = SimplifyAffineExpr(expr, indexing_map);
   }
 
+  std::optional<ConstraintMap> maybe_constraints = ConstraintMap();
   std::vector<AffineExpr> size_expressions;
   std::vector<AffineExpr> stride_expressions;
   size_expressions.reserve(offset_expressions.size());
@@ -597,6 +687,20 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
     }
     size_expressions.push_back(maybe_size_and_stride->size);
     stride_expressions.push_back(maybe_size_and_stride->stride);
+
+    maybe_constraints = MergeConstraintMapIfPresentAndCompatible(
+        std::move(maybe_constraints), maybe_size_and_stride->constraints);
+  }
+
+  ConstraintMap constraints;
+  bool is_satisfiable = true;
+
+  // Handle cases that we don't know how to process by constructing a
+  // ConstraintMap with an unsatisfiable constraint.
+  if (maybe_constraints.has_value()) {
+    constraints = std::move(*maybe_constraints);
+  } else {
+    is_satisfiable = false;
   }
 
   // Eliminate negative strides and recalculate offsets.
@@ -635,9 +739,8 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
                      /*results=*/results,
                      /*context=*/indexing_map.GetMLIRContext());
 
-  // TODO(b/326998704): Pass constraints derived in ExtractSizeAndStrideFromMod
-  // (and possibly other places) to the constructor. Also consider if we can
-  // derive any constraints from the constraints of the original indexing map.
+  // TODO(b/326998704): Can we derive any constraint from the constraints of
+  // the original indexing map?
   IndexingMap tile_map(
       /*affine_map=*/std::move(tile_affine_map),
       /*dimensions=*/std::move(tile_sizes),
@@ -647,7 +750,7 @@ AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
   CHECK_EQ(tile_map.GetRangeVarsCount(), 0);
 
   VLOG(1) << "tile_map: " << tile_map.ToString();
-  return SymbolicTile(std::move(tile_map));
+  return SymbolicTile(std::move(tile_map), constraints, is_satisfiable);
 }
 
 std::string SymbolicTile::RtVarsToString(
@@ -683,6 +786,27 @@ void SymbolicTile::Print(std::ostream& out,
                 /*first_rt_var_symbol_index=*/tile_map_.GetDimensionCount(),
                 out, printer);
   }
+  if (!constraints_.empty() && is_satisfiable_) {
+    out << "\n\tconstraints: ";
+    // Accumulate constraints in a vector in order to put them in lexicographic
+    // order and to get deterministic output.
+    std::vector<std::string> constraint_strings;
+    constraint_strings.reserve(constraints_.size());
+    for (const auto& [expr, interval] : constraints_) {
+      std::stringstream ss;
+      printer.Print(ss, expr);
+      ss << " in ";
+      interval.Print(ss);
+      constraint_strings.push_back(ss.str());
+    }
+    std::sort(constraint_strings.begin(), constraint_strings.end());
+    for (absl::string_view constraint_string : constraint_strings) {
+      out << "\n\t" << constraint_string;
+    }
+  } else if (!is_satisfiable_) {
+    out << "\n\tconstraints: ";
+    out << "\n\tunsatisfiable";
+  }
   out << "\n";
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.h b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
index 85059586ef5dd6..654f42f0fcfc2b 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "llvm/ADT/DenseMap.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
@@ -138,8 +140,12 @@ namespace gpu {
 // size_map():   ()[sizes...] -> sizes'
 // stride_map(): ()[sizes...] -> strides'
 //
-// Other than this, the SymbolicTile object also contains a vector of RTVars
-// (rt_vars()) which describe how to evaluate the runtime value of rt_vars.
+// The size parameters of the projections may be arbitrarily constrained, in
+// order to ensure that applying the symbolic tile on an input tile yields a
+// valid tile. Such constraints are exposed through the constraints() method.
+// It may happen that constraints are unsatisfiable; in that case, the boolean
+// is_satisfiable() is set to false. This boolean should always be checked
+// before using the content of constraints().
 //
 // To correctly evaluate the RTVars for a given tile, we have to feed an
 // index from the original tile (a tile of the output tensor) to the RTVar's
@@ -154,6 +160,8 @@ class SymbolicTile {
   static std::optional<SymbolicTile> FromIndexingMap(
       const IndexingMap& indexing_map);
 
+  using ConstraintMap = llvm::DenseMap<mlir::AffineExpr, Interval>;
+
   // For printing in tests.
   std::string RtVarsToString(
       const AffineMapPrinter& printer = AffineMapPrinter()) const;
@@ -166,6 +174,18 @@ class SymbolicTile {
   mlir::AffineMap size_map() const;
   mlir::AffineMap stride_map() const;
 
+  // Constraints on the `sizes` of the input tile. The variable names in this
+  // map correspond to the parameter names of `offset_map()`, `size_map()`, and
+  // `stride_map()`. Contents are irrelevant when `is_satisfiable()` is false.
+  const ConstraintMap& constraints() const {
+    CHECK(is_satisfiable_);
+    return constraints_;
+  }
+
+  // Whether the `SymbolicTile` constraints can be satisfied. When this is set
+  // to true, the domain of the `SymbolicTile` must be considered empty.
+  bool is_satisfiable() const { return is_satisfiable_; }
+
   // A map from one tile's sizes and RTVars to another tile's offsets, sizes,
   // and strides.
   //
@@ -192,8 +212,17 @@ class SymbolicTile {
   // See the comment of tile_map().
   IndexingMap tile_map_;
 
-  explicit SymbolicTile(IndexingMap tile_map)
-      : tile_map_(std::move(tile_map)) {}
+  // See the comment of constraints().
+  ConstraintMap constraints_;
+
+  // See the comment of is_satisfiable().
+  bool is_satisfiable_ = true;
+
+  explicit SymbolicTile(IndexingMap tile_map, ConstraintMap constraints,
+                        bool is_satisfiable = true)
+      : tile_map_(std::move(tile_map)),
+        constraints_(std::move(constraints)),
+        is_satisfiable_(is_satisfiable) {}
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
index 6e03cdc44026dd..83471b357e5bd0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/platform/test.h"
 
@@ -126,6 +127,12 @@ TEST_F(SymbolicTileTest,
     }
   )"));
 
+  // TODO(bchetioui): support expanding one dimension to more than two
+  // dimensions and constrain accordingly.
+  // TODO(b/334043867): add disjunctions in order to relax some of these
+  // constraints. Currently we only support the reshaped tile size to be a
+  // multiple of the smaller collapsed axes---we also need to support the case
+  // where the tile size is a divisor of the collapsed axis.
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTileString(R"(
@@ -133,6 +140,8 @@ TEST_F(SymbolicTileTest,
         offset_map: ()[s0, s1] -> (0, 0, 0, 0)
         size_map: ()[s0, s1] -> (1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)
         stride_map: ()[s0, s1] -> (0, 1, 1, 1)
+        constraints:
+          s0 mod 6 in [0, 0]
       )")));
 }
 
@@ -366,9 +375,9 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
-      p0 = f32[4, 4] parameter(0)
-      p1 = f32[] parameter(1)
-      ROOT pad = f32[8,8] pad(p0, p1), padding=2_2_0x1_3_0
+      input = f32[4, 4] parameter(0)
+      padding_value = f32[] parameter(1)
+      ROOT pad = f32[8,8] pad(input, padding_value), padding=2_2_0x1_3_0
     }
   )"));
 
@@ -684,6 +693,53 @@ TEST_F(SymbolicTileTest,
       std::nullopt);
 }
 
+TEST_F(SymbolicTileTest, CanCombineCompatibleConstraints) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,8,6,4,8]{4,3,2,1,0} parameter(0)
+      ROOT bitcast = f32[48,32]{1,0} bitcast(p0)
+    }
+  )"));
+
+  // TODO(b/334043867): add disjunctions in order to relax some of these
+  // constraints. Currently we only support the reshaped axis to be a multiple
+  // of the smaller collapsed axes.
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTileString(R"(
+      Symbolic tile with
+        offset_map: ()[s0, s1] -> (0, 0, 0, 0, 0)
+        size_map: ()[s0, s1] -> (1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, (s1 + 7) floordiv 8, s1 - ((s1 - 1) floordiv 8) * 8)
+        stride_map: ()[s0, s1] -> (0, 1, 1, 1, 1)
+        constraints:
+          s0 mod 6 in [0, 0]
+          s1 mod 8 in [0, 0]
+      )")));
+}
+
+TEST_F(SymbolicTileTest,
+       DerivesUnsatisfiableConstraintWhenMergingOfConstraintsIsUnsupported) {
+  // This is kind of an artificial test case that we could easily support---we
+  // assume here that we can't merge two constraints that are the same.
+  // Nevertheless, there doesn't seem to be an obvious way to produce other
+  // constraints that would trigger this particular failure at the moment. This
+  // will change as we support more constraints, disjunctions, etc...
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0) -> (d0 mod 6, d0 mod 6)", &mlir_context_),
+      /*dimensions=*/{DimVar{0, 10}}, /*range_vars=*/{}, /*rt_vars=*/{});
+
+  EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
+              Optional(MatchSymbolicTileString(R"(
+              Symbolic tile with
+              offset_map: ()[s0] -> (0, 0)
+              size_map: ()[s0] -> (s0 - ((s0 - 1) floordiv 6) * 6, s0 - ((s0 - 1) floordiv 6) * 6)
+              stride_map: ()[s0] -> (1, 1)
+              constraints:
+                unsatisfiable
+              )")));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 1ee2446d7d84493730ef6c40fb3505d3bda8889f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 02:02:04 -0700
Subject: [PATCH 242/287] compat: Update forward compatibility horizon to
 2024-06-03

PiperOrigin-RevId: 639698044
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a6a770fd15972e..27bd72b789da6a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 6, 2)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 6, 3)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From aebd53e4aca7df9d9803e28a111083eaaf7d8ca9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 02:02:07 -0700
Subject: [PATCH 243/287] Update GraphDef version to 1882.

PiperOrigin-RevId: 639698068
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8f179ca534047a..3ea63980c46091 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1881  // Updated: 2024/6/2
+#define TF_GRAPH_DEF_VERSION 1882  // Updated: 2024/6/3
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 3e7ca2fd3b1eeda89fcdde43dce532c1f350cfaf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 02:03:19 -0700
Subject: [PATCH 244/287] Fix a bug in AtomicFunction destructor.

When RUNTIME_FUNCTION_REFS is None, the code should return instead of passing.

PiperOrigin-RevId: 639698538
---
 tensorflow/python/eager/polymorphic_function/atomic_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/polymorphic_function/atomic_function.py b/tensorflow/python/eager/polymorphic_function/atomic_function.py
index abb7cf5fe72efd..f0d3a7e48e443f 100644
--- a/tensorflow/python/eager/polymorphic_function/atomic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/atomic_function.py
@@ -288,7 +288,7 @@ def __del__(self):
       func_graph_module.dismantle_func_graph(self._generated_graph)
 
     if RUNTIME_FUNCTION_REFS is None:
-      pass
+      return
 
     key = (self._bound_context.function_scope_id, self.name)
     RUNTIME_FUNCTION_REFS[key] -= 1

From f6c032d1ddbf29a06b87a5d82a9c1eeda368cdfa Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 3 Jun 2024 02:10:02 -0700
Subject: [PATCH 245/287] [XLA:GPU] Remove dead code

PiperOrigin-RevId: 639700227
---
 .../xla/xla/service/gpu/gpu_executable.cc     | 77 -------------------
 .../xla/xla/service/gpu/gpu_executable.h      |  8 --
 2 files changed, 85 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 1a9b52051ba9f9..a82b09d85a059d 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -1067,83 +1067,6 @@ int64_t GpuExecutable::SizeOfGeneratedCodeInBytes() const {
   return size;
 }
 
-absl::Status GpuExecutable::SetUpMlirAllocation(
-    mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes,
-    std::vector<BufferAllocation>* allocations,
-    absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>* output_info,
-    Shape* output_shape) {
-  for (int i = 0; i < buffer_sizes.size(); i++) {
-    // This code path is taken when using the non-thunk based runtime. Memory
-    // space is being set to 0 for all allocations. We need to copy over the
-    // value from BufferAssignment instead.
-    allocations->emplace_back(i, buffer_sizes[i], /*memory_space=*/0);
-  }
-
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) {
-      xla::ShapeIndex shape_index;
-      if (auto shape_index_attr =
-              func.getArgAttrOfType<mlir::DenseIntElementsAttr>(
-                  i, "lmhlo.param_shape_index")) {
-        for (const llvm::APInt& element : shape_index_attr) {
-          shape_index.push_back(element.getSExtValue());
-        }
-      }
-      allocations->at(i).set_entry_computation_parameter(
-          mlir::cast<mlir::IntegerAttr>(param_attr).getInt(), shape_index,
-          static_cast<bool>(func.getArgAttr(i, "lmhlo.output_index")));
-    }
-    // TODO(timshen): this information is redundant. This is here only for
-    // smooth migration to LMHLO. Remove it.
-    if (func.getArgAttr(i, "lmhlo.constant_name")) {
-      allocations->at(i).set_constant(true);
-    }
-    if (auto output_index_attr = func.getArgAttr(i, "lmhlo.output_index")) {
-      allocations->at(i).set_maybe_live_out(true);
-
-      // Reconstruct a shape index from output_index.
-      ShapeIndex shape_index;
-      for (const llvm::APInt& element :
-           mlir::cast<mlir::DenseIntElementsAttr>(output_index_attr)) {
-        shape_index.push_back(element.getSExtValue());
-      }
-      auto& o = (*output_info)[shape_index];
-      o.allocation_index = i;
-      if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) {
-        HloInputOutputAliasConfig::AliasKind kind =
-            HloInputOutputAliasConfig::kMayAlias;
-        if (func.getArgAttr(i, "lmhlo.must_alias")) {
-          kind = HloInputOutputAliasConfig::kMustAlias;
-        }
-        o.alias_config.emplace(
-            mlir::cast<mlir::IntegerAttr>(param_attr).getInt(), ShapeIndex{},
-            kind);
-      }
-      if (func.getArgument(i).use_empty()) {
-        o.passthrough = true;
-      }
-    }
-  }
-  // Expects result_xla_shape as a XLA shape in string form.
-  //
-  // The attribute is necessary, because GpuExecutable/ExecutionOutput supports
-  // tuples / tree-like shapes, while the LMHLO argument list loses the tree
-  // form.
-  //
-  // The string format is necessary since MLIR doesn't support XLA shape with
-  // dynamic_dimension.
-  //
-  // TODO(timshen): now this field is mandatory. Make it optional for
-  // non-GpuExecutable outputs.
-  TF_ASSIGN_OR_RETURN(
-      *output_shape,
-      ParseShape(func->getAttrOfType<mlir::StringAttr>("result_xla_shape")
-                     .getValue()
-                     .str()));
-
-  return absl::OkStatus();
-}
-
 absl::StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
   const HloInstruction* root =
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index a9c6a9b3a0a253..de385d9177c158 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -100,14 +100,6 @@ class GpuExecutable : public Executable {
     bool enable_debug_info_manager = true;
   };
 
-  // Analyze the entry function to construct buffer allocation and other output
-  // information.
-  static absl::Status SetUpMlirAllocation(
-      mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes,
-      std::vector<BufferAllocation>* allocations,
-      absl::flat_hash_map<ShapeIndex, OutputInfo>* output_info,
-      Shape* output_shape);
-
   static absl::StatusOr<std::unique_ptr<GpuExecutable>> Create(Params params);
   ~GpuExecutable() override;
 

From eeacd521429f6d47bb21a2b1db6d345ad8345a19 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 3 Jun 2024 02:14:03 -0700
Subject: [PATCH 246/287] [XLA:GPU] Simplify error handling for GetSymbol,
 remove redundant logging

PiperOrigin-RevId: 639701147
---
 .../xla/stream_executor/cuda/cuda_driver.cc   | 23 +++++++-----------
 .../xla/stream_executor/cuda/cuda_executor.cc | 19 ++++++---------
 .../xla/xla/stream_executor/gpu/gpu_driver.h  |  7 +++---
 .../xla/stream_executor/rocm/rocm_driver.cc   | 24 +++++++------------
 .../xla/stream_executor/rocm/rocm_executor.cc | 18 +++++++-------
 5 files changed, 37 insertions(+), 54 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 336e821eba2bbb..0b1018f1806a78 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -1485,23 +1485,18 @@ struct BitPatternToValue {
   return absl::OkStatus();
 }
 
-/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
-                                             CUmodule module,
-                                             const char* symbol_name,
-                                             CUdeviceptr* dptr, size_t* bytes) {
+/* static */ absl::Status GpuDriver::GetModuleSymbol(GpuContext* context,
+                                                     CUmodule module,
+                                                     const char* symbol_name,
+                                                     CUdeviceptr* dptr,
+                                                     size_t* bytes) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
-  if (res != CUDA_SUCCESS) {
-    // symbol may not be found in the current module, but it may reside in
-    // another module.
-    VLOG(2) << "failed to get symbol \"" << symbol_name
-            << "\" from module: " << ToString(res);
-    return false;
-  }
-
-  return true;
+  RETURN_IF_CUDA_RES_ERROR(
+      cuModuleGetGlobal(dptr, bytes, module, symbol_name),
+      absl::StrCat("Failed to get symbol '", symbol_name, "'"));
+  return absl::OkStatus();
 }
 
 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index bec611675c73f2..a735398bf9cb6c 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -887,26 +887,21 @@ absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   void* mem = nullptr;
   size_t bytes = 0;
-
   CHECK(static_cast<bool>(module_handle));
 
-  auto lookup_in_module = [&](CUmodule module) {
-    CHECK(module != nullptr);
-    return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                      reinterpret_cast<CUdeviceptr*>(&mem),
-                                      &bytes);
-  };
-
   {  // give limited scope to mutex_lock
     absl::MutexLock lock{&in_memory_modules_mu_};
     auto it = gpu_binary_to_module_.find(module_handle.id());
     CHECK(it != gpu_binary_to_module_.end());
-    if (lookup_in_module(it->second.first)) {
-      return DeviceMemoryBase(mem, bytes);
-    }
+
+    GpuModuleHandle gpu_module_handle = it->second.first;
+    CHECK(gpu_module_handle != nullptr);
+    TF_RETURN_IF_ERROR(GpuDriver::GetModuleSymbol(
+        context_, gpu_module_handle, symbol_name.c_str(),
+        reinterpret_cast<CUdeviceptr*>(&mem), &bytes));
+    return DeviceMemoryBase(mem, bytes);
   }
 
-  LOG(INFO) << "Failed to find symbol: " << symbol_name;
   return absl::NotFoundError(
       absl::StrCat("Check if module containing symbol ", symbol_name,
                    " is loaded (module_handle = ",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
index c44f47b47999a7..ceab15bba69715 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
@@ -637,9 +637,10 @@ class GpuDriver {
   // a device pointer and size of the symbol on success. symbol_name may not be
   // null. At least one of dptr or bytes should not be null. No ownership is
   // taken of symbol_name.
-  static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
-                              const char* symbol_name, GpuDevicePtr* dptr,
-                              size_t* bytes);
+  static absl::Status GetModuleSymbol(GpuContext* context,
+                                      GpuModuleHandle module,
+                                      const char* symbol_name,
+                                      GpuDevicePtr* dptr, size_t* bytes);
 
   // Unloads module from the current context via cuModuleUnload.
   // TODO(leary) the documentation doesn't say what kind of disasters happen
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 8daa398db7e5a2..0b0871a0fde2e9 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -1234,24 +1234,18 @@ struct BitPatternToValue {
   return absl::OkStatus();
 }
 
-/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
-                                             hipModule_t module,
-                                             const char* symbol_name,
-                                             hipDeviceptr_t* dptr,
-                                             size_t* bytes) {
+/* static */ absl::Status GpuDriver::GetModuleSymbol(GpuContext* context,
+                                                     hipModule_t module,
+                                                     const char* symbol_name,
+                                                     hipDeviceptr_t* dptr,
+                                                     size_t* bytes) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  hipError_t res = wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
-  if (res != hipSuccess) {
-    // symbol may not be found in the current module, but it may reside in
-    // another module.
-    VLOG(2) << "failed to get symbol \"" << symbol_name
-            << "\" from module: " << ToString(res);
-    return false;
-  }
-
-  return true;
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name),
+      absl::StrCat("Failed to get symbol '", symbol_name, "'"));
+  return absl::OkStatus();
 }
 
 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index a6783d5e08f2da..655e5621cc6f48 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -796,19 +796,17 @@ absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
   if (static_cast<bool>(module_handle)) {
     auto it = gpu_binary_to_module_.find(module_handle.id());
     CHECK(it != gpu_binary_to_module_.end());
-    if (GpuDriver::GetModuleSymbol(
-            context_, it->second.first, symbol_name.c_str(),
-            reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes)) {
-      return DeviceMemoryBase(mem, bytes);
-    }
+    TF_RETURN_IF_ERROR(GpuDriver::GetModuleSymbol(
+        context_, it->second.first, symbol_name.c_str(),
+        reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes));
+    return DeviceMemoryBase(mem, bytes);
   }
 
   for (auto& it : gpu_binary_to_module_) {
-    if (GpuDriver::GetModuleSymbol(
-            context_, it.second.first, symbol_name.c_str(),
-            reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes)) {
-      return DeviceMemoryBase(mem, bytes);
-    }
+    TF_RETURN_IF_ERROR(GpuDriver::GetModuleSymbol(
+        context_, it.second.first, symbol_name.c_str(),
+        reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes));
+    return DeviceMemoryBase(mem, bytes);
   }
 
   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;

From bcf118fc8303b4d8375fe9e71169f18c2d2ad73f Mon Sep 17 00:00:00 2001
From: Greg Olechwierowicz <olechwierowicz@google.com>
Date: Mon, 3 Jun 2024 02:31:23 -0700
Subject: [PATCH 247/287] [XLA:GPU] Add all-reduce-splitter pass to collective
 opt pipeline.

PiperOrigin-RevId: 639705320
---
 third_party/xla/xla/debug_options_flags.cc      | 7 +++++++
 third_party/xla/xla/service/gpu/BUILD           | 1 +
 third_party/xla/xla/service/gpu/gpu_compiler.cc | 4 ++++
 third_party/xla/xla/xla.proto                   | 5 ++++-
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index e9bfec761ca8f6..8b07cd384f6f7a 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -127,6 +127,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_all_gather_combine_by_dim(true);
   opts.set_xla_gpu_enable_reduce_scatter_combine_by_dim(true);
   opts.set_xla_gpu_all_reduce_contiguous(true);
+  opts.set_xla_gpu_enable_all_reduce_splitter(false);
 
   opts.set_xla_gpu_enable_reassociation_for_converted_ar(true);
 
@@ -1070,6 +1071,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_all_reduce_contiguous),
       debug_options->xla_gpu_all_reduce_contiguous(),
       "Combine all-reduces into a single operation over a contiguous buffer."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_all_reduce_splitter",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_all_reduce_splitter),
+      debug_options->xla_gpu_enable_all_reduce_splitter(),
+      "Splits cross-device all reduce into logical reduce scatter followed by "
+      "dynamic slice and all reduce."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_blueconnect_num_devices_per_host",
       int32_setter_for(
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index f6bf18e1c5c74e..dc54ed817a227e 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3529,6 +3529,7 @@ cc_library(
         ":prepare_hlo_for_ir_emitting_pipeline",
         ":rename_fusions",
         ":stream_executor_util",
+        "//xla/service:all_reduce_splitter",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index e1a7ce2a1cc3f2..33cd592fd52938 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "xla/service/all_reduce_folder.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_reduce_reassociate.h"
+#include "xla/service/all_reduce_splitter.h"
 #include "xla/service/async_collective_creator.h"
 #include "xla/service/batchnorm_expander.h"
 #include "xla/service/bitcast_dtypes_expander.h"
@@ -832,6 +833,9 @@ absl::Status RunCollectiveOptimizationPasses(
 
   HloPassPipeline collectives_pipeline("collective-optimizations");
   collectives_pipeline.AddPass<AllReduceFolder>();
+  if (debug_options.xla_gpu_enable_all_reduce_splitter()) {
+    collectives_pipeline.AddPass<AllReduceSplitter>();
+  }
   collectives_pipeline.AddPass<ReduceScatterCreator>();
   collectives_pipeline.AddPass<AllGatherOptimizer>();
   collectives_pipeline.AddPass<AllReduceReassociate>(
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index c64a751e4f6d5f..218992aff521dc 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -408,6 +408,9 @@ message DebugOptions {
   // Combine GPU all-reduces into a single operation over a contiguous buffer.
   bool xla_gpu_all_reduce_contiguous = 158;
 
+  // Split AR across all partitions into logical RS followed by AR.
+  bool xla_gpu_enable_all_reduce_splitter = 299;
+
   // Enable allreduce reassociation on allreduces that are converted to a wider
   // type. The resulting allreduce will be promoted to a wider-typed allreduce.
   bool xla_gpu_enable_reassociation_for_converted_ar = 209;
@@ -783,7 +786,7 @@ message DebugOptions {
   // a deterministic implementation.
   bool xla_gpu_exclude_nondeterministic_ops = 297;
 
-  // Next id: 299
+  // Next id: 300
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 6c50f081ac985d6c7a68b2c23410b43479a2c004 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 3 Jun 2024 03:32:22 -0700
Subject: [PATCH 248/287] Account for trivial dimensions in
 TransposesMinorDimension().

We don't care about whether trivial 1-sized dimensions change their position.

PiperOrigin-RevId: 639718629
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/gpu_fusible.cc        | 53 +++++++++++++----
 .../xla/xla/service/gpu/gpu_fusible_test.cc   | 58 +++++++++++++++++++
 3 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index dc54ed817a227e..d809440ba41201 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -4468,6 +4468,7 @@ cc_library(
         ":hlo_traversal",
         ":ir_emission_utils",
         ":reduction_utils",
+        "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index b004930c5f994c..848f9680498e16 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <stack>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/permutation_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -115,22 +117,53 @@ bool IsPhysicallyTransposing(const HloInstruction& instr) {
                                          instr.shape(), instr.dimensions()));
 }
 
+namespace {
+std::pair<int64_t, int64_t> MostMinorNonTrivialDimension(const Shape& shape) {
+  int64_t position_of_first_non_trivial_dim = 0;
+  for (int64_t dim : shape.layout().minor_to_major()) {
+    if (shape.dimensions()[dim] > 1) {
+      return {dim, position_of_first_non_trivial_dim};
+    }
+    ++position_of_first_non_trivial_dim;
+  }
+  return {-1, position_of_first_non_trivial_dim};
+}
+}  // namespace
+
 bool TransposesMinorDimension(const HloInstruction* instr) {
   switch (instr->opcode()) {
     case HloOpcode::kFusion:
       return absl::c_any_of(instr->fused_instructions(),
                             TransposesMinorDimension);
-    case HloOpcode::kCopy:
-      return instr->shape().layout().minor_to_major(0) !=
-             instr->operand(0)->shape().layout().minor_to_major(0);
+    // TODO(akuegel): This can be simplified by just calling
+    // GetDescriptionForTiledTransposeEmitter() once it returns a value for all
+    // transposes that affect the most minor non-trivial dimension. Right now,
+    // there are also cases with transposes that affect the most minor
+    // non-trivial dimension which are not supported by the transpose emitter,
+    // so GetDescriptionForTiledTransposeEmitter would return std::nullopt.
+    case HloOpcode::kCopy: {
+      int64_t first_non_trivial_operand_dim =
+          MostMinorNonTrivialDimension(instr->operand(0)->shape()).first;
+      int64_t first_non_trivial_output_dim =
+          MostMinorNonTrivialDimension(instr->shape()).first;
+      return first_non_trivial_operand_dim != first_non_trivial_output_dim;
+    }
     case HloOpcode::kTranspose: {
-      // We have an input ([a,b,c]{x,y,z}) that's being transposed. We need to
-      // check if the minor-most dimension (x) is still the minor-most dimension
-      // after the transpose.
-      int64_t minor_input =
-          instr->operand(0)->shape().layout().minor_to_major(0);
-      int64_t minor_output = instr->shape().layout().minor_to_major(0);
-      return minor_input != instr->dimensions().at(minor_output);
+      auto position_in_minor_to_major = InversePermutation(
+          instr->operand(0)->shape().layout().minor_to_major());
+      int64_t position_of_first_non_trivial_dim =
+          MostMinorNonTrivialDimension(instr->operand(0)->shape()).second;
+      for (int64_t output_dim : instr->shape().layout().minor_to_major()) {
+        if (instr->shape().dimensions()[output_dim] == 1) {
+          continue;
+        }
+        int64_t operand_dim = instr->dimensions().at(output_dim);
+        // Check if there is any operand dimension with size > 1 that is more
+        // minor than 'operand_dim'
+        return position_in_minor_to_major[operand_dim] >
+               position_of_first_non_trivial_dim;
+      }
+      return false;
     }
     default:
       return false;
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index a8c66ce9fb8554..874b9da3a0a8c0 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -245,6 +245,41 @@ TEST_F(GpuFusibleTest, TransposesMinorDimension) {
   EXPECT_FALSE(TransposesMinorDimension(tuple->operand(5)));
 }
 
+TEST_F(GpuFusibleTest, TransposesMinorDimensionSkipTrivialDimensions) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY entry {
+      default_layout = f32[10,20,1,1]{3,2,1,0} parameter(0)
+      non_default_layout = f32[10,20,1,1]{1,2,3,0} parameter(1)
+
+      // Only trivial dimensions are swapped.
+      transpose_minor_default = f32[10,20,1,1]{3,2,1,0} transpose(default_layout), dimensions={0,1,3,2}
+      // The first non-trivial dimension is still the same in input and output.
+      transpose_nontrivial_minor_default = f32[10,1,20,1]{3,2,1,0} transpose(default_layout), dimensions={0,2,1,3}
+      no_transpose_minor_default = f32[10,20,1,1]{2,3,1,0} transpose(default_layout), dimensions={0,1,3,2}
+      // We swap the most major dimension with a trivial dimension.
+      transpose_one_major_default = f32[1,20,10,1]{3,2,1,0} transpose(default_layout), dimensions={2,1,0,3}
+      // The first two non-trivial dimensions are swapped.
+      transpose_two_major_default = f32[20,10,1,1]{3,2,1,0} transpose(default_layout), dimensions={1,0,2,3}
+
+      transpose_minor_non_default = f32[10,1,20,1]{1,2,3,0} transpose(non_default_layout), dimensions={0,2,1,3}
+      no_transpose_minor_non_default = f32[10,20,1,1]{1,2,0,3} transpose(non_default_layout), dimensions={0,1,3,2}
+      transpose_major_non_default = f32[10,20,1,1]{1,2,3,0} transpose(non_default_layout), dimensions={0,1,3,2}
+
+      ROOT r = tuple(transpose_minor_default, transpose_nontrivial_minor_default, no_transpose_minor_default, transpose_one_major_default, transpose_two_major_default,
+                     transpose_minor_non_default, no_transpose_minor_non_default, transpose_major_non_default)
+    })"));
+
+  auto* tuple = (*module)->entry_computation()->root_instruction();
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(0)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(1)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(2)));
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(3)));
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(4)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(5)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(6)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(7)));
+}
+
 TEST_F(GpuFusibleTest, CopyTransposesMinorDimension) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
@@ -268,6 +303,29 @@ TEST_F(GpuFusibleTest, CopyTransposesMinorDimension) {
   EXPECT_FALSE(TransposesMinorDimension(tuple->operand(3)));
 }
 
+TEST_F(GpuFusibleTest, CopyTransposesMinorDimensionSkipTrivialDimensions) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY entry {
+      default_layout = f32[10,20,1,1]{3,2,1,0} parameter(0)
+      non_default_layout = f32[10,20,1,1]{1,2,3,0} parameter(1)
+
+      copy_transpose_minor_default = f32[10,20,1,1]{2,3,1,0} copy(default_layout)
+      copy_no_transpose_minor_default = f32[10,20,1,1]{3,2,1,0} copy(default_layout)
+
+      copy_transpose_minor_non_default = f32[10,20,1,1]{2,0,3,1} copy(non_default_layout)
+      copy_no_transpose_minor_non_default = f32[10,20,1,1]{1,2,3,0} copy(non_default_layout)
+
+      ROOT r = tuple(copy_transpose_minor_default, copy_no_transpose_minor_default,
+                     copy_transpose_minor_non_default, copy_no_transpose_minor_non_default)
+    })"));
+
+  auto* tuple = (*module)->entry_computation()->root_instruction();
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(0)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(1)));
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(2)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(3)));
+}
+
 TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {

From c73f668672259ab75c9bc9f6eeb9a4a8b51687ed Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Mon, 3 Jun 2024 04:58:27 -0700
Subject: [PATCH 249/287] PR #13293: [ROCm] Add gpu_plugin dependency for AMD
 gpus.

Imported from GitHub PR https://github.com/openxla/xla/pull/13293

Without this patch amdgpu_compiler dependency was not included for gpu tests and as a result most unit tests failed on amd platform
Copybara import of the project:

--
c87773390404c8bcd1ad00f3b0bc874bc1ad1dca by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

[ROCm] Add gpu_plugin dependency for AMD gpus.

Without this patch amdgpu_compiler dependency was not included for
gpu tests and as a result most unit tests failed on amd platform

Merging this change closes #13293

PiperOrigin-RevId: 639738302
---
 third_party/xla/xla/tests/build_defs.bzl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index 160dba92bec174..9b491e916b6405 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -240,15 +240,13 @@ def xla_test(
                 "//xla/service:cpu_plugin",
                 "//xla/tests:test_macros_cpu",
             ]
-        elif backend in NVIDIA_GPU_BACKENDS:
+        elif backend in NVIDIA_GPU_BACKENDS + AMD_GPU_DEFAULT_BACKENDS:
             backend_deps += [
                 "//xla/service:gpu_plugin",
                 "//xla/tests:test_macros_%s" % backend,
             ]
-            this_backend_tags += tf_gpu_tests_tags()
-            this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
-        elif backend in AMD_GPU_DEFAULT_BACKENDS:
-            backend_deps.append("//xla/tests:test_macros_gpu_amd_any")
+            if backend in NVIDIA_GPU_BACKENDS:
+                this_backend_tags += tf_gpu_tests_tags()
             this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
         elif backend == "interpreter":
             backend_deps += [

From a4b7e42c383da2c5ab59330f8e227bc36f55ceb2 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 3 Jun 2024 05:07:52 -0700
Subject: [PATCH 250/287] Disable ptx_compiler_test with msan.

We are currently not using compilation via libNVPTX. So even if the msan
failure is valid, it should not matter right now.
While there, clean up ptx_compiler_test a bit.

PiperOrigin-RevId: 639740897
---
 third_party/xla/xla/stream_executor/cuda/BUILD             | 2 ++
 .../xla/xla/stream_executor/cuda/ptx_compiler_test.cc      | 7 ++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 131d2dbf03e437..e66deae988a38c 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -541,6 +541,8 @@ cc_library(
 xla_cc_test(
     name = "ptx_compiler_test",
     srcs = ["ptx_compiler_test.cc"],
+    # TODO(b/343996893): Figure out whether msan reports a false positive or not.
+    tags = ["nomsan"],
     deps = [
         ":ptx_compiler",
         ":ptx_compiler_support",
diff --git a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_test.cc b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_test.cc
index a42394f9988eef..5099e455b10424 100644
--- a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_test.cc
@@ -154,9 +154,8 @@ absl::StatusOr<std::vector<uint8_t>> CompileHelper(
     stream_executor::CudaComputeCapability cc, const char* const ptx_input,
     bool disable_gpuasm_optimizations = false, bool cancel_if_reg_spill = false,
     std::vector<std::string> extra_flags = {}) {
-  stream_executor::GpuAsmOpts options{};
-  options.disable_gpuasm_optimizations = disable_gpuasm_optimizations;
-  options.extra_flags = std::move(extra_flags);
+  stream_executor::GpuAsmOpts options(disable_gpuasm_optimizations,
+                                      /*preferred_cuda_dir=*/"", extra_flags);
 
   return stream_executor::CompileGpuAsmUsingLibNvPtxCompiler(
       cc.major, cc.minor, ptx_input, options, cancel_if_reg_spill);
@@ -175,14 +174,12 @@ class PtxCompilerTest : public ::testing::Test {
 };
 
 TEST_F(PtxCompilerTest, IdentifiesUnsupportedArchitecture) {
-  stream_executor::GpuAsmOpts options{};
   EXPECT_THAT(
       CompileHelper(stream_executor::CudaComputeCapability{100, 0}, kSimplePtx),
       tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
 }
 
 TEST_F(PtxCompilerTest, CanCompileSingleCompilationUnit) {
-  stream_executor::GpuAsmOpts options{};
   EXPECT_THAT(CompileHelper(kDefaultComputeCapability, kSimplePtx),
               tsl::testing::IsOk());
 }

From 1c7f292617e7c276d03c51894928004625f8ec29 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 05:25:53 -0700
Subject: [PATCH 251/287] Add `tags` parameters to the
 gen_gpu_hlo_compile_tests build rule.

PiperOrigin-RevId: 639745067
---
 third_party/xla/xla/service/gpu/build_defs.bzl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 2837c5f3139cf3..b1dcd103fb4425 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -72,7 +72,8 @@ def gen_gpu_hlo_compile_tests(
         disabled_backends = [],
         backend_tags = {},
         backend_args = {},
-        xla_flags = []):
+        xla_flags = [],
+        tags = []):
     """Macro to generate Bazel tests for compiling HLO files on a GPU.
 
     This macro creates individual Bazel test targets for each specified HLO file.
@@ -101,6 +102,7 @@ def gen_gpu_hlo_compile_tests(
       backend_args: A dict mapping backend name to list of additional args to
         use for that target.
       xla_flags: A list of XLA flags passed to multihost_hlo_runner.
+      tags: A list of tags to apply to all generated tests.
 
 
     Example Usage:
@@ -180,5 +182,5 @@ def gen_gpu_hlo_compile_tests(
                     hlo_path,
                 ] + xla_flags,
                 data = ["//xla/tools/multihost_hlo_runner:cuda_hlo_runner_main", data_label],
-                tags = backend_tags[backend] + ["requires-mem:16g"],
+                tags = backend_tags[backend] + ["requires-mem:16g"] + tags,
             )

From c285b3e130702052d1e085d955f64a6aa271821f Mon Sep 17 00:00:00 2001
From: Alexander Lyashuk <crem@google.com>
Date: Mon, 3 Jun 2024 05:31:51 -0700
Subject: [PATCH 252/287] Simplify bitcast-convert chain.

When there is a chain of bitcast-converts, only keep the last one.

PiperOrigin-RevId: 639746817
---
 .../xla/xla/service/algebraic_simplifier.cc   |  9 +++++
 .../xla/service/algebraic_simplifier_test.cc  | 33 +++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 97f52024c40bc5..6dee51de136948 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -1479,6 +1479,15 @@ bool AlgebraicSimplifierVisitor::SwapCopyBitcastCopy(
 
 absl::Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
     HloInstruction* bitcast) {
+  auto operand = bitcast->mutable_operand(0);
+
+  // In a chain of BitcastConverts, only keep the last one.
+  if (HloOpcode::kBitcastConvert == operand->opcode()) {
+    return ReplaceWithNewInstruction(
+        bitcast, HloInstruction::CreateBitcastConvert(
+                     bitcast->shape(), operand->mutable_operand(0)));
+  }
+
   TF_ASSIGN_OR_RETURN(bool replaced,
                       TrySimplifyTautologicalBitcastConvert(bitcast));
   if (replaced) {
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 6c7bf94bae7eb3..d9f632889d7155 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -10064,6 +10064,39 @@ TEST_F(AlgebraicSimplifierTest, SimplifyRedundantBitcastConvert) {
               GmockMatch(m::Concatenate(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST_F(AlgebraicSimplifierTest, SimplifyTautologicalBitcastConvert) {
+  const char* kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      p0 = bf16[10] parameter(0)
+      ROOT out = bf16[10] bitcast-convert(p0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
+TEST_F(AlgebraicSimplifierTest, SimplifyBitcastConvertChain) {
+  const char* kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      p0 = s16[10] parameter(0)
+      b1 = bf16[10] bitcast-convert(p0)
+      b2 = u16[10] bitcast-convert(b1)
+      b3 = f16[10] bitcast-convert(b2)
+      ROOT out = s16[10] bitcast-convert(b3)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::BitcastConvert(m::Parameter(0))));
+}
+
 TEST_F(AlgebraicSimplifierTest,
        DoNotSimplifyRedundantBitcastConvertWithControlDep) {
   const char* kModuleStr = R"(

From 34075624dc01f7a3a0e20132108e25f77727615f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 06:01:58 -0700
Subject: [PATCH 253/287] Fix the path to the tensorflow dir in the Win
 libtensorflow script.

PiperOrigin-RevId: 639753417
---
 tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
index 131ad3c7059067..b28c53a90bd078 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
@@ -13,7 +13,7 @@
 :: limitations under the License.
 :: =============================================================================
 
-SET TF_DIR=%cd%\github\tensorflow
+SET TF_DIR=%cd%
 SET TF_DOCKER_DIR=C:\src\tensorflow
 REM TODO(belitskiy): Switch to Artifact Registry
 set TF_DOCKER_IMAGE="gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"

From f143bc29d180319907d5ae91eac4e83020fed250 Mon Sep 17 00:00:00 2001
From: Dan Foreman-Mackey <danfm@google.com>
Date: Mon, 3 Jun 2024 06:24:34 -0700
Subject: [PATCH 254/287] [xla:ffi] Add filegroup for all FFI headers

For the JAX FFI interface, it is useful to have a build filegroup containing
the three headers that end users will typically need.

PiperOrigin-RevId: 639758901
---
 third_party/xla/xla/ffi/api/BUILD | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index c115273baccac1..055bf0c08bedbb 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -23,6 +23,17 @@ package(
 # `ffi.h` defines builtin decoding for canonical XLA types, but users can add their own decodings
 # with template specializations.
 
+# A user of the FFI interface will only need the three headers included in the `all_headers`
+# filegroup: `api.h`, `c_api.h`, and `ffi.h`.
+filegroup(
+    name = "all_headers",
+    srcs = [
+        "api.h",
+        "c_api.h",
+        "ffi.h",
+    ],
+)
+
 filegroup(
     name = "api_headers",
     srcs = ["api.h"],

From 8a058d55c0b005248189ce43d55b3a6c501e113d Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 3 Jun 2024 06:29:24 -0700
Subject: [PATCH 255/287] [JAX] Fail gracefully when an array with multiple
 shards is passed to make_array_from_single_device_arrays.

Fix a crash when an exception is thrown during PyArray construction.

PiperOrigin-RevId: 639760114
---
 third_party/xla/xla/python/py_array.cc   | 25 ++++++++++++++++++------
 third_party/xla/xla/python/xla_client.py |  2 +-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index d1e0ff97f58c7b..e14bdab0e818b8 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -84,7 +84,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 // TODO(b/324133505): remove this GOOGLE_CUDA block after JAX OSS migrates
 // to cuda plugin.
@@ -160,7 +159,14 @@ tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
           first_memory_kind,
           py_arrays.front().ifrt_array()->sharding().devices().front());
   for (const auto& py_array : py_arrays) {
-    DCHECK_EQ(py_array.num_shards(), 1);
+    if (py_array.num_shards() != 1) {
+      throw nb::value_error(
+          absl::StrFormat(
+              "When making an array from single-device arrays the input arrays "
+              "must have one shard each. An argument array had %d shard(s).",
+              py_array.num_shards())
+              .c_str());
+    }
     ifrt_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
     devices.push_back(ifrt_arrays.back()->sharding().devices().front());
     shapes.push_back(ifrt_arrays.back()->shape());
@@ -231,6 +237,7 @@ struct PyArrayObject {
   PyObject* weakrefs;
   PyObject* dict;
 #endif  // PY_VERSION_HEX < 0x030B0000
+  bool initialized;
   alignas(PyArray::Storage) char array_storage[sizeof(PyArray::Storage)];
 };
 static_assert(std::is_standard_layout<PyArrayObject>::value);
@@ -242,6 +249,8 @@ PyArray::Storage* GetPyArrayStorageFromObject(PyArrayObject* py_array_object) {
 
 extern "C" PyObject* PyArray_tp_new(PyTypeObject* type, PyObject*, PyObject*) {
   PyObject* self = type->tp_alloc(type, 0);
+  auto* obj = reinterpret_cast<PyArrayObject*>(self);
+  obj->initialized = false;
   return self;
 }
 
@@ -250,7 +259,9 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
   PyTypeObject* tp = Py_TYPE(self);
   auto* obj = reinterpret_cast<PyArrayObject*>(self);
 
-  GetPyArrayStorageFromObject(obj)->~PyArray_Storage();
+  if (obj->initialized) {
+    GetPyArrayStorageFromObject(obj)->~PyArray_Storage();
+  }
 
   PyObject_ClearWeakRefs(self);
 #if PY_VERSION_HEX < 0x030C0000
@@ -297,8 +308,10 @@ extern "C" int PyArray_tp_clear(PyObject* self) {
 
 template <typename... Args>
 PyArray::Storage* Construct(PyArrayObject* self, Args&&... args) {
-  return new (self->array_storage)
-      PyArray::Storage(std::forward<Args>(args)...);
+  PyArray::Storage* out =
+      new (self->array_storage) PyArray::Storage(std::forward<Args>(args)...);
+  self->initialized = true;
+  return out;
 }
 
 struct ShapedArrayCacheKey {
@@ -970,7 +983,7 @@ nb::handle PyArray::Storage::AsHandle() {
 
 PyArray::Storage::~PyArray_Storage() {
   CHECK(PyGILState_Check());
-  if (py_client->arrays_ == this) {
+  if (py_client && py_client->arrays_ == this) {
     py_client->arrays_ = next;
   }
   if (prev) {
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 8180d9fd421d4d..df13f48d039db3 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -49,7 +49,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 268
+_version = 269
 
 # Version number for MLIR:Python components.
 mlir_api_version = 57

From 97728d3f55bcadb6664c39f94d44ed778c926147 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 06:56:15 -0700
Subject: [PATCH 256/287] Fix up the wheel for `auditwheel`.

Follow-up to:
https://github.com/tensorflow/tensorflow/commit/c09c4cd108bdd4e6e68364ac1ee7f45fd0f72185

PiperOrigin-RevId: 639766209
---
 tensorflow/tools/pip_package/build_pip_package.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 99764afab49005..9588fc19e3d4e9 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -234,6 +234,9 @@ def patch_so(srcs_dir: str) -> None:
           "tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/"
           "pywrap_tensorflow_to_stablehlo.so"
       ): "$ORIGIN/../../../../python",
+      (
+          "tensorflow/compiler/mlir/lite/python/_pywrap_converter_api.so"
+      ): "$ORIGIN/../../../../python",
   }
   for file, path in to_patch.items():
     rpath = subprocess.check_output(

From 0c51dc593181858f2570510befeedbd2d1f53853 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Mon, 3 Jun 2024 07:36:19 -0700
Subject: [PATCH 257/287] [XLA:GPU] Disable broken
 `GpuCompilerTest.GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas`
 temporarily.

PiperOrigin-RevId: 639777588
---
 third_party/xla/xla/service/gpu/gpu_compiler_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 2ebd2b554498e9..7e9e80a8598532 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -336,6 +336,7 @@ ENTRY main {
 
 TEST_F(GpuCompilerTest,
        GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas) {
+  GTEST_SKIP() << "TODO(bchetioui): fix broken test";
   const absl::string_view hlo_string = R"(
 HloModule test
 

From 7c0b9efd73f5ff12bc5fce72712bd38828a06555 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 3 Jun 2024 08:29:15 -0700
Subject: [PATCH 258/287] [IFRT] Add an IFRT API for Topology.

Also add a (currently unused) Compile() method to ifrt::Compiler that produces an ifrt::Executable given a ifrt::Topology.

At the moment this is a thin wrapper around the the PJRT TopologyDescription API, and does not attempt to change that API or its semantics. The change prepares for allowing the two to diverge.

Add a PJRT-IFRT implementation of ifrt::Topology that is for now a direct wrapper of PjRtTopologyDescription.

PiperOrigin-RevId: 639792742
---
 third_party/xla/xla/pjrt/BUILD                |   1 +
 third_party/xla/xla/pjrt/pjrt_compiler.h      |   3 +
 .../xla/xla/pjrt/pjrt_device_description.h    |   2 -
 third_party/xla/xla/pjrt/pjrt_executable.h    |   1 -
 third_party/xla/xla/python/BUILD              |   1 +
 third_party/xla/xla/python/ifrt/BUILD         |  10 +-
 third_party/xla/xla/python/ifrt/client.h      |   7 +-
 third_party/xla/xla/python/ifrt/compiler.h    |   5 +
 third_party/xla/xla/python/ifrt/executable.h  |  14 ++-
 third_party/xla/xla/python/ifrt/mock.h        |  18 ++-
 third_party/xla/xla/python/ifrt/topology.cc   |  22 ++++
 third_party/xla/xla/python/ifrt/topology.h    |  75 +++++++++++
 .../xla/xla/python/ifrt_proxy/client/client.h |   5 +-
 .../xla/python/ifrt_proxy/client/compiler.cc  |   9 ++
 .../xla/python/ifrt_proxy/client/compiler.h   |   7 ++
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |   2 +
 .../xla/xla/python/pjrt_ifrt/pjrt_client.cc   |  11 +-
 .../xla/xla/python/pjrt_ifrt/pjrt_client.h    |   5 +-
 .../xla/xla/python/pjrt_ifrt/pjrt_compiler.cc |  25 ++++
 .../xla/xla/python/pjrt_ifrt/pjrt_compiler.h  |   9 ++
 .../xla/python/pjrt_ifrt/pjrt_executable.cc   |   8 +-
 .../xla/python/pjrt_ifrt/pjrt_executable.h    |  20 ++-
 .../xla/xla/python/pjrt_ifrt/pjrt_topology.cc |  71 +++++++++++
 .../xla/xla/python/pjrt_ifrt/pjrt_topology.h  |  70 +++++++++++
 .../xla/xla/python/pjrt_ifrt/xla_compiler.h   |   1 +
 .../xla/xla/python/py_compile_only_client.cc  |  48 ++++---
 .../xla/xla/python/py_compile_only_client.h   |   4 +-
 third_party/xla/xla/python/xla.cc             | 117 ++++++++++--------
 28 files changed, 458 insertions(+), 113 deletions(-)
 create mode 100644 third_party/xla/xla/python/ifrt/topology.cc
 create mode 100644 third_party/xla/xla/python/ifrt/topology.h
 create mode 100644 third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.cc
 create mode 100644 third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 529bd94fe38cfc..94527a7829de68 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -306,6 +306,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index 0c8a21e7d0976f..07371d2c72bd0d 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -16,12 +16,15 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_COMPILER_H_
 #define XLA_PJRT_PJRT_COMPILER_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/xla_computation.h"
 #include "xla/pjrt/pjrt_device_description.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_device_description.h b/third_party/xla/xla/pjrt/pjrt_device_description.h
index 2021f8e1cf1960..ed852699e404c5 100644
--- a/third_party/xla/xla/pjrt/pjrt_device_description.h
+++ b/third_party/xla/xla/pjrt/pjrt_device_description.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
 #define XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
 
-#include <memory>
-#include <optional>
 #include <string>
 #include <string_view>
 
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index 9bec1ae1d4a1d0..a0de27d701db84 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index ac8b7fba84d3ed..b9825a4df4912a 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1380,6 +1380,7 @@ tsl_pybind_extension(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform/cloud:gcs_file_system",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ] + select({
         # gloo transport only builds on linux
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index ddf9a03825c243..54031c4fc19fd1 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "executable_serdes.cc",
         "shape.cc",
         "sharding.cc",
+        "topology.cc",
         "tuple.cc",
         "value.cc",
     ],
@@ -74,6 +75,7 @@ cc_library(
         "executable_serdes.h",
         "shape.h",
         "sharding.h",
+        "topology.h",
         "tuple.h",
         "value.h",
     ],
@@ -86,9 +88,11 @@ cc_library(
         ":serdes",
         ":shape_proto_cc",
         ":sharding_proto_cc",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
@@ -331,19 +335,13 @@ cc_library(
     hdrs = ["mock.h"],
     deps = [
         ":ifrt",
-        "//xla:literal",
         "//xla:test",
         "//xla/hlo/ir:hlo",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index 0f0ad190d17569..9ee79d2d100b9a 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/service/computation_placer.h"
@@ -180,9 +181,9 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // only ahead-of-time compilation.
   virtual Compiler* GetDefaultCompiler() = 0;
 
-  // Returns a topology description for that covers the provided devices.
-  virtual absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(const DeviceList& devices) const = 0;
+  // Returns a topology that covers the provided devices.
+  virtual absl::StatusOr<std::shared_ptr<Topology>> GetTopologyForDevices(
+      const DeviceList& devices) const = 0;
 
   // Returns the default layout on `device` for a buffer with `dtype` and
   // single-shard dimensions `dims`.
diff --git a/third_party/xla/xla/python/ifrt/compiler.h b/third_party/xla/xla/python/ifrt/compiler.h
index 92a7df23d11a17..36cbdebdf54bc2 100644
--- a/third_party/xla/xla/python/ifrt/compiler.h
+++ b/third_party/xla/xla/python/ifrt/compiler.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/python/ifrt/executable_serdes.h"
 #include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/topology.h"
 
 namespace xla {
 namespace ifrt {
@@ -57,6 +58,10 @@ class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
       std::unique_ptr<Program> program,
       std::unique_ptr<CompileOptions> options) = 0;
 
+  virtual absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) = 0;
+
   // Deserializes a serialized executable as produced by
   // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
   // implementation specific.
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 864b469f551c4f..b3f55f6b4eaf42 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -41,6 +40,7 @@ namespace xla {
 namespace ifrt {
 
 class Client;
+class CompileOptions;
 struct DeserializeExecutableOptions;
 
 // Wraps a computation that has been partially compiled and can be loaded.
@@ -84,6 +84,13 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
 
+  // Returns a list of lists of memory kind strings for output. The returned
+  // value is `[num_programs, num_output]`. The size of the outer list should be
+  // equal to `GetHloModules()`. Under SPMD, one can use
+  // `GetOutputMemoryKinds().front()`.
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const = 0;
+
   using CostAnalysisValue = xla::PjRtValueType;
 
   // Returns named values for cost properties of this executable (such as
@@ -92,6 +99,11 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
   virtual absl::StatusOr<absl::flat_hash_map<std::string, CostAnalysisValue>>
   GetCostAnalysis() const = 0;
 
+  // Returns the compile options used to compile this executable.
+  // TODO(phawkins): consider removing this API and having the client remember
+  // the compile options used to create the executable.
+  virtual const CompileOptions* GetCompileOptions() const = 0;
+
   static char ID;  // NOLINT
 };
 
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 1f720f00974a0c..91543b2d89833c 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -25,19 +25,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
@@ -53,11 +47,11 @@ limitations under the License.
 #include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/test.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "tsl/framework/allocator.h"
 
 namespace xla {
 namespace ifrt {
@@ -154,10 +148,8 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
   MOCK_METHOD(absl::StatusOr<Device*>, LookupAddressableDevice,
               (int local_hardware_id), (const, final));
   MOCK_METHOD(Compiler*, GetDefaultCompiler, (), (final));
-  MOCK_METHOD(
-      absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>,
-      GetTopologyForDevices, (const xla::ifrt::DeviceList& devices),
-      (const, final));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<Topology>>, GetTopologyForDevices,
+              (const xla::ifrt::DeviceList& devices), (const, final));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<xla::PjRtLayout>>,
               GetDefaultLayoutForDevice,
               (xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
@@ -181,6 +173,10 @@ class MockCompiler : public llvm::RTTIExtends<MockCompiler, Compiler> {
               (std::unique_ptr<Program> program,
                std::unique_ptr<CompileOptions> options),
               (final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Executable>>, Compile,
+              (std::unique_ptr<Program> program, const Topology& topology,
+               std::unique_ptr<CompileOptions> options),
+              (final));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>,
               DeserializeLoadedExecutable,
               (absl::string_view serialized,
diff --git a/third_party/xla/xla/python/ifrt/topology.cc b/third_party/xla/xla/python/ifrt/topology.cc
new file mode 100644
index 00000000000000..215c47e44ccadf
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/topology.cc
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/topology.h"
+
+namespace xla::ifrt {
+
+char Topology::ID = 0;
+
+}  // namespace xla::ifrt
diff --git a/third_party/xla/xla/python/ifrt/topology.h b/third_party/xla/xla/python/ifrt/topology.h
new file mode 100644
index 00000000000000..3926b98f16fcd1
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/topology.h
@@ -0,0 +1,75 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_TOPOLOGY_H_
+#define XLA_PYTHON_IFRT_TOPOLOGY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::ifrt {
+
+class Topology : public llvm::RTTIExtends<Topology, llvm::RTTIRoot> {
+ public:
+  // Returns a string that identifies the platform (CPU/GPU/TPU).
+  virtual absl::string_view platform_name() const = 0;
+
+  // Returns a string containing human-readable, platform-specific version info
+  // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+  virtual absl::string_view platform_version() const = 0;
+
+  virtual PjRtPlatformId platform_id() const = 0;
+
+  // Returns an unordered list of descriptions for all devices in this topology.
+  // TODO(phawkins): consider introducing an IFRT-specific API here instead of
+  // delegating to PJRT.
+  virtual std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+  DeviceDescriptions() const = 0;
+
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual absl::StatusOr<xla::Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) const = 0;
+
+  // Serializes the topology for use in cache keys. (No guarantees on
+  // stability).
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  // Returns vendor specific attributes about the topology.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_IFRT_TOPOLOGY_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
index 2bf7a6c82c6173..6a703df3d3002c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -40,6 +40,7 @@
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/compiler.h"
@@ -118,8 +119,8 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
   xla::ifrt::Compiler* GetDefaultCompiler() override {
     return &default_compiler_;
   }
-  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const override {
+  absl::StatusOr<std::shared_ptr<xla::ifrt::Topology>> GetTopologyForDevices(
+      const xla::ifrt::DeviceList& devices) const override {
     return absl::UnimplementedError(
         "GetTopologyForDevices is not supported for the IFRT proxy client.");
   }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 2d0bb3cb1f7f40..d8e6f9c2cdd0fe 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -32,7 +32,9 @@
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt_proxy/client/executable.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
@@ -143,6 +145,13 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
       std::move(loaded_host_callback_handles));
 }
 
+absl::StatusOr<std::unique_ptr<Executable>> Compiler::Compile(
+    std::unique_ptr<Program> program, const Topology& topology,
+    std::unique_ptr<CompileOptions> options) {
+  return absl::UnimplementedError(
+      "IFRT service compiler does not support `Compile` with a topology");
+}
+
 absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
 Compiler::DeserializeLoadedExecutable(
     absl::string_view serialized,
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.h b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
index 6bfc814766d111..3ad562d42ab809 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
@@ -23,6 +23,9 @@
 #include "absl/strings/string_view.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 
 namespace xla {
@@ -37,6 +40,10 @@ class Compiler final : public llvm::RTTIExtends<Compiler, xla::ifrt::Compiler> {
       std::unique_ptr<xla::ifrt::Program> program,
       std::unique_ptr<xla::ifrt::CompileOptions> options) override;
 
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) override;
+
   absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
   DeserializeLoadedExecutable(
       absl::string_view serialized,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index b24907b1737603..9db33f33ea3884 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -193,6 +193,7 @@ cc_library(
         "pjrt_host_callback.cc",
         "pjrt_memory.cc",
         "pjrt_remap.cc",
+        "pjrt_topology.cc",
         "pjrt_tuple.cc",
     ],
     hdrs = [
@@ -204,6 +205,7 @@ cc_library(
         "pjrt_host_callback.h",
         "pjrt_memory.h",
         "pjrt_remap.h",
+        "pjrt_topology.h",
         "pjrt_tuple.h",
     ],
     compatible_with = get_compatible_with_portable(),
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index ced5cb654c22f1..d6376cc4a403fb 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/basic_string_array.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
 #include "xla/python/pjrt_ifrt/pjrt_remap.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
 #include "xla/python/pjrt_ifrt/pjrt_tuple.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -524,13 +526,14 @@ absl::StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
   return PjRtTuple::Create(this, values);
 }
 
-absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-PjRtClient::GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const {
+absl::StatusOr<std::shared_ptr<Topology>> PjRtClient::GetTopologyForDevices(
+    const xla::ifrt::DeviceList& devices) const {
   // TODO(parkers): Consider constructing a sub-slice topology based on the
   // provided devices.
   TF_ASSIGN_OR_RETURN(auto topology, pjrt_client_->GetTopologyDescription());
-  return std::shared_ptr<const xla::PjRtTopologyDescription>(pjrt_client_,
-                                                             topology);
+  return std::make_shared<PjRtTopology>(
+      std::shared_ptr<const xla::PjRtTopologyDescription>(pjrt_client_,
+                                                          topology));
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLayout>>
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 4b52af1ed47659..4dcf396da25212 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_compiler.h"
@@ -184,8 +185,8 @@ class PjRtClient final
     return &default_compiler_;
   }
 
-  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(const DeviceList& devices) const override;
+  absl::StatusOr<std::shared_ptr<Topology>> GetTopologyForDevices(
+      const DeviceList& devices) const override;
 
   absl::StatusOr<std::unique_ptr<xla::PjRtLayout>> GetDefaultLayoutForDevice(
       DType dtype, absl::Span<const int64_t> dims,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
index 028e480b79af2c..feb4112b555806 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -24,12 +24,15 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -54,6 +57,28 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
       std::move(xla_compile_options->loaded_host_callbacks));
 }
 
+absl::StatusOr<std::unique_ptr<Executable>> PjRtCompiler::Compile(
+    std::unique_ptr<Program> program, const Topology& topology,
+    std::unique_ptr<CompileOptions> options) {
+  DCHECK(this);
+  const auto* xla_program = llvm::dyn_cast<HloProgram>(program.get());
+  if (xla_program == nullptr) {
+    return absl::InvalidArgumentError("PjRtCompiler requires an HloProgram");
+  }
+  TF_ASSIGN_OR_RETURN(auto xla_compile_options,
+                      GetXlaCompileOptions(std::move(options)));
+  const auto* pjrt_topology = llvm::dyn_cast<PjRtTopology>(&topology);
+  if (pjrt_topology == nullptr) {
+    return absl::InvalidArgumentError("PjRtCompiler requires a PjRtTopology");
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto executable,
+      PjRtCompile(xla_compile_options->compile_options,
+                  xla_program->mlir_module, *pjrt_topology->description()));
+  return PjRtExecutable::Create(std::move(executable),
+                                std::move(xla_compile_options));
+}
+
 absl::StatusOr<std::unique_ptr<LoadedExecutable>>
 PjRtCompiler::DeserializeLoadedExecutable(
     absl::string_view serialized,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
index 55a0c763ba3b54..930f89d8e2d368 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -18,8 +18,13 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
 
 namespace xla {
 namespace ifrt {
@@ -42,6 +47,10 @@ class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
       std::unique_ptr<Program> program,
       std::unique_ptr<CompileOptions> options) override;
 
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) override;
+
   absl::StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<DeserializeExecutableOptions> options) override;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 41fabc9dbbba37..c4fc0325d93375 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -181,9 +182,10 @@ char PjRtExecutable::ID = 0;
 char PjRtLoadedExecutable::ID = 0;
 
 absl::StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
-    std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
-  return std::unique_ptr<Executable>(
-      new PjRtExecutable(std::move(pjrt_executable)));
+    std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+    std::unique_ptr<XlaCompileOptions> compile_options) {
+  return std::unique_ptr<Executable>(new PjRtExecutable(
+      std::move(pjrt_executable), std::move(compile_options)));
 }
 
 absl::StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index e4400ea16b689f..ac552d0d49eba4 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -81,7 +82,8 @@ class PjRtExecutable final
  public:
   // Creates PjRtExecutable from xla::PjRtExecutable.
   static absl::StatusOr<std::unique_ptr<Executable>> Create(
-      std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
+      std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+      std::unique_ptr<XlaCompileOptions> compile_options);
 
   // PjRtCompatibleExecutable implementation.
 
@@ -152,13 +154,25 @@ class PjRtExecutable final
     return pjrt_executable_->GetCostAnalysis();
   }
 
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return pjrt_executable_->GetOutputMemoryKinds();
+  }
+
+  const XlaCompileOptions* GetCompileOptions() const override {
+    return compile_options_.get();
+  }
+
   static char ID;  // NOLINT
 
  protected:
-  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable)
-      : pjrt_executable_(std::move(pjrt_executable)) {}
+  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+                          std::unique_ptr<XlaCompileOptions> compile_options)
+      : pjrt_executable_(std::move(pjrt_executable)),
+        compile_options_(std::move(compile_options)) {}
 
   std::shared_ptr<xla::PjRtExecutable> pjrt_executable_;
+  std::unique_ptr<XlaCompileOptions> compile_options_;
 };
 
 // `LoadedExecutable` implementation that wraps a `xla::PjRtLoadedExecutable`.
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.cc
new file mode 100644
index 00000000000000..6d76b16c02233c
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.cc
@@ -0,0 +1,71 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+
+namespace xla::ifrt {
+
+char PjRtTopology::ID = 0;
+
+PjRtTopology::PjRtTopology(
+    std::shared_ptr<const xla::PjRtTopologyDescription> description)
+    : description_(std::move(description)) {}
+
+absl::string_view PjRtTopology::platform_name() const {
+  return description_->platform_name();
+}
+
+absl::string_view PjRtTopology::platform_version() const {
+  return description_->platform_version();
+}
+
+PjRtPlatformId PjRtTopology::platform_id() const {
+  return description_->platform_id();
+}
+
+std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+PjRtTopology::DeviceDescriptions() const {
+  return description_->DeviceDescriptions();
+}
+
+absl::StatusOr<xla::Layout> PjRtTopology::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) const {
+  return description_->GetDefaultLayout(element_type, dims);
+}
+
+absl::StatusOr<std::string> PjRtTopology::Serialize() const {
+  return description_->Serialize();
+}
+
+const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+PjRtTopology::Attributes() const {
+  return description_->Attributes();
+}
+
+}  // namespace xla::ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h
new file mode 100644
index 00000000000000..c854cb24862fc7
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/topology.h"
+
+namespace xla::ifrt {
+
+class PjRtTopology final : public llvm::RTTIExtends<PjRtTopology, Topology> {
+ public:
+  explicit PjRtTopology(
+      std::shared_ptr<const xla::PjRtTopologyDescription> description);
+
+  const std::shared_ptr<const xla::PjRtTopologyDescription>& description()
+      const {
+    return description_;
+  }
+
+  absl::string_view platform_name() const override;
+  absl::string_view platform_version() const override;
+  PjRtPlatformId platform_id() const override;
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override;
+
+  absl::StatusOr<xla::Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  std::shared_ptr<const xla::PjRtTopologyDescription> description_;
+};
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
index ad6a8442249866..9e63b240db357f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/host_callback.h"
 
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index d172c5a1b2870d..8b195ac618ead2 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -38,14 +37,11 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/literal.h"
+#include "xla/layout.h"
 #include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
@@ -55,13 +51,18 @@ limitations under the License.
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/python/py_client.h"
 #include "xla/service/computation_placer.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -127,6 +128,12 @@ class InvalidIfrtCompiler final
     return Unimplemented("Compile not implemented.");
   }
 
+  absl::StatusOr<std::unique_ptr<ifrt::Executable>> Compile(
+      std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
+      std::unique_ptr<ifrt::CompileOptions> options) override {
+    return Unimplemented("Compile not implemented.");
+  }
+
   absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>>
   DeserializeLoadedExecutable(
       absl::string_view serialized,
@@ -136,13 +143,12 @@ class InvalidIfrtCompiler final
 
   static char ID;  // NOLINT
 };
-char InvalidIfrtCompiler::ID = 0;  // NOLINT
+[[maybe_unused]] char InvalidIfrtCompiler::ID = 0;
 
 class CompileOnlyIfRtClient final
     : public llvm::RTTIExtends<CompileOnlyIfRtClient, ifrt::Client> {
  public:
-  explicit CompileOnlyIfRtClient(
-      std::shared_ptr<PjRtTopologyDescription> topology)
+  explicit CompileOnlyIfRtClient(std::shared_ptr<ifrt::PjRtTopology> topology)
       : topology_(std::move(topology)),
         descriptions_(topology_->DeviceDescriptions()) {
     for (auto& description : descriptions_) {
@@ -230,10 +236,10 @@ class CompileOnlyIfRtClient final
 
   static char ID;  // NOLINT
 
-  const PjRtTopologyDescription& topology() const { return *topology_; }
+  const ifrt::PjRtTopology& topology() const { return *topology_; }
 
-  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const override {
+  absl::StatusOr<std::shared_ptr<ifrt::Topology>> GetTopologyForDevices(
+      const xla::ifrt::DeviceList& devices) const override {
     return topology_;
   }
 
@@ -248,20 +254,20 @@ class CompileOnlyIfRtClient final
 
  private:
   InvalidIfrtCompiler default_compiler_;
-  std::shared_ptr<PjRtTopologyDescription> topology_;
+  std::shared_ptr<ifrt::PjRtTopology> topology_;
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> descriptions_;
   std::vector<std::unique_ptr<CompileOnlyDevice>> owned_devices_;
   std::vector<ifrt::Device*> devices_;
 };
 
-char CompileOnlyIfRtClient::ID = 0;  // NOLINT
+[[maybe_unused]] char CompileOnlyIfRtClient::ID = 0;
 
 class CompileOnlyPyClient : public PyClient {
  public:
   using PyClient::PyClient;
 
   static nb_class_ptr<PyClient> Make(
-      std::shared_ptr<PjRtTopologyDescription> topology) {
+      std::shared_ptr<ifrt::PjRtTopology> topology) {
     auto client =
         nb::borrow<nb_class_ptr<PyClient>>(make_nb_class<CompileOnlyPyClient>(
             std::make_unique<CompileOnlyIfRtClient>(std::move(topology))));
@@ -269,7 +275,7 @@ class CompileOnlyPyClient : public PyClient {
     return client;
   }
 
-  absl::StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
+  absl::StatusOr<std::shared_ptr<ifrt::Executable>> CompileUnloaded(
       std::string_view mlir_module, CompileOptions options,
       std::vector<nb::capsule> host_callbacks) {
     if (!host_callbacks.empty()) {
@@ -285,8 +291,14 @@ class CompileOnlyPyClient : public PyClient {
         llvm::dyn_cast_or_null<CompileOnlyIfRtClient>(this->ifrt_client());
     CHECK(ifrt_client) << "CompileOnlyPyClient requires ifrt_client be a "
                           "CompileOnlyIfRtClient";
-    return PjRtCompile(std::move(options), module.get(),
-                       ifrt_client->topology());
+    auto xla_options = std::make_unique<ifrt::XlaCompileOptions>(options);
+    TF_ASSIGN_OR_RETURN(auto executable,
+                        PjRtCompile(std::move(options), module.get(),
+                                    *ifrt_client->topology().description()));
+    TF_ASSIGN_OR_RETURN(auto ifrt_executable,
+                        ifrt::PjRtExecutable::Create(std::move(executable),
+                                                     std::move(xla_options)));
+    return std::shared_ptr<ifrt::Executable>(std::move(ifrt_executable));
   }
 
  private:
@@ -298,7 +310,7 @@ class CompileOnlyPyClient : public PyClient {
 }  // namespace
 
 nb_class_ptr<PyClient> MakeCompileOnlyClient(
-    std::shared_ptr<PjRtTopologyDescription> topology) {
+    std::shared_ptr<ifrt::PjRtTopology> topology) {
   return CompileOnlyPyClient::Make(std::move(topology));
 }
 
diff --git a/third_party/xla/xla/python/py_compile_only_client.h b/third_party/xla/xla/python/py_compile_only_client.h
index 0501cc869a276d..ce354bee285210 100644
--- a/third_party/xla/xla/python/py_compile_only_client.h
+++ b/third_party/xla/xla/python/py_compile_only_client.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 // placeholder for index annotation headers
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
 #include "xla/python/py_client.h"
 
 namespace xla {
@@ -36,7 +36,7 @@ namespace xla {
 // (except it will raise errors if you try to run it, which is what we want for
 // AOT environments).
 nb_class_ptr<PyClient> MakeCompileOnlyClient(
-    std::shared_ptr<PjRtTopologyDescription>);
+    std::shared_ptr<ifrt::PjRtTopology>);
 
 void RegisterCompileOnlyClient(nanobind::module_& m);
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 2e087f8c82d235..2e8f3a27a861dd 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -26,12 +26,14 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/nb_defs.h"
 #include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
@@ -52,6 +54,9 @@ limitations under the License.
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
 #include "xla/python/py_program.h"
@@ -93,6 +98,8 @@ limitations under the License.
 #include "xla/python/outfeed_receiver_py.h"
 #include "xla/python/pjit.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
 #include "xla/python/pmap_lib.h"
 #include "xla/python/pprof_profile_builder.h"
 #include "xla/python/profiler.h"
@@ -387,45 +394,44 @@ NB_MODULE(xla_extension, m_nb) {
   m_nb.def("get_default_c_api_topology",
            [](std::string platform_name, std::string topology_name,
               const absl::flat_hash_map<std::string, PjRtValueType>& options)
-               -> std::shared_ptr<PjRtTopologyDescription> {
-             return xla::ValueOrThrow(
-                 GetCApiTopology(platform_name, topology_name, options));
+               -> std::shared_ptr<ifrt::Topology> {
+             return std::make_shared<ifrt::PjRtTopology>(xla::ValueOrThrow(
+                 GetCApiTopology(platform_name, topology_name, options)));
            });
   m_nb.def(
       "get_c_api_topology",
       [](nb::capsule c_api, std::string topology_name,
          const absl::flat_hash_map<std::string, PjRtValueType>& options)
-          -> std::shared_ptr<PjRtTopologyDescription> {
+          -> std::shared_ptr<ifrt::Topology> {
         if (absl::string_view(c_api.name()) != "pjrt_c_api") {
           throw nb::value_error(
               "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
         }
-        return xla::ValueOrThrow(
+        return std::make_shared<ifrt::PjRtTopology>(xla::ValueOrThrow(
             GetCApiTopology(static_cast<const PJRT_Api*>(c_api.data()),
-                            topology_name, options));
-      });
-  m_nb.def(
-      "get_topology_for_devices",
-      [](const std::vector<nb_class_ptr<PyDevice>>& py_devices) {
-        if (py_devices.empty()) {
-          throw nb::value_error(
-              "get_topology_for_devices requires >= 1 devices.");
-        }
-        auto client = py_devices[0]->client();
-        ifrt::DeviceList::Devices ifrt_devices;
-        ifrt_devices.reserve(py_devices.size());
-        for (const auto& py_device : py_devices) {
-          if (py_device->client().get() != client.get()) {
-            throw nb::value_error(
-                "devices passed to get_topology_for_devices come from "
-                "different clients.");
-          }
-          ifrt_devices.push_back(py_device->device());
-        }
-        ifrt::DeviceList device_list(std::move(ifrt_devices));
-        return xla::ValueOrThrow(
-            client->ifrt_client()->GetTopologyForDevices(device_list));
+                            topology_name, options)));
       });
+  m_nb.def("get_topology_for_devices",
+           [](const std::vector<nb_class_ptr<PyDevice>>& py_devices) {
+             if (py_devices.empty()) {
+               throw nb::value_error(
+                   "get_topology_for_devices requires >= 1 devices.");
+             }
+             auto client = py_devices[0]->client();
+             ifrt::DeviceList::Devices ifrt_devices;
+             ifrt_devices.reserve(py_devices.size());
+             for (const auto& py_device : py_devices) {
+               if (py_device->client().get() != client.get()) {
+                 throw nb::value_error(
+                     "devices passed to get_topology_for_devices come from "
+                     "different clients.");
+               }
+               ifrt_devices.push_back(py_device->device());
+             }
+             ifrt::DeviceList device_list(std::move(ifrt_devices));
+             return xla::ValueOrThrow(
+                 client->ifrt_client()->GetTopologyForDevices(device_list));
+           });
 
   TF_CHECK_OK(PyArray::RegisterTypes(m_nb));
   jax::RegisterDeviceList(m_nb);
@@ -817,27 +823,29 @@ NB_MODULE(xla_extension, m_nb) {
            "representation");
 
   RegisterCompileOnlyClient(m_nb);
-  nb::class_<PjRtTopologyDescription>(m_nb, "DeviceTopology")
+  nb::class_<ifrt::Topology>(m_nb, "DeviceTopology")
       .def("_make_compile_only_devices",
-           [](std::shared_ptr<PjRtTopologyDescription> topology) {
-             return MakeCompileOnlyClient(topology)->Devices();
+           [](std::shared_ptr<ifrt::Topology> topology) {
+             if (!llvm::isa<ifrt::PjRtTopology>(*topology)) {
+               throw xla::XlaRuntimeError("Only PjRtTopologies are supported.");
+             }
+             return MakeCompileOnlyClient(
+                        std::dynamic_pointer_cast<ifrt::PjRtTopology>(topology))
+                 ->Devices();
            })
-      .def_prop_ro("platform",
-                   [](PjRtTopologyDescription& topology) {
-                     return topology.platform_name();
-                   })
-      .def_prop_ro("platform_version",
-                   [](PjRtTopologyDescription& topology) {
-                     return topology.platform_version();
-                   })
+      .def_prop_ro(
+          "platform",
+          [](ifrt::Topology& topology) { return topology.platform_name(); })
+      .def_prop_ro(
+          "platform_version",
+          [](ifrt::Topology& topology) { return topology.platform_version(); })
       .def("serialize",
-           [](PjRtTopologyDescription& topology) -> nb::bytes {
+           [](ifrt::Topology& topology) -> nb::bytes {
              std::string serialized = ValueOrThrow(topology.Serialize());
              return nb::bytes(serialized.data(), serialized.size());
            })
       .def("__getattr__",
-           [](PjRtTopologyDescription& topology,
-              std::string_view name) -> nb::object {
+           [](ifrt::Topology& topology, std::string_view name) -> nb::object {
              const auto& attrs = topology.Attributes();
              auto it = attrs.find(name);
              if (it != attrs.end()) {
@@ -848,27 +856,26 @@ NB_MODULE(xla_extension, m_nb) {
                  absl::StrCat("Unknown attribute ", name).c_str());
            });
 
-  nb::class_<PjRtExecutable>(m_nb, "Executable")
-      .def("hlo_modules", ValueOrThrowWrapper(&PjRtExecutable::GetHloModules))
+  nb::class_<ifrt::Executable>(m_nb, "Executable")
+      .def("hlo_modules", ValueOrThrowWrapper(&ifrt::Executable::GetHloModules))
       .def("get_output_memory_kinds",
-           xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputMemoryKinds))
-      .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
+           xla::ValueOrThrowWrapper(&ifrt::Executable::GetOutputMemoryKinds))
+      .def("get_output_shardings", &ifrt::Executable::GetOutputShardings)
       .def("get_parameter_layouts",
-           ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts))
+           ValueOrThrowWrapper(&ifrt::Executable::GetParameterLayouts))
       .def("get_output_layouts",
-           xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts))
-      .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
+           xla::ValueOrThrowWrapper(&ifrt::Executable::GetOutputLayouts))
+      .def("get_parameter_shardings", &ifrt::Executable::GetParameterShardings)
       .def("get_compiled_memory_stats",
-           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats))
-      .def("compile_options",
-           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
+           xla::ValueOrThrowWrapper(&ifrt::Executable::GetCompiledMemoryStats))
+      .def("compile_options", &ifrt::Executable::GetCompileOptions)
       .def("serialize",
-           [](const PjRtExecutable& exec) -> nb::bytes {
-             std::string serialized = ValueOrThrow(exec.SerializeExecutable());
+           [](const ifrt::Executable& exec) -> nb::bytes {
+             std::string serialized = ValueOrThrow(exec.Serialize());
              return nb::bytes(serialized.data(), serialized.size());
            })
       .def("cost_analysis",
-           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCostAnalysis));
+           xla::ValueOrThrowWrapper(&ifrt::Executable::GetCostAnalysis));
 
   m_nb.def("is_asan", IsAsan);
   m_nb.def("is_msan", IsMsan);

From 6b6d924d3c97efac612fcffb4a0759656deebeef Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 3 Jun 2024 08:30:06 -0700
Subject: [PATCH 259/287] [XLA:GPU][MLIR emitters] Split ReductionFusionBase
 into two classes.

PiperOrigin-RevId: 639793131
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  45 +--
 .../xla/xla/service/gpu/fusions/reduction.cc  | 250 ++++++++++++++-
 .../xla/xla/service/gpu/fusions/reduction.h   |  72 ++++-
 .../xla/service/gpu/fusions/reduction_base.cc | 274 -----------------
 .../xla/service/gpu/fusions/reduction_base.h  |  83 +----
 .../xla/service/gpu/fusions/reduction_mlir.cc | 291 +++++++++++++++++-
 .../xla/service/gpu/fusions/reduction_mlir.h  |  68 +++-
 .../gpu/fusions/reduction_mlir_test.cc        | 239 +++++++++++++-
 ...duction_base_test.cc => reduction_test.cc} | 141 +--------
 9 files changed, 914 insertions(+), 549 deletions(-)
 rename third_party/xla/xla/service/gpu/fusions/{reduction_base_test.cc => reduction_test.cc} (78%)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 35a3e0fe984964..461490f56fa23f 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -770,6 +770,28 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "reduction_test",
+    srcs = ["reduction_test.cc"],
+    deps = [
+        ":fusion_emitter",
+        ":reduction",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "reduction_base",
     srcs = ["reduction_base.cc"],
@@ -805,28 +827,6 @@ cc_library(
     ],
 )
 
-xla_cc_test(
-    name = "reduction_base_test",
-    srcs = ["reduction_base_test.cc"],
-    deps = [
-        ":fusion_emitter",
-        ":reduction_base",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_test_utils",
-        "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
 cc_library(
     name = "reduction_mlir",
     srcs = ["reduction_mlir.cc"],
@@ -872,6 +872,7 @@ xla_test(
         ":mlir_emitter_test_base",
         ":reduction_mlir",
         "//xla:error_spec",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index ddba9446823ac8..58ca010c43a7cb 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -116,16 +116,6 @@ llvm::Type* GetIndexType(const HloFusionInstruction& fusion,
       &fusion, tiling.GetNumThreadsPerBlock() * tiling.GetNumBlocks(), builder);
 }
 
-// For a row reduction, returns the number of rows we can process in parallel
-// per warp.
-int RowReductionGetRowsPerWarp(int reduced_dimension_size) {
-  if (WarpSize() % reduced_dimension_size != 0 ||
-      reduced_dimension_size >= WarpSize()) {
-    return 1;
-  }
-  return WarpSize() / reduced_dimension_size;
-}
-
 llvm::Value* CastSharedToGlobal(llvm::IRBuilder<>* builder, llvm::Value* input,
                                 llvm::Type* element_type, llvm::Twine name) {
   return builder->CreateAddrSpaceCast(
@@ -1092,7 +1082,7 @@ absl::StatusOr<FusionEmissionResult> ReductionFusion::EmitInitializers(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
   llvm::IRBuilder<> builder(ir_emitter_context.llvm_module()->getContext());
-  return ReductionEmitter(analysis(), reduction_info(), ir_emitter_context,
+  return ReductionEmitter(analysis_, reduction_info_, ir_emitter_context,
                           fusion, &builder)
       .EmitInitializers();
 }
@@ -1103,10 +1093,246 @@ absl::Status ReductionFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
                                          std::vector<llvm_ir::IrArray> inputs,
                                          std::vector<llvm_ir::IrArray> outputs,
                                          llvm::IRBuilder<>* builder) const {
-  return ReductionEmitter(analysis(), reduction_info(), ir_emitter_context,
+  return ReductionEmitter(analysis_, reduction_info_, ir_emitter_context,
                           fusion, builder)
       .EmitKernel(launch_dims, inputs, outputs);
 }
 
+int ReductionInfo::GetRowsPerWarp() const {
+  if (!is_row_reduction_) return 1;
+  return RowReductionGetRowsPerWarp(
+      tiling_.GetShape()[ReductionDimensions::kRowMinorReducedDimension]);
+}
+
+LaunchDimensions ReductionInfo::launch_dimensions() const {
+  size_t blocks_y = groups_.grouped_roots.size();
+  return {se::BlockDim(/*x=*/tiling_.GetNumBlocks(),
+                       /*y=*/static_cast<int64_t>(blocks_y), /*z=*/1),
+          se::ThreadDim(/*x=*/tiling_.GetNumThreadsPerBlock(),
+                        /*y=*/1, /*z=*/1)};
+}
+
+ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
+  auto* hero_reduction = analysis.FindHeroReduction();
+  CHECK_NE(hero_reduction, nullptr);
+  Shape input_shape = hero_reduction->operand(0)->shape();
+  ReductionDimensions reduction_dimensions =
+      GetReductionKindAndContiguousComponents(*hero_reduction);
+  auto shape = reduction_dimensions.dimensions;
+  VLOG(10) << "is_row_reduction " << reduction_dimensions.is_row_reduction
+           << " " << shape[0] << " " << shape[1] << " " << shape[2];
+  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
+
+  int64_t num_threads_y =
+      reduction_dimensions.is_row_reduction ? 1 : WarpSize();
+  int64_t rows_per_warp =
+      reduction_dimensions.is_row_reduction
+          ? RowReductionGetRowsPerWarp(
+                shape[ReductionDimensions::kRowMinorReducedDimension])
+          : 1;
+  int64_t num_threads_x = [&] {
+    if (reduction_dimensions.is_row_reduction) {
+      if (rows_per_warp > 1) {
+        return shape[ReductionDimensions::kRowMinorReducedDimension];
+      }
+      int64_t max_block_size =
+          MinThreadsXRowReduction(hero_reduction->GetModule()->config());
+      return std::min(
+          max_block_size,
+          RoundUpTo(
+              CeilOfRatio(shape[ReductionDimensions::kRowMinorReducedDimension],
+                          reduction_tiling
+                              [ReductionDimensions::kRowMinorReducedDimension]),
+              WarpSize()));
+    }
+    return WarpSize();
+  }();
+
+  // If we're limited by the size of the x dimension, add additional parallelism
+  // in the y dimension. The code generator doesn't currently support
+  // parallelizing the z dimension (major reduced dimensions). The general
+  // recommendation is to use between 128 and 512 threads, so we just go for
+  // 256. See https://forums.developer.nvidia.com/t/55529
+  constexpr int64_t kThreadsPerBlockTarget = 256;
+  if (reduction_dimensions.is_row_reduction &&
+      num_threads_x * 2 <= kThreadsPerBlockTarget) {
+    int64_t kept_size =
+        reduction_dimensions.dimensions[ReductionDimensions::kRowKeptDimension];
+    // Increase the size of the y dimension as long as there's remaining
+    // parallelism.
+    if (kept_size * num_threads_x <= kThreadsPerBlockTarget) {
+      num_threads_y = kept_size;
+      // num_threads_x is a power of two, but it may be less than 32. If dim_y
+      // is also small, we may have to increase the bound so the total number of
+      // threads is a multiple of 32.
+      while ((num_threads_x * num_threads_y) % 32) ++num_threads_y;
+    } else {
+      num_threads_y = kThreadsPerBlockTarget / num_threads_x;
+    }
+  }
+
+  int vector_size = GetVectorSize(analysis, reduction_dimensions, num_threads_x,
+                                  reduction_tiling, /*for_mlir=*/false);
+
+  absl::InlinedVector<int64_t, 4> num_threads{1, num_threads_y, num_threads_x};
+  absl::InlinedVector<int64_t, 4> tiled_shape{shape[0], shape[1],
+                                              shape[2] / vector_size};
+  absl::InlinedVector<int64_t, 4> tile_per_thread{
+      reduction_tiling[0], reduction_tiling[1],
+      std::max<int64_t>(reduction_tiling[2] / vector_size, 1)};
+  if (rows_per_warp > 1) {
+    // If we produce more than one element per thread, that means the reduced
+    // dimension is small and it can't be tiled - we already have more threads
+    // in a warp than the size of the reduced dimension. The code generator
+    // doesn't currently support tiling the kept dimension, because it just
+    // uses the thread ID as the coordinate.
+    tile_per_thread[2] = 1;
+  }
+  if (vector_size != 1) {
+    num_threads.push_back(1);  // The vector dimension is a loop.
+    tiled_shape.push_back(vector_size);
+    tile_per_thread.push_back(vector_size);
+  }
+
+  Tiling tiling(tiled_shape, tile_per_thread, num_threads,
+                /*loops_to_unroll=*/{false, false, true, false});
+  bool reduction_is_race_free = ReductionIsRaceFree(
+      hero_reduction->GetModule()->config(), reduction_dimensions);
+  return ReductionInfo(analysis, tiling, reduction_dimensions.is_row_reduction,
+                       reduction_is_race_free,
+                       GroupDisjointReductions(analysis, /*for_mlir=*/false),
+                       hero_reduction);
+}
+
+std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  if (!groups_.is_reduction_root[root_index]) {
+    auto map = ComposeIndexingMaps(
+        GetIndexingMapForTiling(tiling_, ctx),
+        GetBitcastMap(tiling_.GetXlaShape(),
+                      analysis_.fusion_root(root_index).shape(), ctx));
+    AddGroupIdConstraint(map, root_index, ctx);
+    return map;
+  }
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+
+  auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
+  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
+                                             tiling_.GetThreadsPerBlock());
+
+  auto physical_shape =
+      ShapeUtil::DeleteDimensions(hero.dimensions(), hero.operand(0)->shape());
+  std::vector<DimVar> dimension_ranges{
+      {{0, tiling_.GetNumThreadsPerBlock() - 1}},
+      {},
+      {},
+      {{0, tiling_.GetNumBlocks() - 1}},
+      {{0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)}},
+      {},
+  };
+
+  constexpr int kRowKept = ReductionDimensions::kRowKeptDimension;
+  constexpr int kRowMinorReduced =
+      ReductionDimensions::kRowMinorReducedDimension;
+
+  constexpr int kColMajorKept = ReductionDimensions::kColMajorKeptDimension;
+  constexpr int kColMinorKept = ReductionDimensions::kColMinorKeptDimension;
+  constexpr int kColReduced = ReductionDimensions::kColReducedDimension;
+
+  auto map = [&]() {
+    if (is_row_reduction_) {
+      IndexingMap linear_index(
+          mlir::AffineMap::get(
+              6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
+              ctx),
+          dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
+      int rows_per_warp = GetRowsPerWarp();
+      if (rows_per_warp > 1) {
+        linear_index.AddConstraint(
+            thread_ids[kRowMinorReduced] % (WarpSize() / rows_per_warp),
+            {0, 0});
+      } else {
+        linear_index.AddConstraint(thread_ids[kRowMinorReduced], {0, 0});
+      }
+      return ComposeIndexingMaps(
+          linear_index, GetBitcastMap(ShapeUtil::MakeShape(
+                                          PRED, {tiling_.GetShape()[kRowKept]}),
+                                      physical_shape, ctx));
+    }
+
+    mlir::SmallVector<mlir::AffineExpr> projected_dims{
+        block_offsets.getResult(kColMajorKept),
+        block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]};
+    std::vector<RangeVar> range_vars;
+    if (thread_ids.size() == 4) {
+      int vector_size = tiling_.GetThreadTileSize().back();
+      range_vars.push_back({0, vector_size - 1});
+      projected_dims.push_back(mlir::getAffineSymbolExpr(0, ctx));
+    }
+    IndexingMap projected_index(
+        mlir::AffineMap::get(6, range_vars.size(), projected_dims, ctx),
+        dimension_ranges, range_vars, /*rt_vars=*/{});
+
+    projected_index.AddConstraint(
+        mlir::getAffineDimExpr(
+            KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx) %
+            WarpSize(),
+        {0, 0});
+    if (!is_row_reduction_) {
+      projected_index.AddConstraint(
+          projected_index.GetAffineMap().getResult(1),
+          {0, tiling_.GetShape()[ReductionDimensions::kColMinorKeptDimension] -
+                  1});
+    }
+
+    return ComposeIndexingMaps(
+        projected_index,
+        GetBitcastMap(ShapeUtil::DeleteDimension(
+                          ReductionDimensions::kColReducedDimension,
+                          tiling_.GetXlaShape()),
+                      physical_shape, ctx));
+  }();
+
+  AddGroupIdConstraint(map, root_index, ctx);
+  return map;
+}
+
+std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+  if (groups_.is_reduction_root[root_index] &&
+      hero_operand_index >= hero.operand_count() / 2) {
+    // We don't have indexing for the init values.
+    return std::nullopt;
+  }
+  if (!groups_.is_reduction_root[root_index]) {
+    return ComposeIndexingMaps(
+        *ComputeThreadIdToOutputIndexing(root_index, ctx),
+        *ComputeOutputToInputIndexing(
+             &analysis_.fusion_root(root_index).instruction(), 0, ctx)
+             .indexing_maps[hero_operand_index]
+             .begin());
+  }
+
+  auto map = ComposeIndexingMaps(
+      GetIndexingMapForTiling(tiling_, ctx),
+      GetBitcastMap(tiling_.GetXlaShape(),
+                    hero.operand(hero_operand_index)->shape(), ctx));
+  AddGroupIdConstraint(map, root_index, ctx);
+  return map;
+}
+
+void ReductionInfo::AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                                         mlir::MLIRContext* ctx) const {
+  // Only threads with the right y block index actually do anything for each
+  // particular root.
+  int group_index = groups_.group_id_per_root[root_index];
+  map.AddConstraint(
+      mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
+                             ctx),
+      {group_index, group_index});
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.h b/third_party/xla/xla/service/gpu/fusions/reduction.h
index 1304e36c27d378..e1d0e374e72ba1 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.h
@@ -36,6 +36,51 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class ReductionInfo {
+ public:
+  static ReductionInfo Create(const HloFusionAnalysis& analysis);
+
+  const Tiling& GetTiling() const { return tiling_; }
+  const ReductionGroups& GetGroups() const { return groups_; }
+  Shape GetReduceOperandShape() const {
+    return first_reduce_->operand(0)->shape();
+  }
+
+  bool IsRowReduction() const { return is_row_reduction_; }
+  bool IsRaceFree() const { return is_race_free_; }
+  int GetRowsPerWarp() const;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const;
+
+  LaunchDimensions launch_dimensions() const;
+
+ private:
+  ReductionInfo(const HloFusionAnalysis& analysis, Tiling tiling,
+                bool is_row_reduction, bool is_race_free,
+                ReductionGroups groups, const HloInstruction* first_reduce)
+      : analysis_(analysis),
+        tiling_(tiling),
+        is_row_reduction_(is_row_reduction),
+        is_race_free_(is_race_free),
+        groups_(std::move(groups)),
+        first_reduce_(first_reduce) {}
+
+  void AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                            mlir::MLIRContext* ctx) const;
+
+  const HloFusionAnalysis& analysis_;
+  Tiling tiling_;
+  bool is_row_reduction_;
+  bool is_race_free_;
+  ReductionGroups groups_;
+  const HloInstruction* first_reduce_;
+};
+
 // Generates code for reduction to contiguous dimensions.
 //
 // Row reduction uses the following algorithm described in CUDA-like
@@ -102,9 +147,28 @@ namespace gpu {
 // complicating the index calculation in the code generation of the reduce
 // instructions. In other words, a block_id_y is assigned to a group and so
 // different groups can be run in parallel.
-class ReductionFusion : public ReductionFusionBase<KernelFusionEmitterBase> {
+class ReductionFusion : public KernelFusionEmitterBase {
  public:
-  using ReductionFusionBase::ReductionFusionBase;
+  explicit ReductionFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis), reduction_info_(ReductionInfo::Create(analysis)) {}
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
+    return reduction_info_.ComputeThreadIdToOutputIndexing(root_index, ctx);
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    return reduction_info_.ComputeThreadIdToInputIndexing(
+        root_index, hero_operand_index, ctx);
+  }
+
+  LaunchDimensions launch_dimensions() const override {
+    return reduction_info_.launch_dimensions();
+  }
+
+  const ReductionInfo& reduction_info() const { return reduction_info_; }
 
  protected:
   absl::StatusOr<FusionEmissionResult> EmitInitializers(
@@ -117,6 +181,10 @@ class ReductionFusion : public ReductionFusionBase<KernelFusionEmitterBase> {
                           std::vector<llvm_ir::IrArray> inputs,
                           std::vector<llvm_ir::IrArray> outputs,
                           llvm::IRBuilder<>* builder) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  ReductionInfo reduction_info_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index dafed4ae815d2f..c7fb4ab6302067 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -54,11 +54,6 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-namespace {
-
-const Shape& FirstShape(const Shape& in) {
-  return in.IsTuple() ? in.tuple_shapes(0) : in;
-}
 
 int RowReductionGetRowsPerWarp(int reduced_dimension_size) {
   if (WarpSize() % reduced_dimension_size != 0 ||
@@ -234,274 +229,5 @@ ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis,
   return result;
 }
 
-}  // namespace
-
-int ReductionInfo::GetRowsPerWarp() const {
-  if (!is_row_reduction_) return 1;
-  return RowReductionGetRowsPerWarp(
-      tiling_.GetShape()[ReductionDimensions::kRowMinorReducedDimension]);
-}
-
-LaunchDimensions ReductionInfo::launch_dimensions() const {
-  size_t blocks_y = groups_.grouped_roots.size();
-  return {se::BlockDim(/*x=*/tiling_.GetNumBlocks(),
-                       /*y=*/static_cast<int64_t>(blocks_y), /*z=*/1),
-          se::ThreadDim(/*x=*/tiling_.GetNumThreadsPerBlock(),
-                        /*y=*/1, /*z=*/1)};
-}
-
-ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis,
-                                    bool for_mlir) {
-  auto* hero_reduction = analysis.FindHeroReduction();
-  CHECK_NE(hero_reduction, nullptr);
-  Shape input_shape = hero_reduction->operand(0)->shape();
-  ReductionDimensions reduction_dimensions =
-      GetReductionKindAndContiguousComponents(*hero_reduction);
-  auto shape = reduction_dimensions.dimensions;
-  VLOG(10) << "is_row_reduction " << reduction_dimensions.is_row_reduction
-           << " " << shape[0] << " " << shape[1] << " " << shape[2];
-  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
-
-  int64_t num_threads_y =
-      reduction_dimensions.is_row_reduction ? 1 : WarpSize();
-  int64_t rows_per_warp =
-      reduction_dimensions.is_row_reduction
-          ? RowReductionGetRowsPerWarp(
-                shape[ReductionDimensions::kRowMinorReducedDimension])
-          : 1;
-  int64_t num_threads_x = [&] {
-    if (reduction_dimensions.is_row_reduction) {
-      if (rows_per_warp > 1) {
-        return shape[ReductionDimensions::kRowMinorReducedDimension];
-      }
-      int64_t max_block_size =
-          MinThreadsXRowReduction(hero_reduction->GetModule()->config());
-      return std::min(
-          max_block_size,
-          RoundUpTo(
-              CeilOfRatio(shape[ReductionDimensions::kRowMinorReducedDimension],
-                          reduction_tiling
-                              [ReductionDimensions::kRowMinorReducedDimension]),
-              WarpSize()));
-    }
-    return WarpSize();
-  }();
-
-  // If we're limited by the size of the x dimension, add additional parallelism
-  // in the y dimension. The code generator doesn't currently support
-  // parallelizing the z dimension (major reduced dimensions). The general
-  // recommendation is to use between 128 and 512 threads, so we just go for
-  // 256. See https://forums.developer.nvidia.com/t/55529
-  constexpr int64_t kThreadsPerBlockTarget = 256;
-  if (reduction_dimensions.is_row_reduction &&
-      num_threads_x * 2 <= kThreadsPerBlockTarget) {
-    int64_t kept_size =
-        reduction_dimensions.dimensions[ReductionDimensions::kRowKeptDimension];
-    // Increase the size of the y dimension as long as there's remaining
-    // parallelism.
-    if (kept_size * num_threads_x <= kThreadsPerBlockTarget) {
-      num_threads_y = kept_size;
-      // num_threads_x is a power of two, but it may be less than 32. If dim_y
-      // is also small, we may have to increase the bound so the total number of
-      // threads is a multiple of 32.
-      while ((num_threads_x * num_threads_y) % 32) ++num_threads_y;
-    } else {
-      num_threads_y = kThreadsPerBlockTarget / num_threads_x;
-    }
-  }
-
-  int vector_size = GetVectorSize(analysis, reduction_dimensions, num_threads_x,
-                                  reduction_tiling, for_mlir);
-
-  absl::InlinedVector<int64_t, 4> num_threads{1, num_threads_y, num_threads_x};
-  absl::InlinedVector<int64_t, 4> tiled_shape{shape[0], shape[1],
-                                              shape[2] / vector_size};
-  absl::InlinedVector<int64_t, 4> tile_per_thread{
-      reduction_tiling[0], reduction_tiling[1],
-      std::max<int64_t>(reduction_tiling[2] / vector_size, 1)};
-  if (for_mlir) {
-    // The indexing map simplifier does not currently handle this correctly,
-    // leading to loop bounds that are too large.
-    // TODO(jreiffers): Implement tightening of ranges based on constraints
-    // instead. For example, based on:
-    //
-    //   s1 in [0, 127]
-    //   d0 floordiv 32 + s1 * 32 in [0, 63]
-    //
-    // Tighten the bound of s1 to [0, 1].
-    for (int i = 0; i < num_threads.size(); ++i) {
-      tile_per_thread[i] = std::min(
-          tile_per_thread[i], CeilOfRatio(tiled_shape[i], num_threads[i]));
-    }
-  }
-  if (rows_per_warp > 1) {
-    // If we produce more than one element per thread, that means the reduced
-    // dimension is small and it can't be tiled - we already have more threads
-    // in a warp than the size of the reduced dimension. The code generator
-    // doesn't currently support tiling the kept dimension, because it just
-    // uses the thread ID as the coordinate.
-    tile_per_thread[2] = 1;
-  }
-  if (vector_size != 1 ||
-      (for_mlir && !reduction_dimensions.is_row_reduction)) {
-    num_threads.push_back(1);  // The vector dimension is a loop.
-    tiled_shape.push_back(vector_size);
-    tile_per_thread.push_back(vector_size);
-  }
-
-  // The MLIR emitter treats the last tiled dimension as the number of parallel
-  // independent reductions per thread (to use vectorized loads). This is only
-  // needed for column reductions: row reductions can use vectorized loads for
-  // the same reduction.
-  // row reduction:     [[a, b], [c, d]] -> [a + b, c + d]
-  // column reduction:  [[a, b], [c, d]] -> [a + c, b + d]
-  // In both cases [a, b] are loaded together, but only in the column reduction
-  // they contribute to different result elements.
-  if (for_mlir && reduction_dimensions.is_row_reduction) {
-    num_threads.push_back(1);
-    tiled_shape.push_back(1);
-    tile_per_thread.push_back(1);
-  }
-
-  Tiling tiling(tiled_shape, tile_per_thread, num_threads,
-                /*loops_to_unroll=*/{false, false, true, false});
-  bool reduction_is_race_free = ReductionIsRaceFree(
-      hero_reduction->GetModule()->config(), reduction_dimensions);
-  return ReductionInfo(analysis, tiling, reduction_dimensions.is_row_reduction,
-                       reduction_is_race_free,
-                       GroupDisjointReductions(analysis, for_mlir),
-                       hero_reduction);
-}
-
-std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
-  if (!groups_.is_reduction_root[root_index]) {
-    auto map = ComposeIndexingMaps(
-        GetIndexingMapForTiling(tiling_, ctx),
-        GetBitcastMap(tiling_.GetXlaShape(),
-                      analysis_.fusion_root(root_index).shape(), ctx));
-    AddGroupIdConstraint(map, root_index, ctx);
-    return map;
-  }
-  const auto& hero = analysis_.fusion_hero(root_index).instruction();
-
-  auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
-  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
-                                             tiling_.GetThreadsPerBlock());
-
-  auto physical_shape =
-      ShapeUtil::DeleteDimensions(hero.dimensions(), hero.operand(0)->shape());
-  std::vector<DimVar> dimension_ranges{
-      {{0, tiling_.GetNumThreadsPerBlock() - 1}},
-      {},
-      {},
-      {{0, tiling_.GetNumBlocks() - 1}},
-      {{0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)}},
-      {},
-  };
-
-  constexpr int kRowKept = ReductionDimensions::kRowKeptDimension;
-  constexpr int kRowMinorReduced =
-      ReductionDimensions::kRowMinorReducedDimension;
-
-  constexpr int kColMajorKept = ReductionDimensions::kColMajorKeptDimension;
-  constexpr int kColMinorKept = ReductionDimensions::kColMinorKeptDimension;
-  constexpr int kColReduced = ReductionDimensions::kColReducedDimension;
-
-  auto map = [&]() {
-    if (is_row_reduction_) {
-      IndexingMap linear_index(
-          mlir::AffineMap::get(
-              6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
-              ctx),
-          dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
-      int rows_per_warp = GetRowsPerWarp();
-      if (rows_per_warp > 1) {
-        linear_index.AddConstraint(
-            thread_ids[kRowMinorReduced] % (WarpSize() / rows_per_warp),
-            {0, 0});
-      } else {
-        linear_index.AddConstraint(thread_ids[kRowMinorReduced], {0, 0});
-      }
-      return ComposeIndexingMaps(
-          linear_index, GetBitcastMap(ShapeUtil::MakeShape(
-                                          PRED, {tiling_.GetShape()[kRowKept]}),
-                                      physical_shape, ctx));
-    }
-
-    mlir::SmallVector<mlir::AffineExpr> projected_dims{
-        block_offsets.getResult(kColMajorKept),
-        block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]};
-    std::vector<RangeVar> range_vars;
-    if (thread_ids.size() == 4) {
-      int vector_size = tiling_.GetThreadTileSize().back();
-      range_vars.push_back({0, vector_size - 1});
-      projected_dims.push_back(mlir::getAffineSymbolExpr(0, ctx));
-    }
-    IndexingMap projected_index(
-        mlir::AffineMap::get(6, range_vars.size(), projected_dims, ctx),
-        dimension_ranges, range_vars, /*rt_vars=*/{});
-
-    projected_index.AddConstraint(
-        mlir::getAffineDimExpr(
-            KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx) %
-            WarpSize(),
-        {0, 0});
-    if (!is_row_reduction_) {
-      projected_index.AddConstraint(
-          projected_index.GetAffineMap().getResult(1),
-          {0, tiling_.GetShape()[ReductionDimensions::kColMinorKeptDimension] -
-                  1});
-    }
-
-    return ComposeIndexingMaps(
-        projected_index,
-        GetBitcastMap(ShapeUtil::DeleteDimension(
-                          ReductionDimensions::kColReducedDimension,
-                          tiling_.GetXlaShape()),
-                      physical_shape, ctx));
-  }();
-
-  AddGroupIdConstraint(map, root_index, ctx);
-  return map;
-}
-
-std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
-    int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
-  const auto& hero = analysis_.fusion_hero(root_index).instruction();
-  if (groups_.is_reduction_root[root_index] &&
-      hero_operand_index >= hero.operand_count() / 2) {
-    // We don't have indexing for the init values.
-    return std::nullopt;
-  }
-  if (!groups_.is_reduction_root[root_index]) {
-    return ComposeIndexingMaps(
-        *ComputeThreadIdToOutputIndexing(root_index, ctx),
-        *ComputeOutputToInputIndexing(
-             &analysis_.fusion_root(root_index).instruction(), 0, ctx)
-             .indexing_maps[hero_operand_index]
-             .begin());
-  }
-
-  auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, ctx),
-      GetBitcastMap(tiling_.GetXlaShape(),
-                    hero.operand(hero_operand_index)->shape(), ctx));
-  AddGroupIdConstraint(map, root_index, ctx);
-  return map;
-}
-
-void ReductionInfo::AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
-                                         mlir::MLIRContext* ctx) const {
-  // Only threads with the right y block index actually do anything for each
-  // particular root.
-  int group_index = groups_.group_id_per_root[root_index];
-  map.AddConstraint(
-      mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             ctx),
-      {group_index, group_index});
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.h b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
index 7cfe9c72a741a3..3d3a10b042677d 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/reduction_utils.h"
 #include "xla/shape.h"
 
 namespace xla {
@@ -43,84 +44,14 @@ struct ReductionGroups {
   std::vector<bool> is_reduction_root;
 };
 
-class ReductionInfo {
- public:
-  static ReductionInfo Create(const HloFusionAnalysis& analysis, bool for_mlir);
+ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis,
+                                        bool for_mlir);
 
-  const Tiling& GetTiling() const { return tiling_; }
-  const ReductionGroups& GetGroups() const { return groups_; }
-  Shape GetReduceOperandShape() const {
-    return first_reduce_->operand(0)->shape();
-  }
+int RowReductionGetRowsPerWarp(int reduced_dimension_size);
 
-  bool IsRowReduction() const { return is_row_reduction_; }
-  bool IsRaceFree() const { return is_race_free_; }
-  int GetRowsPerWarp() const;
-
-  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const;
-
-  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const;
-
-  LaunchDimensions launch_dimensions() const;
-
- private:
-  ReductionInfo(const HloFusionAnalysis& analysis, Tiling tiling,
-                bool is_row_reduction, bool is_race_free,
-                ReductionGroups groups, const HloInstruction* first_reduce)
-      : analysis_(analysis),
-        tiling_(tiling),
-        is_row_reduction_(is_row_reduction),
-        is_race_free_(is_race_free),
-        groups_(std::move(groups)),
-        first_reduce_(first_reduce) {}
-
-  void AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
-                            mlir::MLIRContext* ctx) const;
-
-  const HloFusionAnalysis& analysis_;
-  Tiling tiling_;
-  bool is_row_reduction_;
-  bool is_race_free_;
-  ReductionGroups groups_;
-  const HloInstruction* first_reduce_;
-};
-
-// Base class for reduction fusions. Computes shared information (reduction
-// grouping) and provides implementations of thread->input/output indexing.
-template <typename Base, bool is_mlir = false>
-class ReductionFusionBase : public Base {
- public:
-  explicit ReductionFusionBase(const HloFusionAnalysis& analysis)
-      : analysis_(analysis),
-        reduction_info_(ReductionInfo::Create(analysis, is_mlir)) {}
-
-  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override {
-    return reduction_info().ComputeThreadIdToOutputIndexing(root_index, ctx);
-  }
-
-  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    return reduction_info().ComputeThreadIdToInputIndexing(
-        root_index, hero_operand_index, ctx);
-  }
-
-  LaunchDimensions launch_dimensions() const override {
-    return reduction_info().launch_dimensions();
-  }
-
-  const ReductionInfo& reduction_info() const { return reduction_info_; }
-
-  const HloFusionAnalysis& analysis() const { return analysis_; }
-
- private:
-  const HloFusionAnalysis& analysis_;
-  ReductionInfo reduction_info_;
-};
+int GetVectorSize(const HloFusionAnalysis& analysis,
+                  const ReductionDimensions& reduction_dimensions,
+                  int num_threads, Vector3 reduction_tiling, bool for_mlir);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index f6612b276420d5..e7d261ac2b0868 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -71,6 +71,264 @@ using mlir_converter::PartitionedComputations;
 using HloValueMap =
     absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>;
 
+LaunchDimensions MlirReductionInfo::launch_dimensions() const {
+  size_t blocks_y = groups_.grouped_roots.size();
+  return {se::BlockDim(/*x=*/tiling_.GetNumBlocks(),
+                       /*y=*/static_cast<int64_t>(blocks_y), /*z=*/1),
+          se::ThreadDim(/*x=*/tiling_.GetNumThreadsPerBlock(),
+                        /*y=*/1, /*z=*/1)};
+}
+
+MlirReductionInfo MlirReductionInfo::Create(const HloFusionAnalysis& analysis) {
+  auto* hero_reduction = analysis.FindHeroReduction();
+  CHECK_NE(hero_reduction, nullptr);
+  Shape input_shape = hero_reduction->operand(0)->shape();
+  ReductionDimensions reduction_dimensions =
+      GetReductionKindAndContiguousComponents(*hero_reduction);
+  auto shape = reduction_dimensions.dimensions;
+  VLOG(10) << "is_row_reduction " << reduction_dimensions.is_row_reduction
+           << " " << shape[0] << " " << shape[1] << " " << shape[2];
+  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
+
+  int64_t num_threads_y =
+      reduction_dimensions.is_row_reduction ? 1 : WarpSize();
+  int64_t rows_per_warp =
+      reduction_dimensions.is_row_reduction
+          ? RowReductionGetRowsPerWarp(
+                shape[ReductionDimensions::kRowMinorReducedDimension])
+          : 1;
+  int64_t num_threads_x = [&] {
+    if (reduction_dimensions.is_row_reduction) {
+      if (rows_per_warp > 1) {
+        return shape[ReductionDimensions::kRowMinorReducedDimension];
+      }
+      int64_t max_block_size =
+          MinThreadsXRowReduction(hero_reduction->GetModule()->config());
+      return std::min(
+          max_block_size,
+          RoundUpTo(
+              CeilOfRatio(shape[ReductionDimensions::kRowMinorReducedDimension],
+                          reduction_tiling
+                              [ReductionDimensions::kRowMinorReducedDimension]),
+              WarpSize()));
+    }
+    return WarpSize();
+  }();
+
+  // If we're limited by the size of the x dimension, add additional parallelism
+  // in the y dimension. The code generator doesn't currently support
+  // parallelizing the z dimension (major reduced dimensions). The general
+  // recommendation is to use between 128 and 512 threads, so we just go for
+  // 256. See https://forums.developer.nvidia.com/t/55529
+  constexpr int64_t kThreadsPerBlockTarget = 256;
+  if (reduction_dimensions.is_row_reduction &&
+      num_threads_x * 2 <= kThreadsPerBlockTarget) {
+    int64_t kept_size =
+        reduction_dimensions.dimensions[ReductionDimensions::kRowKeptDimension];
+    // Increase the size of the y dimension as long as there's remaining
+    // parallelism.
+    if (kept_size * num_threads_x <= kThreadsPerBlockTarget) {
+      num_threads_y = kept_size;
+      // num_threads_x is a power of two, but it may be less than 32. If dim_y
+      // is also small, we may have to increase the bound so the total number of
+      // threads is a multiple of 32.
+      while ((num_threads_x * num_threads_y) % 32) ++num_threads_y;
+    } else {
+      num_threads_y = kThreadsPerBlockTarget / num_threads_x;
+    }
+  }
+
+  int vector_size = GetVectorSize(analysis, reduction_dimensions, num_threads_x,
+                                  reduction_tiling, /*for_mlir=*/true);
+
+  absl::InlinedVector<int64_t, 4> num_threads{1, num_threads_y, num_threads_x};
+  absl::InlinedVector<int64_t, 4> tiled_shape{shape[0], shape[1],
+                                              shape[2] / vector_size};
+  absl::InlinedVector<int64_t, 4> tile_per_thread{
+      reduction_tiling[0], reduction_tiling[1],
+      std::max<int64_t>(reduction_tiling[2] / vector_size, 1)};
+  // The indexing map simplifier does not currently handle this correctly,
+  // leading to loop bounds that are too large.
+  // TODO(jreiffers): Implement tightening of ranges based on constraints
+  // instead. For example, based on:
+  //
+  //   s1 in [0, 127]
+  //   d0 floordiv 32 + s1 * 32 in [0, 63]
+  //
+  // Tighten the bound of s1 to [0, 1].
+  for (int i = 0; i < num_threads.size(); ++i) {
+    tile_per_thread[i] = std::min(tile_per_thread[i],
+                                  CeilOfRatio(tiled_shape[i], num_threads[i]));
+  }
+  if (rows_per_warp > 1) {
+    // If we produce more than one element per thread, that means the reduced
+    // dimension is small and it can't be tiled - we already have more threads
+    // in a warp than the size of the reduced dimension. The code generator
+    // doesn't currently support tiling the kept dimension, because it just
+    // uses the thread ID as the coordinate.
+    tile_per_thread[2] = 1;
+  }
+  if (vector_size != 1 || !reduction_dimensions.is_row_reduction) {
+    num_threads.push_back(1);  // The vector dimension is a loop.
+    tiled_shape.push_back(vector_size);
+    tile_per_thread.push_back(vector_size);
+  }
+
+  // The MLIR emitter treats the last tiled dimension as the number of parallel
+  // independent reductions per thread (to use vectorized loads). This is only
+  // needed for column reductions: row reductions can use vectorized loads for
+  // the same reduction.
+  // row reduction:     [[a, b], [c, d]] -> [a + b, c + d]
+  // column reduction:  [[a, b], [c, d]] -> [a + c, b + d]
+  // In both cases [a, b] are loaded together, but only in the column reduction
+  // they contribute to different result elements.
+  if (reduction_dimensions.is_row_reduction) {
+    num_threads.push_back(1);
+    tiled_shape.push_back(1);
+    tile_per_thread.push_back(1);
+  }
+
+  Tiling tiling(tiled_shape, tile_per_thread, num_threads,
+                /*loops_to_unroll=*/{false, false, true, false});
+  bool reduction_is_race_free = ReductionIsRaceFree(
+      hero_reduction->GetModule()->config(), reduction_dimensions);
+  return MlirReductionInfo(
+      analysis, tiling, reduction_dimensions.is_row_reduction,
+      reduction_is_race_free,
+      GroupDisjointReductions(analysis, /*for_mlir=*/true), hero_reduction);
+}
+
+std::optional<IndexingMap> MlirReductionInfo::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  if (!groups_.is_reduction_root[root_index]) {
+    auto map = ComposeIndexingMaps(
+        GetIndexingMapForTiling(tiling_, ctx),
+        GetBitcastMap(tiling_.GetXlaShape(),
+                      analysis_.fusion_root(root_index).shape(), ctx));
+    AddGroupIdConstraint(map, root_index, ctx);
+    return map;
+  }
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+
+  auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
+  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
+                                             tiling_.GetThreadsPerBlock());
+
+  auto physical_shape =
+      ShapeUtil::DeleteDimensions(hero.dimensions(), hero.operand(0)->shape());
+  std::vector<DimVar> dimension_ranges{
+      {{0, tiling_.GetNumThreadsPerBlock() - 1}},
+      {},
+      {},
+      {{0, tiling_.GetNumBlocks() - 1}},
+      {{0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)}},
+      {},
+  };
+
+  constexpr int kRowKept = ReductionDimensions::kRowKeptDimension;
+  constexpr int kRowMinorReduced =
+      ReductionDimensions::kRowMinorReducedDimension;
+
+  constexpr int kColMajorKept = ReductionDimensions::kColMajorKeptDimension;
+  constexpr int kColMinorKept = ReductionDimensions::kColMinorKeptDimension;
+  constexpr int kColReduced = ReductionDimensions::kColReducedDimension;
+
+  auto map = [&]() {
+    if (is_row_reduction_) {
+      IndexingMap linear_index(
+          mlir::AffineMap::get(
+              6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
+              ctx),
+          dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
+      int rows_per_warp = GetRowsPerWarp();
+      if (rows_per_warp > 1) {
+        linear_index.AddConstraint(
+            thread_ids[kRowMinorReduced] % (WarpSize() / rows_per_warp),
+            {0, 0});
+      } else {
+        linear_index.AddConstraint(thread_ids[kRowMinorReduced], {0, 0});
+      }
+      return ComposeIndexingMaps(
+          linear_index, GetBitcastMap(ShapeUtil::MakeShape(
+                                          PRED, {tiling_.GetShape()[kRowKept]}),
+                                      physical_shape, ctx));
+    }
+
+    mlir::SmallVector<mlir::AffineExpr> projected_dims{
+        block_offsets.getResult(kColMajorKept),
+        block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]};
+    std::vector<RangeVar> range_vars;
+    if (thread_ids.size() == 4) {
+      int vector_size = tiling_.GetThreadTileSize().back();
+      range_vars.push_back({0, vector_size - 1});
+      projected_dims.push_back(mlir::getAffineSymbolExpr(0, ctx));
+    }
+    IndexingMap projected_index(
+        mlir::AffineMap::get(6, range_vars.size(), projected_dims, ctx),
+        dimension_ranges, range_vars, /*rt_vars=*/{});
+
+    projected_index.AddConstraint(
+        mlir::getAffineDimExpr(
+            KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx) %
+            WarpSize(),
+        {0, 0});
+    if (!is_row_reduction_) {
+      projected_index.AddConstraint(
+          projected_index.GetAffineMap().getResult(1),
+          {0, tiling_.GetShape()[ReductionDimensions::kColMinorKeptDimension] -
+                  1});
+    }
+
+    return ComposeIndexingMaps(
+        projected_index,
+        GetBitcastMap(ShapeUtil::DeleteDimension(
+                          ReductionDimensions::kColReducedDimension,
+                          tiling_.GetXlaShape()),
+                      physical_shape, ctx));
+  }();
+
+  AddGroupIdConstraint(map, root_index, ctx);
+  return map;
+}
+
+std::optional<IndexingMap> MlirReductionInfo::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+  if (groups_.is_reduction_root[root_index] &&
+      hero_operand_index >= hero.operand_count() / 2) {
+    // We don't have indexing for the init values.
+    return std::nullopt;
+  }
+  if (!groups_.is_reduction_root[root_index]) {
+    return ComposeIndexingMaps(
+        *ComputeThreadIdToOutputIndexing(root_index, ctx),
+        *ComputeOutputToInputIndexing(
+             &analysis_.fusion_root(root_index).instruction(), 0, ctx)
+             .indexing_maps[hero_operand_index]
+             .begin());
+  }
+
+  auto map = ComposeIndexingMaps(
+      GetIndexingMapForTiling(tiling_, ctx),
+      GetBitcastMap(tiling_.GetXlaShape(),
+                    hero.operand(hero_operand_index)->shape(), ctx));
+  AddGroupIdConstraint(map, root_index, ctx);
+  return map;
+}
+
+void MlirReductionInfo::AddGroupIdConstraint(IndexingMap& map,
+                                             int64_t root_index,
+                                             mlir::MLIRContext* ctx) const {
+  // Only threads with the right y block index actually do anything for each
+  // particular root.
+  int group_index = groups_.group_id_per_root[root_index];
+  map.AddConstraint(
+      mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
+                             ctx),
+      {group_index, group_index});
+}
+
 struct MlirReductionFusion::EmitterState {
   EmitterState(const MlirReductionFusion& owner,
                mlir::func::FuncOp entry_function,
@@ -86,7 +344,7 @@ struct MlirReductionFusion::EmitterState {
         computation(computations.FindPartitionedComputation(
             fusion.fused_instructions_computation())) {
     int index = 0;
-    for (const auto& root : owner.analysis().fusion_roots()) {
+    for (const auto& root : owner.analysis_.fusion_roots()) {
       fusion_result_index_starts[&root.instruction()] = index;
       index += root.shape().IsTuple() ? root.shape().tuple_shapes_size() : 1;
     }
@@ -127,12 +385,13 @@ struct MlirReductionFusion::EmitterState {
 };
 
 MlirReductionFusion::MlirReductionFusion(const HloFusionAnalysis& analysis)
-    : ReductionFusionBase(analysis) {
-  CHECK(reduction_info().IsRaceFree())
+    : analysis_(analysis),
+      reduction_info_(MlirReductionInfo::Create(analysis)) {
+  CHECK(reduction_info_.IsRaceFree())
       << "Non-race-free reductions should have been decomposed. Did "
          "tree_reduction_rewriter run?";
 
-  const auto& groups = reduction_info().GetGroups();
+  const auto& groups = reduction_info_.GetGroups();
   int num_groups = groups.grouped_roots.size();
   side_output_roots_.resize(num_groups);
   reduction_heroes_.resize(num_groups);
@@ -164,11 +423,17 @@ MlirReductionFusion::GetEpilogues(const HloFusionInstruction& fusion,
        llvm::zip(reduction_heroes_, reduction_roots_)) {
     epilogues.push_back(
         mlir_converter::EpilogueSpecification::FromOutputIndexing(
-            analysis(), heroes, roots, *this, mlir_context));
+            analysis_, heroes, roots, *this, mlir_context));
   }
   return epilogues;
 }
 
+int MlirReductionInfo::GetRowsPerWarp() const {
+  if (!is_row_reduction_) return 1;
+  return RowReductionGetRowsPerWarp(
+      tiling_.GetShape()[ReductionDimensions::kRowMinorReducedDimension]);
+}
+
 absl::Status MlirReductionFusion::EmitEntryFunction(
     const PartitionedComputations& computations,
     const mlir_converter::CallTargetProvider& call_targets,
@@ -197,7 +462,7 @@ absl::Status MlirReductionFusion::EmitEntryFunction(
 llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
     int group_id, EmitterState& state) const {
   auto& b = state.builder;
-  const auto& tiling = reduction_info().GetTiling();
+  const auto& tiling = reduction_info_.GetTiling();
   const auto& threads_per_block = tiling.GetThreadsPerBlock();
   auto* ctx = state.entry_function.getContext();
 
@@ -222,7 +487,7 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
       mlir_converter::ApplyIndexing(thread_indexing, {thread_id}, {}, b);
 
   Value warp_id = b.create<ma::DivUIOp>(
-      reduction_info().IsRowReduction()
+      reduction_info_.IsRowReduction()
           ? thread_ids[ReductionDimensions::kRowMinorReducedDimension]
           : thread_id,
       b.create<ma::ConstantIndexOp>(WarpSize()));
@@ -237,7 +502,7 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
       shared_read_indices;
   Value shared_write_condition = cst_true;
   Value shared_read_condition = cst_true;
-  if (!reduction_info().IsRowReduction()) {
+  if (!reduction_info_.IsRowReduction()) {
     shared_tile_size = {WarpSize(), WarpSize() * vector_size + 1};
     Value lane_id_times_v = b.create<ma::MulIOp>(lane_id, vector_size_cst);
     Value warp_id_times_v = b.create<ma::MulIOp>(warp_id, vector_size_cst);
@@ -253,7 +518,7 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
           builder.create<ma::AddIOp>(warp_id_times_v, vector_index);
       return SmallVector<Value>{lane_id, col};
     };
-  } else if (reduction_info().GetRowsPerWarp() == 1 && num_warps_row > 1) {
+  } else if (reduction_info_.GetRowsPerWarp() == 1 && num_warps_row > 1) {
     CHECK_EQ(vector_size, 1);
     constexpr int kKept = ReductionDimensions::kRowKeptDimension;
     shared_tile_size = {tiling.GetThreadsPerBlock()[kKept], num_warps_row};
@@ -330,10 +595,10 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
   // In row reductions, we can do a warp shuffle before writing to shared
   // memory. In column reductions, the members of the warp process different
   // output elements, so we need to transpose first.
-  if (reduction_info().IsRowReduction()) {
+  if (reduction_info_.IsRowReduction()) {
     for (auto* reduction : reductions) {
       auto reducer = state.GetReducer(reduction);
-      int max_dist = WarpSize() / 2 / reduction_info().GetRowsPerWarp();
+      int max_dist = WarpSize() / 2 / reduction_info_.GetRowsPerWarp();
       const auto& inits_for_reduction = inits.at(reduction);
       auto& values = accumulated[reduction];
       values = mlir_converter::UnrealizedConversionCast(
@@ -403,7 +668,7 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
         loc, evaluate_epilogue(b, hero_values, outputs, vector_index));
   };
 
-  if (reduction_info().IsRowReduction()) {
+  if (reduction_info_.IsRowReduction()) {
     CHECK_EQ(vector_size, 1);
     auto warp_writes =
         b.create<ma::CmpIOp>(ma::CmpIPredicate::eq, warp_id, zero);
@@ -423,7 +688,7 @@ llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
 
 HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
     int group_id, const HloValueMap& inits) {
-  const auto& tiling = owner.reduction_info().GetTiling();
+  const auto& tiling = owner.reduction_info_.GetTiling();
   auto tile_indexing = GetIndexingMapForTiling(tiling, builder.getContext());
   tile_indexing
       .GetMutableDimensionBound(
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
index 9b2243afd52ac7..7701373727ebf1 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
@@ -30,15 +30,75 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class MlirReductionInfo {
+ public:
+  static MlirReductionInfo Create(const HloFusionAnalysis& analysis);
+
+  const Tiling& GetTiling() const { return tiling_; }
+  const ReductionGroups& GetGroups() const { return groups_; }
+  Shape GetReduceOperandShape() const {
+    return first_reduce_->operand(0)->shape();
+  }
+
+  bool IsRowReduction() const { return is_row_reduction_; }
+  bool IsRaceFree() const { return is_race_free_; }
+  int GetRowsPerWarp() const;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const;
+
+  LaunchDimensions launch_dimensions() const;
+
+ private:
+  MlirReductionInfo(const HloFusionAnalysis& analysis, Tiling tiling,
+                    bool is_row_reduction, bool is_race_free,
+                    ReductionGroups groups, const HloInstruction* first_reduce)
+      : analysis_(analysis),
+        tiling_(tiling),
+        is_row_reduction_(is_row_reduction),
+        is_race_free_(is_race_free),
+        groups_(std::move(groups)),
+        first_reduce_(first_reduce) {}
+
+  void AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                            mlir::MLIRContext* ctx) const;
+
+  const HloFusionAnalysis& analysis_;
+  Tiling tiling_;
+  bool is_row_reduction_;
+  bool is_race_free_;
+  ReductionGroups groups_;
+  const HloInstruction* first_reduce_;
+};
+
 // Reduction fusion. Lowers to LLVM via MLIR. Currently not fully
 // implemented: only single reduction groups, no side outputs, only row
 // reductions.
-class MlirReductionFusion
-    : public ReductionFusionBase<MlirFusionEmitterBase, /*is_mlir=*/true> {
+class MlirReductionFusion : public MlirFusionEmitterBase {
  public:
   explicit MlirReductionFusion(const HloFusionAnalysis& analysis);
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
+    return reduction_info_.ComputeThreadIdToOutputIndexing(root_index, ctx);
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    return reduction_info_.ComputeThreadIdToInputIndexing(
+        root_index, hero_operand_index, ctx);
+  }
+
+  LaunchDimensions launch_dimensions() const override {
+    return reduction_info_.launch_dimensions();
+  }
+
+  const MlirReductionInfo& reduction_info() const { return reduction_info_; }
 
  protected:
   absl::Status EmitEntryFunction(
@@ -64,6 +124,8 @@ class MlirReductionFusion
   std::vector<std::vector<const HloInstruction*>> reduction_roots_;
   // The side output roots for each reduction group.
   std::vector<std::vector<const HloInstruction*>> side_output_roots_;
+  const HloFusionAnalysis& analysis_;
+  MlirReductionInfo reduction_info_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
index 3c8b5597600708..18f7bb857efe6e 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
@@ -19,15 +19,19 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ReductionTest = MlirEmitterTestBase<MlirReductionFusion>;
+using ::testing::ElementsAre;
+using ::testing::SizeIs;
 
-TEST_F(ReductionTest, VariadicRowReduce) {
+using MlirReductionTest = MlirEmitterTestBase<MlirReductionFusion>;
+
+TEST_F(MlirReductionTest, VariadicRowReduce) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -79,7 +83,7 @@ TEST_F(ReductionTest, VariadicRowReduce) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, RowReduceEpilogue) {
+TEST_F(MlirReductionTest, RowReduceEpilogue) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -109,7 +113,7 @@ TEST_F(ReductionTest, RowReduceEpilogue) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, RowReduceMOFEpilogue) {
+TEST_F(MlirReductionTest, RowReduceMOFEpilogue) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -153,7 +157,7 @@ TEST_F(ReductionTest, RowReduceMOFEpilogue) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, RowReduceMOFGroups) {
+TEST_F(MlirReductionTest, RowReduceMOFGroups) {
   constexpr auto kHloString = R"(
     %add_f32 {
       %x = f32[] parameter(0)
@@ -184,7 +188,7 @@ TEST_F(ReductionTest, RowReduceMOFGroups) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, ColumnReduction) {
+TEST_F(MlirReductionTest, ColumnReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -215,7 +219,7 @@ TEST_F(ReductionTest, ColumnReduction) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, SmallColumnReduction) {
+TEST_F(MlirReductionTest, SmallColumnReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -237,7 +241,7 @@ TEST_F(ReductionTest, SmallColumnReduction) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, F64RowReduction) {
+TEST_F(MlirReductionTest, F64RowReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -263,7 +267,7 @@ TEST_F(ReductionTest, F64RowReduction) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, MultiRowReduction) {
+TEST_F(MlirReductionTest, MultiRowReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -290,7 +294,7 @@ TEST_F(ReductionTest, MultiRowReduction) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, NonPowerOfTwoRowReduction) {
+TEST_F(MlirReductionTest, NonPowerOfTwoRowReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -327,7 +331,7 @@ TEST_F(ReductionTest, NonPowerOfTwoRowReduction) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, MixedIndexing) {
+TEST_F(MlirReductionTest, MixedIndexing) {
   constexpr auto kHloString = R"(
     HloModule module
     add {
@@ -351,7 +355,7 @@ TEST_F(ReductionTest, MixedIndexing) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, NonTrivialEpilogue) {
+TEST_F(MlirReductionTest, NonTrivialEpilogue) {
   constexpr auto kHloString = R"(
     HloModule module
     add {
@@ -383,7 +387,7 @@ TEST_F(ReductionTest, NonTrivialEpilogue) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, SideOutput) {
+TEST_F(MlirReductionTest, SideOutput) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -415,7 +419,7 @@ TEST_F(ReductionTest, SideOutput) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, UnsignedSideOutput) {
+TEST_F(MlirReductionTest, UnsignedSideOutput) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
 
@@ -440,7 +444,7 @@ TEST_F(ReductionTest, UnsignedSideOutput) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, BroadcastSideOutput) {
+TEST_F(MlirReductionTest, BroadcastSideOutput) {
   constexpr auto kHloString = R"(
     %add {
       p0 = f32[] parameter(0)
@@ -465,7 +469,7 @@ TEST_F(ReductionTest, BroadcastSideOutput) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, VariadicMOF) {
+TEST_F(MlirReductionTest, VariadicMOF) {
   constexpr auto kHloString = R"(
     %reducer1 {
       p0 = f32[] parameter(0)
@@ -500,7 +504,7 @@ TEST_F(ReductionTest, VariadicMOF) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(ReductionTest, ColumnReductionVectorization) {
+TEST_F(MlirReductionTest, ColumnReductionVectorization) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
     Add {
@@ -524,6 +528,207 @@ TEST_F(ReductionTest, ColumnReductionVectorization) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirReductionTest, ThreadIndexingOutputLayout) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,512] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64]{0,1} reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,512] parameter(0)
+      ROOT %fusion = f32[100,64]{0,1} fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirReductionFusion fusion(analysis);
+
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5) -> (
+          (d3 * 8 + d0 floordiv 32) floordiv 64,
+          (d3 * 8 + d0 floordiv 32) mod 64
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 799]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        d0 mod 32 in [0, 0]
+        d3 * 8 + d0 floordiv 32 in [0, 6399]
+      )"));
+}
+
+TEST_F(MlirReductionTest, TwoGroups) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      %p0 = f32[2] parameter(0)
+      %p1 = f32[2] parameter(1)
+      %c0 = f32[] constant(-inf)
+      %r0 = f32[] reduce(%p0, %c0), dimensions={0}, to_apply=add
+      %c1 = f32[] constant(inf)
+      %r1 = f32[] reduce(%p1, %c1), dimensions={0}, to_apply=add
+      ROOT %tuple = (f32[], f32[]) tuple(%r0, %r1)
+    }
+    ENTRY entry {
+      %p0 = f32[2] parameter(0)
+      %p1 = f32[2] parameter(1)
+      ROOT %fusion = (f32[], f32[]) fusion(%p0, %p1), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirReductionFusion fusion(analysis);
+
+  EXPECT_THAT(fusion.reduction_info().GetGroups().grouped_roots,
+              ElementsAre(ElementsAre(&analysis.fusion_root(0).instruction()),
+                          ElementsAre(&analysis.fusion_root(1).instruction())));
+}
+
+TEST_F(MlirReductionTest, OneGroup) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    %add {
+      %p0 = c128[] parameter(0)
+      %p1 = c128[] parameter(1)
+      ROOT %add.35 = c128[] add(c128[] %p0, c128[] %p1)
+    }
+    %fusion {
+      %p0 = c128[1,2] parameter(0)
+      %c0 = c128[] constant((0, 0))
+      %reduce = c128[] reduce(%p0, %c0), dimensions={0,1}, to_apply=%add
+      %real = f64[] real(c128[] %reduce)
+      %imag = f64[] imag(c128[] %reduce)
+      %negate = f64[] negate(f64[] %imag)
+      ROOT %tuple.29 = (f64[], f64[]) tuple(f64[] %real, f64[] %negate)
+    }
+    ENTRY entry {
+      %p0 = c128[1,2] parameter(0)
+      ROOT %fusion = (f64[], f64[]) fusion(%p0), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  MlirReductionFusion mlir_fusion(analysis);
+  EXPECT_THAT(mlir_fusion.reduction_info().GetGroups().grouped_roots,
+              SizeIs(1));
+}
+
+TEST_F(MlirReductionTest, MlirColumnReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    add {
+      b = f32[] parameter(1)
+      a = f32[] parameter(0)
+      ROOT out = f32[] add(a, b)
+    }
+    fusion {
+      %p0 = f32[192,64,1536] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[192,1536] reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+    ENTRY entry {
+      %p0 = f32[192,64,1536] parameter(0)
+      ROOT %fusion = f32[192,1536] fusion(%p0), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  MlirReductionFusion fusion(analysis);
+
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
+          d3 floordiv 12,
+          d0 floordiv 32 + s1 * 32,
+          ((d3 mod 12) * 32 + d0 mod 32) * 4 + s3
+        )
+        domain:
+        d0 in [0, 1023]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 2303]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 1]
+        s2 in [0, 0]
+        s3 in [0, 3]
+        (d3 mod 12) * 32 + d0 mod 32 in [0, 383]
+        d0 floordiv 32 + s1 * 32 in [0, 63]
+      )"));
+}
+
+TEST_F(MlirReductionTest, MlirColumnReductionVectorSizeTwo) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    add {
+      b = f32[] parameter(1)
+      a = f32[] parameter(0)
+      ROOT out = f32[] add(a, b)
+    }
+    fusion {
+      %p0 = f32[192,64,1538] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[192,1538] reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+    ENTRY entry {
+      %p0 = f32[192,64,1538] parameter(0)
+      ROOT %fusion = f32[192,1538] fusion(%p0), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  MlirReductionFusion fusion(analysis);
+
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
+          d3 floordiv 25,
+          d0 floordiv 32 + s1 * 32,
+          ((d3 mod 25) * 32 + d0 mod 32) * 2 + s3
+        )
+        domain:
+        d0 in [0, 1023]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 4799]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 1]
+        s2 in [0, 0]
+        s3 in [0, 1]
+        (d3 mod 25) * 32 + d0 mod 32 in [0, 768]
+        d0 floordiv 32 + s1 * 32 in [0, 63]
+      )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_test.cc
similarity index 78%
rename from third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
rename to third_party/xla/xla/service/gpu/fusions/reduction_test.cc
index 6c88efbea79d61..636ffccadc4866 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_test.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/service/gpu/fusions/reduction_base.h"
+
+#include "xla/service/gpu/fusions/reduction.h"
 
 #include <memory>
 
@@ -45,28 +46,6 @@ class ReductionTest : public HloTestBase {
   mlir::MLIRContext mlir_context_;
 };
 
-class FakeReductionFusion : public ReductionFusionBase<KernelFusionInterface> {
-  using ReductionFusionBase::ReductionFusionBase;
-  absl::StatusOr<FusionEmissionResult> Emit(
-      IrEmitterContext&, const HloFusionInstruction&) const override {
-    return absl::UnimplementedError("Unimplemented");
-  }
-};
-
-class FakeMlirReductionFusion
-    : public ReductionFusionBase<KernelFusionInterface, true> {
-  using ReductionFusionBase::ReductionFusionBase;
-  absl::StatusOr<FusionEmissionResult> Emit(
-      IrEmitterContext&, const HloFusionInstruction&) const override {
-    return absl::UnimplementedError("Unimplemented");
-  }
-};
-
-std::unique_ptr<FakeReductionFusion> GetReductionFusion(
-    const HloFusionAnalysis& analysis) {
-  return std::make_unique<FakeReductionFusion>(analysis);
-}
-
 TEST_F(ReductionTest, ThreadIndexingRowReduction) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule module
@@ -91,7 +70,7 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
@@ -158,7 +137,7 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
@@ -224,7 +203,7 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
@@ -279,7 +258,7 @@ TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(
       fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
@@ -326,7 +305,7 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   constexpr char kExpectedIndexing[] = R"(
       (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
@@ -380,7 +359,7 @@ TEST_F(ReductionTest, ThreadIndexingVectorized) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
@@ -426,7 +405,7 @@ TEST_F(ReductionTest, ThreadIndexingBroadcastSideOutput) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
   EXPECT_THAT(
       fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
@@ -490,7 +469,7 @@ TEST_F(ReductionTest, TwoGroups) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(fusion.reduction_info().GetGroups().grouped_roots,
               ElementsAre(ElementsAre(&analysis.fusion_root(0).instruction()),
@@ -521,107 +500,9 @@ TEST_F(ReductionTest, OneGroup) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
-  FakeReductionFusion fusion(analysis);
+  ReductionFusion fusion(analysis);
 
   EXPECT_THAT(fusion.reduction_info().GetGroups().grouped_roots, SizeIs(2));
-
-  FakeMlirReductionFusion mlir_fusion(analysis);
-  EXPECT_THAT(mlir_fusion.reduction_info().GetGroups().grouped_roots,
-              SizeIs(1));
-}
-
-TEST_F(ReductionTest, MlirColumnReduction) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-    add {
-      b = f32[] parameter(1)
-      a = f32[] parameter(0)
-      ROOT out = f32[] add(a, b)
-    }
-    fusion {
-      %p0 = f32[192,64,1536] parameter(0)
-      %c0 = f32[] constant(0)
-      ROOT reduce = f32[192,1536] reduce(p0, c0), dimensions={1}, to_apply=add
-    }
-    ENTRY entry {
-      %p0 = f32[192,64,1536] parameter(0)
-      ROOT %fusion = f32[192,1536] fusion(%p0), kind=kInput, calls=fusion
-    })")
-                    .value();
-
-  auto* root = module->entry_computation()->root_instruction();
-  auto analysis = AnalyzeFusion(*root, device_info_);
-
-  FakeMlirReductionFusion fusion(analysis);
-
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
-      MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
-          d3 floordiv 12,
-          d0 floordiv 32 + s1 * 32,
-          ((d3 mod 12) * 32 + d0 mod 32) * 4 + s3
-        )
-        domain:
-        d0 in [0, 1023]
-        d1 in [0, 0]
-        d2 in [0, 0]
-        d3 in [0, 2303]
-        d4 in [0, 0]
-        d5 in [0, 0]
-        s0 in [0, 0]
-        s1 in [0, 1]
-        s2 in [0, 0]
-        s3 in [0, 3]
-        (d3 mod 12) * 32 + d0 mod 32 in [0, 383]
-        d0 floordiv 32 + s1 * 32 in [0, 63]
-      )"));
-}
-
-TEST_F(ReductionTest, MlirColumnReductionVectorSizeTwo) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-    add {
-      b = f32[] parameter(1)
-      a = f32[] parameter(0)
-      ROOT out = f32[] add(a, b)
-    }
-    fusion {
-      %p0 = f32[192,64,1538] parameter(0)
-      %c0 = f32[] constant(0)
-      ROOT reduce = f32[192,1538] reduce(p0, c0), dimensions={1}, to_apply=add
-    }
-    ENTRY entry {
-      %p0 = f32[192,64,1538] parameter(0)
-      ROOT %fusion = f32[192,1538] fusion(%p0), kind=kInput, calls=fusion
-    })")
-                    .value();
-
-  auto* root = module->entry_computation()->root_instruction();
-  auto analysis = AnalyzeFusion(*root, device_info_);
-
-  FakeMlirReductionFusion fusion(analysis);
-
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
-      MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
-          d3 floordiv 25,
-          d0 floordiv 32 + s1 * 32,
-          ((d3 mod 25) * 32 + d0 mod 32) * 2 + s3
-        )
-        domain:
-        d0 in [0, 1023]
-        d1 in [0, 0]
-        d2 in [0, 0]
-        d3 in [0, 4799]
-        d4 in [0, 0]
-        d5 in [0, 0]
-        s0 in [0, 0]
-        s1 in [0, 1]
-        s2 in [0, 0]
-        s3 in [0, 1]
-        (d3 mod 25) * 32 + d0 mod 32 in [0, 768]
-        d0 floordiv 32 + s1 * 32 in [0, 63]
-      )"));
 }
 
 }  // namespace

From ca28c54a47d478c7c806360d910c70b998ef8fe5 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 3 Jun 2024 08:34:16 -0700
Subject: [PATCH 260/287] [xla:cpu] Add a library for running microbenchmarks
 written in HLO

PiperOrigin-RevId: 639794550
---
 .../xla/xla/service/cpu/benchmarks/BUILD      | 52 +++++++++++
 .../benchmarks/elementwise_benchmark_test.cc  | 59 ++++++++++++
 .../cpu/benchmarks/hlo_benchmark_runner.cc    | 91 +++++++++++++++++++
 .../cpu/benchmarks/hlo_benchmark_runner.h     | 34 +++++++
 4 files changed, 236 insertions(+)
 create mode 100644 third_party/xla/xla/service/cpu/benchmarks/BUILD
 create mode 100644 third_party/xla/xla/service/cpu/benchmarks/elementwise_benchmark_test.cc
 create mode 100644 third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.cc
 create mode 100644 third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h

diff --git a/third_party/xla/xla/service/cpu/benchmarks/BUILD b/third_party/xla/xla/service/cpu/benchmarks/BUILD
new file mode 100644
index 00000000000000..e974ec200e188e
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/benchmarks/BUILD
@@ -0,0 +1,52 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_benchmark_runner",
+    testonly = 1,
+    srcs = ["hlo_benchmark_runner.cc"],
+    hdrs = ["hlo_benchmark_runner.h"],
+    deps = [
+        "//xla:literal",
+        "//xla/client:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/cpu:cpu_client",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_parser",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_benchmark",
+    ],
+)
+
+xla_cc_test(
+    name = "elementwise_benchmark_test",
+    srcs = ["elementwise_benchmark_test.cc"],
+    deps = [
+        ":hlo_benchmark_runner",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/benchmarks/elementwise_benchmark_test.cc b/third_party/xla/xla/service/cpu/benchmarks/elementwise_benchmark_test.cc
new file mode 100644
index 00000000000000..3ed0a049d863a4
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/benchmarks/elementwise_benchmark_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace xla::cpu {
+
+static void BM_AddF32(benchmark::State& state) {
+  std::string hlo_module = R"(
+    HloModule m
+
+    add {
+      p0 = f32[1024] parameter(0)
+      p1 = f32[1024] parameter(1)
+      ROOT add = f32[1024] add(p0, p1)
+    }
+
+    ENTRY e {
+      p0 = f32[1024] parameter(0)
+      p1 = f32[1024] parameter(1)
+      ROOT fusion = f32[1024] fusion(p0, p1), kind=kLoop, calls=add
+    }
+  )";
+
+  std::minstd_rand0 engine;
+
+  auto shape = ShapeUtil::MakeShape(F32, {1024});
+  auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
+  auto p1 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
+
+  std::vector<const Literal*> args = {&p0, &p1};
+  CHECK_OK(RunHloBenchmark(state, hlo_module, args));
+}
+
+BENCHMARK(BM_AddF32)->MeasureProcessCPUTime();
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.cc
new file mode 100644
index 00000000000000..91c5895a6e306b
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/benchmarks/hlo_benchmark_runner.h"
+
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/client/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_parser.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace xla::cpu {
+
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::string_view hlo_module,
+                             absl::Span<const Literal* const> args) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      GetTfrtCpuClient(CpuClientOptions()));
+  PjRtDevice* device = client->devices().front();
+
+  HloModuleConfig config;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnUnverifiedModule(hlo_module, HloModuleConfig()));
+
+  XlaComputation computation(module->ToProto());
+
+  // Compile HLO module to executable.
+  CompileOptions compile_options;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                      client->Compile(computation, compile_options));
+
+  // Convert literals to PjRtBuffers.
+  std::vector<std::unique_ptr<PjRtBuffer>> args_buffers;
+  args_buffers.reserve(args.size());
+
+  for (const Literal* arg : args) {
+    TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                        client->BufferFromHostLiteral(*arg, device));
+    TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+  }
+
+  // Execute in synchronous mode to avoid thread hops.
+  ExecuteOptions execute_options;
+  execute_options.execution_mode = ExecuteOptions::ExecutionMode::kSynchronous;
+
+  std::vector<PjRtBuffer*> args_ptrs;
+  args_ptrs.reserve(args_buffers.size());
+  for (const auto& arg : args_buffers) {
+    args_ptrs.push_back(arg.get());
+  }
+
+  // Warmup executable.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<PjRtBuffer>> results,
+      executable->ExecuteSharded(args_ptrs, device, execute_options));
+
+  // Benchmark executable.
+  for (auto _ : state) {
+    TF_ASSIGN_OR_RETURN(results, executable->ExecuteSharded(args_ptrs, device,
+                                                            execute_options));
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h b/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h
new file mode 100644
index 00000000000000..eea9fe0df0cfc2
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_
+#define XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_
+
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace xla::cpu {
+
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::string_view hlo_module,
+                             absl::Span<const Literal* const> args);
+
+}
+
+#endif  // XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_

From e659d9db4cbadce107e6402400aac857000d871c Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Mon, 3 Jun 2024 09:05:37 -0700
Subject: [PATCH 261/287] [XLA:GPU] Re-populate
 `gpu_compiler_test_autotune_db.textproto` appropriately.

This fixes `GpuCompilerTest.GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas`.
We also enforce a check that all the autotuning results are pre-loaded from
`gpu_compiler_test_autotune_db.textproto`, which will give us a better error
message (and a deterministic failure) should such an issue come up again.

We also disable the test pre-Ampere, since we did not generate autotuning results
below sm80.

PiperOrigin-RevId: 639803950
---
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 11 +++++-
 .../gpu_compiler_test_autotune_db.textproto   | 38 +++++++++++++++----
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 7e9e80a8598532..21a6e5495aefc5 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -336,7 +336,14 @@ ENTRY main {
 
 TEST_F(GpuCompilerTest,
        GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas) {
-  GTEST_SKIP() << "TODO(bchetioui): fix broken test";
+  auto cc = backend()
+                .default_stream_executor()
+                ->GetDeviceDescription()
+                .cuda_compute_capability();
+  if (!cc.IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Autotuning results have only been generated for Ampere "
+                 << "and Hopper GPUs";
+  }
   const absl::string_view hlo_string = R"(
 HloModule test
 
@@ -367,6 +374,8 @@ ENTRY main {
   DebugOptions triton_enabled_debug_options = GetDebugOptionsForTest();
   triton_enabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
       false);
+  triton_enabled_debug_options
+      .set_xla_gpu_require_complete_aot_autotune_results(true);
   config.set_debug_options(triton_enabled_debug_options);
   config.set_replica_count(1);
   config.set_num_partitions(1);
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index be93c508b1e1ee..39874deb65565e 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -1,13 +1,37 @@
 version: 3
 results {
-  device: "sm_9.0 with 84942979072B RAM, 132 cores, 1980000KHz clock, 2619000KHz mem clock, 52428800B L2$"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"2\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[\"0\"],\"rhs_batch_dimensions\":[\"0\"]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"1048576\",\"rhs_stride\":\"1048576\",\"grad_x\":false,\"grad_y\":false},\"force_earliest_schedule\":false}"
+  device: "sm_8.0 with 42296475648B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
   result {
     gemm {
       algorithm: -1
     }
     run_time {
-      nanos: 657376
+      nanos: 1
+    }
+  }
+}
+results {
+  device: "sm_8.0 with 42298834944B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
+  result {
+    run_time {
+      nanos: 1
+    }
+    gemm {
+      algorithm: -1
+    }
+  }
+}
+results {
+  device: "sm_8.0 with 42298834944B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[4194304]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"2\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[\"0\"],\"rhs_batch_dimensions\":[\"0\"]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"1048576\",\"rhs_stride\":\"1048576\",\"grad_x\":false,\"grad_y\":false,\"damax_output\":false},\"force_earliest_schedule\":false}"
+  result {
+    run_time {
+      nanos: 1
+    }
+    gemm {
+      algorithm: -1
     }
   }
 }
@@ -19,19 +43,19 @@ results {
       algorithm: -1
     }
     run_time {
-      nanos: 854688
+      nanos: 1
     }
   }
 }
 results {
-  device: "sm_8.0 with 42296475648B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
+  device: "sm_9.0 with 84942979072B RAM, 132 cores, 1980000KHz clock, 2619000KHz mem clock, 52428800B L2$"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"2\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[\"0\"],\"rhs_batch_dimensions\":[\"0\"]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"1048576\",\"rhs_stride\":\"1048576\",\"grad_x\":false,\"grad_y\":false,\"damax_output\":false},\"force_earliest_schedule\":false}"
   result {
     gemm {
       algorithm: -1
     }
     run_time {
-      nanos: 1576960
+      nanos: 1
     }
   }
 }

From 3dc9d915fc6858a4bef2e643f95ed0a7cf2cb73e Mon Sep 17 00:00:00 2001
From: Quoc Truong <quoct@google.com>
Date: Mon, 3 Jun 2024 09:40:05 -0700
Subject: [PATCH 262/287] Update Linux official environment to use tf package
 from artifact registry instead of gcr.io. I've moved the image from gcr.io
 over to artifact registry.

PiperOrigin-RevId: 639815212
---
 ci/official/envs/linux_arm64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index 161b0e2e803822..12d9a0f1ef889d 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -19,7 +19,7 @@ TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
 # despite lacking Nvidia CUDA support.
 TFCI_BUILD_PIP_PACKAGE_ARGS="--repo_env=WHEEL_NAME=tensorflow"
 TFCI_DOCKER_ENABLE=1
-TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-2-16-multi-python
+TFCI_DOCKER_IMAGE=us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build-arm64:tf-2-16-multi-python
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_INDEX_HTML_ENABLE=1

From b32126dfc004f9636dd000f5627ec1e744ffbbea Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 3 Jun 2024 09:53:48 -0700
Subject: [PATCH 263/287] [xla:gpu] Add GemvRewriter pass

Currently this is unused but will be needed to fix matrix-vector dot performance

PiperOrigin-RevId: 639819532
---
 third_party/xla/xla/service/gpu/BUILD         |  35 ++++
 .../xla/xla/service/gpu/gemv_rewriter.cc      | 183 ++++++++++++++++++
 .../xla/xla/service/gpu/gemv_rewriter.h       |  44 +++++
 .../xla/xla/service/gpu/gemv_rewriter_test.cc | 149 ++++++++++++++
 4 files changed, 411 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter.cc
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter.h
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index d809440ba41201..dd085f3022d1e2 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1385,6 +1385,41 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gemv_rewriter",
+    srcs = ["gemv_rewriter.cc"],
+    hdrs = ["gemv_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gemv_rewriter_test",
+    srcs = ["gemv_rewriter_test.cc"],
+    deps = [
+        ":gemv_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "split_k_gemm_rewriter",
     srcs = ["split_k_gemm_rewriter.cc"],
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
new file mode 100644
index 00000000000000..21e5f477e4b059
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
@@ -0,0 +1,183 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Construct a new layout by adding a new minor-most dimension to the input
+// layout. For example, {3, 2, 1, 0} is extended to {4, 3, 2, 1, 0}.
+// We expect that the input layout is normalized by LayoutNormalizer, so that
+// the input layout has a descending ordering.
+absl::StatusOr<Layout> GetLayoutWithNewMinorMostDimension(
+    const Layout& layout) {
+  // Check that the layout is normalized.
+  if (!LayoutUtil::IsMonotonicWithDim0Major(layout)) {
+    return absl::InvalidArgumentError("Layout is not normalized.");
+  }
+  return LayoutUtil::MakeDescendingLayout(layout.minor_to_major_size() + 1);
+}
+
+class GemvRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
+    const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
+    HloInstruction* lhs = dot->mutable_operand(0);
+    HloInstruction* rhs = dot->mutable_operand(1);
+
+    // This pass relies on dot decomposer which ensures that all non-batch
+    // dimensions are merged into one.
+    bool lhs_has_non_contracting_dim =
+        lhs->shape().rank() ==
+        dim_numbers.lhs_batch_dimensions_size() +
+            dim_numbers.lhs_contracting_dimensions_size() + 1;
+    bool rhs_has_non_contracting_dim =
+        rhs->shape().rank() ==
+        dim_numbers.rhs_batch_dimensions_size() +
+            dim_numbers.rhs_contracting_dimensions_size() + 1;
+
+    // Skip matrix-matrix multiplication.
+    if (lhs_has_non_contracting_dim && rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
+    // Skip vector-vector multiplication.
+    if (!lhs_has_non_contracting_dim && !rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
+    if (dot->shape().is_dynamic()) {
+      return absl::OkStatus();
+    }
+
+    changed_ = true;
+
+    HloComputation* computation = dot->parent();
+    HloInstruction* new_lhs = lhs;
+    if (!lhs_has_non_contracting_dim) {
+      const Shape& lhs_shape = lhs->shape();
+      absl::Span<const int64_t> lhs_dimensions = lhs_shape.dimensions();
+      std::vector<int64_t> new_lhs_dimensions(lhs_dimensions.begin(),
+                                              lhs_dimensions.end());
+      new_lhs_dimensions.push_back(1);
+      Shape new_lhs_shape(
+          lhs_shape.element_type(), new_lhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_lhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_lhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(lhs_shape.layout()));
+      new_lhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_lhs_shape, lhs));
+    }
+
+    HloInstruction* new_rhs = rhs;
+    if (!rhs_has_non_contracting_dim) {
+      const Shape& rhs_shape = rhs->shape();
+      absl::Span<const int64_t> rhs_dimensions = rhs_shape.dimensions();
+      std::vector<int64_t> new_rhs_dimensions(rhs_dimensions.begin(),
+                                              rhs_dimensions.end());
+      new_rhs_dimensions.push_back(1);
+      Shape new_rhs_shape(
+          rhs_shape.element_type(), new_rhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_rhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_rhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(rhs_shape.layout()));
+      new_rhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_rhs_shape, rhs));
+    }
+
+    std::vector<int64_t> new_out_dimensions;
+    new_out_dimensions.reserve(dot->shape().dimensions().size() + 1);
+    for (int64_t dim_size : dot->shape().dimensions()) {
+      new_out_dimensions.push_back(dim_size);
+    }
+    if (!lhs_has_non_contracting_dim) {
+      // Insert the trivial dimension before the non-contracting dimension from
+      // rhs.
+      int non_contracting_dim_size = new_out_dimensions.back();
+      new_out_dimensions[new_out_dimensions.size() - 1] = 1;
+      new_out_dimensions.push_back(non_contracting_dim_size);
+    } else {
+      new_out_dimensions.push_back(1);
+    }
+
+    Shape new_out_shape(
+        dot->shape().element_type(), new_out_dimensions,
+        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false),
+        /*tuple_shapes=*/{});
+    TF_ASSIGN_OR_RETURN(
+        *new_out_shape.mutable_layout(),
+        GetLayoutWithNewMinorMostDimension(dot->shape().layout()));
+
+    HloInstruction* new_dot =
+        computation->AddInstruction(HloInstruction::CreateDot(
+            new_out_shape, new_lhs, new_rhs, dot->dot_dimension_numbers(),
+            dot->precision_config()));
+    HloInstruction* bitcast = computation->AddInstruction(
+        HloInstruction::CreateBitcast(dot->shape(), new_dot));
+    return computation->ReplaceInstruction(dot, bitcast);
+  }
+
+  bool changed() const { return changed_; }
+
+ private:
+  bool changed_ = false;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemvRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  GemvRewriterVisitor gemv_rewriter;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_RETURN_IF_ERROR(computation->Accept(&gemv_rewriter));
+  }
+  return gemv_rewriter.changed();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.h b/third_party/xla/xla/service/gpu/gemv_rewriter.h
new file mode 100644
index 00000000000000..a041138b8af5c6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GEMV_REWRITER_H_
+#define XLA_SERVICE_GPU_GEMV_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrite a matrix-vector or a vector-matrix multiplication into a
+// matrix-matrix multiplication with a trivial dimension. For example,
+// [m x n] @ [n] is rewritten to [m x n] @ [n x 1], and [n] @ [m x n] is
+// rewritten to [n x 1] @ [m x n].
+class GemvRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "gemv-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GEMV_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
new file mode 100644
index 00000000000000..2a8b8103e0a94e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <memory>
+#include <optional>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class GemvRewriterTest : public HloTestBase {};
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[32,7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[32,7]{1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[7]{0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[32,1]{1,0} dot(%[[P0]], %[[BITCAST]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteVectorMatrixMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[7]{0} parameter(0)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P0]])
+// CHECK:  %[[P1:.*]] = f32[7,32]{1,0} parameter(1)
+// CHECK:  %[[DOT:.*]] = f32[1,32]{1,0} dot(%[[BITCAST]], %[[P1]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]].1 = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationWithBatch) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[2,5,32,7] parameter(0)
+    p1 = f32[2,5,7] parameter(1)
+    ROOT d = f32[2,5,32] dot(p0, p1),
+      lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+      lhs_contracting_dims={3}, rhs_contracting_dims={2}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[2,5,32,7]{3,2,1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[2,5,7]{2,1,0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[2,5,7,1]{3,2,1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[2,5,32,1]{3,2,1,0} dot(%[[P0]], %[[BITCAST]]),
+// CHECK-SAME: lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[2,5,32]{2,1,0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, DotNotRewriteVectorVectorMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
+TEST_F(GemvRewriterTest, DotNotRewriteMatrixMatrixMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[5,32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
+TEST_F(GemvRewriterTest, DoNotRewriteDotsWithNonNormalizedLayout) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,32,7]{2,1,0} parameter(0)
+    p1 = f32[5,7]{0,1} parameter(1)
+    ROOT d = f32[5,32]{0,1} dot(p0, p1),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  GemvRewriter rewriter;
+  absl::StatusOr<bool> result = this->RunHloPass(&rewriter, module.get());
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.status().message(), "Layout is not normalized.");
+}
+
+}  // namespace
+}  // namespace xla::gpu

From f4dbd690e9a0b72a31f7509089da5abbd0d3acaf Mon Sep 17 00:00:00 2001
From: Kuy Mainwaring <kuym@google.com>
Date: Mon, 3 Jun 2024 10:22:41 -0700
Subject: [PATCH 264/287] [XLA:GPU] Clang-tidy cleanup for
 xla/service/gpu/cudnn_fusion_compiler.{cc,h}

PiperOrigin-RevId: 639830410
---
 third_party/xla/xla/service/gpu/BUILD            |  3 ++-
 .../xla/xla/service/gpu/cudnn_fusion_compiler.cc | 16 ++++++++++------
 .../xla/xla/service/gpu/cudnn_fusion_compiler.h  |  2 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index dd085f3022d1e2..6b7dc38a6253be 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2926,12 +2926,14 @@ cc_library(
         ":triton_fusion_analysis",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
         "//xla:shape_util",
+        "//xla:comparison_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -2941,7 +2943,6 @@ cc_library(
         "//xla/stream_executor/cuda:cudnn_plugin",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_set",
     ]) + ["@com_google_absl//absl/status"],
 )
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index ee8374d46cdc95..bfd04b84eb3ee4 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "third_party/gpus/cudnn/cudnn_version.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -48,8 +49,11 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -273,10 +277,10 @@ class GemmDimensionAdapter {
           }
           switch (scope) {
             case TritonFusionAnalysis::Scope::LHS:
-              lhs_noncontracting_split = spec->back().count;
+              lhs_noncontracting_split_ = spec->back().count;
               break;
             case TritonFusionAnalysis::Scope::OUTPUT:
-              if (lhs_noncontracting_split != spec->back().count) {
+              if (lhs_noncontracting_split_ != spec->back().count) {
                 VLOG(8) << "Output non-contracting dimension has to be split "
                            "the same way as the LHS input one if it is split.";
                 return false;
@@ -299,15 +303,15 @@ class GemmDimensionAdapter {
         strides.push_back(spec->front().stride);
       }
     }
-    if (lhs_noncontracting_split > 1 &&
+    if (lhs_noncontracting_split_ > 1 &&
         scope == TritonFusionAnalysis::Scope::OUTPUT &&
         dimensions[kBatchDimensionIndex] == 1) {
       // LHS input noncontracting dimension is split but the corresponding
       // output one is not. Assign part of the output one to the unused batch
       // dimension.
-      dimensions[kBatchDimensionIndex] = lhs_noncontracting_split;
+      dimensions[kBatchDimensionIndex] = lhs_noncontracting_split_;
       dimensions[kOutputLHSNonContractingDimensionIndex] /=
-          lhs_noncontracting_split;
+          lhs_noncontracting_split_;
       strides[kBatchDimensionIndex] =
           strides[kOutputLHSNonContractingDimensionIndex] *
           dimensions[kOutputLHSNonContractingDimensionIndex];
@@ -316,7 +320,7 @@ class GemmDimensionAdapter {
   }
 
  private:
-  int64_t lhs_noncontracting_split = 1;
+  int64_t lhs_noncontracting_split_ = 1;
   const HloDotInstruction& dot_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
index e5ce4ddefa7b6d..46111030a978b6 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_CUDNN_FUSION_COMPILER_H_
 #define XLA_SERVICE_GPU_CUDNN_FUSION_COMPILER_H_
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"

From 0065f3f3e4841c2d2affa00d53d30b746d278110 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Mon, 3 Jun 2024 10:36:43 -0700
Subject: [PATCH 265/287] Let derived classes access helper methods in
 StreamCommon.

PiperOrigin-RevId: 639835739
---
 third_party/xla/xla/stream_executor/stream_common.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/stream_common.h b/third_party/xla/xla/stream_executor/stream_common.h
index 748406dfa497f1..c9e977856e8450 100644
--- a/third_party/xla/xla/stream_executor/stream_common.h
+++ b/third_party/xla/xla/stream_executor/stream_common.h
@@ -106,7 +106,7 @@ class StreamCommon : public Stream {
   absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
                       const Kernel &k, const KernelArgs &args) override;
 
- private:
+ protected:
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
     return !status_.ok();
@@ -121,6 +121,7 @@ class StreamCommon : public Stream {
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
+ private:
   // The StreamExecutor that supports the operation of this stream.
   StreamExecutor *parent_;
 

From bc23e64c51f0096dc586d7b165caa6979edf00b5 Mon Sep 17 00:00:00 2001
From: Sara Smoot <sarasmoot@google.com>
Date: Mon, 3 Jun 2024 10:58:07 -0700
Subject: [PATCH 266/287] [XLA:GPU] Renaming AddressComputationThunk to
 DynamicSliceThunk

"AddressComputation" is confusing, it simply fuses dynamic slice (and dynamic update slice) into other thunks via buffer assignment tricks

PiperOrigin-RevId: 639843382
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  2 +-
 .../xla/xla/service/gpu/fusions/custom.cc     | 21 +++--
 third_party/xla/xla/service/gpu/runtime/BUILD | 16 ++--
 ...tation_thunk.cc => dynamic_slice_thunk.cc} | 14 ++-
 ...putation_thunk.h => dynamic_slice_thunk.h} | 18 ++--
 ...nk_test.cc => dynamic_slice_thunk_test.cc} | 88 +++++++++----------
 .../xla/service/gpu/runtime/for_all_thunks.cc |  7 +-
 .../gpu/runtime/for_all_thunks_test.cc        | 17 ++--
 8 files changed, 89 insertions(+), 94 deletions(-)
 rename third_party/xla/xla/service/gpu/runtime/{address_computation_thunk.cc => dynamic_slice_thunk.cc} (95%)
 rename third_party/xla/xla/service/gpu/runtime/{address_computation_thunk.h => dynamic_slice_thunk.h} (86%)
 rename third_party/xla/xla/service/gpu/runtime/{address_computation_thunk_test.cc => dynamic_slice_thunk_test.cc} (96%)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 461490f56fa23f..d5df6d39dad250 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -143,8 +143,8 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
-        "//xla/service/gpu/runtime:address_computation_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
+        "//xla/service/gpu/runtime:dynamic_slice_thunk",
         "//xla/service/gpu/runtime:gemm_thunk",
         "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/gpu/runtime:thunk",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index e58b8cd6afa911..9dbaa8016a974e 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -54,8 +54,8 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime/address_computation_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -220,8 +220,7 @@ absl::Status CollectSliceInfo(
     const BufferAssignment& buffer_assignment,
     const HloInstruction& fusion_instr,
     absl::Span<HloInstruction*> slice_instrs,
-    std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>&
-        offsets,
+    std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>& offsets,
     std::vector<std::optional<Shape>>& orig_shapes,
     std::vector<std::optional<Shape>>& sliced_shapes,
     std::vector<std::optional<uint64_t>>& offset_byte_sizes, unsigned arg_idx) {
@@ -231,7 +230,7 @@ absl::Status CollectSliceInfo(
     return absl::OkStatus();
   }
 
-  std::vector<AddressComputationThunk::Offset> arg_offsets;
+  std::vector<DynamicSliceThunk::Offset> arg_offsets;
   for (auto idx_op : arg_slice_instr->index_operands()) {
     const auto* param = Cast<HloParameterInstruction>(idx_op);
     const auto* offset_value = fusion_instr.operand(param->parameter_number());
@@ -259,7 +258,7 @@ absl::Status CollectSliceInfo(
 
     } else if (IsLoopIterationOffset(offset_value)) {
       // Loop offset defined by a loop iteration number.
-      arg_offsets.emplace_back() = AddressComputationThunk::LoopIter();
+      arg_offsets.emplace_back() = DynamicSliceThunk::LoopIter();
 
     } else {
       // Loop offset computed on device and has to be transferred to host.
@@ -331,7 +330,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>
+  std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
@@ -365,7 +364,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   // Handling cases where multiple operands share the same buffer, with
   // different offset by creating new fake allocations so each operand will have
   // a different buffer index. The slices can thus always start at offset 0.
-  // AddressComputationThunk will take care of the offset adjustment.
+  // DynamicSliceThunk will take care of the offset adjustment.
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
   if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(
@@ -457,7 +456,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
     std::vector<std::optional<BufferAllocation::Slice>> arguments{
         lhs_slice, rhs_slice, output, workspace};
 
-    thunk = std::make_unique<AddressComputationThunk>(
+    thunk = std::make_unique<DynamicSliceThunk>(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations),
         std::move(offset_buffer_indices), std::move(orig_shapes),
@@ -510,8 +509,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
     num_args += ShapeUtil::GetLeafCount(operand->shape());
   });
 
-  std::vector<std::optional<std::vector<AddressComputationThunk::Offset>>>
-      offsets(num_args, std::nullopt);
+  std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets(
+      num_args, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
   std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
@@ -736,7 +735,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
             ? ffi_thunk(std::move(fake_operands), std::move(fake_results))
             : legacy_thunk(std::move(fake_operands), std::move(fake_results)));
 
-    thunk = std::make_unique<AddressComputationThunk>(
+    thunk = std::make_unique<DynamicSliceThunk>(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations), std::move(offsets),
         std::move(orig_shapes), std::move(sliced_shapes),
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 35a52bc260e649..298f7622e1f6c3 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -312,9 +312,9 @@ xla_cc_test(
 #===-------------------------------------------------------------------------------------------===//
 
 cc_library(
-    name = "address_computation_thunk",
-    srcs = ["address_computation_thunk.cc"],
-    hdrs = ["address_computation_thunk.h"],
+    name = "dynamic_slice_thunk",
+    srcs = ["dynamic_slice_thunk.cc"],
+    hdrs = ["dynamic_slice_thunk.h"],
     deps = [
         ":sequential_thunk",
         ":thunk",
@@ -342,8 +342,8 @@ cc_library(
 )
 
 xla_test(
-    name = "address_computation_thunk_test",
-    srcs = if_gpu_is_configured(["address_computation_thunk_test.cc"]),
+    name = "dynamic_slice_thunk_test",
+    srcs = if_gpu_is_configured(["dynamic_slice_thunk_test.cc"]),
     backend_tags = {
         "gpu_a100": if_google(["config-cuda-only"]),
         "gpu_v100": if_google(["config-cuda-only"]),
@@ -355,8 +355,8 @@ xla_test(
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        ":address_computation_thunk",
         ":custom_call_thunk",
+        ":dynamic_slice_thunk",
         ":gemm_thunk",
         ":thunk",
         "//xla:shape_util",
@@ -1140,9 +1140,9 @@ cc_library(
     srcs = ["for_all_thunks.cc"],
     hdrs = ["for_all_thunks.h"],
     deps = [
-        ":address_computation_thunk",
         ":command_buffer_thunk",
         ":conditional_thunk",
+        ":dynamic_slice_thunk",
         ":sequential_thunk",
         ":thunk",
         ":while_thunk",
@@ -1155,10 +1155,10 @@ xla_cc_test(
     name = "for_all_thunks_test",
     srcs = ["for_all_thunks_test.cc"],
     deps = [
-        ":address_computation_thunk",
         ":command_buffer_cmd",
         ":command_buffer_thunk",
         ":conditional_thunk",
+        ":dynamic_slice_thunk",
         ":for_all_thunks",
         ":sequential_thunk",
         ":thunk",
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.cc
similarity index 95%
rename from third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.cc
index 032c027b358ca5..d1e2bb38ee2c54 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/address_computation_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -46,7 +46,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-AddressComputationThunk::AddressComputationThunk(
+DynamicSliceThunk::DynamicSliceThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
     std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
@@ -81,8 +81,8 @@ AddressComputationThunk::AddressComputationThunk(
   }
 }
 
-absl::Status AddressComputationThunk::Prepare(
-    const PrepareParams& params, ResourceRequests& resource_requests) {
+absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params,
+                                        ResourceRequests& resource_requests) {
   for (SliceDef& slice : slices_) {
     if (slice.offsets.has_value()) {
       TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
@@ -102,8 +102,7 @@ absl::Status AddressComputationThunk::Prepare(
   return absl::OkStatus();
 }
 
-absl::Status AddressComputationThunk::Initialize(
-    const InitializeParams& params) {
+absl::Status DynamicSliceThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(embedded_thunk_->Initialize(params));
 
   absl::MutexLock lock(&mutex_);
@@ -119,8 +118,7 @@ absl::Status AddressComputationThunk::Initialize(
   return absl::OkStatus();
 }
 
-absl::Status AddressComputationThunk::ExecuteOnStream(
-    const ExecuteParams& params) {
+absl::Status DynamicSliceThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::Stream& stream = *params.stream;
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
 
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h
similarity index 86%
rename from third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h
index b1dea51cda083f..ddd9a97d08e205 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_ADDRESS_COMPUTATION_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME_ADDRESS_COMPUTATION_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
 
 #include <cstdint>
 #include <memory>
@@ -36,12 +36,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// AddressComputationThunk wraps the logic to compute dynamic offsets/sizes from
+// DynamicSliceThunk wraps the logic to compute dynamic offsets/sizes from
 // dynamic-slice or DUS around some original thunks (e.g. custom call or NCCL
 // thunks)
 //
-// AddressComputationThunk assumes that the slices are contiguous.
-class AddressComputationThunk : public Thunk {
+// DynamicSliceThunk assumes that the slices are contiguous.
+class DynamicSliceThunk : public Thunk {
  public:
   struct LoopIter {};
 
@@ -50,7 +50,7 @@ class AddressComputationThunk : public Thunk {
   // computed on device and have to be transferred to host.
   using Offset = std::variant<uint64_t, LoopIter, BufferAllocation::Slice>;
 
-  AddressComputationThunk(
+  DynamicSliceThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
       std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
@@ -59,8 +59,8 @@ class AddressComputationThunk : public Thunk {
       std::vector<std::optional<Shape>> sliced_shapes,
       std::vector<std::optional<uint64_t>> offset_byte_sizes);
 
-  AddressComputationThunk(const AddressComputationThunk&) = delete;
-  AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
+  DynamicSliceThunk(const DynamicSliceThunk&) = delete;
+  DynamicSliceThunk& operator=(const DynamicSliceThunk&) = delete;
 
   const Thunk* embedded_thunk() const { return embedded_thunk_.get(); }
 
@@ -101,4 +101,4 @@ class AddressComputationThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_ADDRESS_COMPUTATION_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
similarity index 96%
rename from third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
rename to third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
index c4dc33093e1391..f7adacd95d6fc3 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/address_computation_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -71,7 +71,7 @@ static se::StreamExecutor* GpuExecutor() {
 
 }  // namespace
 
-TEST(AddressComputationThunkTest, SlicedGemm) {
+TEST(DynamicSliceThunkTest, SlicedGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -136,9 +136,9 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -210,7 +210,7 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
-TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
+TEST(DynamicSliceThunkTest, SlicedNonContiguousGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -289,11 +289,11 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  std::vector<AddressComputationThunk::Offset> rhs_offsets{slice_rhs_offset_0,
-                                                           slice_rhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  std::vector<DynamicSliceThunk::Offset> rhs_offsets{slice_rhs_offset_0,
+                                                     slice_rhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -375,7 +375,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   ASSERT_FALSE(thunk.ExecuteOnStream(params).ok());
 }
 
-TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
+TEST(DynamicSliceThunkTest, MulipleSlicedOperandsGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -453,11 +453,11 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  std::vector<AddressComputationThunk::Offset> rhs_offsets{slice_rhs_offset_0,
-                                                           slice_rhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  std::vector<DynamicSliceThunk::Offset> rhs_offsets{slice_rhs_offset_0,
+                                                     slice_rhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -567,7 +567,7 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
 
-TEST(AddressComputationThunkTest, SlicedMemcpy) {
+TEST(DynamicSliceThunkTest, SlicedMemcpy) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -631,9 +631,9 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<AddressComputationThunk::Offset> slice_offsets{
+  std::vector<DynamicSliceThunk::Offset> slice_offsets{
       slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
-  AddressComputationThunk thunk(
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_src, slice_dst}, std::move(fake_allocations),
       {slice_offsets, std::nullopt},
@@ -701,7 +701,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   ASSERT_EQ(out, ref);
 }
 
-TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
+TEST(DynamicSliceThunkTest, SlicedOutputMemcpy) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -789,13 +789,13 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<AddressComputationThunk::Offset> slice_src_offsets{
+  std::vector<DynamicSliceThunk::Offset> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<AddressComputationThunk::Offset> slice_dst_offsets{
+  std::vector<DynamicSliceThunk::Offset> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
-  AddressComputationThunk thunk(
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_src, slice_dst}, std::move(fake_allocations),
       {slice_src_offsets, slice_dst_offsets},
@@ -895,7 +895,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   ASSERT_EQ(out, ref);
 }
 
-TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
+TEST(DynamicSliceThunkTest, SlicedGemmArbitraryArgumentOrder) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -969,9 +969,9 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -1043,7 +1043,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
-TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
+TEST(DynamicSliceThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -1117,9 +1117,9 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -1193,7 +1193,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
-TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
+TEST(DynamicSliceThunkTest, SlicedTupledOperandGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -1258,9 +1258,9 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
@@ -1340,7 +1340,7 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
-TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
+TEST(DynamicSliceThunkTest, SlicedMemcpyOOB) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -1428,13 +1428,13 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
-  std::vector<AddressComputationThunk::Offset> slice_src_offsets{
+  std::vector<DynamicSliceThunk::Offset> slice_src_offsets{
       slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
       slice_src_offset_3};
-  std::vector<AddressComputationThunk::Offset> slice_dst_offsets{
+  std::vector<DynamicSliceThunk::Offset> slice_dst_offsets{
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
-  AddressComputationThunk thunk(
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_src, slice_dst}, std::move(fake_allocations),
       {slice_src_offsets, slice_dst_offsets},
@@ -1537,7 +1537,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   ASSERT_EQ(out, ref);
 }
 
-TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
+TEST(DynamicSliceThunkTest, SlicedOperandsSameBufferGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
@@ -1609,9 +1609,9 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
-  std::vector<AddressComputationThunk::Offset> lhs_offsets{slice_lhs_offset_0,
-                                                           slice_lhs_offset_1};
-  AddressComputationThunk thunk(
+  std::vector<DynamicSliceThunk::Offset> lhs_offsets{slice_lhs_offset_0,
+                                                     slice_lhs_offset_1};
+  DynamicSliceThunk thunk(
       Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
diff --git a/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc
index 77932caad72fa0..d10d82d31581ef 100644
--- a/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc
+++ b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <optional>
 
 #include "absl/functional/function_ref.h"
-#include "xla/service/gpu/runtime/address_computation_thunk.h"
 #include "xla/service/gpu/runtime/command_buffer_thunk.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
@@ -36,9 +36,8 @@ void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn,
   // ... and then handle all nested `Thunks` recursively.
   switch (thunk->kind()) {
     case Thunk::kAddressComputation:
-      ForAllThunks(fn,
-                   tensorflow::down_cast<const AddressComputationThunk*>(thunk)
-                       ->embedded_thunk());
+      ForAllThunks(fn, tensorflow::down_cast<const DynamicSliceThunk*>(thunk)
+                           ->embedded_thunk());
       break;
     case Thunk::kCommandBuffer:
       if (const std::optional<ThunkSequence>& sequence =
diff --git a/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc b/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc
index 32f7bc6d144af6..fceb4168396a14 100644
--- a/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/runtime/address_computation_thunk.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/command_buffer_thunk.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
@@ -58,21 +58,20 @@ TEST(ForAllThunksTest, SingleThunk) {
   EXPECT_THAT(GetAllThunks(&thunk), UnorderedElementsAre(&thunk));
 }
 
-TEST(ForAllThunksTest, AddressComputationThunk) {
+TEST(ForAllThunksTest, DynamicSliceThunk) {
   auto thunk = std::make_unique<DummyThunk>();
   Thunk* thunk_ptr = thunk.get();
 
   auto thunk_sequence = std::make_unique<ThunkSequence>();
   thunk_sequence->push_back(std::move(thunk));
 
-  AddressComputationThunk address_computation_thunk(
+  DynamicSliceThunk dynamic_slice_thunk(
       Thunk::ThunkInfo(), std::move(thunk_sequence), {}, {}, {}, {}, {}, {});
-  EXPECT_THAT(
-      GetAllThunks(&address_computation_thunk),
-      // `AddressComputationThunk` wraps the `embedded_thunk` in a
-      // `SequentialThunk`, which is why iterate over more than the
-      // two expected `Thunks`.
-      IsSupersetOf<const Thunk*>({thunk_ptr, &address_computation_thunk}));
+  EXPECT_THAT(GetAllThunks(&dynamic_slice_thunk),
+              // `DynamicSliceThunk` wraps the `embedded_thunk` in a
+              // `SequentialThunk`, which is why iterate over more than the
+              // two expected `Thunks`.
+              IsSupersetOf<const Thunk*>({thunk_ptr, &dynamic_slice_thunk}));
 }
 
 TEST(ForAllThunksTest, CommandBufferThunk) {

From 1061edbc94e29be95e5685169976b5fe41d3edcb Mon Sep 17 00:00:00 2001
From: Samuel Agyakwa <sagyakwa@google.com>
Date: Mon, 3 Jun 2024 11:01:19 -0700
Subject: [PATCH 267/287] Modify DirectPluginOpKernelContext constructor to get
 more type safety

Reverts a01c5c6252ae859c18ad1e489981d867a3c4b519

PiperOrigin-RevId: 639844585
---
 .../core/common_runtime/next_pluggable_device/BUILD       | 8 +++++---
 .../next_pluggable_device/direct_plugin_op_kernel.cc      | 5 +++--
 .../next_pluggable_device/direct_plugin_op_kernel.h       | 3 +--
 .../next_pluggable_device/plugin_op_kernel_helper.h       | 8 +++++---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index e6d268cdc23790..016264a032e462 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -149,6 +149,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -227,11 +228,12 @@ cc_library(
     deps = [
         ":c_plugin_op_kernel",
         ":direct_plugin_op_kernel",
+        ":flags",
         ":loose_headers",
         ":plugin_op_kernel",
-        "//tensorflow/c:kernels_hdrs",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/core/common_runtime/next_pluggable_device:flags",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/flags:flag",
+        "@local_tsl//tsl/platform:macros",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
index cdb2839e0639c4..61116b9e4cdcd4 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
@@ -119,8 +120,8 @@ Status DirectPluginOpKernelContext::LookupOrCreateResource(
   return absl::OkStatus();
 }
 
-Status DirectPluginOpKernelContext::GetInput(int index,
-                                             const Tensor** tensor) const {
+absl::Status DirectPluginOpKernelContext::GetInput(
+    int index, const Tensor** tensor) const {
   *tensor = &ctx_->input(index);
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
index 375cd3eb61e4c3..86b631a29723db 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
@@ -60,8 +60,7 @@ class DirectPluginOpKernelConstruction : public PluginOpKernelConstruction {
 
 class DirectPluginOpKernelContext : public PluginOpKernelContext {
  public:
-  explicit DirectPluginOpKernelContext(void* ctx)
-      : ctx_(reinterpret_cast<OpKernelContext*>(ctx)) {}
+  explicit DirectPluginOpKernelContext(OpKernelContext* ctx) : ctx_(ctx) {}
 
   std::string_view GetResourceMgrDefaultContainerName() override;
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
index 6dfa9d4e1ce9f5..1f51f7c4f1b760 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
@@ -16,12 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
 
-#include "tensorflow/c/kernels.h"
-#include "tensorflow/c/tf_status_helper.h"
+#include "absl/flags/flag.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/flags.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tsl/platform/macros.h"
 
 namespace tensorflow {
 
@@ -40,7 +41,8 @@ inline void DeletePluginOpKernelConstruction(
 
 inline PluginOpKernelContext* CreatePluginOpKernelContext(void* ctx) {
   if (!absl::GetFlag(FLAGS_next_pluggable_device_use_c_api)) {
-    return new DirectPluginOpKernelContext(ctx);
+    return new DirectPluginOpKernelContext(
+        reinterpret_cast<OpKernelContext*>(ctx));
   } else {
     return new CPluginOpKernelContext(ctx);
   }

From 2474dc33eec63204cd62019e907448789f1f08ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 11:13:15 -0700
Subject: [PATCH 268/287] Skip using temp files/folders when save tensor and
 new checkpoint.

PiperOrigin-RevId: 639849044
---
 tensorflow/core/platform/file_system_test.cc  |  8 +++++
 tensorflow/core/util/tensor_slice_writer.cc   | 35 +++++++++++++------
 tensorflow/core/util/tensor_slice_writer.h    |  3 +-
 .../xla/third_party/tsl/tsl/platform/env.cc   |  6 ++++
 .../xla/third_party/tsl/tsl/platform/env.h    |  9 +++++
 .../tsl/tsl/platform/file_system.cc           |  8 +++++
 .../tsl/tsl/platform/file_system.h            |  8 +++++
 .../tsl/tsl/platform/retrying_file_system.h   |  6 ++++
 8 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 1c4d978f13ca6c..41d4f5819ee996 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -273,6 +273,14 @@ TEST(InterPlanetaryFileSystemTest, HasAtomicMove) {
   EXPECT_EQ(has_atomic_move, true);
 }
 
+TEST(InterPlanetaryFileSystemTest, CanCreateTempFile) {
+  InterPlanetaryFileSystem ipfs;
+  const string dirname = io::JoinPath(kPrefix, "match-00/abc/00");
+  bool can_create_temp_file;
+  TF_EXPECT_OK(ipfs.CanCreateTempFile(dirname, &can_create_temp_file));
+  EXPECT_EQ(can_create_temp_file, true);
+}
+
 // A simple file system with a root directory and a single file underneath it.
 class TestFileSystem : public NullFileSystem {
  public:
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index c23f2d92126ad6..e06e18c46a8ea3 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -86,8 +86,18 @@ TensorSliceWriter::TensorSliceWriter(const string& filename,
                                      CreateBuilderFunction create_builder)
     : filename_(filename),
       create_builder_(std::move(create_builder)),
-      tmpname_(strings::StrCat(filename, ".tempstate", random::New64())),
       slices_(0) {
+  Env* env = Env::Default();
+  Status status = env->CanCreateTempFile(filename_, &use_temp_file_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to get CanCreateTempFile attribute: " << filename_;
+    use_temp_file_ = true;
+  }
+
+  data_filename_ = filename_;
+  if (use_temp_file_) {
+    data_filename_ = strings::StrCat(filename_, ".tempstate", random::New64());
+  }
   VersionDef* versions = sts_.mutable_meta()->mutable_versions();
   versions->set_producer(TF_CHECKPOINT_VERSION);
   versions->set_min_consumer(TF_CHECKPOINT_VERSION_MIN_CONSUMER);
@@ -95,7 +105,7 @@ TensorSliceWriter::TensorSliceWriter(const string& filename,
 
 Status TensorSliceWriter::Finish() {
   Builder* b;
-  Status s = create_builder_(tmpname_, &b);
+  Status s = create_builder_(data_filename_, &b);
   if (!s.ok()) {
     delete b;
     return s;
@@ -114,18 +124,21 @@ Status TensorSliceWriter::Finish() {
 
   int64_t file_size;
   s = builder->Finish(&file_size);
-  // We need to rename the file to the proper name
-  if (s.ok()) {
-    s = Env::Default()->RenameFile(tmpname_, filename_);
+  // If use temp file, we need to rename the file to the proper name.
+  if (use_temp_file_) {
     if (s.ok()) {
-      VLOG(1) << "Written " << slices_ << " slices for "
-              << sts_.meta().tensor_size() << " tensors (" << file_size
-              << " bytes) to " << filename_;
+      s = Env::Default()->RenameFile(data_filename_, filename_);
+      if (s.ok()) {
+        VLOG(1) << "Written " << slices_ << " slices for "
+                << sts_.meta().tensor_size() << " tensors (" << file_size
+                << " bytes) to " << filename_;
+      } else {
+        LOG(ERROR) << "Failed to rename file " << data_filename_ << " to "
+                   << filename_;
+      }
     } else {
-      LOG(ERROR) << "Failed to rename file " << tmpname_ << " to " << filename_;
+      Env::Default()->DeleteFile(data_filename_).IgnoreError();
     }
-  } else {
-    Env::Default()->DeleteFile(tmpname_).IgnoreError();
   }
   return s;
 }
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 5d02a06f00e2f9..1409518f965621 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -86,7 +86,8 @@ class TensorSliceWriter {
 
   const string filename_;
   const CreateBuilderFunction create_builder_;
-  const string tmpname_;
+  string data_filename_;
+  bool use_temp_file_;
 
   // A mapping from the tensor names to their index in meta_.saved_slice_meta()
   std::unordered_map<string, int> name_to_index_;
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.cc b/third_party/xla/third_party/tsl/tsl/platform/env.cc
index 45199b50ebd94b..77f48b3372e1eb 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.cc
@@ -328,6 +328,12 @@ absl::Status Env::HasAtomicMove(const string& path, bool* has_atomic_move) {
   return fs->HasAtomicMove(path, has_atomic_move);
 }
 
+Status Env::CanCreateTempFile(const string& fname, bool* can_create_temp_file) {
+  FileSystem* fs;
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
+  return fs->CanCreateTempFile(fname, can_create_temp_file);
+}
+
 absl::Status Env::DeleteRecursively(const string& dirname,
                                     int64_t* undeleted_files,
                                     int64_t* undeleted_dirs) {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/xla/third_party/tsl/tsl/platform/env.h
index 6953ac8722976a..37abc1dee97d54 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -346,6 +346,15 @@ class Env {
   ///  TF
   absl::Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
+  /// Returns whether the give path is on a file system
+  /// that has ability to create a new temp file. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// If this returns false, TensorFlow will write directly to output files
+  /// instead of creating a temporary file and swapping it in. This may mean
+  /// that incomplete writes are visible to consumers.
+  Status CanCreateTempFile(const std::string& fname,
+                           bool* can_create_temp_file);
+
   /// Stores the size of `fname` in `*file_size`.
   absl::Status GetFileSize(const std::string& fname, uint64* file_size);
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.cc b/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
index cbca921d50b545..68d0fcf0499ca5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tsl/platform/status.h"
+
 #if defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM) || \
     defined(PLATFORM_GOOGLE)
 #include <fnmatch.h>
@@ -93,6 +95,12 @@ absl::Status FileSystem::HasAtomicMove(const string& path,
   return absl::OkStatus();
 }
 
+Status FileSystem::CanCreateTempFile(const std::string& fname,
+                                     bool* can_create_temp_file) {
+  *can_create_temp_file = true;
+  return OkStatus();
+}
+
 void FileSystem::FlushCaches(TransactionToken* token) {}
 
 bool FileSystem::FilesExist(const std::vector<string>& files,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.h b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
index a25a4760357ca6..4b728a42c4d507 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
@@ -387,6 +387,14 @@ class FileSystem {
   virtual absl::Status HasAtomicMove(const std::string& path,
                                      bool* has_atomic_move);
 
+  /// Returns whether the give path is on a file system
+  /// that has ability to create a new temp file. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// If the file system cannot create a temp file, it's possibile that
+  /// uncomplete result may appear in the given file.
+  virtual Status CanCreateTempFile(const std::string& fname,
+                                   bool* can_create_temp_file);
+
   /// \brief Flushes any cached filesystem objects from memory.
   virtual void FlushCaches() { FlushCaches(nullptr); }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
index 3db1665a4f1867..88da8787c3d618 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
@@ -151,6 +151,12 @@ class RetryingFileSystem : public FileSystem {
     return base_file_system_->HasAtomicMove(path, has_atomic_move);
   }
 
+  Status CanCreateTempFile(const std::string& fname,
+                           bool* can_create_temp_file) override {
+    // this method does not need to be retried
+    return base_file_system_->CanCreateTempFile(fname, can_create_temp_file);
+  }
+
   absl::Status DeleteRecursively(const string& dirname, TransactionToken* token,
                                  int64_t* undeleted_files,
                                  int64_t* undeleted_dirs) override {

From 2f8fc71fc46ef757abe664c64ea80609d99fe2d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 11:15:57 -0700
Subject: [PATCH 269/287] Remove key_value_set overload for byte values.

PiperOrigin-RevId: 639849972
---
 third_party/xla/xla/python/xla.cc | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 2e8f3a27a861dd..db507178aac9d2 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -671,16 +671,6 @@ NB_MODULE(xla_extension, m_nb) {
             xla::ThrowIfError(client.KeyValueSet(key, value, allow_overwrite));
           },
           nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
-      .def(
-          "key_value_set",
-          [](DistributedRuntimeClient& client, std::string_view key,
-             nb::bytes value, bool allow_overwrite) {
-            nb::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(
-                key, std::string_view(value.c_str(), value.size()),
-                allow_overwrite));
-          },
-          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
       // The key must be a string, but the value must a
       // Python bytes object.
       // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.

From c1b16c89f2c97d00966ef9ab8bfbdf73a6b70ac4 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 3 Jun 2024 12:02:12 -0700
Subject: [PATCH 270/287] [XLA:GPU] Fix hlo_fusion_analysis_test.

kParameter instruction shouldn't be a part of HloFusionAnalysis. Causes msan failure.

PiperOrigin-RevId: 639866442
---
 .../xla/xla/service/gpu/hlo_fusion_analysis_test.cc        | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index eb5add834a5536..04c58194c75846 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 
-
 #include <gtest/gtest.h>
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
@@ -256,8 +255,7 @@ TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  auto analysis =
-      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  auto analysis = AnalyzeFusion(*root, device_info);
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
@@ -289,8 +287,7 @@ TEST_F(HloFusionAnalysisTest, InvalidReduceMultiOutputFusion) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  auto analysis =
-      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  auto analysis = AnalyzeFusion(*root, device_info);
   // We expect to fallback to the loop emitter, because the two reductions are
   // not compatible as they reduce over different dimensions.
   EXPECT_EQ(analysis.GetEmitterFusionKind(),

From 72596820553cd33a06479eae436ce052171b201f Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Mon, 3 Jun 2024 12:16:05 -0700
Subject: [PATCH 271/287] Rename StreamExecutorInterface to be StreamExecutor
 to save on all the renaming elsewhere.

A followon change will move it from stream_executor_interface.h to stream_executor.h

PiperOrigin-RevId: 639870861
---
 .../stream_executor/stream_executor.cc          | 10 ++++------
 .../xla/xla/backends/interpreter/executor.h     | 10 ++++------
 third_party/xla/xla/device_util.h               |  2 +-
 third_party/xla/xla/service/backend.cc          |  3 +--
 .../xla/xla/service/shaped_buffer_test.cc       |  4 +---
 .../xla/xla/stream_executor/cuda/cuda_blas.cc   |  3 +--
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc    |  2 +-
 .../xla/stream_executor/cuda/cuda_executor.cc   |  4 ++--
 .../xla/xla/stream_executor/cuda/cuda_fft.cc    |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_kernel.h  |  2 +-
 .../xla/stream_executor/device_memory_handle.cc |  2 +-
 .../xla/stream_executor/device_memory_handle.h  |  5 ++---
 .../xla/xla/stream_executor/gpu/gpu_executor.h  | 12 ++++++------
 .../xla/xla/stream_executor/gpu/gpu_kernel.h    |  2 +-
 .../xla/stream_executor/host/host_executor.h    | 10 ++++------
 .../stream_executor/host_memory_allocation.cc   |  2 +-
 .../stream_executor/host_memory_allocation.h    |  7 +++----
 .../xla/xla/stream_executor/kernel_factory.h    |  2 +-
 .../xla/stream_executor/mock_stream_executor.h  | 10 +++++-----
 .../xla/xla/stream_executor/plugin_registry.h   |  8 ++++----
 .../xla/xla/stream_executor/rocm/hip_blas_lt.cc |  8 ++++----
 .../xla/xla/stream_executor/rocm/rocm_blas.cc   |  5 ++---
 .../xla/xla/stream_executor/rocm/rocm_blas.h    |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc    |  2 +-
 .../xla/stream_executor/rocm/rocm_executor.cc   |  4 ++--
 .../xla/xla/stream_executor/rocm/rocm_fft.cc    |  2 +-
 .../xla/stream_executor/scoped_module_handle.h  |  5 ++---
 .../stream_executor/stream_executor_interface.h | 17 ++++++-----------
 .../stream_executor_memory_allocator.cc         | 14 +++++++-------
 .../stream_executor_memory_allocator.h          |  9 ++++-----
 .../stream_executor/stream_executor_pimpl.cc    |  8 ++------
 .../xla/stream_executor/stream_executor_pimpl.h | 10 +++++-----
 .../xla/xla/stream_executor/tpu/tpu_executor.h  |  4 ++--
 .../tpu/tpu_executor_interface.h                |  4 ++--
 .../trace_command_buffer_factory.cc             |  4 ++--
 .../trace_command_buffer_factory.h              |  4 ++--
 .../xla/stream_executor/typed_kernel_factory.h  |  9 ++++-----
 .../xla/xla/tests/buffer_donation_test.cc       |  4 +---
 .../xla/xla/tests/cpu_gpu_fusion_test.cc        |  4 +---
 third_party/xla/xla/tests/dot_operation_test.cc |  4 +---
 third_party/xla/xla/tests/dynamic_ops_test.cc   |  4 +---
 .../xla/xla/tests/local_client_execute_test.cc  |  4 +---
 .../xla/xla/tests/local_client_test_base.h      | 11 +----------
 third_party/xla/xla/tests/while_test.cc         |  4 +---
 44 files changed, 101 insertions(+), 147 deletions(-)

diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index cde30a61b9cf54..2c255906360d2f 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -186,7 +186,7 @@ void HostCallbackTrampoline(void* ctx, TF_Status* status) {
   delete host_ctx;
 }
 
-class CStreamExecutor : public StreamExecutor {
+class CStreamExecutor : public StreamExecutorCommon {
  public:
   explicit CStreamExecutor(Platform* se_platform, SP_Device device,
                            SP_DeviceFns* device_fns,
@@ -194,7 +194,7 @@ class CStreamExecutor : public StreamExecutor {
                            SP_Platform* platform, SP_PlatformFns* platform_fns,
                            SP_TimerFns* timer_fns, const std::string& name,
                            int visible_device_count)
-      : StreamExecutor(se_platform),
+      : StreamExecutorCommon(se_platform),
         device_(std::move(device)),
         device_fns_(device_fns),
         stream_executor_(stream_executor),
@@ -457,13 +457,11 @@ class CStreamExecutor : public StreamExecutor {
     return StatusFromTF_Status(c_status.get());
   }
 
-  absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override {
     return tsl::errors::Unimplemented(
         "EnablePeerAccessTo is not supported by pluggable device.");
   }
-  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
-    return false;
-  }
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override { return false; }
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override {
     return stream_executor_->device_memory_usage(
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 15d0029cf8ba50..99ee09afb5f678 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -45,10 +45,10 @@ limitations under the License.
 namespace stream_executor {
 namespace interpreter {
 
-class XlaInterpreterExecutor : public StreamExecutor {
+class XlaInterpreterExecutor : public StreamExecutorCommon {
  public:
   XlaInterpreterExecutor(int device_ordinal, Platform *platform)
-      : StreamExecutor(platform), device_ordinal_(device_ordinal) {}
+      : StreamExecutorCommon(platform), device_ordinal_(device_ordinal) {}
 
   absl::Status Init() override { return absl::OkStatus(); }
 
@@ -138,13 +138,11 @@ class XlaInterpreterExecutor : public StreamExecutor {
   static absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
-  absl::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+  absl::Status EnablePeerAccessTo(StreamExecutor *other) override {
     return absl::OkStatus();
   }
 
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
-    return true;
-  }
+  bool CanEnablePeerAccessTo(StreamExecutor *other) override { return true; }
   absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override {
     return std::make_unique<Event>();
   }
diff --git a/third_party/xla/xla/device_util.h b/third_party/xla/xla/device_util.h
index 1e9bc39329f286..8ad749f12d993b 100644
--- a/third_party/xla/xla/device_util.h
+++ b/third_party/xla/xla/device_util.h
@@ -29,7 +29,7 @@ namespace xla {
 
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-std::string DeviceIdentifier(se::StreamExecutorInterface* stream_exec) {
+std::string DeviceIdentifier(se::StreamExecutor* stream_exec) {
   return absl::StrCat(stream_exec->GetPlatform()->Name(), ":",
                       stream_exec->device_ordinal());
 }
diff --git a/third_party/xla/xla/service/backend.cc b/third_party/xla/xla/service/backend.cc
index 82285fb781588a..efbcf2d8603165 100644
--- a/third_party/xla/xla/service/backend.cc
+++ b/third_party/xla/xla/service/backend.cc
@@ -149,8 +149,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
       stream_executors_(stream_executors.begin(), stream_executors.end()) {
   // Create a memory allocator for the valid stream executors.
   memory_allocator_ = std::make_shared<se::StreamExecutorMemoryAllocator>(
-      platform, std::vector<se::StreamExecutorInterface*>{
-                    stream_executors_.begin(), stream_executors_.end()});
+      platform, stream_executors_);
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
 
diff --git a/third_party/xla/xla/service/shaped_buffer_test.cc b/third_party/xla/xla/service/shaped_buffer_test.cc
index c13eb86f72168a..d1776820744a71 100644
--- a/third_party/xla/xla/service/shaped_buffer_test.cc
+++ b/third_party/xla/xla/service/shaped_buffer_test.cc
@@ -35,9 +35,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
                           xla::PlatformUtil::GetDefaultPlatform());
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>{executors.begin(),
-                                                          executors.end()});
+  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = std::make_unique<xla::ScopedShapedBuffer>(
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 42c4940ff636f8..a10b12cb1fb6c6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -1429,8 +1429,7 @@ void initialize_cublas() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
           kCudaPlatformId, "cuBLAS",
-          [](::stream_executor::StreamExecutorInterface *parent)
-              -> blas::BlasSupport * {
+          [](::stream_executor::StreamExecutor *parent) -> blas::BlasSupport * {
             gpu::GpuExecutor *cuda_executor =
                 dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 9c33472189148d..f3da8f39156ab2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -8432,7 +8432,7 @@ void initialize_cudnn() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
           cuda::kCudaPlatformId, "cuDNN",
-          [](StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+          [](StreamExecutor* parent) -> dnn::DnnSupport* {
             gpu::GpuExecutor* cuda_executor =
                 dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index a735398bf9cb6c..1b18956615238b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -869,12 +869,12 @@ fft::FftSupport* GpuExecutor::AsFft() {
   return fft_.get();
 }
 
-bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }
 
-absl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+absl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutor* other) {
   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index aa20be7e22df08..e532bc28e344ad 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -470,7 +470,7 @@ void initialize_cufft() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
           cuda::kCudaPlatformId, "cuFFT",
-          [](StreamExecutorInterface *parent) -> fft::FftSupport * {
+          [](StreamExecutor *parent) -> fft::FftSupport * {
             gpu::GpuExecutor *cuda_executor =
                 dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
index 69cf73bcf5b773..c9c78ff5509c65 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// The CUDA implementation of the StreamExecutorInterface functionality.
+// The CUDA implementation of the StreamExecutor functionality.
 // CUDA inclusions are ideally confined to this implementation file.
 //
 // The notions from the StreamExecutor basically correspond to the CUDA streams
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle.cc b/third_party/xla/xla/stream_executor/device_memory_handle.cc
index c21fbca804265b..3eb3ddf0206f67 100644
--- a/third_party/xla/xla/stream_executor/device_memory_handle.cc
+++ b/third_party/xla/xla/stream_executor/device_memory_handle.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace stream_executor {
 
-DeviceMemoryHandle::DeviceMemoryHandle(StreamExecutorInterface *executor,
+DeviceMemoryHandle::DeviceMemoryHandle(StreamExecutor *executor,
                                        DeviceMemoryBase memory)
     : memory_(std::move(memory)), executor_(executor) {}
 
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle.h b/third_party/xla/xla/stream_executor/device_memory_handle.h
index aedfbfa7a21f7e..f6098c943590e8 100644
--- a/third_party/xla/xla/stream_executor/device_memory_handle.h
+++ b/third_party/xla/xla/stream_executor/device_memory_handle.h
@@ -30,8 +30,7 @@ class DeviceMemoryHandle {
 
   // A helper constructor to generate a scoped device memory given an already
   // allocated memory and a stream executor.
-  DeviceMemoryHandle(StreamExecutorInterface *executor,
-                     DeviceMemoryBase memory);
+  DeviceMemoryHandle(StreamExecutor *executor, DeviceMemoryBase memory);
   ~DeviceMemoryHandle();
 
   // Moves ownership of the memory from other to the constructed
@@ -50,7 +49,7 @@ class DeviceMemoryHandle {
   void Free();
 
   DeviceMemoryBase memory_;            // Value we wrap with scoped-release.
-  StreamExecutorInterface *executor_;  // Null if this object is inactive.
+  StreamExecutor *executor_;           // Null if this object is inactive.
 };
 }  // namespace stream_executor
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index af1b55087b8591..9ffc36f928180e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// The CUDA implementation of the StreamExecutorInterface functionality.
+// The CUDA implementation of the StreamExecutor functionality.
 // CUDA inclusions are ideally confined to this implementation file.
 //
 // The notions from the StreamExecutor basically correspond to the CUDA streams
@@ -71,8 +71,8 @@ class GpuKernel;
 class GpuCommandBuffer;
 
 // CUDA-platform implementation of the platform-agnostic
-// StreamExecutorInterface.
-class GpuExecutor : public StreamExecutor {
+// StreamExecutor.
+class GpuExecutor : public StreamExecutorCommon {
   // Helper classes to attach a type erased state to the GpuExecutor. Currently,
   // we just need to support some XLA specific state.
   class Object {
@@ -104,7 +104,7 @@ class GpuExecutor : public StreamExecutor {
   // sub_platform indicates the subplatform used in this executor; it must
   // be a CUDA type.
   GpuExecutor(Platform* platform, int device_ordinal)
-      : StreamExecutor(platform),
+      : StreamExecutorCommon(platform),
         device_(0),
         context_(nullptr),
         device_ordinal_(device_ordinal),
@@ -239,9 +239,9 @@ class GpuExecutor : public StreamExecutor {
 
   absl::Status BlockHostUntilDone(Stream* stream) override;
 
-  absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override;
 
-  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override;
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
index 81c8e687d81208..ea027f4dac22fb 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// The CUDA implementation of the StreamExecutorInterface functionality.
+// The CUDA implementation of the StreamExecutor functionality.
 // CUDA inclusions are ideally confined to this implementation file.
 //
 // The notions from the StreamExecutor basically correspond to the CUDA streams
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index 508d3061a5e7e3..9153461c4a63d9 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -55,7 +55,7 @@ namespace host {
 // This is useful for evaluating the performance of host-based or fallback
 // routines executed under the context of a GPU executor.
 // See stream_executor.h for description of the below operations.
-class HostExecutor : public StreamExecutor {
+class HostExecutor : public StreamExecutorCommon {
  public:
   // A function that loads a kernel function from a given spec. If spec is not
   // supported it returns an empty optional.
@@ -67,7 +67,7 @@ class HostExecutor : public StreamExecutor {
   static void RegisterKernelFunctionLoader(KernelFunctionLoader loader);
 
   HostExecutor(Platform* platform, int device_ordinal)
-      : StreamExecutor(platform), device_ordinal_(device_ordinal) {}
+      : StreamExecutorCommon(platform), device_ordinal_(device_ordinal) {}
 
   absl::Status Init() override;
 
@@ -139,13 +139,11 @@ class HostExecutor : public StreamExecutor {
   CreateDeviceDescription(int device_ordinal);
   int device_ordinal() const override { return device_ordinal_; }
 
-  absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override {
     return absl::OkStatus();
   }
 
-  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
-    return true;
-  }
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override { return true; }
 
   absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
 
diff --git a/third_party/xla/xla/stream_executor/host_memory_allocation.cc b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
index 623940e83d2c6c..40fed3cdc92fb2 100644
--- a/third_party/xla/xla/stream_executor/host_memory_allocation.cc
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace stream_executor {
 
 HostMemoryAllocation::HostMemoryAllocation(void* ptr, uint64_t size,
-                                           StreamExecutorInterface* executor)
+                                           StreamExecutor* executor)
     : ptr_(ptr), size_(size), executor_(executor) {}
 
 HostMemoryAllocation::~HostMemoryAllocation() {
diff --git a/third_party/xla/xla/stream_executor/host_memory_allocation.h b/third_party/xla/xla/stream_executor/host_memory_allocation.h
index fb9bb394e0a473..eb0bf290e9a2a8 100644
--- a/third_party/xla/xla/stream_executor/host_memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.h
@@ -22,14 +22,13 @@ limitations under the License.
 
 namespace stream_executor {
 
-class StreamExecutorInterface;
+class StreamExecutor;
 
 // RAII container for pinned host memory allocation allocated on an underlying
 // device owned by `*this`.
 class HostMemoryAllocation final : public MemoryAllocation {
  public:
-  HostMemoryAllocation(void* ptr, uint64_t size,
-                       StreamExecutorInterface* executor);
+  HostMemoryAllocation(void* ptr, uint64_t size, StreamExecutor* executor);
   ~HostMemoryAllocation() final;
 
   void* opaque() const final { return ptr_; }
@@ -38,7 +37,7 @@ class HostMemoryAllocation final : public MemoryAllocation {
  private:
   void* ptr_ = nullptr;
   uint64_t size_ = 0;
-  StreamExecutorInterface* executor_ = nullptr;
+  StreamExecutor* executor_ = nullptr;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_factory.h b/third_party/xla/xla/stream_executor/kernel_factory.h
index 2dd955efe5ed2c..f817b946d86b81 100644
--- a/third_party/xla/xla/stream_executor/kernel_factory.h
+++ b/third_party/xla/xla/stream_executor/kernel_factory.h
@@ -32,7 +32,7 @@ class KernelFactory {
  public:
   // Creates kernel on a given executor from a given kernel specification.
   static inline absl::StatusOr<std::unique_ptr<Kernel>> Create(
-      StreamExecutorInterface *executor, const MultiKernelLoaderSpec &spec) {
+      StreamExecutor *executor, const MultiKernelLoaderSpec &spec) {
     TF_ASSIGN_OR_RETURN(auto kernel, executor->CreateKernel());
     TF_RETURN_IF_ERROR(executor->GetKernel(spec, kernel.get()));
     return kernel;
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index 6e2c9c7c81af22..f1558110783693 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -54,8 +54,8 @@ namespace blas {
 class BlasSupport;
 }
 
-// Implements StreamExecutorInterface for testing.
-class MockStreamExecutor : public StreamExecutorInterface {
+// Implements StreamExecutor for testing.
+class MockStreamExecutor : public StreamExecutor {
  public:
   MockStreamExecutor() = default;
   MOCK_METHOD(absl::Status, Init, (), (override));
@@ -140,9 +140,9 @@ class MockStreamExecutor : public StreamExecutorInterface {
               (override));
   MOCK_METHOD(absl::Status, BlockHostUntilDone, (Stream * stream), (override));
   MOCK_METHOD(absl::Status, GetStatus, (Stream * stream));
-  MOCK_METHOD(absl::Status, EnablePeerAccessTo,
-              (StreamExecutorInterface * other), (override));
-  MOCK_METHOD(bool, CanEnablePeerAccessTo, (StreamExecutorInterface * other),
+  MOCK_METHOD(absl::Status, EnablePeerAccessTo, (StreamExecutor * other),
+              (override));
+  MOCK_METHOD(bool, CanEnablePeerAccessTo, (StreamExecutor * other),
               (override));
   MOCK_METHOD(bool, DeviceMemoryUsage, (int64_t* free, int64_t* total),
               (const, override));
diff --git a/third_party/xla/xla/stream_executor/plugin_registry.h b/third_party/xla/xla/stream_executor/plugin_registry.h
index fa9e7b0f5db167..f2051fa18bda96 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.h
+++ b/third_party/xla/xla/stream_executor/plugin_registry.h
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace stream_executor {
 
-class StreamExecutorInterface;
+class StreamExecutor;
 
 // Enumeration to list the supported types of plugins / support libraries.
 enum class PluginKind {
@@ -54,9 +54,9 @@ enum class PluginKind {
 // late-loading from distorting performance/benchmarks as much as possible.
 class PluginRegistry {
  public:
-  typedef blas::BlasSupport* (*BlasFactory)(StreamExecutorInterface*);
-  typedef dnn::DnnSupport* (*DnnFactory)(StreamExecutorInterface*);
-  typedef fft::FftSupport* (*FftFactory)(StreamExecutorInterface*);
+  typedef blas::BlasSupport* (*BlasFactory)(StreamExecutor*);
+  typedef dnn::DnnSupport* (*DnnFactory)(StreamExecutor*);
+  typedef fft::FftSupport* (*FftFactory)(StreamExecutor*);
 
   // Gets (and creates, if necessary) the singleton PluginRegistry instance.
   static PluginRegistry* Instance();
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index 2061708a736f6f..cb1b2e1094f439 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -418,10 +418,10 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     std::optional<DeviceMemoryBase> workspace,
     std::optional<ScratchAllocator*> scratch_allocator,
     blas::ProfileResult* profile_result) const {
-  absl::Status status = blas_lt_ref_.parent_->RecordApiTrace(
-      StreamExecutorInterface::GemmCallTrace{
-          StreamExecutorInterface::GemmCallTrace::GemmType::kBlasLt, 0,
-          a.size(), b.size()});
+  absl::Status status =
+      blas_lt_ref_.parent_->RecordApiTrace(StreamExecutor::GemmCallTrace{
+          StreamExecutor::GemmCallTrace::GemmType::kBlasLt, 0, a.size(),
+          b.size()});
 
   TF_ASSIGN_OR_RETURN(
       std::optional<gpu::GpuTimer> timer,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index ec20077bde6e6f..837f20bca3e440 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -452,8 +452,7 @@ Impl_DoBlasScal(wrap::rocblas_sscal, float,
  *    and ex functions expect the same type as the compute type (i.e. floats.)
  *
  **/
-using sei = StreamExecutorInterface;
-using GemmCallTrace = sei::GemmCallTrace;
+using GemmCallTrace = StreamExecutor::GemmCallTrace;
 
 // Log the GEMM operation if the logging mode is enabled.
 void ROCMBlas::MaybeLogGemmOp(GemmCallTrace::GemmType op,
@@ -1284,7 +1283,7 @@ void initialize_rocblas() {
         PluginRegistry::Instance()
             ->RegisterFactory<PluginRegistry::BlasFactory>(
                 rocm::kROCmPlatformId, "rocBLAS",
-                [](StreamExecutorInterface *parent) -> blas::BlasSupport * {
+                [](StreamExecutor *parent) -> blas::BlasSupport * {
                   gpu::GpuExecutor *rocm_executor =
                       dynamic_cast<gpu::GpuExecutor *>(parent);
                   if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
index d8bb216ebeff8e..e3c2aa222f78f3 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
@@ -200,7 +200,7 @@ class ROCMBlas : public blas::BlasSupport {
   // container holding solutions vector (to avoid reallocating it each time)
   std::vector<rocblas_int> solutions_;
 
-  void MaybeLogGemmOp(StreamExecutorInterface::GemmCallTrace::GemmType op,
+  void MaybeLogGemmOp(StreamExecutor::GemmCallTrace::GemmType op,
                       blas::CallContext context, uint64_t size1,
                       uint64_t size2);
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index c5cd6e29cb02c5..a95843d67e406e 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -5194,7 +5194,7 @@ void initialize_miopen() {
     absl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
             rocm::kROCmPlatformId, "MIOpen",
-            [](StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+            [](StreamExecutor* parent) -> dnn::DnnSupport* {
               gpu::GpuExecutor* rocm_executor =
                   dynamic_cast<gpu::GpuExecutor*>(parent);
               if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 655e5621cc6f48..004ee8c3701956 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -773,12 +773,12 @@ fft::FftSupport* GpuExecutor::AsFft() {
   return fft_.get();
 }
 
-bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
 }
 
-absl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+absl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutor* other) {
   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
 }
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
index 0168dd98aaf3b5..cf07e17df3728d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
@@ -509,7 +509,7 @@ void initialize_rocfft() {
     absl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
             rocm::kROCmPlatformId, "rocFFT",
-            [](StreamExecutorInterface *parent) -> fft::FftSupport * {
+            [](StreamExecutor *parent) -> fft::FftSupport * {
               gpu::GpuExecutor *rocm_executor =
                   dynamic_cast<gpu::GpuExecutor *>(parent);
               if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/scoped_module_handle.h b/third_party/xla/xla/stream_executor/scoped_module_handle.h
index 35af165f5a2e5b..262051661ea9c8 100644
--- a/third_party/xla/xla/stream_executor/scoped_module_handle.h
+++ b/third_party/xla/xla/stream_executor/scoped_module_handle.h
@@ -25,8 +25,7 @@ namespace stream_executor {
 // A wrapper around ModuleHandle that uses RAII to manage its lifetime.
 class ScopedModuleHandle {
  public:
-  ScopedModuleHandle(StreamExecutorInterface* executor,
-                     ModuleHandle module_handle)
+  ScopedModuleHandle(StreamExecutor* executor, ModuleHandle module_handle)
       : executor_(executor), module_handle_(module_handle) {}
 
   ScopedModuleHandle(ScopedModuleHandle&& other) {
@@ -51,7 +50,7 @@ class ScopedModuleHandle {
   }
 
  private:
-  StreamExecutorInterface* executor_;
+  StreamExecutor* executor_;
   ModuleHandle module_handle_;
 
   ScopedModuleHandle(const ScopedModuleHandle&) = delete;
diff --git a/third_party/xla/xla/stream_executor/stream_executor_interface.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
index f040fda36e44d7..bbe0ea5e0ba0ed 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -46,10 +46,9 @@ class Stream;
 
 // Interface which defines the method for interacting with an accelerator device
 // (e.g. GPU, TPU).
-class StreamExecutorInterface {
+class StreamExecutor {
  public:
-  StreamExecutorInterface() = default;
-  virtual ~StreamExecutorInterface() = default;
+  virtual ~StreamExecutor() = default;
 
   // Returns a reference to the platform that created this executor.
   virtual const Platform* GetPlatform() const = 0;
@@ -288,11 +287,11 @@ class StreamExecutorInterface {
   // Enables peer access from this StreamExecutor to memory
   // allocated by other, such that launched device code, memcpies, etc may
   // access it directly.
-  virtual absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) = 0;
+  virtual absl::Status EnablePeerAccessTo(StreamExecutor* other) = 0;
 
   // Returns whether it's possible to enable peer access from this
   // StreamExecutor to memory allocated by another.
-  virtual bool CanEnablePeerAccessTo(StreamExecutorInterface* other) = 0;
+  virtual bool CanEnablePeerAccessTo(StreamExecutor* other) = 0;
 
   // Returns the underlying device memory usage information, if it is available.
   // If it is not available (false is returned), free/total may not be
@@ -403,15 +402,11 @@ class StreamExecutorInterface {
   // Sets the argument logging mode. Returns true if 'mode' is valid.
   // The mode is a bitmask of the kLog* constants.
   virtual bool SetArgumentLoggingMode(uint64_t mode) { return false; }
-
- private:
-  StreamExecutorInterface(const StreamExecutorInterface&) = delete;
-  void operator=(const StreamExecutorInterface&) = delete;
 };
 
 template <typename T>
-inline DeviceMemory<T> StreamExecutorInterface::AllocateArray(
-    uint64_t element_count, int64_t memory_space) {
+inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64_t element_count,
+                                                     int64_t memory_space) {
   uint64_t bytes = sizeof(T) * element_count;
   auto memory_limit_bytes = GetMemoryLimitBytes();
   if (memory_limit_bytes > 0 &&
diff --git a/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.cc b/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.cc
index c9b1494e1ed4f0..670da50c7ac244 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.cc
@@ -34,21 +34,21 @@ limitations under the License.
 namespace stream_executor {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    StreamExecutorInterface* executor)
+    StreamExecutor* executor)
     : DeviceMemoryAllocator(executor->GetPlatform()) {
   stream_executors_ = {executor};
 }
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     const Platform* platform,
-    absl::Span<StreamExecutorInterface* const> stream_executors)
+    absl::Span<StreamExecutor* const> stream_executors)
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
 absl::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
-  TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+  TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
                       GetStreamExecutor(device_ordinal));
   DeviceMemoryBase result =
       executor->AllocateArray<uint8_t>(size, memory_space);
@@ -66,7 +66,7 @@ absl::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
 absl::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
                                                        DeviceMemoryBase mem) {
   if (!mem.is_null()) {
-    TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+    TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
                         GetStreamExecutor(device_ordinal));
     VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
                                   mem.opaque(), device_ordinal);
@@ -75,13 +75,13 @@ absl::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
   return absl::OkStatus();
 }
 
-absl::StatusOr<StreamExecutorInterface*>
+absl::StatusOr<StreamExecutor*>
 StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) const {
   if (device_ordinal < 0) {
     return absl::InvalidArgumentError(absl::StrFormat(
         "device ordinal value (%d) must be non-negative", device_ordinal));
   }
-  for (StreamExecutorInterface* se : stream_executors_) {
+  for (StreamExecutor* se : stream_executors_) {
     if (se->device_ordinal() == device_ordinal) {
       return se;
     }
@@ -99,7 +99,7 @@ absl::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
     int device_ordinal) {
   CHECK(!AllowsAsynchronousDeallocation())
       << "The logic below only works for synchronous allocators";
-  TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+  TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
                       GetStreamExecutor(device_ordinal));
   absl::MutexLock lock(&mutex_);
   if (!streams_.count(device_ordinal)) {
diff --git a/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h b/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h
index 077b7161b66321..85c5cf656cf557 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h
@@ -39,14 +39,14 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   // Create an allocator supporting a single device, corresponding to the
   // passed executor.
-  explicit StreamExecutorMemoryAllocator(StreamExecutorInterface *executor);
+  explicit StreamExecutorMemoryAllocator(StreamExecutor *executor);
 
   // Create an allocator supporting multiple stream executors.
   //
   // Precondition: all stream_executors have different device ordinals.
   StreamExecutorMemoryAllocator(
       const Platform *platform,
-      absl::Span<StreamExecutorInterface *const> stream_executors);
+      absl::Span<StreamExecutor *const> stream_executors);
 
   absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
                                               bool retry_on_failure,
@@ -64,13 +64,12 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
   absl::StatusOr<Stream *> GetStream(int device_ordinal) override;
 
   // Gets the stream executor for given device ordinal.
-  absl::StatusOr<StreamExecutorInterface *> GetStreamExecutor(
-      int device_ordinal) const;
+  absl::StatusOr<StreamExecutor *> GetStreamExecutor(int device_ordinal) const;
 
  private:
   // Available stream executors. Each stream executor has a different device
   // ordinal.
-  std::vector<StreamExecutorInterface *> stream_executors_;
+  std::vector<StreamExecutor *> stream_executors_;
 
   absl::Mutex mutex_;
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index efd27f4ec49f48..e801d2049df6a7 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Implements the StreamExecutor interface by passing through to its
-// implementation_ value (in pointer-to-implementation style), which
-// implements StreamExecutorInterface.
-
 #include "xla/stream_executor/stream_executor_pimpl.h"
 
 #include <cstddef>
@@ -64,11 +60,11 @@ static int64_t GetMemoryLimitBytesFromEnvironmentVariable() {
   return value * (1ll << 20);
 }
 
-StreamExecutor::StreamExecutor(const Platform* platform)
+StreamExecutorCommon::StreamExecutorCommon(const Platform* platform)
     : platform_(platform),
       memory_limit_bytes_(GetMemoryLimitBytesFromEnvironmentVariable()) {}
 
-const DeviceDescription& StreamExecutor::GetDeviceDescription() const {
+const DeviceDescription& StreamExecutorCommon::GetDeviceDescription() const {
   absl::MutexLock lock(&mu_);
   if (device_description_ != nullptr) {
     return *device_description_;
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index f57ace700cc9a0..096982cb8eb56d 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -62,11 +62,11 @@ class Stream;
 //
 // Thread-safe after initialization.
 // StreamExecutor interface should not be invoked from a signal handler.
-class StreamExecutor : public StreamExecutorInterface {
+class StreamExecutorCommon : public StreamExecutor {
  public:
-  explicit StreamExecutor(const Platform* platform);
+  explicit StreamExecutorCommon(const Platform* platform);
 
-  ~StreamExecutor() = default;
+  ~StreamExecutorCommon() = default;
 
   const Platform* GetPlatform() const override { return platform_; }
   const DeviceDescription& GetDeviceDescription() const override;
@@ -91,8 +91,8 @@ class StreamExecutor : public StreamExecutorInterface {
   // limit.
   int64_t memory_limit_bytes_;
 
-  StreamExecutor(const StreamExecutor&) = delete;
-  void operator=(const StreamExecutor&) = delete;
+  StreamExecutorCommon(const StreamExecutorCommon&) = delete;
+  void operator=(const StreamExecutorCommon&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 3e2e719cca89f2..82b77e03736b3d 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -156,10 +156,10 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
                         uint32_t pattern, uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override {
     LOG(FATAL) << "not yet implemented";
   }
-  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override {
     LOG(FATAL) << "not yet implemented";
   }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
index 7816efbd5c2637..ae994bca72fbf6 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -34,10 +34,10 @@ class TpuCore;
 namespace tensorflow {
 namespace tpu {
 
-class TpuExecutorInterface : public stream_executor::StreamExecutor {
+class TpuExecutorInterface : public stream_executor::StreamExecutorCommon {
  public:
   explicit TpuExecutorInterface(stream_executor::Platform* platform)
-      : StreamExecutor(platform) {}
+      : StreamExecutorCommon(platform) {}
 
   class TemporaryDeviceMemory {
    public:
diff --git a/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
index 6e66b897e32c28..6c4028571f7d10 100644
--- a/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
+++ b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
@@ -30,7 +30,7 @@ namespace stream_executor {
 
 absl::StatusOr<std::unique_ptr<CommandBuffer>>
 TraceCommandBufferFactory::Create(
-    StreamExecutorInterface* executor,
+    StreamExecutor* executor,
     absl::AnyInvocable<absl::Status(Stream*)> function,
     CommandBuffer::Mode mode) {
   TF_ASSIGN_OR_RETURN(auto stream, executor->CreateStream());
@@ -40,7 +40,7 @@ TraceCommandBufferFactory::Create(
 
 absl::StatusOr<std::unique_ptr<CommandBuffer>>
 TraceCommandBufferFactory::Create(
-    StreamExecutorInterface* executor, Stream* stream,
+    StreamExecutor* executor, Stream* stream,
     absl::AnyInvocable<absl::Status(Stream*)> function,
     CommandBuffer::Mode mode) {
   if (stream == nullptr)
diff --git a/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
index 9a88770c8d8b58..0649af160e7344 100644
--- a/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
+++ b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
@@ -39,14 +39,14 @@ class TraceCommandBufferFactory {
   // primary use case for traced command buffers is to be inserted into primary
   // command buffers constructed with explicit APIs.
   static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
-      StreamExecutorInterface* executor,
+      StreamExecutor* executor,
       absl::AnyInvocable<absl::Status(Stream*)> function,
       CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
 
   // Creates a new command buffer on the given executor by tracing `function`
   // invocation using a user provided stream that will be passed to `function`.
   static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
-      StreamExecutorInterface* executor, Stream* stream,
+      StreamExecutor* executor, Stream* stream,
       absl::AnyInvocable<absl::Status(Stream*)> function,
       CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
 };
diff --git a/third_party/xla/xla/stream_executor/typed_kernel_factory.h b/third_party/xla/xla/stream_executor/typed_kernel_factory.h
index 95b9eb6c505ca5..8c0bcae07c47be 100644
--- a/third_party/xla/xla/stream_executor/typed_kernel_factory.h
+++ b/third_party/xla/xla/stream_executor/typed_kernel_factory.h
@@ -39,7 +39,7 @@ class TypedKernelFactory {
  public:
   // Creates a typed kernel on a given executor from a kernel specification.
   static absl::StatusOr<TypedKernel<Params...>> Create(
-      StreamExecutorInterface *executor, const MultiKernelLoaderSpec &spec) {
+      StreamExecutor *executor, const MultiKernelLoaderSpec &spec) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Kernel> kernel,
                         KernelFactory::Create(executor, spec));
     return TypedKernel<Params...>(std::move(kernel));
@@ -51,7 +51,7 @@ class TypedKernelFactory {
   // time. The canonical storage for both ptx and cubin_data should outlive the
   // lifetime of the kernel.
   static absl::StatusOr<TypedKernel<Params...>> Create(
-      StreamExecutorInterface *executor, absl::string_view kernel_name,
+      StreamExecutor *executor, absl::string_view kernel_name,
       absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
     MultiKernelLoaderSpec loader_spec(
         TypedKernel<Params...>::kNumberOfParameters);
@@ -67,8 +67,7 @@ class TypedKernelFactory {
   // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
   // an in-process symbol pointer.
   static absl::StatusOr<TypedKernel<Params...>> Create(
-      StreamExecutorInterface *executor, absl::string_view kernel_name,
-      void *symbol) {
+      StreamExecutor *executor, absl::string_view kernel_name, void *symbol) {
     MultiKernelLoaderSpec loader_spec(
         TypedKernel<Params...>::kNumberOfParameters);
     loader_spec.AddInProcessSymbol(symbol, kernel_name);
@@ -79,7 +78,7 @@ class TypedKernelFactory {
   // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
   // an LLVM IR.
   static absl::StatusOr<TypedKernel<Params...>> Create(
-      StreamExecutorInterface *executor, absl::string_view ir,
+      StreamExecutor *executor, absl::string_view ir,
       absl::string_view entrypoint, absl::string_view kernel_name,
       absl::Span<std::string> options) {
     MultiKernelLoaderSpec loader_spec(
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 44ff367ad6aa79..732d562871afa9 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -82,9 +82,7 @@ class BufferDonationTest : public HloTestBase {
     TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
 
     auto& executors = backend_->stream_executors();
-    se::StreamExecutorMemoryAllocator memory_allocator(
-        platform_, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                             executors.end()));
+    se::StreamExecutorMemoryAllocator memory_allocator(platform_, executors);
     ExecutableRunOptions run_options;
     run_options.set_stream(stream.get());
     run_options.set_allocator(&memory_allocator);
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index 34cd15db068096..6e22c1afbec754 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -893,9 +893,7 @@ void BM_ParallelFusion(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                          executors.end()));
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
 
   const int64_t intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 546d53e456e5a7..3bfdc835e6eb13 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -2308,9 +2308,7 @@ ENTRY MatrixVectorComplex {
 void DOT_ReorderContracting(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                          executors.end()));
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
 
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
diff --git a/third_party/xla/xla/tests/dynamic_ops_test.cc b/third_party/xla/xla/tests/dynamic_ops_test.cc
index 0de3c8f6dbec37..ac530970229611 100644
--- a/third_party/xla/xla/tests/dynamic_ops_test.cc
+++ b/third_party/xla/xla/tests/dynamic_ops_test.cc
@@ -974,9 +974,7 @@ ENTRY main {
 void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                          executors.end()));
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 53fce7717be064..66b47edafa3497 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -905,9 +905,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
 void BM_LocalClientOverhead(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                          executors.end()));
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index 2ade8663703027..67951bef3749e9 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -46,9 +46,7 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
       : se::StreamExecutorMemoryAllocator(
-            platform, GetInterfaceVectorFromExecutors(
-                          PlatformUtil::GetStreamExecutors(platform).value())) {
-  }
+            platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
   absl::StatusOr<se::OwningDeviceMemory> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
@@ -65,13 +63,6 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
   int64_t deallocation_count(int device_ordinal) const;
 
  private:
-  // Helper function to turn a vector<StreamExecutor*> into a
-  // vector<StreamExecutorInterface*>.
-  std::vector<se::StreamExecutorInterface*> GetInterfaceVectorFromExecutors(
-      const std::vector<se::StreamExecutor*>& executors) {
-    return std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                     executors.end());
-  }
   mutable absl::Mutex count_mutex_;
 
   // Global counts of allocations and deallocations.
diff --git a/third_party/xla/xla/tests/while_test.cc b/third_party/xla/xla/tests/while_test.cc
index dd9db003037340..3a3d9705e0f620 100644
--- a/third_party/xla/xla/tests/while_test.cc
+++ b/third_party/xla/xla/tests/while_test.cc
@@ -1251,9 +1251,7 @@ void BM_WhileLoop(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(
-      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
-                                                          executors.end()));
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
 
   const int64_t seq_len = 100;

From bff9c1d5477b75fabe6aee29d59bcf8981ffdce1 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Mon, 3 Jun 2024 12:22:58 -0700
Subject: [PATCH 272/287] Implement a batched version of
 `Array::GetReadyFuture()`

Some IFRT implementations may be able to provide a more efficient `GetReadyFuture` implementations if they can see multiple arrays at a time, e.g., IFRT Proxy. `Client::GetReadyFuture({array})` is expected to have the same semantics as `array->GetReadyFuture()`.

IFRT Proxy is updated such that the changes are forward and backward compatible. If a newer IFRT Proxy client connects to an older IFRT Proxy server that does not support `Client::GetReadyFuture()`, the client falls back to `Array::GetReadyFuture()`. Also, the proto explicitly uses non-packed fields to maintain version compatibility.

PiperOrigin-RevId: 639872949
---
 third_party/xla/xla/python/BUILD              |  3 +-
 third_party/xla/xla/python/ifrt/BUILD         |  1 +
 .../xla/python/ifrt/array_impl_test_lib.cc    | 25 +++++++
 third_party/xla/xla/python/ifrt/client.h      | 18 +++++
 third_party/xla/xla/python/ifrt/mock.cc       |  4 ++
 third_party/xla/xla/python/ifrt/mock.h        |  2 +
 .../xla/xla/python/ifrt_proxy/client/BUILD    |  1 +
 .../xla/xla/python/ifrt_proxy/client/array.cc |  8 +--
 .../xla/python/ifrt_proxy/client/client.cc    | 41 ++++++++++++
 .../xla/xla/python/ifrt_proxy/client/client.h |  4 ++
 .../python/ifrt_proxy/client/rpc_helper.cc    |  2 +-
 .../xla/python/ifrt_proxy/client/rpc_helper.h |  4 +-
 .../xla/python/ifrt_proxy/client/version.h    |  4 +-
 .../xla/python/ifrt_proxy/common/VERSION.md   | 13 ++++
 .../ifrt_proxy/common/ifrt_service.proto      | 22 +++---
 .../integration_tests/mock_array_test.cc      | 11 +++
 .../python/ifrt_proxy/server/ifrt_backend.cc  | 67 +++++++++++--------
 .../python/ifrt_proxy/server/ifrt_backend.h   |  5 +-
 .../ifrt_proxy/server/ifrt_backend_test.cc    | 14 ++--
 .../xla/python/ifrt_proxy/server/version.h    |  4 +-
 .../xla/xla/python/pjrt_ifrt/pjrt_client.cc   | 12 ++++
 .../xla/xla/python/pjrt_ifrt/pjrt_client.h    |  3 +
 .../xla/xla/python/py_compile_only_client.cc  |  7 ++
 third_party/xla/xla/python/util.cc            | 20 +++---
 third_party/xla/xla/python/util.h             |  3 -
 25 files changed, 230 insertions(+), 68 deletions(-)
 create mode 100644 third_party/xla/xla/python/ifrt_proxy/common/VERSION.md

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index b9825a4df4912a..cb0bfe2a38d050 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1068,9 +1068,8 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//xla:util",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_future",
         "//xla/python/ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 54031c4fc19fd1..493424e8ab850e 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -264,6 +264,7 @@ cc_library(
     deps = [
         ":ifrt",
         ":test_util",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index ec1f81424274f5..059f6f7a32f035 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/test_util.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
 
@@ -518,6 +520,29 @@ TEST(ArrayImplTest, GetReadyFuture) {
   TF_EXPECT_OK(array->GetReadyFuture().Await());
 }
 
+TEST(ArrayImplTest, BatchedGetReadyFuture) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  std::vector<tsl::RCReference<Value>> values;
+  for (int i = 0; i < 4; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(values.emplace_back(),
+                            client->MakeArrayFromHostBuffer(
+                                data.data(), dtype, shape,
+                                /*byte_strides=*/std::nullopt, sharding,
+                                semantics, /*on_done_with_host_buffer=*/{}));
+  }
+  TF_EXPECT_OK(client->GetReadyFuture(values).Await());
+}
+
 TEST(ArrayImplTest, Delete) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
 
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index 9ee79d2d100b9a..37cac86e3fe121 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
@@ -135,6 +136,23 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
               absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
               ArrayCopySemantics semantics) = 0;
 
+  // Returns a future that becomes ready once all of the values become ready.
+  //
+  // Timing and error semantics:
+  //
+  // * The returned future is fulfilled only after all values in `values` become
+  //   ready, regardless of their error statuses.
+  // * If none of the values have errors, the returned future is fulfilled with
+  //   `absl::OkStatus()` once all values are ready.
+  // * If there is one or more values with errors, the implementation will pick
+  //   one of them arbitrarily to fulfill the returned future.
+  //
+  // Note: this API currently accepts a span of `tsl::RCReference<Array>` for
+  // consistency with other APIs. We may change this to take a span of `Array*`
+  // instead to reflect its read-only semantics.
+  virtual Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) = 0;
+
   // Builds a tuple from a sequence of values.
   virtual absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) = 0;
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index 6b4479a14c6344..c5efb3bfed0af3 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -130,6 +130,10 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
                             ArrayCopySemantics semantics) {
         return delegated_->RemapArrays(plan, arrays, semantics);
       });
+  ON_CALL(*this, GetReadyFuture)
+      .WillByDefault([this](absl::Span<const tsl::RCReference<Value>> values) {
+        return delegated_->GetReadyFuture(values);
+      });
   ON_CALL(*this, MakeTuple)
       .WillByDefault([this](absl::Span<tsl::RCReference<Value>> values) {
         return delegated_->MakeTuple(values);
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 91543b2d89833c..8b8aba00517f3a 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -127,6 +127,8 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
                absl::Span<tsl::RCReference<Array>> arrays,
                ArrayCopySemantics semantics),
               (final));
+  MOCK_METHOD(Future<>, GetReadyFuture,
+              (absl::Span<const tsl::RCReference<Value>> values), (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Tuple>>, MakeTuple,
               (absl::Span<tsl::RCReference<Value>> values), (final));
   MOCK_METHOD(absl::string_view, runtime_type, (), (const, final));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index b21964d40aa22d..89cbd3d6ce8197 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -127,6 +127,7 @@ cc_library(
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index b39502969318ee..c98c4767c59538 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -114,13 +114,13 @@ void Array::Destruct(RpcHelper* rpc_helper, ArrayHandle handle) {
 }
 
 Future<> Array::GetReadyFuture() const {
-  auto req = std::make_unique<CheckArrayReadyRequest>();
-  req->set_array_handle(handle_.handle);
+  auto req = std::make_unique<CheckValueReadyRequest>();
+  req->add_value_handles(handle_.handle);
 
   auto promise = Future<>::CreatePromise();
-  rpc_helper_->CheckArrayReady(std::move(req))
+  rpc_helper_->CheckValueReady(std::move(req))
       .OnReady(
-          [promise](absl::StatusOr<std::shared_ptr<CheckArrayReadyResponse>>
+          [promise](absl::StatusOr<std::shared_ptr<CheckValueReadyResponse>>
                         resp) mutable { promise.Set(resp.status()); });
   return Future<>(std::move(promise));
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index d77e9d797d0792..7dd34a714a8a71 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -24,19 +24,23 @@
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/array.h"
 #include "xla/python/ifrt_proxy/client/device.h"
 #include "xla/python/ifrt_proxy/client/memory.h"
@@ -203,6 +207,43 @@ Client::RemapArrays(const RemapPlan& plan,
   return Array::RemapArrays(this, rpc_helper_, plan, arrays, semantics);
 }
 
+xla::ifrt::Future<> Client::GetReadyFuture(
+    absl::Span<const tsl::RCReference<xla::ifrt::Value>> values) {
+  if (rpc_helper_->version().protocol_version() <= 1) {
+    // Legacy implementation for servers that do not support
+    // `Client::GetReadyFuture`.
+    std::vector<xla::ifrt::Future<>> futures;
+    futures.reserve(values.size());
+    for (const auto& value : values) {
+      futures.push_back(value->GetReadyFuture());
+    }
+    return xla::ifrt::JoinFutures(futures);
+  }
+
+  absl::InlinedVector<Future<>, 1> futures;
+
+  auto req = std::make_unique<CheckValueReadyRequest>();
+  for (const auto& value : values) {
+    // TODO(b/261991179): IFRT Proxy currently supports Arrays as the only value
+    // type, but this may be extended later to other types such as Tuples.
+    if (auto proxy_array =
+            llvm::dyn_cast<xla::ifrt::proxy::Array>(value.get())) {
+      req->add_value_handles(proxy_array->handle().handle);
+    } else {
+      futures.push_back(value->GetReadyFuture());
+    }
+  }
+
+  auto promise = Future<>::CreatePromise();
+  rpc_helper_->CheckValueReady(std::move(req))
+      .OnReady(
+          [promise](absl::StatusOr<std::shared_ptr<CheckValueReadyResponse>>
+                        resp) mutable { promise.Set(resp.status()); });
+  futures.push_back(Future<>(std::move(promise)));
+
+  return JoinFutures(futures);
+}
+
 absl::StatusOr<DeviceAssignment> Client::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
   auto req = std::make_unique<GetDefaultDeviceAssignmentRequest>();
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
index 6a703df3d3002c..4419053b95b31b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -37,6 +37,7 @@
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
@@ -79,6 +80,9 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
       absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
       ArrayCopySemantics semantics) override;
 
+  xla::ifrt::Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) override;
+
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) override {
     return absl::UnimplementedError(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index ec577968b52253..a05780b606dd18 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -139,13 +139,13 @@ void RpcHelper::Disconnect() {
 RPC(Init, init);
 RPC(GetDefaultDeviceAssignment, get_default_device_assignment);
 RPC(CheckFuture, check_future);
+RPC(CheckValueReady, check_value_ready);
 RPC(MakeArrayFromHostBuffer, make_array_from_host_buffer);
 RPC(AssembleArrayFromSingleDeviceArrays,
     assemble_array_from_single_device_arrays);
 RPC(RemapArrays, remap_arrays);
 RPC(DisassembleIntoSingleDeviceArrays, disassemble_into_single_device_arrays);
 RPC(CopyToHostBuffer, copy_to_host_buffer);
-RPC(CheckArrayReady, check_array_ready);
 RPC(IsArrayDeleted, is_array_deleted);
 RPC(DestructArray, destruct_array)
 RPC(Reshard, reshard);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
index 304bb17b3c4605..e2bcb254184f23 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -87,6 +87,8 @@ class RpcHelper {
 
   ResponseFuture<CheckFutureResponse> CheckFuture(
       std::unique_ptr<CheckFutureRequest> req);
+  ResponseFuture<CheckValueReadyResponse> CheckValueReady(
+      std::unique_ptr<CheckValueReadyRequest> req);
 
   ResponseFuture<MakeArrayFromHostBufferResponse> MakeArrayFromHostBuffer(
       std::unique_ptr<MakeArrayFromHostBufferRequest> req);
@@ -100,8 +102,6 @@ class RpcHelper {
       std::unique_ptr<DisassembleIntoSingleDeviceArraysRequest> req);
   ResponseFuture<CopyToHostBufferResponse> CopyToHostBuffer(
       std::unique_ptr<CopyToHostBufferRequest> req);
-  ResponseFuture<CheckArrayReadyResponse> CheckArrayReady(
-      std::unique_ptr<CheckArrayReadyRequest> req);
   ResponseFuture<ReshardResponse> Reshard(std::unique_ptr<ReshardRequest> req);
   ResponseFuture<FullyReplicatedShardResponse> FullyReplicatedShard(
       std::unique_ptr<FullyReplicatedShardRequest> req);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/version.h b/third_party/xla/xla/python/ifrt_proxy/client/version.h
index 06df1e0c70b005..c23b5194d209d8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/version.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/version.h
@@ -21,9 +21,11 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+// LINT.IfChange
 // TODO(b/296144873): Document the version upgrade policy.
 inline constexpr int kClientMinVersion = 1;
-inline constexpr int kClientMaxVersion = 1;
+inline constexpr int kClientMaxVersion = 2;
+// LINT.ThenChange(//tensorflow/compiler/xla/python/ifrt_proxy/common/VERSION.md)
 
 }  // namespace proxy
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
new file mode 100644
index 00000000000000..8dcd177e1ee29f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
@@ -0,0 +1,13 @@
+# IFRT Proxy Protocol Versions
+
+## Version 1
+
+*   Added date: 2023-12-20.
+*   Changes:
+    *   Initial version.
+
+## Version 2
+
+*   Added date: 2024-05-31.
+*   Changes:
+    *   Added support for `Client::GetReadyFuture()`.
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 6c37451979dbda..91bd79099c9a97 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -42,6 +42,9 @@ message IfrtRequest {
     // ===== Future =====
     CheckFutureRequest check_future_request = 3;
 
+    // ===== Value =====
+    CheckValueReadyRequest check_value_ready_request = 8;
+
     // ===== Array =====
     MakeArrayFromHostBufferRequest make_array_from_host_buffer_request = 4;
     AssembleArrayFromSingleDeviceArraysRequest
@@ -50,7 +53,6 @@ message IfrtRequest {
     CopyToHostBufferRequest copy_to_host_buffer_request = 6;
     DisassembleIntoSingleDeviceArraysRequest
         disassemble_into_single_device_arrays_request = 7;
-    CheckArrayReadyRequest check_array_ready_request = 8;
     DeleteArrayRequest delete_array_request = 9;
     ReshardRequest reshard_request = 10;
     FullyReplicatedShardRequest fully_replicated_shard_request = 20;
@@ -86,6 +88,9 @@ message IfrtResponse {
     // ===== Future =====
     CheckFutureResponse check_future_response = 3;
 
+    // ===== Value =====
+    CheckValueReadyResponse check_value_ready_response = 8;
+
     // ===== Array =====
     MakeArrayFromHostBufferResponse make_array_from_host_buffer_response = 4;
     AssembleArrayFromSingleDeviceArraysResponse
@@ -94,7 +99,6 @@ message IfrtResponse {
     CopyToHostBufferResponse copy_to_host_buffer_response = 6;
     DisassembleIntoSingleDeviceArraysResponse
         disassemble_into_single_device_arrays_response = 7;
-    CheckArrayReadyResponse check_array_ready_response = 8;
     DeleteArrayResponse delete_array_response = 9;
     ReshardResponse reshard_response = 10;
     FullyReplicatedShardResponse fully_replicated_shard_response = 20;
@@ -240,6 +244,14 @@ message CheckFutureRequest {
 }
 message CheckFutureResponse {}
 
+// ================ Value-related operations ================
+
+// Checks if the given Values are ready on the server.
+message CheckValueReadyRequest {
+  repeated fixed64 value_handles = 1 [packed = false];
+}
+message CheckValueReadyResponse {}
+
 // ================ Array-related operations ================
 
 // In the current context of the IFRT proxy service, the term `Host` in the
@@ -318,12 +330,6 @@ message FullyReplicatedShardResponse {
   fixed64 array_handle = 1;
 }
 
-// Checks if the given Arrays are ready on the server.
-message CheckArrayReadyRequest {
-  fixed64 array_handle = 1;
-}
-message CheckArrayReadyResponse {}
-
 // Deletes the given Array. Response contains the handle for a Future that
 // becomes ready when the deletion completes.
 message DeleteArrayRequest {
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index 5127b0c8365730..ae8c86855662b8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -43,6 +43,7 @@
 #include "xla/python/ifrt/mock.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/client.h"
 #include "xla/python/ifrt_proxy/client/registry.h"
 #include "xla/python/ifrt_proxy/server/grpc_server.h"
@@ -148,6 +149,16 @@ class MockArrayTest : public testing::Test {
               return result;
             });
 
+    ON_CALL(*mock_backend, GetReadyFuture)
+        .WillByDefault([](absl::Span<const tsl::RCReference<Value>> values) {
+          std::vector<Future<>> futures;
+          futures.reserve(values.size());
+          for (const auto& value : values) {
+            futures.push_back(value->GetReadyFuture());
+          }
+          return JoinFutures(futures);
+        });
+
     return mock_backend;
   }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index bf66c2680f4602..397d597d6007d9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -55,6 +55,7 @@
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/common/array_util.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/proto_util.h"
@@ -164,8 +165,8 @@ Future<BackendInterface::Response> IfrtBackend::Process(
     case IfrtRequest::RequestCase::kDisassembleIntoSingleDeviceArraysRequest:
       return Future<Response>(
           HandleDisassembleIntoSingleDeviceArraysRequest(std::move(request)));
-    case IfrtRequest::RequestCase::kCheckArrayReadyRequest:
-      return Future<Response>(HandleCheckArrayReadyRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kCheckValueReadyRequest:
+      return Future<Response>(HandleCheckValueReadyRequest(std::move(request)));
     case IfrtRequest::RequestCase::kReshardRequest:
       return Future<Response>(HandleReshardRequest(std::move(request)));
     case IfrtRequest::RequestCase::kFullyReplicatedShardRequest:
@@ -340,6 +341,41 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckFutureRequest(
   return Future<BackendInterface::Response>(std::move(promise));
 }
 
+Future<BackendInterface::Response> IfrtBackend::HandleCheckValueReadyRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  std::vector<tsl::RCReference<xla::ifrt::Value>> values;
+  values.reserve(request->check_value_ready_request().value_handles_size());
+  for (const auto& value_handle :
+       request->check_value_ready_request().value_handles()) {
+    // TODO(b/261991179): IFRT Proxy currently supports Arrays as the only value
+    // type, but this may be extended later to other types such as Tuples.
+    auto array = GetArray(value_handle);
+    if (!array.ok()) {
+      return Future<Response>(array.status());
+    }
+    values.push_back(*std::move(array));
+  }
+
+  auto ifrt_response_promise =
+      Future<BackendInterface::Response>::CreatePromise();
+  Future<BackendInterface::Response> ifrt_response_future(
+      ifrt_response_promise);
+
+  client_->GetReadyFuture(values).OnReady(
+      [op_id = request->request_metadata().op_id(),
+       promise = std::move(ifrt_response_promise)](
+          absl::Status status) mutable -> void {
+        if (!status.ok()) {
+          promise.Set(std::move(status));
+          return;
+        }
+        auto ifrt_response = NewIfrtResponse(op_id);
+        ifrt_response->mutable_check_value_ready_response();
+        promise.Set(std::move(ifrt_response));
+      });
+  return ifrt_response_future;
+}
+
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleMakeArrayFromHostBufferRequest(
     std::unique_ptr<IfrtRequest> request) {
@@ -594,33 +630,6 @@ IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
   return response;
 }
 
-Future<BackendInterface::Response> IfrtBackend::HandleCheckArrayReadyRequest(
-    std::unique_ptr<IfrtRequest> request) {
-  auto array = GetArray(request->check_array_ready_request().array_handle());
-  if (!array.ok()) {
-    return Future<Response>(array.status());
-  }
-
-  auto ifrt_response_promise =
-      Future<BackendInterface::Response>::CreatePromise();
-  Future<BackendInterface::Response> ifrt_response_future(
-      ifrt_response_promise);
-
-  (*array)->GetReadyFuture().OnReady(
-      [op_id = request->request_metadata().op_id(),
-       promise = std::move(ifrt_response_promise)](
-          absl::Status status) mutable -> void {
-        if (!status.ok()) {
-          promise.Set(std::move(status));
-          return;
-        }
-        auto ifrt_response = NewIfrtResponse(op_id);
-        ifrt_response->mutable_check_array_ready_response();
-        promise.Set(std::move(ifrt_response));
-      });
-  return ifrt_response_future;
-}
-
 absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleReshardRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& reshard_request = request->reshard_request();
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index c9775aabb0b6fc..8ec6c0ef07cebd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -114,6 +114,9 @@ class IfrtBackend final : public BackendInterface {
   Future<Response> HandleCheckFutureRequest(
       std::unique_ptr<IfrtRequest> request);
 
+  Future<Response> HandleCheckValueReadyRequest(
+      std::unique_ptr<IfrtRequest> request);
+
   absl::StatusOr<Response> HandleMakeArrayFromHostBufferRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleAssembleArrayFromSingleDeviceArraysRequest(
@@ -128,8 +131,6 @@ class IfrtBackend final : public BackendInterface {
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleFullyReplicatedShardRequest(
       std::unique_ptr<IfrtRequest> request);
-  Future<Response> HandleCheckArrayReadyRequest(
-      std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleDeleteArrayRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleIsArrayDeletedRequest(
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index c8be96379969f8..9d427563c82aba 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -758,27 +758,27 @@ TEST_F(IfrtBackendHandlerTest, ReshardFailsWithNonExistentArrayHandle) {
 TEST_F(IfrtBackendHandlerTest,
        CheckArrayReadyRequestRelaysTheResultFromBackend) {
   auto mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
-  EXPECT_CALL(*mock_array, GetReadyFuture())
-      .WillOnce(Return(Future<>(absl::OkStatus())))
-      .WillOnce(Return(Future<>(absl::UnknownError("injected error"))));
   TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
                           MakeTestArray(std::move(mock_array)));
+  EXPECT_CALL(*mock_client_, GetReadyFuture(_))
+      .WillOnce(Return(Future<>(absl::OkStatus())))
+      .WillOnce(Return(Future<>(absl::UnknownError("injected error"))));
 
   {
     auto ifrt_request = NewIfrtRequest(NewOpId());
-    ifrt_request->mutable_check_array_ready_request()->set_array_handle(
+    ifrt_request->mutable_check_value_ready_request()->add_value_handles(
         array_handle);
     TF_ASSERT_OK_AND_ASSIGN(auto ifrt_response,
                             CallBackend(std::move(ifrt_request)));
 
     EXPECT_THAT(ifrt_response->response_metadata().status().code(),
                 tensorflow::error::OK);
-    EXPECT_TRUE(ifrt_response->has_check_array_ready_response());
+    EXPECT_TRUE(ifrt_response->has_check_value_ready_response());
   }
 
   {
     auto ifrt_request = NewIfrtRequest(NewOpId());
-    ifrt_request->mutable_check_array_ready_request()->set_array_handle(
+    ifrt_request->mutable_check_value_ready_request()->add_value_handles(
         array_handle);
     EXPECT_THAT(CallBackend(std::move(ifrt_request)),
                 StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
@@ -788,7 +788,7 @@ TEST_F(IfrtBackendHandlerTest,
 TEST_F(IfrtBackendHandlerTest,
        CheckArrayReadyRequestFailsWithNonExistentArrayHandle) {
   auto ifrt_request = NewIfrtRequest(NewOpId());
-  ifrt_request->mutable_check_array_ready_request()->set_array_handle(0);
+  ifrt_request->mutable_check_value_ready_request()->add_value_handles(0);
   EXPECT_THAT(CallBackend(std::move(ifrt_request)),
               StatusIs(absl::StatusCode::kNotFound));
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/version.h b/third_party/xla/xla/python/ifrt_proxy/server/version.h
index 2556b5656f6188..5a5108932184d1 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/version.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/version.h
@@ -23,9 +23,11 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+// LINT.IfChange
 // TODO(b/296144873): Document the version upgrade policy.
 inline constexpr int kServerMinVersion = 1;
-inline constexpr int kServerMaxVersion = 1;
+inline constexpr int kServerMaxVersion = 2;
+// LINT.ThenChange(//tensorflow/compiler/xla/python/ifrt_proxy/common/VERSION.md)
 
 // Returns a version that both the client and the server support, or an error if
 // there is no such a version.
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index d6376cc4a403fb..256afea2bd0537 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
@@ -521,6 +523,16 @@ PjRtClient::RemapArrays(const RemapPlan& plan,
   return PjRtCompatibleClientRemapArrays(this, plan, arrays, semantics);
 }
 
+Future<> PjRtClient::GetReadyFuture(
+    absl::Span<const tsl::RCReference<Value>> values) {
+  absl::InlinedVector<Future<>, 1> futures;
+  futures.reserve(values.size());
+  for (const auto& value : values) {
+    futures.push_back(value->GetReadyFuture());
+  }
+  return JoinFutures(futures);
+}
+
 absl::StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
     absl::Span<tsl::RCReference<Value>> values) {
   return PjRtTuple::Create(this, values);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 4dcf396da25212..6d0d5551936617 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -129,6 +129,9 @@ class PjRtClient final
       absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
       ArrayCopySemantics semantics) override;
 
+  Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) override;
+
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) override;
 
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index 8b195ac618ead2..aede5f06df1576 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/remap_plan.h"
@@ -185,6 +186,12 @@ class CompileOnlyIfRtClient final
     return Unimplemented("RemapArrays not available with compile-only client.");
   }
 
+  ifrt::Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<ifrt::Value>> values) override {
+    return ifrt::Future<>(Unimplemented(
+        "GetReadyFuture not available with compile-only client."));
+  }
+
   absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<ifrt::Value>> values) override {
     return Unimplemented("MakeTuple not available with compile-only client.");
diff --git a/third_party/xla/xla/python/util.cc b/third_party/xla/xla/python/util.cc
index 2d2ea906db7e13..74468f5ee2fbcd 100644
--- a/third_party/xla/xla/python/util.cc
+++ b/third_party/xla/xla/python/util.cc
@@ -15,31 +15,35 @@ limitations under the License.
 
 #include "xla/python/util.h"
 
-#include <memory>
-#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 
 namespace xla {
 
 absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
+  if (ifrt_arrays.empty()) {
+    return absl::OkStatus();
+  }
+
   ifrt::Future<> future;
   if (ifrt_arrays.size() == 1) {
     future = ifrt_arrays[0]->GetReadyFuture();
   } else {
-    std::vector<ifrt::Future<>> futures;
-    futures.reserve(ifrt_arrays.size());
+    std::vector<tsl::RCReference<ifrt::Value>> values;
+    values.reserve(ifrt_arrays.size());
     for (ifrt::Array* const ifrt_array : ifrt_arrays) {
-      futures.push_back(ifrt_array->GetReadyFuture());
+      values.push_back(tsl::FormRef(ifrt_array));
     }
-    future = ifrt::JoinFutures(absl::MakeSpan(futures));
+    ifrt::Client* const client = ifrt_arrays.front()->client();
+    future = client->GetReadyFuture(values);
   }
 
   absl::Status s = future.Await();
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
index e623e87bd599d3..fa71dbc2a226b0 100644
--- a/third_party/xla/xla/python/util.h
+++ b/third_party/xla/xla/python/util.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef XLA_PYTHON_UTIL_H_
 #define XLA_PYTHON_UTIL_H_
 
-#include <memory>
-#include <vector>
-
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/array.h"

From 13c693238962a67764cadb090d10693416a41ab8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 3 Jun 2024 12:25:55 -0700
Subject: [PATCH 273/287] Support passing user-supplied data to
 TfLiteOperator's methods

The current mechanism of having TfLiteOperator.init return a user-allocated buffer
doesn't allow for the TfLiteOperator.init to take a user-supplied pointer. And that is something we want to support so as to allow the developer to pass  TfLiteOperatorCreate() a C struct or C++ class implementing the Init/Free/Prepare/Eval methods.

PiperOrigin-RevId: 639873764
---
 tensorflow/lite/c/c_api_opaque_internal.cc    |   5 +-
 tensorflow/lite/c/common_internal.h           |  25 +++
 tensorflow/lite/core/async/async_subgraph.cc  |  21 ++-
 .../lite/core/c/c_api_experimental_test.cc    |  13 +-
 tensorflow/lite/core/c/c_api_opaque_test.cc   | 157 +++++++++++++++++-
 tensorflow/lite/core/c/c_api_test.cc          |  94 ++++++-----
 tensorflow/lite/core/c/operator.cc            |  57 ++++++-
 tensorflow/lite/core/c/operator.h             | 127 ++++++++++++++
 tensorflow/lite/core/subgraph.cc              | 144 +++++++++-------
 tensorflow/lite/delegates/BUILD               |   2 +
 tensorflow/lite/delegates/delegate_test.cc    |  14 +-
 .../lite/delegates/opaque_delegate_test.cc    |  28 ++--
 .../delegates/utils/simple_opaque_delegate.cc |  20 +--
 .../utils/simple_opaque_delegate_test.cc      |  20 +--
 .../lite/mutable_op_resolver_utils_test.cc    |  35 ++--
 15 files changed, 591 insertions(+), 171 deletions(-)

diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index cead720c0c7dcc..fa41ec4dedb014 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -35,9 +35,10 @@ TfLiteOperator* MakeOperator(const TfLiteRegistration* registration,
   // We need to allocate a new TfLiteOperator object and then
   // populate its state correctly, based on the contents in 'registration'.
 
-  auto* registration_external = TfLiteOperatorCreate(
+  auto* registration_external = TfLiteOperatorCreateWithData(
       static_cast<TfLiteBuiltinOperator>(registration->builtin_code),
-      registration->custom_name, registration->version);
+      registration->custom_name, registration->version,
+      /*user_data=*/nullptr);
 
   registration_external->node_index = node_index;
 
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index b47c38b6922e5a..d4b281ca31ad9e 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -81,6 +81,31 @@ typedef struct TfLiteOperator {
   // Indicates if an operator's output can safely overwrite its input.
   // See the comments in `TfLiteInPlaceOp`.
   uint64_t inplace_operator;
+
+  // Data supplied by the user in the `TfLiteOperatorCreateWithData` and then
+  // returned back to the user in the `TfLiteOperator` callbacks listed above.
+  // The user is expected to manage the memory pointed by this field and the
+  // lifetime of that memory should extend at least from the call to
+  // `TfLiteOperatorCreateWithData` to the invocation of `TfLiteOperator.free`.
+  void* user_data;
+  // The following callbacks can be set with the `TfLiteOperatorSetXXXWithData`
+  // functions and if, so set, will pass back the value of the user_data field
+  // above as first argument.
+  //
+  // TODO(b/339641079): Remove the legacy callbacks listed above and rename
+  // these below without the `_with_data` suffix.
+  void* (*init_with_data)(void* user_data, TfLiteOpaqueContext* context,
+                          const char* buffer, size_t length);
+  void (*free_with_data)(void* user_data, TfLiteOpaqueContext* context,
+                         void* buffer);
+  TfLiteStatus (*prepare_with_data)(void* user_data,
+                                    TfLiteOpaqueContext* context,
+                                    TfLiteOpaqueNode* node);
+  TfLiteStatus (*invoke_with_data)(void* user_data,
+                                   TfLiteOpaqueContext* context,
+                                   TfLiteOpaqueNode* node);
+  struct TfLiteAsyncKernel* (*async_kernel_with_data)(
+      void* user_data, TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
 } TfLiteOperator;
 
 // Returns true iff it's safe to dereference
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
index 11fcef091be9ff..a9dd2b6f6b9a65 100644
--- a/tensorflow/lite/core/async/async_subgraph.cc
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -33,14 +33,19 @@ namespace {
 TfLiteAsyncKernel* GetAsyncKernel(TfLiteContext* context,
                                   const TfLiteRegistration& op_reg,
                                   TfLiteNode& node) {
-  if (op_reg.registration_external &&
-      op_reg.registration_external->async_kernel) {
-    return op_reg.registration_external->async_kernel(
-        // The casts here are only safe because this code is part of TFLite
-        // runtime. Applications should not rely on TfLiteContext / TfLiteNode
-        // being equivalent to TfLiteOpaqueContext / TfLiteOpaqueNode.
-        reinterpret_cast<TfLiteOpaqueContext*>(context),
-        reinterpret_cast<TfLiteOpaqueNode*>(&node));
+  if (op_reg.registration_external) {
+    // The casts here are only safe because this code is part of TFLite
+    // runtime. Applications should not rely on TfLiteContext / TfLiteNode being
+    // equivalent to TfLiteOpaqueContext / TfLiteOpaqueNode.
+    auto* context_ = reinterpret_cast<TfLiteOpaqueContext*>(context);
+    auto* node_ = reinterpret_cast<TfLiteOpaqueNode*>(&node);
+    if (op_reg.registration_external->async_kernel_with_data) {
+      auto user_data = op_reg.registration_external->user_data;
+      return op_reg.registration_external->async_kernel_with_data(
+          user_data, context_, node_);
+    } else if (op_reg.registration_external->async_kernel) {
+      return op_reg.registration_external->async_kernel(context_, node_);
+    }
   }
   if (op_reg.async_kernel) {
     return op_reg.async_kernel(context, &node);
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index bb9fa71485a3d1..074cc9f2e0c282 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <cstdarg>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
@@ -51,8 +52,8 @@ const TfLiteRegistration* GetNoOpRegistration() {
 }
 
 const TfLiteOperator* GetNoOpOperator() {
-  static TfLiteOperator* registration =
-      TfLiteOperatorCreate(kTfLiteBuiltinCustom, "NoOp", 1);
+  static TfLiteOperator* registration = TfLiteOperatorCreateWithData(
+      kTfLiteBuiltinCustom, "NoOp", 1, /* user_data */ nullptr);
   TfLiteOperatorSetInvoke(
       registration,
       /*invoke=*/[](TfLiteOpaqueContext*, TfLiteOpaqueNode*) {
@@ -247,8 +248,8 @@ const TfLiteOperator* SinhFindCustomOpExternal(void*, const char* custom_op,
                                                int version) {
   if (absl::string_view(custom_op) == "Sinh" && version == 1) {
     static TfLiteOperator* registration = []() {
-      TfLiteOperator* reg =
-          TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+      TfLiteOperator* reg = TfLiteOperatorCreateWithData(
+          kTfLiteBuiltinCustom, "Sinh", 1, /* user_data */ nullptr);
       TfLiteOperatorSetPrepare(reg, &SinhPrepareOpaque);
       TfLiteOperatorSetInvoke(reg, &SinhEvalOpaque);
       return reg;
@@ -655,9 +656,9 @@ struct OpaqueTestDelegate {
     delegate_state->buffer_handle++;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteOperatorCreate(
+    registration.registration_external = TfLiteOperatorCreateWithData(
         kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel",
-        /* version = */ 1);
+        /* version = */ 1, /* user_data = */ nullptr);
 
     TfLiteOperatorSetPrepare(
         registration.registration_external,
diff --git a/tensorflow/lite/core/c/c_api_opaque_test.cc b/tensorflow/lite/core/c/c_api_opaque_test.cc
index 14c9fd27c50923..df1799735ea1b1 100644
--- a/tensorflow/lite/core/c/c_api_opaque_test.cc
+++ b/tensorflow/lite/core/c/c_api_opaque_test.cc
@@ -578,7 +578,9 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+  TfLiteOperator* reg =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinCustom, "Sinh", /*version=*/1,
+                                   /*user_data=*/nullptr);
   TfLiteOperatorSetPrepare(reg, my_custom_op::Prepare);
   TfLiteOperatorSetInit(reg, my_custom_op::Init);
   TfLiteOperatorSetFree(reg, my_custom_op::Free);
@@ -609,5 +611,158 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
   TfLiteModelDelete(model);
 }
 
+TEST(TestTfLiteOpaqueNode, CustomOpWithLegacyCallbacks) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      "tensorflow/lite/testdata/custom_sinh.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteOperator* reg =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinCustom, "Sinh", /*version=*/1,
+                                   /*user_data=*/nullptr);
+  TfLiteOperatorSetPrepare(reg, [](auto context, auto node) {
+    return my_custom_op::Prepare(context, node);
+  });
+  TfLiteOperatorSetInit(reg, [](auto context, auto buffer, auto length) {
+    return my_custom_op::Init(context, buffer, length);
+  });
+  TfLiteOperatorSetFree(
+      reg, [](auto context, auto data) { my_custom_op::Free(context, data); });
+  TfLiteOperatorSetInvoke(reg, [](auto context, auto node) {
+    return my_custom_op::Invoke(context, node);
+  });
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddOperator(options, reg);
+
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  TfLiteInterpreterOptionsDelete(options);
+
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  const float input_value = 42.0f;
+  TfLiteTensorCopyFromBuffer(input_tensor, &input_value, sizeof(float));
+
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  float output_value;
+  TfLiteTensorCopyToBuffer(output_tensor, &output_value, sizeof(float));
+  EXPECT_EQ(output_value, input_value);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOperatorDelete(reg);
+  TfLiteModelDelete(model);
+}
+
+TEST(TestTfLiteOpaqueNode, CustomOpWithNoUserData) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      "tensorflow/lite/testdata/custom_sinh.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteOperator* reg =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinCustom, "Sinh", /*version=*/1,
+                                   /*user_data=*/nullptr);
+  TfLiteOperatorSetPrepareWithData(
+      reg, [](auto user_data, auto context, auto node) {
+        EXPECT_EQ(nullptr, user_data);
+        return my_custom_op::Prepare(context, node);
+      });
+  TfLiteOperatorSetInitWithData(
+      reg, [](auto user_data, auto context, auto buffer, auto length) {
+        EXPECT_EQ(nullptr, user_data);
+        return my_custom_op::Init(context, buffer, length);
+      });
+  TfLiteOperatorSetFreeWithData(reg,
+                                [](auto user_data, auto context, auto data) {
+                                  EXPECT_EQ(nullptr, user_data);
+                                  my_custom_op::Free(context, data);
+                                });
+  TfLiteOperatorSetInvokeWithData(reg,
+                                  [](auto user_data, auto context, auto node) {
+                                    EXPECT_EQ(nullptr, user_data);
+                                    return my_custom_op::Invoke(context, node);
+                                  });
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddOperator(options, reg);
+
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  TfLiteInterpreterOptionsDelete(options);
+
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  const float input_value = 42.0f;
+  TfLiteTensorCopyFromBuffer(input_tensor, &input_value, sizeof(float));
+
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  float output_value;
+  TfLiteTensorCopyToBuffer(output_tensor, &output_value, sizeof(float));
+  EXPECT_EQ(output_value, input_value);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOperatorDelete(reg);
+  TfLiteModelDelete(model);
+}
+
+TEST(TestTfLiteOpaqueNode, CustomOpWithData) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      "tensorflow/lite/testdata/custom_sinh.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteOperator* reg =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinCustom, "Sinh", /*version=*/1,
+                                   /*user_data=*/reinterpret_cast<void*>(345));
+  TfLiteOperatorSetPrepareWithData(
+      reg, [](auto user_data, auto context, auto node) {
+        EXPECT_EQ(reinterpret_cast<void*>(345), user_data);
+        return my_custom_op::Prepare(context, node);
+      });
+  TfLiteOperatorSetInitWithData(
+      reg, [](auto user_data, auto context, auto buffer, auto length) {
+        EXPECT_EQ(reinterpret_cast<void*>(345), user_data);
+        return my_custom_op::Init(context, buffer, length);
+      });
+  TfLiteOperatorSetFreeWithData(
+      reg, [](auto user_data, auto context, auto data) {
+        EXPECT_EQ(reinterpret_cast<void*>(345), user_data);
+        my_custom_op::Free(context, data);
+      });
+  TfLiteOperatorSetInvokeWithData(
+      reg, [](auto user_data, auto context, auto node) {
+        EXPECT_EQ(reinterpret_cast<void*>(345), user_data);
+        return my_custom_op::Invoke(context, node);
+      });
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddOperator(options, reg);
+
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  TfLiteInterpreterOptionsDelete(options);
+
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  const float input_value = 42.0f;
+  TfLiteTensorCopyFromBuffer(input_tensor, &input_value, sizeof(float));
+
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  float output_value;
+  TfLiteTensorCopyToBuffer(output_tensor, &output_value, sizeof(float));
+  EXPECT_EQ(output_value, input_value);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOperatorDelete(reg);
+  TfLiteModelDelete(model);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 5f9b820a173827..f7fbe8066ec3e3 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -446,11 +446,13 @@ std::vector<int>* g_nodes_to_replace;
 TfLiteOpaqueDelegate* g_opaque_delegate_struct;
 
 TfLiteOperator* CreateDelegateKernelExternalRegistration() {
-  TfLiteOperator* delegate_kernel_registration_external = TfLiteOperatorCreate(
-      kTfLiteBuiltinDelegate, "TEST DELEGATE KERNEL", /*version=*/1);
-  TfLiteOperatorSetInit(
+  TfLiteOperator* delegate_kernel_registration_external =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinDelegate,
+                                   "TEST DELEGATE KERNEL", /*version=*/1,
+                                   /*user_data=*/nullptr);
+  TfLiteOperatorSetInitWithData(
       delegate_kernel_registration_external,
-      [](TfLiteOpaqueContext* context, const char* buffer,
+      [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
         const TfLiteOpaqueDelegateParams* params =
             reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
@@ -470,10 +472,11 @@ TfLiteOperator* CreateDelegateKernelExternalRegistration() {
         }
         return new OpState{true};
       });
-  TfLiteOperatorSetFree(delegate_kernel_registration_external,
-                        [](TfLiteOpaqueContext* context, void* buffer) {
-                          delete (reinterpret_cast<OpState*>(buffer));
-                        });
+  TfLiteOperatorSetFreeWithData(
+      delegate_kernel_registration_external,
+      [](void* user_data, TfLiteOpaqueContext* context, void* buffer) {
+        delete (reinterpret_cast<OpState*>(buffer));
+      });
   return delegate_kernel_registration_external;
 }
 
@@ -753,11 +756,12 @@ struct DelegateKernelState {
 };
 
 TfLiteOperator* CreateReg() {
-  auto reg_ex = TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
-                                     "Test driver delegate", /*version=*/1);
-  TfLiteOperatorSetInit(
+  auto reg_ex = TfLiteOperatorCreateWithData(
+      kTfLiteBuiltinDelegate, "Test driver delegate", /*version=*/1,
+      /*user_data=*/nullptr);
+  TfLiteOperatorSetInitWithData(
       reg_ex,
-      [](TfLiteOpaqueContext* context, const char* buffer,
+      [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
         const TfLiteOpaqueDelegateParams* params =
             reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
@@ -788,9 +792,9 @@ TfLiteOperator* CreateReg() {
         return new DelegateKernelState{input_tensor, output_tensor};
       });
 
-  TfLiteOperatorSetInvoke(
+  TfLiteOperatorSetInvokeWithData(
       reg_ex,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         DelegateKernelState* delegate_kernel =
             reinterpret_cast<DelegateKernelState*>(
@@ -806,10 +810,12 @@ TfLiteOperator* CreateReg() {
         return kTfLiteOk;
       });
 
-  TfLiteOperatorSetFree(reg_ex, [](TfLiteOpaqueContext* context, void* data) {
-    DelegateKernelState* state = reinterpret_cast<DelegateKernelState*>(data);
-    delete state;
-  });
+  TfLiteOperatorSetFreeWithData(
+      reg_ex, [](void* user_data, TfLiteOpaqueContext* context, void* data) {
+        DelegateKernelState* state =
+            reinterpret_cast<DelegateKernelState*>(data);
+        delete state;
+      });
   return reg_ex;
 }
 
@@ -1098,8 +1104,8 @@ struct SinhParams {
   bool use_cosh_instead = false;
 };
 
-void* FlexSinhInit(TfLiteOpaqueContext* context, const char* buffer,
-                   size_t length) {
+void* FlexSinhInit(void* user_data, TfLiteOpaqueContext* context,
+                   const char* buffer, size_t length) {
   auto sinh_params = new SinhParams;
   // The buffer that is passed into here is the custom_options
   // field from the flatbuffer
@@ -1112,16 +1118,16 @@ void* FlexSinhInit(TfLiteOpaqueContext* context, const char* buffer,
   return sinh_params;
 }
 
-void FlexSinhFree(TfLiteOpaqueContext* context, void* data) {
+void FlexSinhFree(void* user_data, TfLiteOpaqueContext* context, void* data) {
   delete static_cast<SinhParams*>(data);
 }
 
-TfLiteStatus FlexSinhPrepare(TfLiteOpaqueContext* context,
+TfLiteStatus FlexSinhPrepare(void* user_data, TfLiteOpaqueContext* context,
                              TfLiteOpaqueNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus FlexSinhEval(TfLiteOpaqueContext* context,
+TfLiteStatus FlexSinhEval(void* user_data, TfLiteOpaqueContext* context,
                           TfLiteOpaqueNode* node) {
   auto sinh_params =
       static_cast<SinhParams*>(TfLiteOpaqueNodeGetUserData(node));
@@ -1145,11 +1151,13 @@ TEST(CApiSimple, CustomOpSupport) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-  TfLiteOperatorSetPrepare(reg, &FlexSinhPrepare);
-  TfLiteOperatorSetInit(reg, &FlexSinhInit);
-  TfLiteOperatorSetFree(reg, &FlexSinhFree);
-  TfLiteOperatorSetInvoke(reg, &FlexSinhEval);
+  TfLiteOperator* reg =
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinCustom, "Sinh", /*version=*/1,
+                                   /*user_data=*/nullptr);
+  TfLiteOperatorSetPrepareWithData(reg, &FlexSinhPrepare);
+  TfLiteOperatorSetInitWithData(reg, &FlexSinhInit);
+  TfLiteOperatorSetFreeWithData(reg, &FlexSinhFree);
+  TfLiteOperatorSetInvokeWithData(reg, &FlexSinhEval);
 
   const char* kCustomName = TfLiteOperatorGetCustomName(reg);
   EXPECT_EQ("Sinh", kCustomName);
@@ -1514,13 +1522,14 @@ TEST(CApiSimple, OpaqueApiAccessors) {
     // Define a delegate kernel that checks that the properties of the model
     // are accessible via the opaque API function.
     //
-    TfLiteOperator* reg =
-        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
+    TfLiteOperator* reg = TfLiteOperatorCreateWithData(
+        kTfLiteBuiltinDelegate, "my delegate", /*version=*/123,
+        /*user_data=*/nullptr);
     EXPECT_EQ(123, TfLiteOperatorGetVersion(reg));
-    TfLiteOperatorSetInit(
+    TfLiteOperatorSetInitWithData(
         reg,
-        [](TfLiteOpaqueContext* opaque_context, const char* buffer,
-           size_t length) -> void* {
+        [](void* user_data, TfLiteOpaqueContext* opaque_context,
+           const char* buffer, size_t length) -> void* {
           const TfLiteOpaqueDelegateParams* params =
               reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
           EXPECT_EQ(2, params->input_tensors->size);
@@ -1836,12 +1845,13 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* context,
                                        TfLiteOpaqueDelegate* delegate,
                                        void* data) -> TfLiteStatus {
-    TfLiteOperator* registration =
-        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
-    TfLiteOperatorSetInit(
+    TfLiteOperator* registration = TfLiteOperatorCreateWithData(
+        kTfLiteBuiltinDelegate, "my delegate", /*version=*/123,
+        /*user_data=*/nullptr);
+    TfLiteOperatorSetInitWithData(
         registration,
-        [](TfLiteOpaqueContext* opaque_context, const char* buffer,
-           size_t length) -> void* {
+        [](void* user_data, TfLiteOpaqueContext* opaque_context,
+           const char* buffer, size_t length) -> void* {
           const TfLiteOpaqueDelegateParams* params =
               reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
           EXPECT_EQ(2, params->input_tensors->size);
@@ -1859,14 +1869,14 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
           return nullptr;
         });
 
-    TfLiteOperatorSetPrepare(
+    TfLiteOperatorSetPrepareWithData(
         registration,
-        [](TfLiteOpaqueContext* context,
+        [](void* user_data, TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
 
-    TfLiteOperatorSetInvoke(
+    TfLiteOperatorSetInvokeWithData(
         registration,
-        [](TfLiteOpaqueContext* context,
+        [](void* user_data, TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus {
           const TfLiteOpaqueTensor* input0 =
               TfLiteOpaqueNodeGetInput(context, node, 0);
diff --git a/tensorflow/lite/core/c/operator.cc b/tensorflow/lite/core/c/operator.cc
index 261504de8e166d..ee67da2dcb2311 100644
--- a/tensorflow/lite/core/c/operator.cc
+++ b/tensorflow/lite/core/c/operator.cc
@@ -25,6 +25,13 @@ limitations under the License.
 
 TfLiteOperator* TfLiteOperatorCreate(TfLiteBuiltinOperator builtin_code,
                                      const char* custom_name, int version) {
+  return TfLiteOperatorCreateWithData(builtin_code, custom_name, version,
+                                      /*user_data=*/nullptr);
+}
+
+TfLiteOperator* TfLiteOperatorCreateWithData(TfLiteBuiltinOperator builtin_code,
+                                             const char* custom_name,
+                                             int version, void* user_data) {
   return new TfLiteOperator{.custom_name = custom_name,
                             .version = version,
                             .init = nullptr,
@@ -34,7 +41,8 @@ TfLiteOperator* TfLiteOperatorCreate(TfLiteBuiltinOperator builtin_code,
                             .async_kernel = nullptr,
                             .builtin_code = builtin_code,
                             .node_index = -1,
-                            .inplace_operator = kTfLiteInplaceOpNone};
+                            .inplace_operator = kTfLiteInplaceOpNone,
+                            .user_data = user_data};
 }
 
 void TfLiteOperatorDelete(TfLiteOperator* reg) { delete reg; }
@@ -45,12 +53,27 @@ void TfLiteOperatorSetInit(TfLiteOperator* registration,
   registration->init = init;
 }
 
+TfLiteStatus TfLiteOperatorSetInitWithData(
+    TfLiteOperator* registration,
+    void* (*init)(void* user_data, TfLiteOpaqueContext* context,
+                  const char* buffer, size_t length)) {
+  registration->init_with_data = init;
+  return kTfLiteOk;
+}
+
 void TfLiteOperatorSetFree(TfLiteOperator* registration,
                            void (*free)(TfLiteOpaqueContext* context,
                                         void* data)) {
   registration->free = free;
 }
 
+TfLiteStatus TfLiteOperatorSetFreeWithData(
+    TfLiteOperator* registration,
+    void (*free)(void* user_data, TfLiteOpaqueContext* context, void* data)) {
+  registration->free_with_data = free;
+  return kTfLiteOk;
+}
+
 void TfLiteOperatorSetPrepare(
     TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
@@ -58,6 +81,14 @@ void TfLiteOperatorSetPrepare(
   registration->prepare = prepare;
 }
 
+TfLiteStatus TfLiteOperatorSetPrepareWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(void* user_data, TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node)) {
+  registration->prepare_with_data = prepare;
+  return kTfLiteOk;
+}
+
 void TfLiteOperatorSetInvoke(
     TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
@@ -65,6 +96,14 @@ void TfLiteOperatorSetInvoke(
   registration->invoke = invoke;
 }
 
+TfLiteStatus TfLiteOperatorSetInvokeWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(void* user_data, TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node)) {
+  registration->invoke_with_data = invoke;
+  return kTfLiteOk;
+}
+
 void TfLiteOperatorSetAsyncKernel(
     TfLiteOperator* registration,
     TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
@@ -72,6 +111,15 @@ void TfLiteOperatorSetAsyncKernel(
   registration->async_kernel = async_kernel;
 }
 
+TfLiteStatus TfLiteOperatorSetAsyncKernelWithData(
+    TfLiteOperator* registration,
+    TfLiteAsyncKernel* (*async_kernel)(void* user_data,
+                                       TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node)) {
+  registration->async_kernel_with_data = async_kernel;
+  return kTfLiteOk;
+}
+
 void TfLiteOperatorSetInplaceOperator(TfLiteOperator* registration,
                                       uint64_t inplace_operator) {
   registration->inplace_operator = inplace_operator;
@@ -92,3 +140,10 @@ int TfLiteOperatorGetVersion(const TfLiteOperator* registration) {
   }
   return registration->version;
 }
+
+void* TfLiteOperatorGetUserData(const TfLiteOperator* registration) {
+  if (!registration) {
+    return nullptr;
+  }
+  return registration->user_data;
+}
diff --git a/tensorflow/lite/core/c/operator.h b/tensorflow/lite/core/c/operator.h
index 6fa71059d3ad26..21415238dd8e07 100644
--- a/tensorflow/lite/core/c/operator.h
+++ b/tensorflow/lite/core/c/operator.h
@@ -74,9 +74,54 @@ typedef struct TfLiteOperator TfLiteOperator;
 /// \param version      Version of the op.  See
 ///                     https://www.tensorflow.org/lite/guide/ops_version
 ///
+/// \return \a newly created TfLiteOperator on success, \a nullptr on failure
+///
+/// Deprecated: Use `TfLiteOperatorCreateWithData`
 TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
     TfLiteBuiltinOperator builtin_code, const char* custom_name, int version);
 
+/// Returns a new TfLiteOperator instance.
+///
+/// The returned TfLiteOperator instance represents a definition
+/// of an operator with the identity (builtin_code/custom_name and
+/// version) specified by the parameters, but with all callbacks initially
+/// unset.
+///
+/// Evaluation of any operation using this operator will be done using
+/// the "prepare" and "invoke" callbacks, which can be set using
+/// `TfLiteOperatorSetPrepare` and
+/// `TfLiteOperatorSetInvoke`, or for async execution
+/// the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
+/// which can be set using `TfLiteOperatorSetAsyncKernel`.
+/// If the relevant callbacks are not set, then such evaluation will result
+/// in an error status.  So normally any use of this function should be followed
+/// by appropriate calls to set those callbacks.
+///
+/// \note The caller retains ownership and should ensure that
+/// the lifetime of the `TfLiteOperator` must be at least as long as
+/// the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
+/// used in.
+///
+/// \param builtin_code Enumeration code specifying which builtin operator this
+///                     defines, or `TfLiteBuiltinCustom` to define a custom op.
+/// \param custom_name  Name of the custom op, or `nullptr` for a builtin op.
+///                     If `custom_name` is non-null, then `builtin_code` should
+///                     be `TfLiteBuiltinCustom`.
+/// \param version      Version of the op.  See
+///                     https://www.tensorflow.org/lite/guide/ops_version
+/// \param user_data    Opaque pointer passed to the operator's callbacks set
+///                     with functions such as `TfLiteOperatorSetXXXWithData`.
+///                     The user is expected to manage the memory pointed by
+///                     this field and the lifetime of that memory should extend
+///                     at least from the call to `TfLiteOperatorCreateWithData`
+///                     to the invocation of the callback set with
+///                     `TfLiteOperatorSetFreeWithData`.
+///
+/// \return a newly created TfLiteOperator on success, or a nullptr on failure
+TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreateWithData(
+    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version,
+    void* user_data);
+
 /// Destroys the TfLiteOperator instance.
 ///
 TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
@@ -98,16 +143,36 @@ TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
 TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
     const TfLiteOperator* registration);
 
+/// Return the user data field of the provided external 'registration', or
+/// nullptr if none was set.
+///
+TFL_CAPI_EXPORT extern void* TfLiteOperatorGetUserData(
+    const TfLiteOperator* registration);
+
 /// Sets the initialization callback for the registration.
 ///
 /// The callback is called to initialize the op from serialized data.
 /// Please refer `init` of `TfLiteRegistration` for the detail.
 ///
+/// Deprecated: Use `TfLiteOperatorSetInitWithData`
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
     TfLiteOperator* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                   size_t length));
 
+/// Sets the initialization callback for the registration. The function returns
+/// an error upon failure.
+///
+/// The callback is called to initialize the op from serialized data. The value
+/// passed in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreateWithData`.  Please refer `init` of `TfLiteRegistration`
+/// for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetInitWithData(
+    TfLiteOperator* registration,
+    void* (*init)(void* user_data, TfLiteOpaqueContext* context,
+                  const char* buffer, size_t length));
+
 /// Sets the deallocation callback for the registration.
 ///
 /// This callback is called to deallocate the data returned by the init
@@ -115,30 +180,72 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
 /// returned by the `init` callback. Please refer `free` of `TfLiteRegistration`
 /// for the detail.
 ///
+/// Deprecated: Use `TfLiteOperatorSetFreeWithData`
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
     TfLiteOperator* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
 
+/// Sets the deallocation callback for the registration, similarly to
+/// `TfLiteOperatorSetFree`. The function returns an error upon failure.
+///
+/// This callback is called to deallocate the data returned by the init
+/// callback. The value passed in the `data` parameter is the value that was
+/// returned by the `init` callback. The value passed in the `user_data`
+/// parameter is the value that was passed to `TfLiteOperatorCreateWithData`.
+/// Please refer `free` of `TfLiteRegistration` for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetFreeWithData(
+    TfLiteOperator* registration,
+    void (*free)(void* user_data, TfLiteOpaqueContext* context, void* data));
+
 /// Sets the preparation callback for the registration.
 ///
 /// The callback is called when the inputs of operator have been resized.
 /// Please refer `prepare` of `TfLiteRegistration` for the detail.
 ///
+/// Deprecated: Use `TfLiteOperatorSetPrepareWithData`
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
     TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
                             TfLiteOpaqueNode* node));
 
+/// Sets the preparation callback for the registration. The function returns an
+/// error upon failure.
+///
+/// The callback is called when the inputs of operator have been resized.  The
+/// value passed in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreateWithData`.  Please refer `prepare` of
+/// `TfLiteRegistration` for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetPrepareWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(void* user_data, TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+
 /// Sets the invocation callback for the registration.
 ///
 /// The callback is called when the operator is executed.
 /// Please refer `invoke` of `TfLiteRegistration` for the detail.
 ///
+/// Deprecated: Use `TfLiteOperatorSetInvokeWithData`
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
     TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
                            TfLiteOpaqueNode* node));
 
+/// Sets the invocation callback for the registration. The function returns an
+/// error upon failure.
+///
+/// The callback is called when the operator is executed.  The value passed in
+/// the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `invoke` of `TfLiteRegistration` for
+/// the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetInvokeWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(void* user_data, TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+
 /// Sets the async kernel accessor callback for the registration.
 ///
 /// The callback is called to retrieve the async kernel if the delegate supports
@@ -146,12 +253,32 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
 /// should not be called, or `async_kernel` needs to be nullptr.
 /// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
 /// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
+///
 /// \warning This is an experimental API and subject to change.
+/// Deprecated: Use `TfLiteOperatorSetAsyncKernelWithData`
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetAsyncKernel(
     TfLiteOperator* registration,
     struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
                                               TfLiteOpaqueNode* node));
 
+/// Sets the async kernel accessor callback for the registration. The function
+/// returns an error upon failure.
+///
+/// The callback is called to retrieve the async kernel if the delegate supports
+/// it. If the delegate does not support async execution, either this function
+/// should not be called, or `async_kernel` needs to be nullptr.  `node` is the
+/// delegate TfLiteNode created by `ModifyGraphWithDelegate`.  The value passed
+/// in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `async_kernel` of `TfLiteRegistration`
+/// for the detail.
+///
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetAsyncKernelWithData(
+    TfLiteOperator* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(void* user_data,
+                                              TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
+
 /// Sets the inplace_operator field of the external registration.
 ///
 /// This is a bitmask. Please refer to `inplace_operator` field of
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index ce3622105b1ce5..cc1e0170e95c56 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1294,18 +1294,24 @@ void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
   // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
   //    the callbacks stored within the 'TfLiteOperator' itself.
-  if (op_reg.registration_external &&
-      op_reg.registration_external->node_index != -1) {
-    TfLiteRegistration* referenced_registration =
-        &nodes_and_registration_[op_reg.registration_external->node_index]
-             .second;
-    if (referenced_registration->init == nullptr) return nullptr;
-    return referenced_registration->init(&context_, buffer, length);
-  }
-
-  if (op_reg.registration_external && op_reg.registration_external->init) {
-    return op_reg.registration_external->init(
-        reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer, length);
+  if (op_reg.registration_external) {
+    if (op_reg.registration_external->node_index != -1) {
+      TfLiteRegistration* referenced_registration =
+          &nodes_and_registration_[op_reg.registration_external->node_index]
+               .second;
+      if (referenced_registration->init == nullptr) return nullptr;
+      return referenced_registration->init(&context_, buffer, length);
+    }
+    if (op_reg.registration_external->init_with_data) {
+      void* user_data = op_reg.registration_external->user_data;
+      return op_reg.registration_external->init_with_data(
+          user_data, reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer,
+          length);
+    }
+    if (op_reg.registration_external->init) {
+      return op_reg.registration_external->init(
+          reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer, length);
+    }
   }
   if (op_reg.init == nullptr) return nullptr;
   return op_reg.init(&context_, buffer, length);
@@ -1325,31 +1331,38 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
   // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
   //    the callbacks stored within the 'TfLiteOperator' itself.
-  if (op_reg.registration_external &&
-      op_reg.registration_external->node_index != -1) {
-    TfLiteRegistration* referenced_registration =
-        &nodes_and_registration_[op_reg.registration_external->node_index]
-             .second;
-    if (referenced_registration->prepare == nullptr) {
-      if (IsUnresolvedCustomOp(op_reg)) {
-        ReportError(
-            "Encountered unresolved custom op: %s.\nSee instructions: "
-            "https://www.tensorflow.org/lite/guide/ops_custom ",
-            op_reg.custom_name ? op_reg.custom_name : "UnknownOp");
-        return kTfLiteUnresolvedOps;
-      } else {
-        // Resolved ops can have a null Prepare function.
-        return kTfLiteOk;
+  if (op_reg.registration_external) {
+    if (op_reg.registration_external->node_index != -1) {
+      TfLiteRegistration* referenced_registration =
+          &nodes_and_registration_[op_reg.registration_external->node_index]
+               .second;
+      if (referenced_registration->prepare == nullptr) {
+        if (IsUnresolvedCustomOp(op_reg)) {
+          ReportError(
+              "Encountered unresolved custom op: %s.\nSee instructions: "
+              "https://www.tensorflow.org/lite/guide/ops_custom ",
+              op_reg.custom_name ? op_reg.custom_name : "UnknownOp");
+          return kTfLiteUnresolvedOps;
+        } else {
+          // Resolved ops can have a null Prepare function.
+          return kTfLiteOk;
+        }
       }
+      return referenced_registration->prepare(&context_, node);
+    }
+    if (op_reg.registration_external->prepare_with_data) {
+      // The 'data' field required by the 'prepare' function pointer must be
+      // retrieved from the 'registration_external' object itself.
+      void* user_data = op_reg.registration_external->user_data;
+      return op_reg.registration_external->prepare_with_data(
+          user_data, reinterpret_cast<TfLiteOpaqueContext*>(&context_),
+          reinterpret_cast<TfLiteOpaqueNode*>(node));
+    }
+    if (op_reg.registration_external->prepare) {
+      return op_reg.registration_external->prepare(
+          reinterpret_cast<TfLiteOpaqueContext*>(&context_),
+          reinterpret_cast<TfLiteOpaqueNode*>(node));
     }
-    return referenced_registration->prepare(&context_, node);
-  }
-  if (op_reg.registration_external && op_reg.registration_external->prepare) {
-    // The 'data' field required by the 'prepare' function pointer must be
-    // retrieved from the 'registration_external' object itself.
-    return op_reg.registration_external->prepare(
-        reinterpret_cast<TfLiteOpaqueContext*>(&context_),
-        reinterpret_cast<TfLiteOpaqueNode*>(node));
   }
   if (op_reg.prepare == nullptr) {
     // Check if it's an unresolved custom op.
@@ -1391,19 +1404,25 @@ TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
   // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
   //    the callbacks stored within the 'TfLiteOperator' itself.
-  if (op_reg.registration_external &&
-      op_reg.registration_external->node_index != -1) {
-    TfLiteRegistration* referenced_registration =
-        &nodes_and_registration_[op_reg.registration_external->node_index]
-             .second;
-    if (referenced_registration->invoke == nullptr) return kTfLiteError;
-    return referenced_registration->invoke(&context_, node);
-  }
-
-  if (op_reg.registration_external && op_reg.registration_external->invoke) {
-    return op_reg.registration_external->invoke(
-        reinterpret_cast<TfLiteOpaqueContext*>(&context_),
-        reinterpret_cast<TfLiteOpaqueNode*>(node));
+  if (op_reg.registration_external) {
+    if (op_reg.registration_external->node_index != -1) {
+      TfLiteRegistration* referenced_registration =
+          &nodes_and_registration_[op_reg.registration_external->node_index]
+               .second;
+      if (referenced_registration->invoke == nullptr) return kTfLiteError;
+      return referenced_registration->invoke(&context_, node);
+    }
+    if (op_reg.registration_external->invoke_with_data) {
+      void* user_data = op_reg.registration_external->user_data;
+      return op_reg.registration_external->invoke_with_data(
+          user_data, reinterpret_cast<TfLiteOpaqueContext*>(&context_),
+          reinterpret_cast<TfLiteOpaqueNode*>(node));
+    }
+    if (op_reg.registration_external->invoke) {
+      return op_reg.registration_external->invoke(
+          reinterpret_cast<TfLiteOpaqueContext*>(&context_),
+          reinterpret_cast<TfLiteOpaqueNode*>(node));
+    }
   }
   if (op_reg.invoke == nullptr) return kTfLiteError;
   return op_reg.invoke(&context_, node);
@@ -1424,18 +1443,23 @@ void Subgraph::OpFree(const TfLiteRegistration& op_reg, void* buffer) {
   // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
   //    the callbacks stored within the 'TfLiteOperator' itself.
-  if (op_reg.registration_external &&
-      op_reg.registration_external->node_index != -1 && buffer) {
-    TfLiteRegistration* referenced_registration =
-        &nodes_and_registration_[op_reg.registration_external->node_index]
-             .second;
-    if (referenced_registration->free == nullptr) return;
-    return referenced_registration->free(&context_, buffer);
-  }
-  if (op_reg.registration_external && op_reg.registration_external->free &&
-      buffer) {
-    return op_reg.registration_external->free(
-        reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer);
+  if (op_reg.registration_external && buffer) {
+    if (op_reg.registration_external->node_index != -1) {
+      TfLiteRegistration* referenced_registration =
+          &nodes_and_registration_[op_reg.registration_external->node_index]
+               .second;
+      if (referenced_registration->free == nullptr) return;
+      return referenced_registration->free(&context_, buffer);
+    }
+    if (op_reg.registration_external->free_with_data) {
+      void* user_data = op_reg.registration_external->user_data;
+      return op_reg.registration_external->free_with_data(
+          user_data, reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer);
+    }
+    if (op_reg.registration_external->free) {
+      return op_reg.registration_external->free(
+          reinterpret_cast<TfLiteOpaqueContext*>(&context_), buffer);
+    }
   }
   if (op_reg.free == nullptr) return;
   if (buffer) {
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 6442a85901f10d..6e171eacff486c 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -125,6 +125,7 @@ cc_test(
         ":delegate_test_util",
         ":interpreter_utils",
         ":utils",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string",
@@ -136,6 +137,7 @@ cc_test(
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/experimental/remat:metadata_util",
+        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_conversion_utils",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index b98c233912d1f8..8d37bfa38c8af7 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
@@ -447,8 +448,9 @@ struct OpaqueTestDelegate {
     delegate_state->delegate_prepared = true;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteOperatorCreate(
-        kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel", 1);
+    registration.registration_external = TfLiteOperatorCreateWithData(
+        kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel", 1,
+        /*user_data=*/nullptr);
 
     registration.prepare = [](TfLiteContext* context,
                               TfLiteNode* node) -> TfLiteStatus {
@@ -1131,11 +1133,11 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 };
 
 TfLiteOperator* CreateTfLiteOperator() {
-  auto registration =
-      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
-  TfLiteOperatorSetPrepare(
+  auto* registration = TfLiteOperatorCreateWithData(
+      kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1, /*user_data=*/nullptr);
+  TfLiteOperatorSetPrepareWithData(
       registration,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         // If tensors are resized, the runtime should propagate shapes
         // automatically if 'kTfLiteDelegateFlagsRequirePropagatedShapes' flag
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
index be650d253d2721..571198865a4a0e 100644
--- a/tensorflow/lite/delegates/opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -71,12 +71,14 @@ TEST(TestOpaqueDelegate, AddDelegate) {
     // Test that an unnamed delegate kernel can be passed to the TF Lite
     // runtime.
     TfLiteOperator* registration_external =
-        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
-                             /*name*/ nullptr,
-                             /*version=*/1);
-    TfLiteOperatorSetInit(registration_external,
-                          [](TfLiteOpaqueContext* context, const char* buffer,
-                             size_t length) -> void* { return nullptr; });
+        TfLiteOperatorCreateWithData(kTfLiteBuiltinDelegate,
+                                     /*name*/ nullptr,
+                                     /*version=*/1,
+                                     /*user_data=*/nullptr);
+    TfLiteOperatorSetInitWithData(
+        registration_external,
+        [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
+           size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
@@ -127,12 +129,14 @@ TEST(TestOpaqueDelegate, ModelWithCustomOpAndInitData) {
                                        TfLiteOpaqueDelegate* opaque_delegate,
                                        void* data) -> TfLiteStatus {
     TfLiteOperator* registration_external =
-        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
-                             /*name*/ nullptr,
-                             /*version=*/1);
-    TfLiteOperatorSetInit(registration_external,
-                          [](TfLiteOpaqueContext* context, const char* buffer,
-                             size_t length) -> void* { return nullptr; });
+        TfLiteOperatorCreateWithData(kTfLiteBuiltinDelegate,
+                                     /*name*/ nullptr,
+                                     /*version=*/1,
+                                     /*user_data=*/nullptr);
+    TfLiteOperatorSetInitWithData(
+        registration_external,
+        [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
+           size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
index 96fbd3b3b91082..a7ff4285885782 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
@@ -33,18 +33,18 @@ namespace {
 TfLiteOperator* CreateDelegateKernelRegistration(
     SimpleOpaqueDelegateInterface* delegate) {
   TfLiteOperator* kernel_registration =
-      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(),
-                           /*version=*/1);
+      TfLiteOperatorCreateWithData(kTfLiteBuiltinDelegate, delegate->Name(),
+                                   /*version=*/1, /*user_data=*/nullptr);
 
-  TfLiteOperatorSetFree(
+  TfLiteOperatorSetFreeWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context, void* buffer) -> void {
+      [](void* user_data, TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteOperatorSetInit(
+  TfLiteOperatorSetInitWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context, const char* buffer,
+      [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
         const TfLiteOpaqueDelegateParams* params =
             reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
@@ -60,18 +60,18 @@ TfLiteOperator* CreateDelegateKernelRegistration(
         }
         return delegate_kernel.release();
       });
-  TfLiteOperatorSetPrepare(
+  TfLiteOperatorSetPrepareWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         SimpleOpaqueDelegateKernelInterface* delegate_kernel =
             reinterpret_cast<SimpleOpaqueDelegateKernelInterface*>(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteOperatorSetInvoke(
+  TfLiteOperatorSetInvokeWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         SimpleOpaqueDelegateKernelInterface* delegate_kernel =
             reinterpret_cast<SimpleOpaqueDelegateKernelInterface*>(
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
index 3ec43793ad7a23..556fc264659977 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
@@ -274,17 +274,17 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
 
 TfLiteOperator* CreateDelegateKernelRegistrationImpl(
     SimpleOpaqueDelegateInterface* delegate) {
-  TfLiteOperator* kernel_registration =
-      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(), 1);
-  TfLiteOperatorSetFree(
+  TfLiteOperator* kernel_registration = TfLiteOperatorCreateWithData(
+      kTfLiteBuiltinDelegate, delegate->Name(), 1, /*user_data=*/nullptr);
+  TfLiteOperatorSetFreeWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context, void* buffer) -> void {
+      [](void* user_data, TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteOperatorSetInit(
+  TfLiteOperatorSetInitWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context, const char* buffer,
+      [](void* user_data, TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
         auto* params =
             reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
@@ -301,18 +301,18 @@ TfLiteOperator* CreateDelegateKernelRegistrationImpl(
         }
         return delegate_kernel.release();
       });
-  TfLiteOperatorSetPrepare(
+  TfLiteOperatorSetPrepareWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         SimpleOpaqueDelegateKernelInterface* delegate_kernel =
             reinterpret_cast<SimpleOpaqueDelegateKernelInterface*>(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteOperatorSetInvoke(
+  TfLiteOperatorSetInvokeWithData(
       kernel_registration,
-      [](TfLiteOpaqueContext* context,
+      [](void* user_data, TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
         SimpleOpaqueDelegateKernelInterface* delegate_kernel =
             reinterpret_cast<SimpleOpaqueDelegateKernelInterface*>(
diff --git a/tensorflow/lite/mutable_op_resolver_utils_test.cc b/tensorflow/lite/mutable_op_resolver_utils_test.cc
index d9d9e0320486db..76520a8a82cdd8 100644
--- a/tensorflow/lite/mutable_op_resolver_utils_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_utils_test.cc
@@ -31,28 +31,32 @@ namespace tflite {
 namespace {
 
 // We need some dummy functions to identify the registrations.
-TfLiteStatus DummyInvoke(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+TfLiteStatus DummyInvoke(void* user_data, TfLiteOpaqueContext* context,
+                         TfLiteOpaqueNode* node) {
   return kTfLiteOk;
 }
-TfLiteStatus DummyPrepare(TfLiteOpaqueContext* context,
+TfLiteStatus DummyPrepare(void* user_data, TfLiteOpaqueContext* context,
                           TfLiteOpaqueNode* node) {
   return kTfLiteOk;
 }
 
 TfLiteOperator* GetDummyRegistration() {
   static TfLiteOperator* registration = []() {
-    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "dummy", 1);
-    TfLiteOperatorSetPrepare(r, DummyPrepare);
-    TfLiteOperatorSetInvoke(r, DummyInvoke);
-    return r;
+    auto* op = TfLiteOperatorCreateWithData(
+        kTfLiteBuiltinCustom, "dummy", /*version=*/1, /*user_data=*/nullptr);
+    TfLiteOperatorSetPrepareWithData(op, DummyPrepare);
+    TfLiteOperatorSetInvokeWithData(op, DummyInvoke);
+    return op;
   }();
   return registration;
 }
 
 TfLiteOperator* GetAdditionOpRegistration() {
   static TfLiteOperator* registration = []() {
-    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinAdd, nullptr, 1);
-    TfLiteOperatorSetInvoke(r, DummyInvoke);
+    auto* r =
+        TfLiteOperatorCreateWithData(kTfLiteBuiltinAdd, /*custom_name=*/nullptr,
+                                     /*version=*/1, /*user_data=*/nullptr);
+    TfLiteOperatorSetInvokeWithData(r, DummyInvoke);
     return r;
   }();
   return registration;
@@ -67,7 +71,8 @@ TEST_F(MutableOpResolverTest, FindOp) {
   const TfLiteRegistration* found_registration =
       resolver.FindOp(BuiltinOperator_ADD, 1);
   ASSERT_NE(found_registration, nullptr);
-  EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
+  EXPECT_TRUE(found_registration->registration_external->invoke_with_data ==
+              DummyInvoke);
   EXPECT_EQ(
       TfLiteOperatorGetBuiltInCode(found_registration->registration_external),
       kTfLiteBuiltinAdd);
@@ -98,7 +103,8 @@ TEST_F(MutableOpResolverTest, RegisterOpWithSingleVersion) {
 
   found_registration = resolver.FindOp(BuiltinOperator_ADD, 2);
   ASSERT_NE(found_registration, nullptr);
-  EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
+  EXPECT_TRUE(found_registration->registration_external->invoke_with_data ==
+              DummyInvoke);
   EXPECT_EQ(found_registration->version, 2);
 
   found_registration = resolver.FindOp(BuiltinOperator_ADD, 3);
@@ -114,12 +120,14 @@ TEST_F(MutableOpResolverTest, RegisterOpWithMultipleVersions) {
 
   found_registration = resolver.FindOp(BuiltinOperator_ADD, 2);
   ASSERT_NE(found_registration, nullptr);
-  EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
+  EXPECT_TRUE(found_registration->registration_external->invoke_with_data ==
+              DummyInvoke);
   EXPECT_EQ(found_registration->version, 2);
 
   found_registration = resolver.FindOp(BuiltinOperator_ADD, 3);
   ASSERT_NE(found_registration, nullptr);
-  EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
+  EXPECT_TRUE(found_registration->registration_external->invoke_with_data ==
+              DummyInvoke);
   EXPECT_EQ(found_registration->version, 3);
 }
 
@@ -144,7 +152,8 @@ TEST_F(MutableOpResolverTest, FindCustomOp) {
   const TfLiteRegistration* found_registration = resolver.FindOp("dummy", 1);
   ASSERT_NE(found_registration, nullptr);
   EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_CUSTOM);
-  EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
+  EXPECT_TRUE(found_registration->registration_external->invoke_with_data ==
+              DummyInvoke);
   EXPECT_EQ(found_registration->version, 1);
 }
 

From 7a76b7a137b4b23b7e49bbb70ffa32f926fa9a60 Mon Sep 17 00:00:00 2001
From: Michael Levesque-Dion <mlevesquedion@google.com>
Date: Mon, 3 Jun 2024 12:28:55 -0700
Subject: [PATCH 274/287] Revert bytecode changes from #2259

The bytecode changes are causing issues due to some StableHLO
consumers using StableHLO directly rather than VHLO.

Temporarily remove the relevant roundtrip tests as well since this
breaks round-tripping.

PiperOrigin-RevId: 639874523
---
 third_party/stablehlo/temporary.patch         | 135 ++++++++++++++++++
 .../xla/third_party/stablehlo/temporary.patch | 135 ++++++++++++++++++
 2 files changed, 270 insertions(+)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 1b320f5cbc5903..2a9975d27cd193 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -175,6 +175,78 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloBytecode.cpp b/stablehlo/stablehlo/dialect/StablehloBytecode.cpp
+--- stablehlo/stablehlo/dialect/StablehloBytecode.cpp
++++ stablehlo/stablehlo/dialect/StablehloBytecode.cpp
+@@ -491,21 +491,19 @@
+ StablehloBytecodeInterface::readGatherDimensionNumbersAttr(
+     DialectBytecodeReader &reader) const {
+   LOG_READ_CALL;
+-  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims,
+-      operandBatchingDims, startIndicesBatchingDims, startIndexMap;
++  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims, startIndexMap;
+   int64_t indexVectorDim;
+ 
+   if (failed(reader.readSignedVarInts(offsetDims)) ||
+       failed(reader.readSignedVarInts(collapsedSliceDims)) ||
+-      failed(reader.readSignedVarInts(operandBatchingDims)) ||
+-      failed(reader.readSignedVarInts(startIndicesBatchingDims)) ||
+       failed(reader.readSignedVarInts(startIndexMap)) ||
+       failed(reader.readSignedVarInt(indexVectorDim)))
+     return GatherDimensionNumbersAttr();
+ 
+   return GatherDimensionNumbersAttr::get(
+-      getContext(), offsetDims, collapsedSliceDims, operandBatchingDims,
+-      startIndicesBatchingDims, startIndexMap, indexVectorDim);
++      getContext(), offsetDims, collapsedSliceDims,
++      /*operandBatchingDims=*/{}, /*startIndicesBatchingDims=*/{},
++      startIndexMap, indexVectorDim);
+ }
+ 
+ void StablehloBytecodeInterface::write(GatherDimensionNumbersAttr attr,
+@@ -513,8 +511,6 @@
+   writer.writeVarInt(stablehlo_encoding::kGatherDimensionNumbers);
+   writer.writeSignedVarInts(attr.getOffsetDims());
+   writer.writeSignedVarInts(attr.getCollapsedSliceDims());
+-  writer.writeSignedVarInts(attr.getOperandBatchingDims());
+-  writer.writeSignedVarInts(attr.getStartIndicesBatchingDims());
+   writer.writeSignedVarInts(attr.getStartIndexMap());
+   writer.writeSignedVarInt(attr.getIndexVectorDim());
+ }
+@@ -604,20 +600,19 @@
+     DialectBytecodeReader &reader) const {
+   LOG_READ_CALL;
+   llvm::SmallVector<int64_t> updateWindowDims, insertedWindowDims,
+-      inputBatchingDims, scatterIndicesBatchingDims, scatterDimsToOperandDims;
++      scatterDimsToOperandDims;
+   int64_t indexVectorDim;
+ 
+   if (failed(reader.readSignedVarInts(updateWindowDims)) ||
+       failed(reader.readSignedVarInts(insertedWindowDims)) ||
+-      failed(reader.readSignedVarInts(inputBatchingDims)) ||
+-      failed(reader.readSignedVarInts(scatterIndicesBatchingDims)) ||
+       failed(reader.readSignedVarInts(scatterDimsToOperandDims)) ||
+       failed(reader.readSignedVarInt(indexVectorDim)))
+     return ScatterDimensionNumbersAttr();
+ 
+   return ScatterDimensionNumbersAttr::get(
+-      getContext(), updateWindowDims, insertedWindowDims, inputBatchingDims,
+-      scatterIndicesBatchingDims, scatterDimsToOperandDims, indexVectorDim);
++      getContext(), updateWindowDims, insertedWindowDims,
++      /*inputBatchingDims=*/{}, /*scatterIndicesBatchingDims=*/{},
++      scatterDimsToOperandDims, indexVectorDim);
+ }
+ 
+ void StablehloBytecodeInterface::write(ScatterDimensionNumbersAttr attr,
+@@ -625,8 +620,6 @@
+   writer.writeVarInt(stablehlo_encoding::kScatterDimensionNumbersAttr);
+   writer.writeSignedVarInts(attr.getUpdateWindowDims());
+   writer.writeSignedVarInts(attr.getInsertedWindowDims());
+-  writer.writeSignedVarInts(attr.getInputBatchingDims());
+-  writer.writeSignedVarInts(attr.getScatterIndicesBatchingDims());
+   writer.writeSignedVarInts(attr.getScatterDimsToOperandDims());
+   writer.writeSignedVarInt(attr.getIndexVectorDim());
+ }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -3009,4 +3081,67 @@ diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/referen
  
    virtual llvm::Error operator()(Operation &op, Scope &scope,
                                   Process *process) final {
+diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+--- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
++++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+@@ -361,22 +361,6 @@
+   func.return %0 : tensor<3x5xcomplex<f32>>
+ }
+ 
+-func.func @test_gather(%arg0: tensor<5x200x100x300xf32>, %arg1: tensor<5x10x2xi32>) -> tensor<5x10x300xf32> {
+-  %0 = "stablehlo.gather"(%arg0, %arg1) {
+-    dimension_numbers = #stablehlo.gather<
+-      offset_dims = [2],
+-      collapsed_slice_dims = [1, 2],
+-      operand_batching_dims = [0],
+-      start_indices_batching_dims = [0],
+-      start_index_map = [1,2],
+-      index_vector_dim = 2,
+-    >,
+-    indices_are_sorted = true,
+-    slice_sizes = array<i64: 1, 1, 1, 300>
+-  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>) -> tensor<5x10x300xf32>
+-  func.return %0 : tensor<5x10x300xf32>
+-}
+-
+ func.func @test_set_get_dimension_size(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
+   %0 = "stablehlo.set_dimension_size"(%arg, %size) {dimension = 1 : i64} : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
+   %1 = "stablehlo.get_dimension_size"(%0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+@@ -600,36 +584,6 @@
+   %1 = "stablehlo.tuple"(%0#0, %0#1) : (tensor<3xui64>, tensor<2x2xui32>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+   func.return %1 : tuple<tensor<3xui64>, tensor<2x2xui32>>
+ }
+-
+-func.func @test_scatter(%input_tensor: tensor<5x200x100x300xf32>, %scatter_indices: tensor<5x10x2xi32>, %updates: tensor<5x10x300xf32>) -> tensor<5x200x100x300xf32> {
+-  %0 = "stablehlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+-    %add = stablehlo.add %lhs, %rhs : tensor<f32>
+-    "stablehlo.return"(%add) : (tensor<f32>) -> ()
+-  }) {
+-    scatter_dimension_numbers = #stablehlo.scatter<
+-      update_window_dims = [2],
+-      inserted_window_dims = [1, 2],
+-      input_batching_dims = [0],
+-      scatter_indices_batching_dims = [0],
+-      scatter_dims_to_operand_dims = [1, 2],
+-      index_vector_dim = 2
+-    >,
+-    indices_are_sorted = true,
+-    unique_indices = true
+-  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>, tensor<5x10x300xf32>) -> tensor<5x200x100x300xf32>
+-  func.return %0 : tensor<5x200x100x300xf32>
+-}
+-
+-func.func @test_scatter2(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2: tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>) {
+-    %0:2 = "stablehlo.scatter"(%arg0, %arg0, %arg1, %arg2, %arg2) ({
+-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>):
+-      %2 = stablehlo.add %arg3, %arg4 : tensor<f32>
+-      %3 = stablehlo.add %arg5, %arg6 : tensor<f32>
+-      "stablehlo.return"(%2, %3) : (tensor<f32>, tensor<f32>) -> ()
+-    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = false} : (tensor<200x100x300xf32>, tensor<200x100x300xf32>, tensor<10x2xi64>, tensor<10x300xf32>, tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
+-    return %0#0, %0#1 : tensor<200x100x300xf32>, tensor<200x100x300xf32>
+-  }
+ 
+ func.func @test_select(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 1b320f5cbc5903..2a9975d27cd193 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -175,6 +175,78 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloBytecode.cpp b/stablehlo/stablehlo/dialect/StablehloBytecode.cpp
+--- stablehlo/stablehlo/dialect/StablehloBytecode.cpp
++++ stablehlo/stablehlo/dialect/StablehloBytecode.cpp
+@@ -491,21 +491,19 @@
+ StablehloBytecodeInterface::readGatherDimensionNumbersAttr(
+     DialectBytecodeReader &reader) const {
+   LOG_READ_CALL;
+-  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims,
+-      operandBatchingDims, startIndicesBatchingDims, startIndexMap;
++  llvm::SmallVector<int64_t> offsetDims, collapsedSliceDims, startIndexMap;
+   int64_t indexVectorDim;
+ 
+   if (failed(reader.readSignedVarInts(offsetDims)) ||
+       failed(reader.readSignedVarInts(collapsedSliceDims)) ||
+-      failed(reader.readSignedVarInts(operandBatchingDims)) ||
+-      failed(reader.readSignedVarInts(startIndicesBatchingDims)) ||
+       failed(reader.readSignedVarInts(startIndexMap)) ||
+       failed(reader.readSignedVarInt(indexVectorDim)))
+     return GatherDimensionNumbersAttr();
+ 
+   return GatherDimensionNumbersAttr::get(
+-      getContext(), offsetDims, collapsedSliceDims, operandBatchingDims,
+-      startIndicesBatchingDims, startIndexMap, indexVectorDim);
++      getContext(), offsetDims, collapsedSliceDims,
++      /*operandBatchingDims=*/{}, /*startIndicesBatchingDims=*/{},
++      startIndexMap, indexVectorDim);
+ }
+ 
+ void StablehloBytecodeInterface::write(GatherDimensionNumbersAttr attr,
+@@ -513,8 +511,6 @@
+   writer.writeVarInt(stablehlo_encoding::kGatherDimensionNumbers);
+   writer.writeSignedVarInts(attr.getOffsetDims());
+   writer.writeSignedVarInts(attr.getCollapsedSliceDims());
+-  writer.writeSignedVarInts(attr.getOperandBatchingDims());
+-  writer.writeSignedVarInts(attr.getStartIndicesBatchingDims());
+   writer.writeSignedVarInts(attr.getStartIndexMap());
+   writer.writeSignedVarInt(attr.getIndexVectorDim());
+ }
+@@ -604,20 +600,19 @@
+     DialectBytecodeReader &reader) const {
+   LOG_READ_CALL;
+   llvm::SmallVector<int64_t> updateWindowDims, insertedWindowDims,
+-      inputBatchingDims, scatterIndicesBatchingDims, scatterDimsToOperandDims;
++      scatterDimsToOperandDims;
+   int64_t indexVectorDim;
+ 
+   if (failed(reader.readSignedVarInts(updateWindowDims)) ||
+       failed(reader.readSignedVarInts(insertedWindowDims)) ||
+-      failed(reader.readSignedVarInts(inputBatchingDims)) ||
+-      failed(reader.readSignedVarInts(scatterIndicesBatchingDims)) ||
+       failed(reader.readSignedVarInts(scatterDimsToOperandDims)) ||
+       failed(reader.readSignedVarInt(indexVectorDim)))
+     return ScatterDimensionNumbersAttr();
+ 
+   return ScatterDimensionNumbersAttr::get(
+-      getContext(), updateWindowDims, insertedWindowDims, inputBatchingDims,
+-      scatterIndicesBatchingDims, scatterDimsToOperandDims, indexVectorDim);
++      getContext(), updateWindowDims, insertedWindowDims,
++      /*inputBatchingDims=*/{}, /*scatterIndicesBatchingDims=*/{},
++      scatterDimsToOperandDims, indexVectorDim);
+ }
+ 
+ void StablehloBytecodeInterface::write(ScatterDimensionNumbersAttr attr,
+@@ -625,8 +620,6 @@
+   writer.writeVarInt(stablehlo_encoding::kScatterDimensionNumbersAttr);
+   writer.writeSignedVarInts(attr.getUpdateWindowDims());
+   writer.writeSignedVarInts(attr.getInsertedWindowDims());
+-  writer.writeSignedVarInts(attr.getInputBatchingDims());
+-  writer.writeSignedVarInts(attr.getScatterIndicesBatchingDims());
+   writer.writeSignedVarInts(attr.getScatterDimsToOperandDims());
+   writer.writeSignedVarInt(attr.getIndexVectorDim());
+ }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -3009,4 +3081,67 @@ diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/referen
  
    virtual llvm::Error operator()(Operation &op, Scope &scope,
                                   Process *process) final {
+diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+--- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
++++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+@@ -361,22 +361,6 @@
+   func.return %0 : tensor<3x5xcomplex<f32>>
+ }
+ 
+-func.func @test_gather(%arg0: tensor<5x200x100x300xf32>, %arg1: tensor<5x10x2xi32>) -> tensor<5x10x300xf32> {
+-  %0 = "stablehlo.gather"(%arg0, %arg1) {
+-    dimension_numbers = #stablehlo.gather<
+-      offset_dims = [2],
+-      collapsed_slice_dims = [1, 2],
+-      operand_batching_dims = [0],
+-      start_indices_batching_dims = [0],
+-      start_index_map = [1,2],
+-      index_vector_dim = 2,
+-    >,
+-    indices_are_sorted = true,
+-    slice_sizes = array<i64: 1, 1, 1, 300>
+-  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>) -> tensor<5x10x300xf32>
+-  func.return %0 : tensor<5x10x300xf32>
+-}
+-
+ func.func @test_set_get_dimension_size(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
+   %0 = "stablehlo.set_dimension_size"(%arg, %size) {dimension = 1 : i64} : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
+   %1 = "stablehlo.get_dimension_size"(%0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+@@ -600,36 +584,6 @@
+   %1 = "stablehlo.tuple"(%0#0, %0#1) : (tensor<3xui64>, tensor<2x2xui32>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+   func.return %1 : tuple<tensor<3xui64>, tensor<2x2xui32>>
+ }
+-
+-func.func @test_scatter(%input_tensor: tensor<5x200x100x300xf32>, %scatter_indices: tensor<5x10x2xi32>, %updates: tensor<5x10x300xf32>) -> tensor<5x200x100x300xf32> {
+-  %0 = "stablehlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+-    %add = stablehlo.add %lhs, %rhs : tensor<f32>
+-    "stablehlo.return"(%add) : (tensor<f32>) -> ()
+-  }) {
+-    scatter_dimension_numbers = #stablehlo.scatter<
+-      update_window_dims = [2],
+-      inserted_window_dims = [1, 2],
+-      input_batching_dims = [0],
+-      scatter_indices_batching_dims = [0],
+-      scatter_dims_to_operand_dims = [1, 2],
+-      index_vector_dim = 2
+-    >,
+-    indices_are_sorted = true,
+-    unique_indices = true
+-  } : (tensor<5x200x100x300xf32>, tensor<5x10x2xi32>, tensor<5x10x300xf32>) -> tensor<5x200x100x300xf32>
+-  func.return %0 : tensor<5x200x100x300xf32>
+-}
+-
+-func.func @test_scatter2(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2: tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>) {
+-    %0:2 = "stablehlo.scatter"(%arg0, %arg0, %arg1, %arg2, %arg2) ({
+-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>):
+-      %2 = stablehlo.add %arg3, %arg4 : tensor<f32>
+-      %3 = stablehlo.add %arg5, %arg6 : tensor<f32>
+-      "stablehlo.return"(%2, %3) : (tensor<f32>, tensor<f32>) -> ()
+-    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = false} : (tensor<200x100x300xf32>, tensor<200x100x300xf32>, tensor<10x2xi64>, tensor<10x300xf32>, tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
+-    return %0#0, %0#1 : tensor<200x100x300xf32>, tensor<200x100x300xf32>
+-  }
+ 
+ func.func @test_select(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 

From a9342811c65af24fbff1d07595e36adff1740d8d Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 3 Jun 2024 12:53:55 -0700
Subject: [PATCH 275/287] [xla:gpu] Strength-reduce dots that are too small

PiperOrigin-RevId: 639882149
---
 third_party/xla/xla/service/gpu/BUILD         |  2 +
 .../service/gpu/gpu_algebraic_simplifier.cc   |  9 +++++
 .../gpu/gpu_algebraic_simplifier_test.cc      | 38 +++++++++++++++----
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6b7dc38a6253be..2992c34fdfa63f 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3898,6 +3898,7 @@ cc_library(
         "gpu_algebraic_simplifier.h",
     ],
     deps = [
+        ":matmul_utils",
         ":triton_support",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -3906,6 +3907,7 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
index a2d4425adec493..968c29bb8031c1 100644
--- a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_algebraic_simplifier.h"
 
+#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/triton_support.h"
 #include "xla/xla_data.pb.h"
 
@@ -49,6 +51,13 @@ bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
     return true;
   }
 
+  absl::StatusOr<bool> is_too_small =
+      IsMatrixMultiplicationTooSmallForRewriting(*hlo, /*threshold=*/1000000);
+  CHECK_OK(is_too_small.status());
+  if (is_too_small.value()) {
+    return true;
+  }
+
   // If GemmFusion cannot handle this dot, we should strength-reduce it so that
   // it can be handled by the fusion pipeline.
   return !CanTritonHandleGEMM(*dot, compute_capability_);
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
index 07dd66c40f2856..2b20807976a4bf 100644
--- a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
@@ -34,8 +34,8 @@ TEST_F(GpuAlgebraicSimplifierTest, VectorVectorDotShouldBeStrengthReduced) {
 HloModule m
 
 ENTRY entry {
-  p0 = f32[32, 5] parameter(0)
-  p1 = f32[32, 5] parameter(1)
+  p0 = f32[32, 500] parameter(0)
+  p1 = f32[32, 500] parameter(1)
   ROOT dot = f32[32] dot(p0, p1), lhs_batch_dims={0},
     lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 })";
@@ -55,9 +55,9 @@ TEST_F(GpuAlgebraicSimplifierTest, MatrixVectorDotShouldNotBeStrengthReduced) {
 HloModule m
 
 ENTRY entry {
-  p0 = f32[32, 5, 7] parameter(0)
-  p1 = f32[32, 5] parameter(1)
-  ROOT dot = f32[32,7] dot(p0, p1), lhs_batch_dims={0},
+  p0 = f32[32, 5000, 7000] parameter(0)
+  p1 = f32[32, 5000] parameter(1)
+  ROOT dot = f32[32,7000] dot(p0, p1), lhs_batch_dims={0},
     lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1},
     algorithm=dot_bf16_bf16_f32_x6
 })";
@@ -78,9 +78,9 @@ TEST_F(GpuAlgebraicSimplifierTest,
 HloModule m
 
 ENTRY entry {
-  p0 = c64[32, 5, 7] parameter(0)
-  p1 = c64[32, 5] parameter(1)
-  ROOT dot = c64[32,7] dot(p0, p1), lhs_batch_dims={0},
+  p0 = c64[32, 5000, 7000] parameter(0)
+  p1 = c64[32, 5000] parameter(1)
+  ROOT dot = c64[32,7000] dot(p0, p1), lhs_batch_dims={0},
     lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -94,5 +94,27 @@ ENTRY entry {
   EXPECT_TRUE(visitor.ShouldStrengthReduceDotToReduce(dot));
 }
 
+TEST_F(GpuAlgebraicSimplifierTest, SmallDotShouldBeStrengthReduced) {
+  const std::string& hlo_string = R"(
+HloModule m
+
+ENTRY entry {
+  p0 = f32[32, 50, 70] parameter(0)
+  p1 = f32[32, 50] parameter(1)
+  ROOT dot = f32[32,70] dot(p0, p1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1},
+    algorithm=dot_bf16_bf16_f32_x6
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* dot = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(true);
+  se::CudaComputeCapability ampere(8, 0);
+  GpuAlgebraicSimplifier simplifier(options, ampere);
+  GpuAlgebraicSimplifierVisitor visitor(options, ampere, &simplifier);
+  EXPECT_TRUE(visitor.ShouldStrengthReduceDotToReduce(dot));
+}
+
 }  // namespace
 }  // namespace xla::gpu

From 718b14cf13e914141e048ab65c1577cac3ae4409 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 3 Jun 2024 13:06:57 -0700
Subject: [PATCH 276/287] [xla:gpu] Get correct slice assigned to
 dynamic-update-slice result

PiperOrigin-RevId: 639886142
---
 .../xla/xla/service/gpu/fusions/custom.cc     |   6 +
 third_party/xla/xla/service/gpu/tests/BUILD   |  19 ++
 .../gpu/tests/dynamic_slice_fusion_test.cc    | 234 ++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 9dbaa8016a974e..5d199310ebf215 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -318,6 +318,12 @@ absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
           "AddressComputationFusion only handles contiguous slices "
           "currently");
     }
+
+    // If slice is the root of the fusion, get the buffer assigned to the
+    // fusion itself.
+    if (slice_instr == fusion_instr.fused_expression_root()) {
+      return GetAllocationSlice(buffer_assignment, &fusion_instr, {});
+    }
   }
 
   return GetAllocationSlice(buffer_assignment, &fusion_instr, shape_idx);
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 70a29348c99859..b963db2ea5b362 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -66,6 +66,25 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "dynamic_slice_fusion_test",
+    srcs = ["dynamic_slice_fusion_test.cc"],
+    backends = ["gpu"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    deps = [
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/stream_executor",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_test(
     name = "element_wise_row_vectorization_test",
     srcs = ["element_wise_row_vectorization_test.cc"],
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
new file mode 100644
index 00000000000000..2643feff61f3ca
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "xla/error_spec.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+#if GOOGLE_CUDA
+static constexpr char kPlatform[] = "CUDA";
+#elif TENSORFLOW_USE_ROCM
+static constexpr char kPlatform[] = "ROCM";
+#endif
+
+class DynamicSliceFusionTest : public HloTestBase {};
+
+TEST_F(DynamicSliceFusionTest, GemmSlice) {
+  const char* hlo_reference = R"(
+    HloModule reference
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+
+      get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(custom-call.1), index=0
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(get-tuple-element.0)
+      ROOT dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+    }
+)";
+
+  const char* hlo_dynamic_slice_fusion = R"(
+    HloModule dynamic_slice_fusion
+
+    dynamic-slice-fusion {
+      p4 = f16[4,8,8]{2,1,0} parameter(4)
+      p0.1 = f16[2,8,8]{2,1,0} parameter(0)
+      p1.1 = s32[] parameter(1)
+      p2.1 = s32[] parameter(2)
+      slice.0 = f16[1,8,8]{2,1,0} dynamic-slice(p0.1, p1.1, p2.1, p2.1), dynamic_slice_sizes={1,8,8}
+      bitcast.0 = f16[8,8]{1,0} bitcast(slice.0)
+      p3 = f16[2,8,8]{2,1,0} parameter(3)
+      slice.1 = f16[1,8,8]{2,1,0} dynamic-slice(p3, p1.1, p2.1, p2.1), dynamic_slice_sizes={1,8,8}
+      bitcast.1 = f16[8,8]{1,0} bitcast(slice.1)
+      custom-call.0 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.0, bitcast.1),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      get-tuple-element.2 = f16[8,8]{1,0} get-tuple-element(custom-call.0), index=0
+      bitcast.2 = f16[1,8,8]{2,1,0} bitcast(get-tuple-element.2)
+      dus.1 = f16[4,8,8]{2,1,0} dynamic-update-slice(p4, bitcast.2, p1.1, p2.1, p2.1)
+      get-tuple-element.3 = s8[256]{0} get-tuple-element(custom-call.0), index=1
+      ROOT tuple.1 = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(dus.1, get-tuple-element.3)
+    }
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      address_computation = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(p0, c1_s32, c0_s32, p1, p2),
+        kind=kCustom, calls=dynamic-slice-fusion,
+        backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],
+                        "fusion_backend_config":{
+                          "kind":"__custom_fusion",
+                          "custom_fusion_config":{
+                            "name":"dynamic_address_computation"
+                           }},
+                        "force_earliest_schedule":false}
+      ROOT gte = f16[4,8,8]{2,1,0} get-tuple-element(address_computation), index=0
+    }
+)";
+
+  auto reference = ParseAndReturnVerifiedModule(hlo_reference).value();
+  auto fusion = ParseAndReturnVerifiedModule(hlo_dynamic_slice_fusion).value();
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(reference), std::move(fusion),
+                                      ErrorSpec{1e-7, 1e-7},
+                                      /*run_hlo_passes=*/false));
+}
+
+static absl::Status Memcpy(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> dst) {
+  return stream->MemcpyD2D(
+      &dst->data, src.data,
+      absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
+          primitive_util::ByteWidth(src.dtype));
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::AnyBuffer>()  // src
+                           .Ret<ffi::AnyBuffer>()  // dst
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", kPlatform,
+                         kMemcpy);
+
+TEST_F(DynamicSliceFusionTest, CustomCallSlice) {
+  const char* hlo_reference = R"(
+    HloModule reference
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[4,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41),
+        custom_call_target="__xla_test$$memcpy",
+        api_version=API_VERSION_TYPED_FFI
+
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(custom-call.1)
+      ROOT dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p1, bitcast.43, c1_s32, c0_s32, c0_s32)
+    }
+)";
+
+  const char* hlo_dynamic_slice_fusion = R"(
+    HloModule dynamic_slice_fusion
+
+    dynamic-slice-fusion {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = f16[4,8,8]{2,1,0} parameter(3)
+      slice.0 = f16[1,8,8]{2,1,0} dynamic-slice(p0, p1, p2, p2), dynamic_slice_sizes={1,8,8}
+      bitcast.0 = f16[8,8]{1,0} bitcast(slice.0)
+
+      custom-call.0 = f16[8,8]{1,0} custom-call(bitcast.0),
+        custom_call_target="__xla_test$$memcpy",
+        api_version=API_VERSION_TYPED_FFI
+
+      bitcast.2 = f16[1,8,8]{2,1,0} bitcast(custom-call.0)
+      ROOT dus.1 = f16[4,8,8]{2,1,0} dynamic-update-slice(p3, bitcast.2, p1, p2, p2)
+    }
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[4,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      ROOT address_computation = f16[4,8,8]{2,1,0} fusion(p0, c1_s32, c0_s32, p1),
+        kind=kCustom, calls=dynamic-slice-fusion,
+        backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],
+                        "fusion_backend_config":{
+                          "kind":"__custom_fusion",
+                          "custom_fusion_config":{
+                            "name":"dynamic_address_computation"
+                           }},
+                        "force_earliest_schedule":false}
+    }
+)";
+
+  auto reference = ParseAndReturnVerifiedModule(hlo_reference).value();
+  auto fusion = ParseAndReturnVerifiedModule(hlo_dynamic_slice_fusion).value();
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(reference), std::move(fusion),
+                                      ErrorSpec{1e-7, 1e-7},
+                                      /*run_hlo_passes=*/false));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From e0acf5da421673ec5e3e715d338456c1aac22b8d Mon Sep 17 00:00:00 2001
From: Yin Zhang <yinzz@google.com>
Date: Mon, 3 Jun 2024 13:57:06 -0700
Subject: [PATCH 277/287] Consolidate `EnterOpMetadata` logic for op metrics db
 population

PiperOrigin-RevId: 639902488
---
 .../convert/xplane_to_op_metrics_db.cc        | 42 ++++++++++---------
 tensorflow/core/profiler/utils/op_utils.cc    | 34 ++++++++++-----
 tensorflow/core/profiler/utils/op_utils.h     |  9 +++-
 3 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index 2954cc2206f19e..f20ebe2be3c8d3 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -140,23 +140,23 @@ void CollectTfActivities(
     std::vector<TfActivity>* tf_activities) {
   uint32 tf_op_id = 0;
   tf_activities->reserve(line.NumEvents() * 2);
-  line.ForEachEvent([&tf_ops, &tf_op_id,
-                     &tf_activities](const XEventVisitor& event) {
-    const tsl::profiler::TfOp* tf_op = gtl::FindOrNull(tf_ops, event.Id());
-    if (tf_op != nullptr) {
-      ++tf_op_id;
-      bool is_eager = false;
-      if (std::optional<XStatVisitor> stat =
-              event.GetStat(StatType::kIsEager)) {
-        is_eager = stat->IntValue();
-      }
-      tsl::profiler::Timespan span = event.GetTimespan();
-      tf_activities->push_back(
-          {span.begin_ps(), tf_op_id, kTfOpBegin, *tf_op, is_eager});
-      tf_activities->push_back(
-          {span.end_ps(), tf_op_id, kTfOpEnd, *tf_op, is_eager});
-    }
-  });
+  line.ForEachEvent(
+      [&tf_ops, &tf_op_id, &tf_activities](const XEventVisitor& event) {
+        const tsl::profiler::TfOp* tf_op = gtl::FindOrNull(tf_ops, event.Id());
+        if (tf_op != nullptr) {
+          ++tf_op_id;
+          bool is_eager = false;
+          if (std::optional<XStatVisitor> stat =
+                  event.GetStat(StatType::kIsEager)) {
+            is_eager = stat->IntValue();
+          }
+          tsl::profiler::Timespan span = event.GetTimespan();
+          tf_activities->push_back(
+              {span.begin_ps(), tf_op_id, kTfOpBegin, *tf_op, is_eager});
+          tf_activities->push_back(
+              {span.end_ps(), tf_op_id, kTfOpEnd, *tf_op, is_eager});
+        }
+      });
 }
 
 }  // namespace
@@ -248,6 +248,7 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
       absl::string_view tf_op_full_name;
       bool is_eager = false;
       int64_t program_id = 0;
+      absl::string_view deduplicated_name = "";
       event.ForEachStat([&](const XStatVisitor& stat) {
         if (stat.Type() == StatType::kTfOp) {
           tf_op_full_name = stat.StrOrRefValue();
@@ -255,6 +256,8 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
           is_eager = stat.IntValue();
         } else if (stat.Type() == StatType::kProgramId) {
           program_id = stat.IntOrUintValue();
+        } else if (stat.Type() == StatType::kDeduplicatedName) {
+          deduplicated_name = stat.StrOrRefValue();
         }
       });
       if (tf_op_full_name.empty()) return;
@@ -266,8 +269,9 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
       }
       device_op_metrics_db_builder.EnterOp(
           /*program_id=*/program_id,
-          absl::StrCat(tf_op.name, "/", event.Name()), tf_op.type,
-          tf_op_full_name, is_eager,
+          /**name=*/absl::StrCat(tf_op.name, "/", event.Name()),
+          /**category=*/tf_op.type,
+          /*provenance=*/tf_op_full_name, deduplicated_name, is_eager,
           /*occurrences=*/1, event.DurationPs(),
           /*children_time_ps=*/0, costs.flops, costs.bytes_accessed);
     });
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index d5d0c2e05e7b37..7c25fdcb5096b4 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/op_utils.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <string>
 
 #include "absl/strings/string_view.h"
@@ -73,25 +74,38 @@ void HostOpMetricsDbBuilder::EnterHostInfeedEnqueue(
   last_host_infeed_enqueue_ = host_infeed_enqueue;
 }
 
+void DeviceOpMetricsDbBuilder::EnterOpMetadata(
+    uint64 program_id, absl::string_view program_name,
+    absl::string_view category, absl::string_view provenance,
+    absl::string_view deduplicated_name, bool is_eager) {
+  // We only need to add xla metadata once to each new op, as they are the
+  // same across occurrences.
+  OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, program_name);
+  if (op_metrics->occurrences() > 0) return;
+  op_metrics->set_category(category == tsl::profiler::kUnknownOp
+                               ? "unknown"
+                               : std::string(category));
+  op_metrics->set_provenance(std::string(provenance));
+  if (!deduplicated_name.empty()) {
+    op_metrics->set_deduplicated_name(std::string(deduplicated_name));
+  }
+  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
+}
+
 void DeviceOpMetricsDbBuilder::EnterOp(
     uint64 program_id, absl::string_view name, absl::string_view category,
-    absl::string_view provenance, bool is_eager, uint64 occurrences,
-    uint64 time_ps, uint64 children_time_ps, int64_t flops,
-    int64_t bytes_accessed,
+    absl::string_view provenance, absl::string_view deduplicated_name,
+    bool is_eager, uint64 occurrences, uint64 time_ps, uint64 children_time_ps,
+    int64_t flops, int64_t bytes_accessed,
     const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
         memory_accessed_breakdown,
     int64_t model_flops) {
+  EnterOpMetadata(program_id, name, category, provenance, deduplicated_name,
+                  is_eager);
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
-  if (op_metrics->category().empty())
-    op_metrics->set_category(category == tsl::profiler::kUnknownOp
-                                 ? "unknown"
-                                 : std::string(category));
-  if (op_metrics->provenance().empty())
-    op_metrics->set_provenance(std::string(provenance));
   op_metrics->set_num_cores(1);
-  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index b7cf3a804ff425..dea310aeae16fd 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -73,11 +73,16 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
   //                               type and memory space.
   void EnterOp(uint64 program_id, absl::string_view name,
                absl::string_view category, absl::string_view provenance,
-               bool is_eager, uint64 occurrences, uint64 time_ps,
-               uint64 children_time_ps, int64_t flops, int64_t bytes_accessed,
+               absl::string_view deduplicated_name, bool is_eager,
+               uint64 occurrences, uint64 time_ps, uint64 children_time_ps,
+               int64_t flops, int64_t bytes_accessed,
                const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
                    memory_accessed_breakdown = {},
                int64_t model_flops = 0);
+
+  void EnterOpMetadata(uint64 program_id, absl::string_view program_name,
+                       absl::string_view category, absl::string_view provenance,
+                       absl::string_view deduplicated_name, bool is_eager);
 };
 
 }  // namespace profiler

From 23ad53a685b831750825a25a2a94ce9755f930ec Mon Sep 17 00:00:00 2001
From: Sandeep Dasgupta <sdasgup@google.com>
Date: Mon, 3 Jun 2024 14:29:03 -0700
Subject: [PATCH 278/287] Integrate StableHLO at openxla/stablehlo@82bfdd48

PiperOrigin-RevId: 639912807
---
 third_party/stablehlo/temporary.patch         | 58 +------------------
 third_party/stablehlo/workspace.bzl           |  4 +-
 .../xla/third_party/stablehlo/temporary.patch | 58 +------------------
 .../xla/third_party/stablehlo/workspace.bzl   |  4 +-
 4 files changed, 8 insertions(+), 116 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 2a9975d27cd193..018c278a7f936a 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -2216,7 +2216,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
-@@ -0,0 +1,41 @@
+@@ -0,0 +1,42 @@
 +# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 +# Copyright 2023 The StableHLO Authors.
 +#
@@ -2250,6 +2250,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/s
 +        StablehloTestUtils
 +        StablehloPasses
 +        InterpreterOps
++        StablehloLinalgTransforms
 +        StablehloTOSATransforms
 +        )
 +add_llvm_executable(experimental-stablehlo-opt StablehloOptMain.cpp)
@@ -3014,61 +3015,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/python/tests/stablehlo.py b/stablehlo/stablehlo/integrations/python/tests/stablehlo.py
---- stablehlo/stablehlo/integrations/python/tests/stablehlo.py
-+++ stablehlo/stablehlo/integrations/python/tests/stablehlo.py
-@@ -115,14 +115,17 @@
-       operand_batching_dims=[6, 7],
-       start_indices_batching_dims=[8, 9],
-       start_index_map=[10],
--      index_vector_dim=11)
--  assert attr is not None
--  assert str(attr) == ("#stablehlo.gather<offset_dims = [1, 2], "
--                       "collapsed_slice_dims = [3, 4, 5], "
--                       "operand_batching_dims = [6, 7], "
--                       "start_indices_batching_dims = [8, 9], "
--                       "start_index_map = [10], "
--                       "index_vector_dim = 11>")
-+      index_vector_dim=11,
-+  )
-+  assert attr is not None
-+  assert str(attr) == (
-+      "#stablehlo.gather<offset_dims = [1, 2], "
-+      "collapsed_slice_dims = [3, 4, 5], "
-+      "operand_batching_dims = [6, 7], "
-+      "start_indices_batching_dims = [8, 9], "
-+      "start_index_map = [10], "
-+      "index_vector_dim = 11>"
-+  )
-   assert attr.offset_dims == [1, 2]
-   assert attr.collapsed_slice_dims == [3, 4, 5]
-   assert attr.operand_batching_dims == [6, 7]
-@@ -178,14 +181,17 @@
-       input_batching_dims=[6, 7],
-       scatter_indices_batching_dims=[8, 9],
-       scattered_dims_to_operand_dims=[10, 11],
--      index_vector_dim=12)
--  assert attr is not None
--  assert str(attr) == ("#stablehlo.scatter<update_window_dims = [1, 2, 3], "
--                       "inserted_window_dims = [4, 5], "
--                       "input_batching_dims = [6, 7], "
--                       "scatter_indices_batching_dims = [8, 9], "
--                       "scatter_dims_to_operand_dims = [10, 11], "
--                       "index_vector_dim = 12>")
-+      index_vector_dim=12,
-+  )
-+  assert attr is not None
-+  assert str(attr) == (
-+      "#stablehlo.scatter<update_window_dims = [1, 2, 3], "
-+      "inserted_window_dims = [4, 5], "
-+      "input_batching_dims = [6, 7], "
-+      "scatter_indices_batching_dims = [8, 9], "
-+      "scatter_dims_to_operand_dims = [10, 11], "
-+      "index_vector_dim = 12>"
-+  )
-   assert attr.update_window_dims == [1, 2, 3]
-   assert attr.inserted_window_dims == [4, 5]
-   assert attr.input_batching_dims == [6, 7]
 diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
 --- stablehlo/stablehlo/reference/Api.cpp
 +++ stablehlo/stablehlo/reference/Api.cpp
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index bfaa9962a9997f..40e0cd35825b69 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "c44d9af8d4879adccf1054cb61a53377ae5898cb"
-    STABLEHLO_SHA256 = "a8f5d4df0256e9d1c7b35fead77c31b9d8d985a0909eb198374faa9f7de15e94"
+    STABLEHLO_COMMIT = "82bfdd489013e9aa85a0b5a5c346c850bb4672af"
+    STABLEHLO_SHA256 = "3a1a8e166956691664b4ee0e0f0f086c8a490a667047fe618918b60d915f9873"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 2a9975d27cd193..018c278a7f936a 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -2216,7 +2216,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
-@@ -0,0 +1,41 @@
+@@ -0,0 +1,42 @@
 +# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 +# Copyright 2023 The StableHLO Authors.
 +#
@@ -2250,6 +2250,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/s
 +        StablehloTestUtils
 +        StablehloPasses
 +        InterpreterOps
++        StablehloLinalgTransforms
 +        StablehloTOSATransforms
 +        )
 +add_llvm_executable(experimental-stablehlo-opt StablehloOptMain.cpp)
@@ -3014,61 +3015,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/python/tests/stablehlo.py b/stablehlo/stablehlo/integrations/python/tests/stablehlo.py
---- stablehlo/stablehlo/integrations/python/tests/stablehlo.py
-+++ stablehlo/stablehlo/integrations/python/tests/stablehlo.py
-@@ -115,14 +115,17 @@
-       operand_batching_dims=[6, 7],
-       start_indices_batching_dims=[8, 9],
-       start_index_map=[10],
--      index_vector_dim=11)
--  assert attr is not None
--  assert str(attr) == ("#stablehlo.gather<offset_dims = [1, 2], "
--                       "collapsed_slice_dims = [3, 4, 5], "
--                       "operand_batching_dims = [6, 7], "
--                       "start_indices_batching_dims = [8, 9], "
--                       "start_index_map = [10], "
--                       "index_vector_dim = 11>")
-+      index_vector_dim=11,
-+  )
-+  assert attr is not None
-+  assert str(attr) == (
-+      "#stablehlo.gather<offset_dims = [1, 2], "
-+      "collapsed_slice_dims = [3, 4, 5], "
-+      "operand_batching_dims = [6, 7], "
-+      "start_indices_batching_dims = [8, 9], "
-+      "start_index_map = [10], "
-+      "index_vector_dim = 11>"
-+  )
-   assert attr.offset_dims == [1, 2]
-   assert attr.collapsed_slice_dims == [3, 4, 5]
-   assert attr.operand_batching_dims == [6, 7]
-@@ -178,14 +181,17 @@
-       input_batching_dims=[6, 7],
-       scatter_indices_batching_dims=[8, 9],
-       scattered_dims_to_operand_dims=[10, 11],
--      index_vector_dim=12)
--  assert attr is not None
--  assert str(attr) == ("#stablehlo.scatter<update_window_dims = [1, 2, 3], "
--                       "inserted_window_dims = [4, 5], "
--                       "input_batching_dims = [6, 7], "
--                       "scatter_indices_batching_dims = [8, 9], "
--                       "scatter_dims_to_operand_dims = [10, 11], "
--                       "index_vector_dim = 12>")
-+      index_vector_dim=12,
-+  )
-+  assert attr is not None
-+  assert str(attr) == (
-+      "#stablehlo.scatter<update_window_dims = [1, 2, 3], "
-+      "inserted_window_dims = [4, 5], "
-+      "input_batching_dims = [6, 7], "
-+      "scatter_indices_batching_dims = [8, 9], "
-+      "scatter_dims_to_operand_dims = [10, 11], "
-+      "index_vector_dim = 12>"
-+  )
-   assert attr.update_window_dims == [1, 2, 3]
-   assert attr.inserted_window_dims == [4, 5]
-   assert attr.input_batching_dims == [6, 7]
 diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
 --- stablehlo/stablehlo/reference/Api.cpp
 +++ stablehlo/stablehlo/reference/Api.cpp
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index bfaa9962a9997f..40e0cd35825b69 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "c44d9af8d4879adccf1054cb61a53377ae5898cb"
-    STABLEHLO_SHA256 = "a8f5d4df0256e9d1c7b35fead77c31b9d8d985a0909eb198374faa9f7de15e94"
+    STABLEHLO_COMMIT = "82bfdd489013e9aa85a0b5a5c346c850bb4672af"
+    STABLEHLO_SHA256 = "3a1a8e166956691664b4ee0e0f0f086c8a490a667047fe618918b60d915f9873"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From 9c905d4693a24622f4bc062a190ed3fcad5a50d6 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Sun, 9 Jun 2024 13:42:58 +0000
Subject: [PATCH 279/287] Fix merge conflicts

---
 third_party/xla/xla/service/gpu/fusions/BUILD | 28 -------------------
 .../xla/xla/service/gpu/gpu_compiler_test.cc  |  3 --
 2 files changed, 31 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 480f9b8ae1b5cf..384456b942f11d 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -79,12 +79,8 @@ cc_library(
 xla_test(
     name = "in_place_dynamic_update_slice_mlir_test",
     srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"], #TODO(rocm) TEMP
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":in_place_dynamic_update_slice_mlir",
         ":mlir_emitter_test_base",
@@ -382,12 +378,8 @@ cc_library(
 xla_test(
     name = "loop_mlir_test",
     srcs = ["loop_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"],
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":loop_mlir",
         ":mlir_emitter_test_base",
@@ -436,12 +428,8 @@ cc_library(
 xla_test(
     name = "scatter_mlir_test",
     srcs = ["scatter_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"],
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":mlir_emitter_test_base",
         ":scatter_mlir",
@@ -492,12 +480,8 @@ cc_library(
 xla_test(
     name = "transpose_mlir_test",
     srcs = ["transpose_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"], #TODO(rocm) TEMP
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":mlir_emitter_test_base",
         ":transpose_mlir",
@@ -887,12 +871,8 @@ cc_library(
 xla_test(
     name = "reduction_mlir_test",
     srcs = ["reduction_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"], #TODO(rocm) TEMP
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":mlir_emitter_test_base",
         ":reduction_mlir",
@@ -977,12 +957,8 @@ cc_library(
 xla_test(
     name = "concatenate_mlir_test",
     srcs = ["concatenate_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"], #TODO(rocm) TEMP
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
@@ -1114,12 +1090,8 @@ cc_library(
 xla_test(
     name = "input_slices_mlir_test",
     srcs = ["input_slices_mlir_test.cc"],
-<<<<<<< HEAD
     tags = ["no_rocm"], #TODO(rocm) TEMP
-    use_gpu = True,
-=======
     backends = ["gpu"],
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
     deps = [
         ":input_slices_mlir",
         ":mlir_emitter_test_base",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 3e96eac53b4b26..69fe30e7fa81f4 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -343,10 +343,8 @@ ENTRY main {
 
 TEST_F(GpuCompilerTest,
        GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas) {
-<<<<<<< HEAD
   if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
     GTEST_SKIP() << "Folder structure differences prevents finding of gpu_compiler_test_autotune_db.textproto.";
-=======
   auto cc = backend()
                 .default_stream_executor()
                 ->GetDeviceDescription()
@@ -354,7 +352,6 @@ TEST_F(GpuCompilerTest,
   if (!cc.IsAtLeastAmpere()) {
     GTEST_SKIP() << "Autotuning results have only been generated for Ampere "
                  << "and Hopper GPUs";
->>>>>>> 23ad53a685b831750825a25a2a94ce9755f930ec
   }
   const absl::string_view hlo_string = R"(
 HloModule test

From 50a7844413e2042d6c5039132e368e3dac154927 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Sun, 9 Jun 2024 13:54:50 +0000
Subject: [PATCH 280/287] Re-enable xla tests that were disabled during weekly
 sync 240521

---
 third_party/xla/xla/service/gpu/BUILD         | 1 -
 third_party/xla/xla/service/gpu/fusions/BUILD | 6 ------
 third_party/xla/xla/service/gpu/tests/BUILD   | 1 -
 third_party/xla/xla/tests/BUILD               | 3 ---
 4 files changed, 11 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 78f6eb20089d9b..aff5aa0a7ee401 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -5778,7 +5778,6 @@ xla_cc_test(
 xla_test(
     name = "determinism_test",
     srcs = if_gpu_is_configured(["determinism_test.cc"]),
-    tags = ["no_rocm"],
     backends = [
         "gpu_a100",
         "gpu_amd_any",
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 384456b942f11d..67495107803d4a 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -79,7 +79,6 @@ cc_library(
 xla_test(
     name = "in_place_dynamic_update_slice_mlir_test",
     srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = ["gpu"],
     deps = [
         ":in_place_dynamic_update_slice_mlir",
@@ -428,7 +427,6 @@ cc_library(
 xla_test(
     name = "scatter_mlir_test",
     srcs = ["scatter_mlir_test.cc"],
-    tags = ["no_rocm"],
     backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
@@ -480,7 +478,6 @@ cc_library(
 xla_test(
     name = "transpose_mlir_test",
     srcs = ["transpose_mlir_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
@@ -871,7 +868,6 @@ cc_library(
 xla_test(
     name = "reduction_mlir_test",
     srcs = ["reduction_mlir_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = ["gpu"],
     deps = [
         ":mlir_emitter_test_base",
@@ -957,7 +953,6 @@ cc_library(
 xla_test(
     name = "concatenate_mlir_test",
     srcs = ["concatenate_mlir_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = ["gpu"],
     deps = [
         ":concatenate_mlir",
@@ -1090,7 +1085,6 @@ cc_library(
 xla_test(
     name = "input_slices_mlir_test",
     srcs = ["input_slices_mlir_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = ["gpu"],
     deps = [
         ":input_slices_mlir",
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index c18396cbd64c49..282b2252335984 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -483,7 +483,6 @@ xla_test(
 xla_test(
     name = "gpu_triton_custom_call_test",
     srcs = ["gpu_triton_custom_call_test.cc"],
-    tags = ["no_rocm"], #TODO(rocm) TEMP
     backends = [
         "gpu_a100",
         "gpu_v100",
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 61e4c8d2e8ccda..ff2e4eccfe2ef6 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -913,7 +913,6 @@ xla_test(
     shard_count = 20,
     tags = [
         "optonly",
-        "no_rocm",
     ],
     deps = [
         ":client_library_test_base",
@@ -952,7 +951,6 @@ xla_test(
         "optonly",
         # TODO(b/151340488): Timed out on 2020-03-12.
         "nozapfhahn",
-        "no_rocm",
     ],
     deps = [
         ":client_library_test_base",
@@ -1073,7 +1071,6 @@ xla_test(
     shard_count = 20,
     tags = [
         "optonly",
-        "no_rocm",
     ],
     deps = [
         ":client_library_test_base",

From aae89e5c1afde95cb0f168d56e29e464e3ca6295 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Mon, 10 Jun 2024 11:52:11 +0000
Subject: [PATCH 281/287] Enable new mlir fusion emitters only for CUDA

---
 third_party/xla/xla/service/gpu/fusions/BUILD                   | 1 +
 .../xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc       | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 67495107803d4a..26883a3ba5608f 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -284,6 +284,7 @@ cc_library(
     testonly = True,
     srcs = ["mlir_emitter_test_base.cc"],
     hdrs = ["mlir_emitter_test_base.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
index 2dfc06b9e747af..bec0e2f43e047d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
@@ -73,7 +73,9 @@ MlirEmitterTestBaseImpl::MlirEmitterTestBaseImpl() {
 
 DebugOptions MlirEmitterTestBaseImpl::GetDebugOptionsForTest() {
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
+#ifdef GOOGLE_CUDA
   debug_options.set_xla_gpu_enable_mlir_emitters(true);
+#endif //GOOGLE_CUDA
   return debug_options;
 }
 

From ab85687df41ecb77faa07ea6ceed7bbfbe20b9c8 Mon Sep 17 00:00:00 2001
From: Harsha HS <harsha.havanurshamsundara@amd.com>
Date: Thu, 30 May 2024 14:01:32 +0000
Subject: [PATCH 282/287] Update test_tag_filters and build_tag_filters for AMD
 GPUs

---
 .../tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
index 9a28073ade31dd..2afe6e756795d9 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -81,6 +81,6 @@ test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/dtensor/python/tests:multi_client_test_2gpus -//tensorflow/dtensor/python/tests:multi_client_test_nccl_2gpus -//tensorflow/python/distribute/experimental:multi_worker_mirrored_strategy_test_2gpus
 
 # For XLA (rocm)
-test:xla_cpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm --keep_going
-test:xla_cpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm
+test:xla_cpp_filters --test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only --keep_going
+test:xla_cpp_filters --build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only
 test:xla_cpp --config=xla_cpp_filters -- //xla/... //build_tools/...

From 7469899860bee21007477a2df8071e9e9927a3fd Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Mon, 10 Jun 2024 12:19:18 +0000
Subject: [PATCH 283/287] Disable gpu_triton_custom_call_test and dot operation
 tests

---
 third_party/xla/xla/service/gpu/tests/BUILD | 1 +
 third_party/xla/xla/tests/BUILD             | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 282b2252335984..af020248014a02 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -488,6 +488,7 @@ xla_test(
         "gpu_v100",
         "gpu_amd_any",
     ],
+    tags = ["no_rocm"], #TODO(rocm) TEMP, sync 24-06-03
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index ff2e4eccfe2ef6..7b7b3497fd8c9c 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -913,6 +913,7 @@ xla_test(
     shard_count = 20,
     tags = [
         "optonly",
+        "no_rocm", #TODO(rocm) TEMP, sync 24-06-03
     ],
     deps = [
         ":client_library_test_base",
@@ -951,6 +952,7 @@ xla_test(
         "optonly",
         # TODO(b/151340488): Timed out on 2020-03-12.
         "nozapfhahn",
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
     ],
     deps = [
         ":client_library_test_base",
@@ -1071,6 +1073,7 @@ xla_test(
     shard_count = 20,
     tags = [
         "optonly",
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
     ],
     deps = [
         ":client_library_test_base",

From 60a2d36a3086ce02122b6bab049eb375a5b47a27 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Mon, 10 Jun 2024 12:25:54 +0000
Subject: [PATCH 284/287] Disable determinism_test

---
 third_party/xla/xla/service/gpu/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index aff5aa0a7ee401..f50f81023490c3 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -5778,6 +5778,7 @@ xla_cc_test(
 xla_test(
     name = "determinism_test",
     srcs = if_gpu_is_configured(["determinism_test.cc"]),
+    tags = ["no_rocm"], #TODO(rocm): TEMP, sync 24-06-03
     backends = [
         "gpu_a100",
         "gpu_amd_any",

From 18778e43f698aa8e81a63d2e34feea6fecfcd734 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Mon, 10 Jun 2024 12:33:49 +0000
Subject: [PATCH 285/287] Fix build issue in gpu_compiler_test

---
 third_party/xla/xla/service/gpu/gpu_compiler_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 69fe30e7fa81f4..15b332d253f99c 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -345,6 +345,7 @@ TEST_F(GpuCompilerTest,
        GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas) {
   if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
     GTEST_SKIP() << "Folder structure differences prevents finding of gpu_compiler_test_autotune_db.textproto.";
+  }
   auto cc = backend()
                 .default_stream_executor()
                 ->GetDeviceDescription()

From fa3a04078d60c9c4af409e062f40b4df74a2968d Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Mon, 10 Jun 2024 20:12:54 +0000
Subject: [PATCH 286/287] Disable dot_algorithm_support_test,
 triton_support_test, ir_emitter_triton_parametrized_test

---
 third_party/xla/xla/service/gpu/BUILD | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index f50f81023490c3..2b6d2d6d32e19c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -711,7 +711,10 @@ xla_test(
         "gpu_amd_any",
     ],
     shard_count = 10,
-    tags = ["nomac"],
+    tags = [
+        "nomac",
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
+    ],
     deps = [
         ":triton_support",
         "//xla:comparison_util",
@@ -1215,7 +1218,10 @@ xla_test(
         "gpu_amd_any",
     ],
     shard_count = 10,
-    tags = ["nomac"],
+    tags = [
+        "nomac",
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
+    ],
     deps = [
         ":gpu_device_info_for_tests",
         ":gpu_float_support",
@@ -5579,6 +5585,7 @@ xla_test(
     ],
     tags = [
         "nomac",
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
     ],
     deps = [
         "//xla:shape_util",

From 184dce54f99cd4b634764ce6d7f447ee8487c83b Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Tue, 11 Jun 2024 13:52:29 +0000
Subject: [PATCH 287/287] Disable pjrt_c_api_gpu_test

---
 third_party/xla/xla/pjrt/c/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 4b4a6468773d2c..0d7b1057b801f8 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -351,6 +351,9 @@ xla_test(
     name = "pjrt_c_api_gpu_test",
     srcs = ["pjrt_c_api_gpu_test.cc"],
     backends = ["gpu"],
+    tags = [
+        "no_rocm", #TODO(rocm): TEMP, sync 24-06-03
+    ],
     deps = [
         ":pjrt_c_api_ffi_extension_hdrs",
         ":pjrt_c_api_gpu",