Merge pull request #2325 from ROCmSoftwarePlatform/to-2.13.1

Update 2.13 branch to 2.13.1
ROCm · Dec 18, 2023 · 10c36c1 · 10c36c1
2 parents 644d3a9 + 01e3857
commit 10c36c1
Show file tree

Hide file tree

Showing 14 changed files with 103 additions and 46 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,3 +1,9 @@
+# Release 2.13.1
+
+### Bug Fixes and Other Changes
+
+*  Refactor CpuExecutable to propagate LLVM errors.
+
 # Release 2.13.0
 
 ## TensorFlow

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
@@ -2114,6 +2114,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
     ],
 )

diff --git a/tensorflow/compiler/tests/xla_custom_call_ops_test.py b/tensorflow/compiler/tests/xla_custom_call_ops_test.py
@@ -18,6 +18,7 @@
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import random_ops
@@ -46,6 +47,22 @@ def f(x, y):
       self.assertIn('custom_call_target="my_call"', hlo)
       self.assertIn('backend_config="my_backend_config"', hlo)
 
+  def testXlaCustomCallOpDoesntExist(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      def f():
+        return xla.custom_call(
+            args=(1, 2),
+            target_name='my_non_existing_call_target',
+            dtype=dtypes.int32,
+            shape=(),
+            backend_config='my_backend_config',
+        )
+
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        compiled_f = def_function.function(f, jit_compile=True)
+        compiled_f()
+
   def testXlaCustomCallV2Op(self):
     with ops.device('device:{}:0'.format(self.device)):
 

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -1403,9 +1403,12 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
                                                  std::move(llvm_context));
   cantFail((*jit)->AddModule(std::move(thread_safe_module)));
 
-  auto cpu_executable = std::make_unique<CpuExecutable>(
-      std::move(*jit), std::move(assignment), std::move(module), function_name,
-      std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map));
+  TF_ASSIGN_OR_RETURN(
+      auto cpu_executable,
+      CpuExecutable::Create(std::move(*jit), std::move(assignment),
+                            std::move(module), function_name,
+                            std::move(hlo_profile_printer_data),
+                            std::move(hlo_profile_index_map)));
 
   if (embed_ir_in_executable) {
     cpu_executable->set_ir_module_string(ir_module_string);
@@ -1507,7 +1510,7 @@ CpuCompiler::CompileXlaRuntimeCpuExecutable(
                     obj_file);
   }
 
-  return std::make_unique<CpuExecutable>(
+  return CpuExecutable::Create(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment),
       std::move(xla_runtime_executable));

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -58,53 +58,60 @@ namespace cpu {
 
 namespace runtime = ::xla::runtime;
 
-CpuExecutable::CpuExecutable(
+StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module,
     const std::string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                 std::move(hlo_profile_index_map)),
-      jit_(std::move(jit)),
-      assignment_(std::move(assignment)),
-      module_name_(entry_function_name) {
-  if (assignment_) {
-    buffer_assignment_ =
-        std::make_shared<BufferAssignmentProto>(assignment_->ToProto());
-  }
-  if (has_module()) {
-    XlaDebugInfoManager::Get()->RegisterModule(
-        module().unique_id(), shared_module(), buffer_assignment_);
-  }
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
+  std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
+      std::move(hlo_module), std::move(hlo_profile_printer_data),
+      std::move(hlo_profile_index_map), std::move(assignment)));
+  executable->jit_ = std::move(jit);
+  executable->module_name_ = entry_function_name;
 
   // Resolve symbols in the constructor rather than at execution time to avoid
   // races because FindSymbol is not thread safe.
   llvm::Expected<llvm::orc::ExecutorSymbolDef> sym =
-      jit_->FindCompiledSymbol(entry_function_name);
+      executable->jit_->FindCompiledSymbol(entry_function_name);
   // We expect to find the symbol provided with entry_function_name; otherwise
   // this is an internal error.
-  CHECK(sym->getAddress()) << "Symbol " << entry_function_name << " not found.";
+  if (!sym) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Symbol ", entry_function_name, " not found."));
+  }
   // getAddress can do work under the hood in the jit, so it needs to be
   // guarded by the mutex.
-  compute_function_ =
+  executable->compute_function_ =
       reinterpret_cast<ComputeFunctionType>(sym->getAddress().getValue());
   VLOG(1) << "compute_function_ at address "
-          << reinterpret_cast<void*>(compute_function_);
-  jit_->DoneCompiling();
+          << reinterpret_cast<void*>(executable->compute_function_);
+  executable->jit_->DoneCompiling();
+  return executable;
 }
 
-CpuExecutable::CpuExecutable(
+StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable)
+    std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable) {
+  std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
+      std::move(hlo_module), std::move(hlo_profile_printer_data),
+      std::move(hlo_profile_index_map), std::move(assignment)));
+  executable->xla_runtime_executable_ = std::move(xla_runtime_executable);
+  return executable;
+}
+
+CpuExecutable::CpuExecutable(
+    std::unique_ptr<HloModule> hlo_module,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    std::unique_ptr<const BufferAssignment> assignment)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      assignment_(std::move(assignment)),
-      xla_runtime_executable_(std::move(xla_runtime_executable)) {
+      assignment_(std::move(assignment)) {
   if (assignment_) {
     buffer_assignment_ =
         std::make_shared<BufferAssignmentProto>(assignment_->ToProto());
@@ -328,9 +335,9 @@ StatusOr<std::unique_ptr<Executable>> CpuExecutable::LoadFromObjFile(
       std::move(executable_ptr), xla_framework_mapping,
       std::move(*ffi_modules_state));
 
-  return std::unique_ptr<Executable>(new CpuExecutable(
-      std::move(hlo_module), nullptr, nullptr, std::move(buffer_assignment),
-      std::move(xla_runtime_executable)));
+  return CpuExecutable::Create(std::move(hlo_module), nullptr, nullptr,
+                               std::move(buffer_assignment),
+                               std::move(xla_runtime_executable));
 }
 
 StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -138,14 +138,15 @@ class XlaRuntimeCpuExecutable {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
-  CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
-                std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<HloModule> hlo_module,
-                const std::string& entry_function_name,
-                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
-  // XLA Runtime constructor.
-  CpuExecutable(
+  static StatusOr<std::unique_ptr<CpuExecutable>> Create(
+      std::unique_ptr<SimpleOrcJIT> jit,
+      std::unique_ptr<const BufferAssignment> assignment,
+      std::unique_ptr<HloModule> hlo_module,
+      const std::string& entry_function_name,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+  // XLA Runtime factory method.
+  static StatusOr<std::unique_ptr<CpuExecutable>> Create(
       std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
@@ -257,7 +258,7 @@ class CpuExecutable : public Executable {
   const InstructionValueSet& GetRootValueSet() const;
 
   // The JIT containing compiled modules.
-  const std::unique_ptr<SimpleOrcJIT> jit_;
+  std::unique_ptr<SimpleOrcJIT> jit_;
 
   // Buffer assignment for the buffers we need to allocate.
   const std::unique_ptr<const BufferAssignment> assignment_;
@@ -281,6 +282,10 @@ class CpuExecutable : public Executable {
   // If not null, XLA Runtime is enabled.
   std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable_;
 
+  CpuExecutable(std::unique_ptr<HloModule> hlo_module,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+                std::unique_ptr<const BufferAssignment> assignment);
   CpuExecutable(const CpuExecutable&) = delete;
   CpuExecutable& operator=(const CpuExecutable&) = delete;
 };

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
@@ -22,7 +22,7 @@ limitations under the License.
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
 #define TF_MINOR_VERSION 13
-#define TF_PATCH_VERSION 0
+#define TF_PATCH_VERSION 1
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
@@ -71,7 +71,7 @@ def register_extension_info(**kwargs):
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.13.0"
+VERSION = "2.13.1"
 VERSION_MAJOR = VERSION.split(".")[0]
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 

diff --git a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
@@ -16,6 +16,10 @@
 set -x
 
 ARM_SKIP_TESTS="-//tensorflow/lite/... \
+-//tensorflow/core/platform:ram_file_system_test \
+-//tensorflow/python/compiler/xla:xla_test \
+-//tensorflow/python/data/experimental/kernel_tests:checkpoint_input_pipeline_hook_test \
+-//tensorflow/python/distribute:parameter_server_strategy_test \
 -//tensorflow/python/kernel_tests/nn_ops:atrous_conv2d_test \
 -//tensorflow/python/kernel_tests/nn_ops:conv_ops_test \
 "
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
@@ -47,8 +47,8 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.13.0'
-_RC_VERSION = ''
+_VERSION = '2.13.1'
+
 
 # We use the same setup.py for all tensorflow_* packages and for the nightly
 # equivalents (tf_nightly_*). The package is controlled from the argument line

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -34,12 +34,18 @@ RUN /setup.sources.sh && /setup.packages.sh /devel.packages.txt && /setup.cuda.s
 # - buildifier: clean bazel build deps
 # - buildozer: clean bazel build deps
 # - gcloud SDK: communicate with Google Cloud Platform (GCP) for RBE, CI
+# - patchelf: Utility tool to modify existing ELF executables and libraries
 RUN git clone --branch v1.7.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
 RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
 RUN wget https://github.com/bazelbuild/buildtools/releases/download/3.5.0/buildifier -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
 RUN wget https://github.com/bazelbuild/buildtools/releases/download/3.5.0/buildozer -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
 RUN curl -sSL https://sdk.cloud.google.com > /tmp/gcloud && bash /tmp/gcloud --install-dir=~/usr/local/bin --disable-prompts
-
+# Download and install patchelf v0.18.0 from GitHub. The default Ubuntu focal
+# packages only provide the "0.10-2build1" version. We use patchelf to manipulate
+# certain shared libraries during the wheel building process (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/build_pip_package.sh#L255-L262).
+# When we use Patchelf versions <0.12, those shared libraries end up with a
+# corrupted PT_NOTE program header. This was fixed in v0.12, see https://github.com/NixOS/patchelf/commit/43a33482b501b0f5ee9da312aabfca3806570cc9.
+RUN wget https://github.com/NixOS/patchelf/releases/download/0.18.0/patchelf-0.18.0-x86_64.tar.gz && tar -zxvf patchelf-0.18.0-x86_64.tar.gz -C /usr && rm -rf patchelf-0.18.0-x86_64.tar.gz
 
 # All lines past this point are reset when $CACHEBUSTER is set. We need this
 # for Python specifically because we install some nightly packages which are

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -28,6 +28,7 @@ build-essential
 ca-certificates
 llvm-16
 clang-16
+lld-16
 clang-format-12
 colordiff
 curl
@@ -47,7 +48,6 @@ mlocate
 moreutils
 openjdk-11-jdk
 openjdk-11-jre-headless
-patchelf
 pkg-config
 python3-dev
 python3-setuptools

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
@@ -23,6 +23,10 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
+# Use lld as the linker
+build --linkopt="-fuse-ld=lld"
+build --linkopt="-lm"
+
 # Disable clang extention that rejects type definitions within offsetof. 
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -30,6 +30,10 @@ build --copt=-mavx --host_copt=-mavx
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build --copt=-Wno-gnu-offsetof-extensions
 
+# Use lld as the linker
+build --linkopt="-fuse-ld=lld"
+build --linkopt="-lm"
+
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling