From 82fc5ef63f6f4d9b31a469a194be8074bd6e2a36 Mon Sep 17 00:00:00 2001 From: scxfjiang Date: Thu, 28 Nov 2024 16:33:49 -0600 Subject: [PATCH] script to run single test --- .../linux/rocm/run_gpu_single_single_test.sh | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100755 tensorflow/tools/ci_build/linux/rocm/run_gpu_single_single_test.sh diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single_single_test.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single_single_test.sh new file mode 100755 index 00000000000000..cdc71cb731ec43 --- /dev/null +++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single_single_test.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== +set -e +set -x + +N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) +# If rocm-smi exists locally (it should) use it to find +# out how many GPUs we have to test with. +rocm-smi -i +STATUS=$? +if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else + TF_GPU_COUNT=$(rocm-smi -i | grep 'Device ID' | grep 'GPU' | wc -l) +fi +TF_TESTS_PER_GPU=1 +N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + +echo "" +echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." +echo "" + +# First positional argument (if any) specifies the ROCM_INSTALL_DIR +if [[ -n $1 ]]; then + ROCM_INSTALL_DIR=$1 +else + if [[ -z "${ROCM_PATH}" ]]; then + ROCM_INSTALL_DIR=/opt/rocm-6.2.0 + else + ROCM_INSTALL_DIR=$ROCM_PATH + fi +fi + +# Run configure. +export PYTHON_BIN_PATH=$(which python3) + +export TF_NEED_ROCM=1 +export ROCM_PATH=$ROCM_INSTALL_DIR + +if [ -f /usertools/rocm.bazelrc ]; then + # Use the bazelrc files in /usertools if available + if [ ! -d /tf ]; then + # The bazelrc files in /usertools expect /tf to exist + mkdir /tf + fi + bazel \ + --bazelrc=/usertools/rocm.bazelrc \ + test \ + --jobs=30 \ + --local_ram_resources=60000 \ + --local_cpu_resources=15 \ + --local_test_jobs=${N_TEST_JOBS} \ + --config=sigbuild_local_cache \ + --config=rocm \ + --action_env=OPENBLAS_CORETYPE=Haswell \ + --action_env=TF_ENABLE_ONEDNN_OPTS=0 \ + --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ + --config=pycpp_filters -- //tensorflow/compiler/tests:tridiagonal_matmul_ops_test + + # Legacy style: run configure then build + yes "" | $PYTHON_BIN_PATH configure.py + + # Run bazel test command. Double test timeouts to avoid flakes. + bazel test \ + --config=rocm \ + -k \ + --test_tag_filters=gpu,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-tpu,-v1only \ + --jobs=30 \ + --local_ram_resources=60000 \ + --local_cpu_resources=15 \ + --local_test_jobs=${N_TEST_JOBS} \ + --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ + --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_env=HSA_TOOLS_LIB=libroctracer64.so \ + --action_env=OPENBLAS_CORETYPE=Haswell \ + --action_env=TF_ENABLE_ONEDNN_OPTS=0 \ + --test_timeout 920,2400,7200,9600 \ + --build_tests_only \ + --test_output=errors \ + --test_sharding_strategy=disabled \ + --test_size_filters=small,medium,large \ + --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ + -- \ + //tensorflow/... \ + -//tensorflow/python/integration_testing/... \ + -//tensorflow/core/tpu/... \ + -//tensorflow/lite/... \ + -//tensorflow/compiler/tf2tensorrt/... \ + -//tensorflow/dtensor/python/tests:multi_client_test_nccl_2gpus +fi