Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[testing-on-gke part 6.10] Add script for automating runs #2538

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/run-automated.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/bin/bash
#
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is used purely for automating the run of
# the script run-gke-tests.sh to
# run it periodically as a cron-job.
#
# For your case, add/remove/modify the configuration parameters as you need.
#
# Assumptions for this script to work:
# 1. You have appropriate access to project_id defined below.
# 2. You have the cluster with cluster_name defined below, or enough.
# resources in project_id to create this cluster.

# Print all shell commands.
set -x

# Fail if any command fails.
set -e

# Environment variable USER must be defined.
if test -z ${USER}; then
echo "USER has not been set"
exit 1
fi

# Define configuration parameters.
if test -z ${project_id}; then
echo "project_id has not been set."
exit 1
fi
if test -z "${project_number}"; then
echo "project_number has not been set."
exit 1
fi
export zone=us-west1-b
if test -z "${cluster_name}"; then
echo "cluster_name has not been set."
exit 1
fi
export node_pool=default-pool
export machine_type=n2-standard-96
export num_nodes=7
export num_ssd=16
export use_custom_csi_driver=true
export output_dir=.
if test -z "${gcsfuse_branch}"; then
echo "gcsfuse_branch has not been set."
exit 1
fi
export pod_wait_time_in_seconds=300
export pod_timeout_in_seconds=64800
# Pass instance_id from outside to continue previous run, if it got terminated
# somehow (timeout of ssh etc.)
if test -z ${instance_id}; then
export instance_id=$(echo ${USER} | sed 's/_google//' | sed 's/_com//')-$(date +%Y%m%d-%H%M%S)
fi
if test -z "${output_gsheet_id}"; then
echo "output_gsheet_id has not been set."
exit 1
fi
if test -z "${output_gsheet_keyfile}"; then
echo "output_gsheet_keyfile has not been set."
exit 1
fi
export force_update_gcsfuse_code=true
# Continue previous run if pods had been scheduled/completed already.
test -n ${only_parse} || export only_parse=false

# Create a dedicated folder on the machine.
mkdir -pv ~/gke-testing && cd ~/gke-testing
wget https://raw.githubusercontent.com/googlecloudplatform/gcsfuse/${gcsfuse_branch}/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh -O run-gke-tests.sh
chmod +x run-gke-tests.sh

# Remove previous run's outputs.
rm -rfv log fio/output.csv dlio/output.csv

# Run the script.
start_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run started at ${start_time}'
touch log
(./run-gke-tests.sh --debug |& tee -a log) || true
# Use the following if you want to run it in a tmux session instead.
# tmux new-session -d -s ${instance_id} 'bash -c "(./run-gke-tests.sh --debug |& tee -a log); sleep 604800 "'
end_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run ended at ${end_time}'

# Some post-run steps to be taken for output collection.
if test -n "${workload_config}"; then
cp ${workload_config} ./workloads.json
else
cp src/gcsfuse/perfmetrics/scripts/testing_on_gke/examples/workloads.json .
fi
git -C src/gcsfuse rev-parse HEAD > gcsfuse_commithash
git -C src/gcs-fuse-csi-driver rev-parse HEAD > gcs_fuse_csi_driver_commithash
# Fetch cloud-logs for this run. This has not been tested yet.
# (gcloud logging read --project=${project_id} 'timestamp>="${start_time}"" AND timestamp<="${end_time}" AND resource.labels.cluster_name="${cluster_name}" ' --order=ASC --format=csv\(timestamp\,resource.labels.pod_name,resource.labels.container_name,"text_payload"\) > cloud_logs.txt) &

# Upload outputs to GCS after the run.
if test -z "${output_bucket}"; then
echo "output_bucket has not been set."
exit 1
fi
output_path_uri=gs://${output_bucket}/outputs/${instance_id}
for file in fio/output.csv dlio/output.csv log run-gke-tests.sh workloads.json gcsfuse_commithash gcs_fuse_csi_driver_commithash; do
if test -f ${file} ; then
gcloud storage cp --content-type=text/text ${file} ${output_path_uri}/${file}
fi
done

# Go back to whichever working directory you were in.
cd -
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"TestConfig": {
"workloadConfig": {
"_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
"runOnSSD": true,
"runOnSSD": false,
"workloads": [
{
"_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
Expand Down
59 changes: 59 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/workloads_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"_comment": "_ in the starting of element name indicates comment.",
"TestConfig": {
"workloadConfig": {
"_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
"runOnSSD": false,
"workloads": [
{
"_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
"_fioWorkload": {
"_description": "Every fioWorkload must have fileSize, filesPerThread, numThreads, and blockSize fields. readTypes is an array of string values 'read' and 'randread'. If readTypes is missing, then it defaults to [\"read\",\"randread\"].",
"fileSize": "64K",
"filesPerThread": 20000,
"numThreads": 50,
"blockSize": "64K",
"readTypes": ["read","randread"]
},
"gcsfuseMountOptions": "GCSFuse mount-options, in a compact stringified format, to be used for the test scenario gcsfuse-generic. The individual config/cli flag values should be separated by comma. Each cli flag should be of the form <flag>[=<value>], while each config-file flag should be of form <config>[:<subconfig>[:<subsubconfig>[...]]]:<value>. For example, a legal value would be: implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:-1 .",
"bucket":"The bucket must have objects with name Workload.{i}/{j} for every i,j where i:0-{numThreads}-1, j:0-{filesPerThread}-1, and each of these objects must be of size {fileSize}. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
},
{
"fioWorkload": {
"fileSize": "64K",
"filesPerThread": 100,
"numThreads": 20,
"blockSize": "64K",
"readTypes": ["randread"]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
"bucket":"fio-64k-1m-us-west1",
"_bucket_alt2":"fio-64k-1m-us-central1",
"_bucket_alt3":"gke-fio-64k-1m"
},
{
"_description": "This is a dummy dlio workload (missing the 'dlioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a dlio workload, it must have a valid 'dlioWorkload' object and a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
"_dlioWorkload": {
"_description": "Every dlioWorkload must have numFilesTrain, recordLength, and batchSizes fields. batchSizes is an array of integer values",
"numFilesTrain": 500000,
"recordLength": 102400,
"batchSizes": [800,128]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true",
"bucket":"The bucket must have objects with name 'train/', 'valid/', and train/img_{i}_of_{numFilesTrain}.npz for every i where i:0-{numFilesTrain}-1 and each train/img_{i}_of_{numFilesTrain}.npz must be of size {recordLength} bytes. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
},
{
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 3145728,
"batchSizes": [200]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
"bucket":"dlio-unet3d-3mb-100k-us-west1",
gargnitingoogle marked this conversation as resolved.
Show resolved Hide resolved
"_bucket_alt2":"dlio-unet3d-3mb-100k-us-central1",
"_bucket_alt3":"gke-dlio-unet3d-3mb-100k"
}
]
}
}
}