Skip to content

Commit

Permalink
Merge pull request #29 from shizunge/parallel
Browse files Browse the repository at this point in the history
Add `GANTRY_UPDATE_NUM_WORKERS` and other fixes
  • Loading branch information
shizunge authored Feb 18, 2024
2 parents fa0d4fb + c1a21ba commit 24fe8dd
Show file tree
Hide file tree
Showing 27 changed files with 1,548 additions and 884 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ jobs:
- name: Upload coverage reports to Codecov
if: ${{ github.ref == 'refs/heads/main' }}
uses: codecov/codecov-action@v4
with:
directory: coverage
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

12 changes: 6 additions & 6 deletions .github/workflows/on-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ jobs:
matrix:
test_suit:
- gantry_cleanup_images_spec.sh
- gantry_entrypoint_spec.sh
- gantry_common_options_spec.sh
- gantry_filters_spec.sh
- gantry_job_spec.sh
- gantry_login_spec.sh
- gantry_manifest_spec.sh
- gantry_multiple_services_spec.sh
- gantry_no_running_tasks_spec.sh
- gantry_notify_spec.sh
- gantry_options_spec.sh
- gantry_parallel_spec.sh
- gantry_rollback_spec.sh
- gantry_simple_spec.sh
- gantry_service_multiple_spec.sh
- gantry_service_no_running_tasks_spec.sh
- gantry_service_single_spec.sh
- gantry_update_options_spec.sh
steps:
- name: Set up Docker Buildx
uses: docker/[email protected]
Expand Down
59 changes: 52 additions & 7 deletions .github/workflows/on-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,24 @@ jobs:
set +e
tests:
name: Run tests
name: Test script
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
test_suit:
- gantry_cleanup_images_spec.sh
- gantry_entrypoint_spec.sh
- gantry_common_options_spec.sh
- gantry_filters_spec.sh
- gantry_job_spec.sh
- gantry_login_spec.sh
- gantry_manifest_spec.sh
- gantry_multiple_services_spec.sh
- gantry_no_running_tasks_spec.sh
- gantry_notify_spec.sh
- gantry_options_spec.sh
- gantry_parallel_spec.sh
- gantry_rollback_spec.sh
- gantry_simple_spec.sh
- gantry_service_multiple_spec.sh
- gantry_service_no_running_tasks_spec.sh
- gantry_service_single_spec.sh
- gantry_update_options_spec.sh
steps:
- name: Set up Docker Buildx
uses: docker/[email protected]
Expand Down Expand Up @@ -111,4 +111,49 @@ jobs:
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
provenance: false
- name: Extract tag
run: |
TAGS="${{ steps.meta.outputs.tags }}"
for TAG in ${TAGS}; do
if echo "${TAG}" | grep -q "ghcr.io/${{ github.repository }}-development:dev-"; then
echo "TAG=${TAG}"
echo "${TAG}" > tag.txt
break;
fi
done
- name: Store tag
uses: actions/upload-artifact@v4
with:
name: tag
path: tag.txt


container_tests:
name: Test container
runs-on: ubuntu-latest
needs:
- build_and_push
steps:
- name: Set up Docker Buildx
uses: docker/[email protected]
- name: Install shellspec
run: |
mkdir -p ~/shellspec
cd ~/shellspec
git clone https://github.com/shellspec/shellspec.git
ln -s ~/shellspec/shellspec/shellspec /usr/local/bin/shellspec
echo -n "shellspec version: "
shellspec --version
- name: Checkout Code
uses: actions/checkout@v4
- name: Load tag
uses: actions/download-artifact@v4
with:
name: tag
- name: Run tests
run: |
export DOCKERHUB_PASSWORD=${{ secrets.DOCKERHUB_PASSWORD }}
export DOCKERHUB_USERNAME=${{ secrets.DOCKERHUB_USERNAME }}
export GANTRY_TEST_CONTAINER_REPO_TAG=$(cat tag.txt)
echo "GANTRY_TEST_CONTAINER_REPO_TAG=${GANTRY_TEST_CONTAINER_REPO_TAG}"
bash shellspec --jobs 50 --tag "container_test:true"
18 changes: 1 addition & 17 deletions .github/workflows/on-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,6 @@ jobs:
tests:
name: Run tests
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
test_suit:
- gantry_cleanup_images_spec.sh
- gantry_entrypoint_spec.sh
- gantry_filters_spec.sh
- gantry_job_spec.sh
- gantry_login_spec.sh
- gantry_manifest_spec.sh
- gantry_multiple_services_spec.sh
- gantry_no_running_tasks_spec.sh
- gantry_notify_spec.sh
- gantry_options_spec.sh
- gantry_rollback_spec.sh
- gantry_simple_spec.sh
steps:
- name: Set up Docker Buildx
uses: docker/[email protected]
Expand All @@ -60,7 +44,7 @@ jobs:
run: |
export DOCKERHUB_PASSWORD=${{ secrets.DOCKERHUB_PASSWORD }}
export DOCKERHUB_USERNAME=${{ secrets.DOCKERHUB_USERNAME }}
bash shellspec --pattern tests/${{ matrix.test_suit }}
bash shellspec --jobs 50
build_and_push:
name: Build and push Docker image
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.shellspec-quick.log
coverage
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ You can configure the most behaviors of *Gantry* via environment variables.
| GANTRY_ROLLBACK_ON_FAILURE | true | Set to `true` to enable rollback when updating fails. Set to `false` to disable the rollback. |
| GANTRY_ROLLBACK_OPTIONS | | [Options](https://docs.docker.com/engine/reference/commandline/service_update/#options) added to the `docker service update --rollback` command. |
| GANTRY_UPDATE_JOBS | false | Set to `true` to update replicated-job or global-job. Set to `false` to disable updating jobs. |
| GANTRY_UPDATE_NUM_WORKERS | 1 | The maximum number of updates that can run in parallel. |
| GANTRY_UPDATE_OPTIONS | | [Options](https://docs.docker.com/engine/reference/commandline/service_update/#options) added to the `docker service update` command. |
| GANTRY_UPDATE_TIMEOUT_SECONDS | 300 | Error out if updating of a single service takes longer than the given time. |

Expand Down
1 change: 1 addition & 0 deletions docs/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ The label on the services to select config to enable authentication is renamed t
| GANTRY_SERVICES_EXCLUDED_FILTERS |
| GANTRY_SERVICES_SELF |
| GANTRY_UPDATE_JOBS |
| GANTRY_UPDATE_NUM_WORKERS |

### License

Expand Down
19 changes: 9 additions & 10 deletions src/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ _read_docker_hub_rate() {
USER=
fi
if ! HOST=$(gantry_read_registry_host 2>&1); then
log ERROR "Failed to read HOST: ${HOST}";
log ERROR "Failed to read registry HOST: ${HOST}";
HOST=
fi
local USER_AND_PASS=
Expand All @@ -77,7 +77,9 @@ _read_docker_hub_rate() {
gantry() {
local PRE_RUN_CMD="${GANTRY_PRE_RUN_CMD:-""}"
local POST_RUN_CMD="${GANTRY_POST_RUN_CMD:-""}"
local STACK="${1:-gantry}"
local STACK="${1}"
[ -z "${STACK}" ] && STACK=$(gantry_current_service_name)
[ -z "${STACK}" ] && STACK="gantry"
export LOG_SCOPE="${STACK}"
local START_TIME=
START_TIME=$(date +%s)
Expand Down Expand Up @@ -115,7 +117,7 @@ gantry() {
gantry_update_services_list "${SERVICES_LIST}"
ACCUMULATED_ERRORS=$((ACCUMULATED_ERRORS + $?))
else
log WARN "Skip updating all services due to previous errors."
log WARN "Skip updating all services due to previous error(s)."
fi

local DOCKER_HUB_RATE_AFTER=
Expand All @@ -131,9 +133,9 @@ gantry() {

local TIME_ELAPSED=
TIME_ELAPSED=$(time_elapsed_since "${START_TIME}")
local MESSAGE="Done. Use ${TIME_ELAPSED}. ${ACCUMULATED_ERRORS} errors."
local MESSAGE="Done. Use ${TIME_ELAPSED}. ${ACCUMULATED_ERRORS} error(s)."
local RETURN_VALUE=0
if [ ${ACCUMULATED_ERRORS} -gt 0 ]; then
if [ "${ACCUMULATED_ERRORS}" -gt 0 ]; then
log ERROR "${MESSAGE}"
RETURN_VALUE=1
else
Expand All @@ -147,12 +149,9 @@ main() {
LOG_LEVEL="${GANTRY_LOG_LEVEL:-${LOG_LEVEL}}"
NODE_NAME="${GANTRY_NODE_NAME:-${NODE_NAME}}"
export LOG_LEVEL NODE_NAME
local INTERVAL_SECONDS=
INTERVAL_SECONDS=$(gantry_read_number GANTRY_SLEEP_SECONDS 0) || return 1
local IMAGES_TO_REMOVE="${GANTRY_IMAGES_TO_REMOVE:-""}"
local INTERVAL_SECONDS="${GANTRY_SLEEP_SECONDS:-0}"
if ! is_number "${INTERVAL_SECONDS}"; then
log ERROR "GANTRY_SLEEP_SECONDS must be a number. Got \"${GANTRY_SLEEP_SECONDS}\"."
return 1;
fi
if [ -n "${IMAGES_TO_REMOVE}" ]; then
# Image remover runs as a global job. The log will be collected via docker commands then formatted.
# Redefine the log function for the formater.
Expand Down
79 changes: 40 additions & 39 deletions src/lib-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ log() {
if _log_level "${1}" >/dev/null; then
LEVEL="${1}";
shift;
fi;
fi
_log_formatter "${LEVEL}" "$(date -Iseconds)" "${NODE_NAME}" "${LOG_SCOPE}" "${@}";
}

Expand Down Expand Up @@ -329,50 +329,45 @@ wait_service_state() {
local WAIT_RUNNING WAIT_COMPLETE;
WAIT_RUNNING=$(echo "${@}" | grep -q -- "--running" && echo "true" || echo "false")
WAIT_COMPLETE=$(echo "${@}" | grep -q -- "--complete" && echo "true" || echo "false")
local RETURN_VALUE=0
local SLEEP_SECONDS=1
local STATES=
STATES=$(_docker_service_task_states "${SERVICE_NAME}" 2>&1)
while is_true "${WAIT_RUNNING}" || is_true "${WAIT_COMPLETE}" ; do
while STATES=$(_docker_service_task_states "${SERVICE_NAME}" 2>&1); do
local NUM_LINES=0
local NUM_RUNS=0
local NUM_DONES=0
local NUM_FAILS=0
while read -r LINE; do
[ -z "${LINE}" ] && continue;
log INFO "Service ${SERVICE_NAME}: ${LINE}."
NUM_LINES=$((NUM_LINES+1));
echo "${LINE}" | grep -q "Running" && NUM_RUNS=$((NUM_RUNS+1));
echo "${LINE}" | grep -q "Complete" && NUM_DONES=$((NUM_DONES+1));
echo "${LINE}" | grep -q "Failed" && NUM_FAILS=$((NUM_FAILS+1));
done < <(echo "${STATES}")
if [ ${NUM_LINES} -gt 0 ]; then
if ${WAIT_RUNNING} && [ ${NUM_RUNS} -eq ${NUM_LINES} ]; then
break
if [ "${NUM_LINES}" -gt 0 ]; then
if "${WAIT_RUNNING}" && [ "${NUM_RUNS}" -eq "${NUM_LINES}" ]; then
return 0;
fi
if ${WAIT_COMPLETE} && [ ${NUM_DONES} -eq ${NUM_LINES} ]; then
break
if "${WAIT_COMPLETE}" && [ "${NUM_DONES}" -eq "${NUM_LINES}" ]; then
return 0;
fi
if ${WAIT_COMPLETE} && [ ${NUM_FAILS} -gt 0 ]; then
if "${WAIT_COMPLETE}" && [ "${NUM_FAILS}" -gt 0 ]; then
# Get return value of the task from the string "task: non-zero exit (1)".
local TASK_STATE=
local TASK_RETURN_VALUE=
TASK_STATE=$(echo "${STATES}" | grep "Failed")
TASK_RETURN_VALUE=$(echo "${TASK_STATE}" | sed -n 's/.*task: non-zero exit (\([0-9]\+\)).*/\1/p')
TASK_RETURN_VALUE=$(echo "${STATES}" | grep "Failed" | sed -n 's/.*task: non-zero exit (\([0-9]\+\)).*/\1/p')
# Get the first error code.
local RETURN_VALUE=0
RETURN_VALUE=$(echo "${TASK_RETURN_VALUE:-1}" | cut -d ' ' -f 1)
break
return "${RETURN_VALUE}"
fi
fi
sleep "${SLEEP_SECONDS}"
if ! STATES=$(_docker_service_task_states "${SERVICE_NAME}" 2>&1); then
log ERROR "Failed to obtain task states of service ${SERVICE_NAME}: ${STATES}"
return 1
if ! ("${WAIT_RUNNING}" || "${WAIT_COMPLETE}"); then
return 0;
fi
sleep "${SLEEP_SECONDS}"
done
echo "${STATES}" | while read -r LINE; do
log INFO "Service ${SERVICE_NAME}: ${LINE}."
done
return "${RETURN_VALUE}"
log ERROR "Failed to obtain task states of service ${SERVICE_NAME}: ${STATES}"
return 1
}

docker_service_remove() {
Expand All @@ -381,22 +376,29 @@ docker_service_remove() {
return 0
fi
log INFO "Removing service ${SERVICE_NAME}."
docker service rm "${SERVICE_NAME}" >/dev/null
local RETURN_VALUE=$?
local LOG=
if ! LOG=$(docker service rm "${SERVICE_NAME}" 2>&1); then
log ERROR "Failed to remove docker service ${SERVICE_NAME}: ${LOG}"
return 1
fi
log INFO "Removed service ${SERVICE_NAME}."
return ${RETURN_VALUE}
return 0
}

# We do not expect failures when using docker_global_job.
# Docker will try to restart the failed tasks.
# We do not check the converge of the service. It must be used togther with wait_service_state.
# We do not check the converge of the service, thus some jobs may failed on some nodes.
# It is better to be used togther with wait_service_state.
docker_global_job() {
local SERVICE_NAME=
SERVICE_NAME=$(_get_docker_command_name_arg "${@}")
log INFO "Starting service ${SERVICE_NAME}."
docker service create \
--mode global-job \
"${@}" >/dev/null
log INFO "Starting global-job ${SERVICE_NAME}."
local LOG=
if ! LOG=$(docker service create --mode global-job "${@}" 2>&1); then
log ERROR "Failed to create global-job ${SERVICE_NAME}: ${LOG}"
return 1
fi
return 0
}

# A job could fail when using docker_replicated_job.
Expand All @@ -407,16 +409,17 @@ docker_replicated_job() {
IS_DETACH=$(_get_docker_command_detach "${@}")
# Add "--detach" to work around https://github.com/docker/cli/issues/2979
# The Docker CLI does not exit on failures.
log INFO "Starting service ${SERVICE_NAME}."
docker service create \
--mode replicated-job --detach \
"${@}" >/dev/null
local RETURN_VALUE=$?
log INFO "Starting replicated-job ${SERVICE_NAME}."
local LOG=
if ! LOG=$(docker service create --mode replicated-job --detach "${@}" 2>&1); then
log ERROR "Failed to create replicated-job ${SERVICE_NAME}: ${LOG}"
return 1
fi
# If the command line does not contain '--detach', the function returns til the replicated job is complete.
if ! "${IS_DETACH}"; then
wait_service_state "${SERVICE_NAME}" --complete || return $?
fi
return ${RETURN_VALUE}
return 0
}

_container_status() {
Expand All @@ -442,9 +445,7 @@ docker_run() {
local RETRIES=0
local MAX_RETRIES=5
local SLEEP_SECONDS=10
while ! docker run \
"${@}" >/dev/null;
do
while ! docker run "${@}" >/dev/null; do
if [ ${RETRIES} -ge ${MAX_RETRIES} ]; then
echo "Failed to run docker. Reached the max retries ${MAX_RETRIES}." >&2
return 1
Expand Down
Loading

0 comments on commit 24fe8dd

Please sign in to comment.