openqa-investigate

#!/bin/bash

# Usage
# echo jobnumber | openqa-investigate-multi

# http://redsymbol.net/articles/unofficial-bash-strict-mode/
set -euo pipefail

# shellcheck source=/dev/null
. "$(dirname "${BASH_SOURCE[0]}")"/_common

host="${host:-"openqa.opensuse.org"}"
scheme="${scheme:-"https"}"
host_url="$scheme://$host"
dry_run="${dry_run:-"0"}"
verbose="${verbose:-"false"}"
prio_add="${prio_add:-"100"}"
exclude_name_regex="${exclude_name_regex:-":investigate:"}"
exclude_no_group="${exclude_no_group:-"true"}"
# exclude_group_regex checks a combined string "<parent job group name> / <job group name>"
exclude_group_regex="${exclude_group_regex:-"Development.*/ "}"
force=${force:-false}
retries=${retries:-"3"}
OPENQA_CLI_RETRY_SLEEP_TIME_S=${OPENQA_CLI_RETRY_SLEEP_TIME_S:-20}
client_args=(api --header 'User-Agent: openqa-investigate (https://github.com/os-autoinst/scripts)' --host "$host_url" --retries="$retries")
jq_output_limit="${jq_output_limit:-15}"
curl_args=(-L --user-agent "openqa-investigate")
echoerr() { echo "$@" >&2; }

clone() {
    local origin id name_suffix refspec unsupported_cluster_jobs pending_cluster_jobs name base_prio clone_settings casedir repo out clone_id
    origin=${1:?"Need 'origin'"}
    id=${2:?"Need 'id'"}
    name_suffix=${3+":$3"}
    refspec=${4+$4}
    local clone_job_data
    if [[ "$origin" == "$id" ]]; then
        clone_job_data=$job_data
    else
        clone_job_data=$(client-get-job "$id")
    fi
    # shellcheck disable=SC2181
    [[ $? != 0 ]] && echoerr "unable to query job data for $id: $clone_job_data" && return 1

    # fail on jobs with directly chained dependencies (not supported)
    unsupported_cluster_jobs=$(echo "$clone_job_data" | runjq -r '(.job.children["Directly chained"] | length) + (.job.parents["Directly chained"] | length)') || return $?
    [[ $unsupported_cluster_jobs != 0 ]] \
        && echoerr "Unable to clone job $id: it is part of a directly chained cluster (not supported)" && return 2

    name="$(echo "$clone_job_data" | runjq -r '.job.test'):investigate$name_suffix" || return $?
    base_prio=$(echo "$clone_job_data" | runjq -r '.job.priority') || return $?
    clone_settings=("TEST+=:investigate$name_suffix" '_TRIGGER_JOB_DONE_HOOK=1' '_GROUP_ID=0' 'BUILD=')
    if [[ $refspec ]]; then
        vars_json=$(fetch-vars-json "$origin")
        testgiturl=$(echo "$vars_json" | runjq -r '.TEST_GIT_URL')
        casedir=$(echo "$clone_job_data" | runjq -r '.job.settings.CASEDIR') || return $?
        if [[ $testgiturl != null ]]; then
            if [[ ! $testgiturl =~ ^https?://[^[:space:]]+$ ]]; then
                warn "Can not clone refspec of job $origin with unknown/invalid git url TEST_GIT_URL='$testgiturl'"
                return 0
            fi
            casedir=$testgiturl
        fi
        [[ $casedir == null ]] && casedir=''
        repo=${casedir:-'https://github.com/os-autoinst/os-autoinst-distri-opensuse.git'}
        clone_settings+=("CASEDIR=${repo%#*}#${refspec}")
        if [[ $name_suffix =~ (last_good_tests_and_build) ]]; then
            worker_vars_settings=$(echo "$vars_json" | runjq -r '.WORKER_CLASS') || return $?
            if [[ $worker_vars_settings != null ]]; then
                clone_settings+=("WORKER_CLASS=${worker_vars_settings}")
            else
                name+="(unidentified worker class in vars.json)"
            fi
        fi
    fi
    [[ -n ${*:5} ]] && clone_settings+=("${@:5}")
    # clear "PUBLISH_" settings to avoid overriding production assets
    # shellcheck disable=SC2207
    clone_settings+=($(echo "$clone_job_data" | runjq -r '.job.settings | keys[] | select (startswith("PUBLISH_")) | . + "=none"')) || return $?
    clone_settings+=("OPENQA_INVESTIGATE_ORIGIN=$host_url/t$origin")
    out=$($clone_call "$host_url/tests/$id" "${clone_settings[@]}")
    if [[ $dry_run = 1 ]]; then
        echo "$out"
        out="{ \"$origin\": 42}"
    fi

    # output: { "$id": $clone_id }
    clone_id=$(echo "$out" | runjq -r ".\"$id\"")
    # Create markdown list entry
    echo "* *$name*: t#$clone_id"

    # set priority of cloned jobs
    # shellcheck disable=SC2207
    clone_ids=($(echo "$out" | runjq -r 'values[]'))
    for id in "${clone_ids[@]}"; do
        client-put-job "$id" "{\"priority\": $((base_prio + prio_add))}" > /dev/null
    done
}

trigger_jobs() {
    id="${1:?"Need 'job_id'"}"
    # for 1. current job/build + current test -> check if reproducible/sporadic
    clone "$id" "$id" 'retry' '' "${@:2}"

    job_url="$host_url/tests/$id"
    investigation=$(runcurl "${curl_args[@]}" -sS "$job_url"/investigation_ajax) || return $?
    last_good_exists=$(echo "$investigation" | runjq -r '.last_good') || return $?
    if [[ "$last_good_exists" = "null" || "$last_good_exists" = "not found" ]]; then
        echo "No last good recorded, skipping regression investigation jobs" && return 0
    fi
    last_good=$(echo "$investigation" | runjq -r '.last_good.text') || return $?
    [[ ! $last_good =~ ^[0-9]+$ ]] && echo ".last_good.text not found: investigation for test $id returned '$investigation'" >&2 && return 1

    # for 2. current job/build + last good test (+ last good needles) ->
    #   check for test (+needles) regression
    test_log=$(echo "$investigation" | runjq -r '.test_log') || return $?
    if echo "$test_log" | grep -q "No.*changes recorded"; then
        echo "$test_log. Skipping test regression investigation job."
        last_good_tests=''
    else
        vars_last_good=$(fetch-vars-json "$last_good") || return $?
        last_good_tests=$(echo "$vars_last_good" | runjq -r '.TEST_GIT_HASH') || return $?
        # here we could apply the same approach for needles, not only tests
        # With https://github.com/os-autoinst/os-autoinst/pull/1358 we could
        # theoretically use TEST_GIT_REFSPEC but this would act on the shared
        # test case dir within either the common openQA folder or the common
        # worker cache and hence influence other tests.
        # So better we use CASEDIR with a git refspec, only slightly less
        # efficient and also needing to know which git repo to use
        #refspec_arg="TEST_GIT_REFSPEC=$last_good_tests"
        refspec_arg=$last_good_tests
        clone "$id" "$id" "last_good_tests:$last_good_tests" "$refspec_arg" "${@:2}"
    fi

    # 3. last good job/build + current test -> check for product regression
    if ! echo "$investigation" | grep -q '\<BUILD\>'; then
        echo "Current job has same build as last good, product regression unlikely. Skipping product regression investigation job."
        last_good_build=''
    else
        vars_last_good=${vars_last_good:-$(runcurl "${curl_args[@]}" -sS "$host_url/tests/$last_good"/file/vars.json)} || return $?
        last_good_build=$(echo "$vars_last_good" | runjq -r '.BUILD') || return $?
        # here we clone with unspecified test refspec, i.e. this could be a
        # more recent tests version. As an alternative we could explicitly
        # checkout the git version from "first bad"
        clone "$id" "$last_good" "last_good_build:$last_good_build" '' "${@:2}"
    fi

    # 4. last good job/build + last good test -> check for other problem
    #    sources, e.g. infrastructure
    if [[ -z $last_good_tests ]]; then
        echo "No test regression expected. Not triggered 'good build+test' as it would be the same as 3., good build + current test"
    elif [[ -z $last_good_build ]]; then
        echo "No product regression expected. Not triggered 'good build+test' as it would be the same as 2., current build + good test"
    else
        clone "$id" "$last_good" "last_good_tests_and_build:$last_good_tests+$last_good_build" "$refspec_arg" "${@:2}"
    fi
}

query_dependency_data_or_postpone() {
    local id=$1 job_data=$2 dependency_data pending_cluster_jobs cluster_jobs

    # postpone if not all dependencies are done/cancelled
    # note: This "AJAX" route is normally used to render the dependencies tab in the web UI.
    dependency_data=$(get-dependencies-ajax "$id")
    cluster_jobs=$(echo "$dependency_data" | runjq -r "[$id, [.cluster[] | select(contains([$id]))]] | flatten | unique") || return $?
    # shellcheck disable=SC2016
    pending_cluster_jobs=$(echo "$dependency_data" | runjq --argjson cluster_jobs "$cluster_jobs" -r '[.nodes[] | select([.id] | inside($cluster_jobs)) | select([.state] | inside(["done", "cancelled"]) | not)] | length') || return $?
    [[ $pending_cluster_jobs != 0 ]] \
        && echoerr "Postponing to investigate job $id: waiting until $pending_cluster_jobs pending parallel job(s) finished" && return 142

    # do not skip the job
    echo "$dependency_data"
    return 255
}

sync_via_investigation_comment() {
    local id=$1 first_cluster_job_id=$2

    [[ $dry_run = 1 ]] && return 255
    comment_id=$(client-post-job-comment "$first_cluster_job_id" "Starting investigation for job $id" | runjq -r '.id') || return $?
    first_comment_id=$(client-get-job-comments "$first_cluster_job_id" | runjq -r '[.[] | select(.text | contains("investigation"))] | min_by(.id) | .id') || return $?

    # delete comment again in case a concurrent job could start the investigation before us
    if [[ $comment_id != "$first_comment_id" ]]; then
        echoerr "Skipping investigation of job $id: job cluster is already being investigated, see comment on job $first_cluster_job_id"
        client-delete-job-comment "$first_cluster_job_id" "$comment_id" && return 0
    fi

    echo "$comment_id"
    return 255
}

finalize_investigation_comment() {
    local id=$1 first_cluster_job_id=$2 comment_id=$3 comment_text=$4

    # delete comment again if there were no investigation jobs needed after all
    if ! [[ $comment_text ]]; then
        client-delete-job-comment "$first_cluster_job_id" "$comment_id"
        return 0
    fi

    local comment="Automatic investigation jobs for job $id:

$comment_text

💡[*Detailed explanation of this comment*](https://github.com/os-autoinst/scripts#More-details-and-examples-about-openqa-investigate-comments)"
    client-put-job-comment "$first_cluster_job_id" "$comment_id" "$comment"

    # also write a comment on the job we're actually investigating
    if [[ $first_cluster_job_id != "$id" ]]; then
        client-post-job-comment "$id" "$comment"
    fi
}

fetch-investigation-results() {
    local origin_job_id=$1
    local state job investigate_type other_id result investigate_comment output comment_lines

    [[ $dry_run = 1 ]] && return 0
    investigate_comment=$(client-get-job-comments "$origin_job_id" | runjq -r '[.[] | select(.text | contains("Automatic investigation jobs for job") and contains(":investigate:retry*:"))] | min_by(.id)') || return $?
    [[ $investigate_comment == 'null' ]] && return
    output=$(echo "$investigate_comment" | runjq -r '.text') || return $?
    mapfile -t comment_lines <<< "$output"

    for line in "${comment_lines[@]}"; do
        if [[ $line =~ :investigate:([^:*]+).*t\#([0-9]+) ]]; then
            investigate_type=${BASH_REMATCH[1]}
            other_id=${BASH_REMATCH[2]}
            job=$(client-get-job-state "$other_id")
            state=$(echo "$job" | runjq -r '.state') || return $?
            # at least one job is not finished, come back later
            [[ $state != 'done' ]] && return 142

            result=$(echo "$job" | runjq -r '.result') || return $?
            echo "$investigate_type|$other_id|$result"
        fi
    done
}

is-ok() {
    [[ $1 == passed || $1 == softfailed ]]
}

identify-issue-type() {
    # Fetch job data for all 4 investigation jobs. If one is not yet finished,
    # return special 142 to retrigger this hook later again.
    local origin_job_id=$1
    local state result investigate_type passed output result_lines
    local pass_lgt='' pass_lgb='' pass_lgtb=''
    product_issue=false test_issue=false infra_issue=false

    output=$(fetch-investigation-results "$origin_job_id") || return $?
    mapfile -t result_lines <<< "$output"

    for line in "${result_lines[@]}"; do
        if [[ $line =~ ^([^|]+)\|([^|]+)\|([^|]+) ]]; then
            investigate_type=${BASH_REMATCH[1]} result=${BASH_REMATCH[3]}
            is-ok "$result" && passed=true || passed=false
            if [[ $investigate_type == last_good_tests_and_build ]]; then
                pass_lgtb=$passed
            elif [[ $investigate_type == last_good_build ]]; then
                pass_lgb=$passed
            elif [[ $investigate_type == last_good_tests ]]; then
                pass_lgt=$passed
            fi
        fi
    done
    if ([[ -z "$pass_lgtb" ]] || "$pass_lgtb") && ([[ -z "$pass_lgt" ]] || ! "$pass_lgt") && [[ -n "$pass_lgb" ]] && "$pass_lgb"; then
        product_issue=true
    elif ([[ -z "$pass_lgtb" ]] || "$pass_lgtb") && [[ -n "$pass_lgt" ]] && "$pass_lgt" && ([[ -z "$pass_lgb" ]] || ! "$pass_lgb"); then
        test_issue=true
    elif ([[ -n "$pass_lgtb" ]] && ! "$pass_lgtb") && ([[ -n "$pass_lgt" ]] && ! "$pass_lgt") && ([[ -n "$pass_lgb" ]] && ! "$pass_lgb"); then
        infra_issue=true
    fi
}

post-investigate() {
    local id=$1 retry_name=$2
    local rc=0 status
    [[ ! "$retry_name" =~ investigate:retry$ ]] && echo "Job is ':investigate:' already, skipping investigation" && return 0
    # We are in the investigate:retry job now. From here we will check the
    # results of the other investigation jobs, if necessary
    retry_result="$(echo "$job_data" | runjq -r '.job.result')" || return $?
    investigate_origin="$(echo "$job_data" | runjq -r '.job.settings.OPENQA_INVESTIGATE_ORIGIN')" || return 1
    origin_job_id=${investigate_origin#"$host_url/t"}
    origin_job_data=$(client-get-job "$origin_job_id") || rc=$?
    # shellcheck disable=SC2154
    [[ $origin_job_data =~ $not_found_regex ]] && echo "Skipping posting investigation comment on original job $origin_job_id as it does not exist anymore" && return 0
    [[ $rc != 0 ]] && return $rc
    origin_name="$(echo "$origin_job_data" | runjq -r '.job.test')" || return $?
    # cluster jobs might have the same OPENQA_INVESTIGATE_ORIGIN as the root retry job
    [[ $retry_name != "$origin_name:investigate:retry" ]] && echo "Job $retry_name ($id) is not a retry of $origin_name ($origin_job_id)" && return 0

    comment="Investigate retry job *$retry_name*: t#$id"
    if is-ok "$retry_result"; then
        comment+=" $retry_result."$'\n\n'
        comment+="📋 **Likely a sporadic failure**."
    else
        comment+=" failed."$'\n\n'
        local product_issue=false
        local test_issue=false
        local infra_issue=false
        identify-issue-type "$origin_job_id" || return $?

        if "$product_issue"; then
            comment+="📋 **Jobs including the last good build are ok, likely a product issue**."
        elif "$test_issue"; then
            comment+="📋 **Jobs including the last good test are ok, likely a test issue**."
        elif "$infra_issue"; then
            comment+="📋 **All investigation jobs failed, likely an issue with the test environment (settings, external resources, infrastructure)**."
        else
            comment+="📋 **Likely not a sporadic failure**."
        fi
    fi

    # meanwhile the original job might have been deleted already, handle
    # gracefully
    out=$(client-post-job-comment "$origin_job_id" "$comment" 2>&1) || rc=$?
    if [[ "$rc" -ne 0 ]]; then
        status=$(echo "$out" | runjq .error_status 2> /dev/null || echo "unknown")
        if [[ "$status" != "404" ]]; then
            echoerr "Unexpected error encountered when posting comments on job $origin_job_id after investigation job $id failed: '$out'"
            return 2
        fi
    fi
}

# crosscheck
# 1. current job/build + current test -> check if reproducible/sporadic
# 2. current job/build + last good test (+ last good needles) -> check for
#    test (+needles) regression
# 3. last good job/build + current test -> check for product regression
# 4. last good job/build + last good test -> check for other problem
#    sources, e.g. infrastructure
investigate() {
    local id="${1##*/}"
    local rc=0

    job_data=$(client-get-job "$id")
    # shellcheck disable=SC2181
    [[ $? != 0 ]] && echoerr "unable to query job data for $id: $job_data" && return 1
    old_name="$(echo "$job_data" | runjq -r '.job.test')" || return $?
    if [[ "$old_name" =~ ":investigate:" ]]; then
        post-investigate "$id" "$old_name" || return $?
        return 0
    fi
    clone="$(echo "$job_data" | runjq -r '.job.clone_id')" || return $?
    if ! "$force" && [[ "$clone" != "null" ]]; then
        echoerr "Job $id already has a clone, skipping investigation. Use the env variable 'force=true' to trigger investigation jobs"
        return 0
    fi

    # determine dependency data or postpone if cluster not done
    dependency_data=$(query_dependency_data_or_postpone "$id" "$job_data") || rc=$?
    [[ $rc != 255 ]] && return $rc

    # determine the job in the cluster with the lowest ID to use that for commenting/synchronization
    first_cluster_job_id=$(echo "$dependency_data" | runjq -r "[$id, [.cluster[] | select(contains([$id]))]] | flatten | min") || return $?

    [[ "$old_name" =~ $exclude_name_regex ]] && echo "Job name '$old_name' matches \$exclude_name_regex '$exclude_name_regex', skipping investigation" && return 0
    group="$(echo "$job_data" | runjq -r '.job.parent_group + " / " + .job.group')" || return $?
    [[ "$group" = " / " ]] && [[ "$exclude_no_group" = "true" ]] && echo "Job w/o job group, \$exclude_no_group is set, skipping investigation" && return 0
    [[ "$group" =~ $exclude_group_regex ]] && echo "Job group '$group' matches \$exclude_group_regex '$exclude_group_regex', skipping investigation" && return 0

    # Optionally we can find "first failed", could extend openQA investigation
    # method instead for we are just working based on supplied job which can
    # have more, ambiguous potential changes that we need to bisect on

    # sync by writing initial investigation comment (edited later)
    rc=0
    comment_id=$(sync_via_investigation_comment "$id" "$first_cluster_job_id") || rc=$?
    [[ $rc != 255 ]] && return $rc

    out=$(trigger_jobs "$id" "${@:2}")
    $verbose && echo "$0, id: '$id', out: '$out'"
    finalize_investigation_comment "$id" "$first_cluster_job_id" "$comment_id" "$out"
}

main() {
    local id=${1:?"Need 'job_id'"}
    client_prefix=''
    [ "$dry_run" = "1" ] && client_prefix="echo"
    set +u
    if [[ -z "${client_call[*]}" ]]; then
        client_call=(openqa-cli "${client_args[@]}")
        client_prefix="${client_prefix:-runcli}"
        client_call=("$client_prefix" "${client_call[@]}")
    fi
    set -u
    clone_call="${clone_call:-"$client_prefix openqa-clone-job --json-output --skip-chained-deps --max-depth 0 --parental-inheritance --within-instance"}"
    investigate "$@"
}

caller 0 > /dev/null || main "$@"