From 842515e03d8f4ae88336c450c49d9a3a8659ec61 Mon Sep 17 00:00:00 2001 From: Sonja Heinze Date: Tue, 12 Sep 2023 18:59:53 +0200 Subject: [PATCH] Implement a Fuzzy CI to catch regressions This commit implements a Fuzzy CI to catch end-to-end regressions in Merlin by checking the responses of a set of ocamlmerlin queries. That set of queries is determined by a deterministic but randomly chosen set of file location samples on the Irmin code base. The CI passes automatically if the responses on all samples remain the same before and after the PR. If there's a diff in the responses, the CI fails and makes the diff available as a GH action artifact. There's a high-level description of the CI in much more detail on https://github.com/ocaml/merlin/wiki/Merlin-Fuzzy-CI. Co-authored-by: Enguerrand Decorne --- .github/fuzzy-ci-helpers/create_diff.ml | 45 +++ .github/workflows/fuzzy-ci.yml | 509 ++++++++++++++++++++++++ CHANGES.md | 2 +- 3 files changed, 555 insertions(+), 1 deletion(-) create mode 100644 .github/fuzzy-ci-helpers/create_diff.ml create mode 100644 .github/workflows/fuzzy-ci.yml diff --git a/.github/fuzzy-ci-helpers/create_diff.ml b/.github/fuzzy-ci-helpers/create_diff.ml new file mode 100644 index 0000000000..0ab15910d7 --- /dev/null +++ b/.github/fuzzy-ci-helpers/create_diff.ml @@ -0,0 +1,45 @@ +type read_result = Success | Eof + +let read_line_opt () = try Some (read_line ()) with End_of_file -> None +let delimiter1 = Sys.argv.(1) +let delimiter2 = Sys.argv.(2) +let diff_file = Sys.argv.(3) + +let rec loop () = + let rec stdin_to_file ~until file_oc = + match read_line_opt () with + | None -> Eof + | Some line -> + if String.equal line until then Success + else ( + output_string file_oc (line ^ "\n"); + stdin_to_file ~until file_oc) + in + let label1 = read_line_opt () in + let tmp1 = Filename.temp_file "tmp1" "" in + let oc = open_out tmp1 in + let read_result1 = stdin_to_file ~until:delimiter1 oc in + close_out oc; + let label2 = read_line_opt () in + let tmp2 = Filename.temp_file "tmp2" "" in + let oc = open_out tmp2 in + let read_result2 = stdin_to_file ~until:delimiter2 oc in + close_out oc; + match (label1, read_result1, label2, read_result2) with + | Some label1, Success, Some label2, Success -> + let diff_cmd = + Printf.sprintf + "diff -U 5 --label=\"%s\" --label=\"%s\" \"%s\" \"%s\" >> %s" label1 + label2 tmp1 tmp2 diff_file + in + let _ = Sys.command diff_cmd in + loop () + | Some _, Success, Some _, Eof -> + raise (Failure "EOF before reaching delimiter2 when reading second JSON.") + | Some _, Success, None, _ -> + raise (Failure "EOF before reaching delimiter2 when second label.") + | Some _, Eof, _, _ -> + raise (Failure "EOF before reaching delimiter1 when reading first JSON.") + | None, _, _, _ -> () + +let () = loop () diff --git a/.github/workflows/fuzzy-ci.yml b/.github/workflows/fuzzy-ci.yml new file mode 100644 index 0000000000..d1f10cac00 --- /dev/null +++ b/.github/workflows/fuzzy-ci.yml @@ -0,0 +1,509 @@ +name: Fuzzy CI + +on: + pull_request: + branches: [ master ] + types: [ opened, synchronize, reopened, unlabeled, labeled ] + paths-ignore: + - '**.md' + - '**.txt' + - '.git*' + - 'doc/**' + - 'emacs/**' + - 'vim/**' + - '**/emacs-lint.yml' + - 'bench/**' + - 'upstream/**' + - 'tests/**' + +env: + # Artifact names need to be consistent across jobs: + BASE_BRANCH_ARTIFACT_NAME: base-branch-data-${{ github.event.pull_request.base.sha }}-pr${{ github.event.pull_request.number }} + MERGE_BRANCH_ARTIFACT_NAME: merge-branch-data-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }}-pr${{ github.event.pull_request.number }} + DIFF_ARTIFACT_NAME: diff-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }} + + # File names also need to be consistant across jobs: + FULL_DIFF_FILE: full_responses.diff + DISTILLED_DIFF_FILE: distilled_data.diff + # Note: FULL_DATA_FILE and DISTILLED_DATA_FILE need to be the file names of the files generated by `merl-an behavior` + FULL_DATA_FILE: full_responses.json + DISTILLED_DATA_FILE: distilled_data.json + + # GitHub API related short-hands: + GH_API_COMMENTS: ${{ github.event.pull_request.comments_url }} + GH_API_LABELS: ${{ github.event.pull_request.issue_url }}/labels + GH_API_ARTIFACTS: ${{ github.event.pull_request.base.repo.url }}/actions/artifacts + + # URL short-hands + ACTIONS_RUNS_ENDPOINT: ${{ github.event.repository.html_url }}/actions/runs + CURRENT_ACTION_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} + + # Irmin version and merl-an version need to be consistent for reproducibility (Irmin is used as the test code base to test `ocamlmerlin` on) + IRMIN_VERSION: 3.9.0 + # TODO: Release merl-an and install a certain version instead of pinning it to a certain commit + MERL_AN_SHA: 1643fb7a9958379fb4ed8d7c5169146aaa88f5b7 + + # The compiler version used on the respective branches. It also needs to form part of Irmin's build cache key. + # Bump either of these whenever the compiler version is bumped on either of the two branches. + merge_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1 + base_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1 + +jobs: + data: + name: Generate data + runs-on: ubuntu-22.04 + if: > + github.event.action == 'opened' || + github.event.action == 'synchronize' || + github.event.action == 'reopened' || + ( + github.event.action == 'unlabeled' && + github.event.label.name == 'fuzzy-diff-looks-good' + ) + env: + data_dir: data + strategy: + matrix: + commit: ["merge_branch", "base_branch"] + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Checking out ${{ matrix.commit }} + env: + base_branch_sha: ${{ github.event.pull_request.base.sha }} + merge_branch_sha: ${{ github.sha }} + run: | + sha=$${{ matrix.commit }}_sha + echo "Check out $sha" + git checkout $sha + + - name: Get desired compiler version + id: compiler + run: | + v=$${{ matrix.commit }}_COMPILER_VERSION + echo "version=$v" | tee -a $GITHUB_OUTPUT + + - name: Install OCaml + uses: ocaml/setup-ocaml@v2 + with: + ocaml-compiler: ${{ steps.compiler.outputs.version }} + dune-cache: true + + - name: Install merlin dependencies + run: | + opam pin menhirLib 20201216 --no-action + opam install . --deps-only --yes + + - name: Install merlin + run: | + # Running `subst` to have the current commit in the data produced by `merl-an` + opam exec -- dune subst + opam exec -- dune build -p merlin-lib,dot-merlin-reader,merlin + opam exec -- dune install -p merlin-lib,dot-merlin-reader,merlin + + - name: Pull irmin and its deps from cache if possible + uses: actions/cache@v3 + id: irmin-cache + with: + path: irmin/ + key: os${{ runner.os }}+arch${{ runner.arch }}+${{ hashFiles('fuzzy-ci-helpers/irmin.3.9.0.opam.locked') }}+${{ env.IRMIN_VERSION }}+${{ steps.compiler.outputs.version }} + + - name: Download Irmin tarball + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: | + wget https://github.com/mirage/irmin/releases/download/$IRMIN_VERSION/irmin-$IRMIN_VERSION.tbz + + - name: Create irmin dir + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: mkdir -p irmin + + - name: Decompress Irmin tarball + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: tar xvf irmin-$IRMIN_VERSION.tbz -C irmin --strip-components=1 + + - name: Get Irmin's lock files + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: | + cp .github/fuzzy-ci-helpers/irmin.3.9.0.opam.locked irmin/irmin.opam.locked + + - name: Install opam monorepo + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: opam install opam-monorepo --yes + + - name: Pull in Irmin's dependencies + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: | + git checkout ${{ github.sha }} + opam monorepo pull --lockfile=irmin.opam.locked --yes + working-directory: irmin + + - name: Prune Irmin + if: steps.irmin-cache.outputs.cache-hit != 'true' + run: | + rm -r examples/ bench/ + find test/ -mindepth 1 -maxdepth 1 -type d -not -name 'irmin-pack' -exec rm -r {} \; + find src/ -mindepth 1 -maxdepth 1 -type d \ + -not -name 'irmin-pack' \ + -not -name 'irmin' \ + -not -name 'irmin-tezos' \ + -not -name ppx_irmin \ + -not -name irmin_test \ + -not -name irmin-test \ + -exec rm -r {} \; + working-directory: irmin + + - name: Build Irmin + run: | + opam exec -- dune build @check + working-directory: irmin + + - name: Pull merl-an from cache if possible + uses: actions/cache@v3 + id: merl-an-cache + with: + path: /usr/local/bin/merl-an + key: os${{ runner.os }}+arch${{ runner.arch }}+merl-an-sha$MERL_AN_SHA + + - name: Install merl-an + if: steps.merl-an-cache.outputs.cache-hit != 'true' + run: opam pin -y merl-an https://github.com/pitag-ha/merl-an.git#$MERL_AN_SHA + + - name: Add merl-an to /usr/local/bin/ + if: steps.merl-an-cache.outputs.cache-hit != 'true' + run: opam exec -- cp $GITHUB_WORKSPACE/_opam/bin/merl-an /usr/local/bin/merl-an + + - name: Create data set of Merlin responses + run: | + # Note: The parameters with most influence on the execution time are + # `--sample-size`: Number of samples per file defined by `--project` (and per local query). + # `--project`: List of dirs/files to create samples on. In the case of a dirs, all ml(i) files recursively in the dir are used. + # `--queries`: The `ocamlmerlin` queries that are being run. + opam exec -- merl-an behavior \ + --queries=type-enclosing,occurrences,locate,complete-prefix,errors \ + --sample-size=30 \ + --data=${{ env.data_dir }} \ + --merlin=ocamlmerlin \ + --project=irmin/src/irmin,irmin/src/irmin-pack,irmin/test/irmin-pack + + - name: Remove varying components from data + run: | + # TODO: This could be done on the `merl-an` side + jq '.responses |= map(del(.heap_mbytes, .timings, .cache))' \ + ${{ env.data_dir }}/$FULL_DATA_FILE > temp.json && \ + mv temp.json ${{ env.data_dir }}/$FULL_DATA_FILE + + - name: Create name for data artifact + id: artifact_name + env: + base_branch_artifact_name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }} + merge_branch_artifact_name: ${{ env. MERGE_BRANCH_ARTIFACT_NAME }} + run: echo "name=$${{ matrix.commit }}_artifact_name" >> $GITHUB_OUTPUT + + - name: Upload data + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.artifact_name.outputs.name }} + path: ${{ env.data_dir }} + + - name: Compile diff tool + if: ${{ matrix.commit == 'merge_branch' }} + run: | + # Taking advantage that ocamlopt is installed on this runner: compile the diff tool here and share it with the next job where it's needed. + # All GH runners are hosted on x86 machines and all jobs in this workflow declare the same OS, so this should workTM. + opam exec -- ocamlopt -o create_diff .github/fuzzy-ci-helpers/create_diff.ml + + - name: Upload diff tool + if: ${{ matrix.commit == 'merge_branch' }} + uses: actions/upload-artifact@v3 + with: + name: diff_tool + path: create_diff + + + + diff: + name: Generate diffs + runs-on: ubuntu-22.04 + outputs: + diff_exits: ${{steps.full_responses_diff.outputs.diff_exists}} + needs: data + env: + base_data_dir: base_data + merge_data_dir: merge_data + diff_dir: diff + steps: + - name: Download base branch data + uses: actions/download-artifact@v3 + with: + name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }} + path: ${{ env.base_data_dir }} + + - name: Download merge branch data + uses: actions/download-artifact@v3 + with: + name: ${{ env.MERGE_BRANCH_ARTIFACT_NAME }} + path: ${{ env.merge_data_dir }} + + - name: Create diff dir + run: mkdir -p "$diff_dir" + + - name: Download diff tool + uses: actions/download-artifact@v3 + with: + name: diff_tool + + - name: Give diff tool execute permissions + run: chmod +x create_diff + + - name: Generate full responses diff + id: full_responses_diff + run: | + jq -r -n \ + --slurpfile data1 "$base_data_dir/$FULL_DATA_FILE" \ + --slurpfile data2 "$merge_data_dir/$FULL_DATA_FILE" \ + 'def process_json($branch; $data): + ($branch + ": " + $data.cmd + " (id=" + ($data.sample_id | tostring) + ")"), $data; + range($data1|length) as $i | + process_json("base branch"; $data1[$i]), + "--input-separator--", + process_json("merge branch"; $data2[$i]), + "--diff-cmd-separator--"' \ + | ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$FULL_DIFF_FILE" + if [ -s "$diff_dir/$FULL_DIFF_FILE" ]; then + echo "diff_exists=true" | tee -a $GITHUB_OUTPUT + else + echo "diff_exists=false" | tee -a $GITHUB_OUTPUT + fi + + - name: Generate distilled data diff + # If there's no full reponses diff, there also won't be a distilled data diff + if: ${{ steps.full_responses_diff.outputs.diff_exists == 'true' }} + run: | + jq -r -n \ + --slurpfile data1 "$base_data_dir/$DISTILLED_DATA_FILE" \ + --slurpfile data2 "$merge_data_dir/$DISTILLED_DATA_FILE" \ + 'def process_json($branch; $data): + ($branch + ": " + $data.cmd + " (id=" + ($data.sample_id | tostring) + ")"), $data; + range($data1|length) as $i | + process_json("base branch"; $data1[$i]), + "--input-separator--", + process_json("merge branch"; $data2[$i]), + "--diff-cmd-separator--"' \ + | ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$DISTILLED_DIFF_FILE" + + - name: Upload diff(s) + uses: actions/upload-artifact@v3 + with: + name: ${{ env.DIFF_ARTIFACT_NAME }} + path: ${{ env.diff_dir }} + + + + output: + name: Evaluate diffs + runs-on: ubuntu-22.04 + needs: diff + env: + earlier_diff_was_approved: ${{ contains(github.event.pull_request.labels.*.name, 'fuzzy-diff-looks-good') }} + current_diff_exists: ${{ needs.diff.outputs.diff_exits }} + diff_dir: ${{ needs.artifact_names.outputs.diff_dir }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Download current diff(s) + if: ${{ env.current_diff_exists == 'true' }} + uses: actions/download-artifact@v3 + with: + name: ${{ env.DIFF_ARTIFACT_NAME }} + + - name: Retreive hash of approved diff + if: ${{ env.earlier_diff_was_approved == 'true' }} + id: approved_diff + run: | + msg_start=$(head -c 50 .github/fuzzy-ci-helpers/msg.txt) + + next_page_endpoint="$GH_API_COMMENTS?per_page=100&page=1" + latest_comment="{}" + + while [ -n "$next_page_endpoint" ]; do + latest_comment=$( + curl -s -D "headers.txt" -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$next_page_endpoint" | + jq --arg msg_start "$msg_start" --argjson latest "{}" ' + map( + select( + (.body | startswith($msg_start)) and .user.login == "github-actions[bot]" + ) + ) + [$latest] | max_by(.created_at)' + ) + + next_page_endpoint=$( + rg '^link:' headers.txt | + tr ',' '\n' | + rg 'rel="next"' | + cut -d'<' -f2 | + cut -d'>' -f1 + ) + done + + hash=$(echo "$latest_comment" | jq '.body' -r | grep '256-sha' | awk '{print $NF}') + echo "hash='$hash'" | tee -a $GITHUB_OUTPUT + + - name: Analyze current diff + id: current_diff + run: | + hash=$(sha256sum "$FULL_DIFF_FILE" | awk '{print $1}') + echo "hash='$hash'" | tee -a $GITHUB_OUTPUT + + - name: Write instruction to delete PR label + # When this workflow is triggered by a PR from a fork, it doesn't have + # the permissions to delete PR labels. Instead, we forward the + # instruction to delete the label to fuzzy-ci-privileged.yml. + if: ${{ env.earlier_diff_was_approved == 'true' && steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }} + run: | + echo ${{ steps.approved_diff.outputs.hash }} + echo ${{ steps.current_diff.outputs.hash }} + mkdir -p ./forward + jq -n \ + --arg instruction "delete_label" \ + --arg endpoint "$GH_API_LABELS" \ + '{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json + + - name: Upload instruction to delete label + if: ${{ env.earlier_diff_was_approved == 'true' && steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }} + uses: actions/upload-artifact@v3 + with: + name: forwarded_instructions + path: forward/ + + - name: Return + id: return + env: + github_api_labels_url: ${{ github.event.pull_request.base.repo.url }}/issues/${{ github.event.pull_request.number }}/labels + run: | + print_head_of_diffs () { + echo "--------beginning of full responses diff head--------" + head -n 100 "$FULL_DIFF_FILE" + echo "--------end of full responses diff head--------" + echo "--------beginning of distilled data diff head--------" + head -n 100 "$DISTILLED_DIFF_FILE" + echo "--------end of distilled data diff head--------" + } + + # FIXME (?): Are nested conditionals always so ugly in Bash, or is there a better way? Option types and the possibility to match would help a lot. + LABEL_NAME=$(cat .github/fuzzy-ci-helpers/label_name.txt) + if $earlier_diff_was_approved; then + echo "Earlier diff was approved." + if [ ${{ steps.current_diff.outputs.hash }} == ${{ steps.approved_diff.outputs.hash }} ]; then + echo "This diff has been approved earlier. Everything ok." + exit 0 + else + print_head_of_diffs + printf "The diff has changed since it was approved. So I'm removing the $LABEL_NAME label. If the new diff looks good, please set the label again.\n\ + There's a head of the new diffs printed above. The whole diffs can be downloaded from $CURRENT_ACTION_URL .\n\ + Previous sha256: ${{ steps.approved_diff.outputs.hash }}\n\ + Current sha256: ${{ steps.current_diff.outputs.hash }}" + echo "delete_label=true" >> $GITHUB_OUTPUT + exit 1 + fi + else + if $current_diff_exists; then + print_head_of_diffs + printf "There's a head of the diffs printed above. The diffs can be downloaded from $CURRENT_ACTION_URL .\nIf it looks good, please set the $LABEL_NAME label on the PR." + exit 1 + else + echo "No diff. All good." + exit 0 + fi + fi + + + approve: + name: Approve diff + if: > + github.event_name == 'pull_request' && + github.event.action == 'labeled' && + github.event.label.name == 'fuzzy-diff-looks-good' + runs-on: ubuntu-22.04 + steps: + - name: Retreive diff artifact meta-data + id: diff_metadata + run: | + all_artifacts=$(curl -sSL "$GH_API_ARTIFACTS") + diff_artifact=$(echo "$all_artifacts" | jq "first(.artifacts[] | select(.name == \"$DIFF_ARTIFACT_NAME\") )") + id=$(echo "$diff_artifact" | jq ".id") + echo "id=$id" | tee -a $GITHUB_OUTPUT + workflow_run=$(echo "$diff_artifact" | jq ".workflow_run | .id") + echo "workflow_run=$workflow_run" | tee -a $GITHUB_OUTPUT + if [ -z $id ]; then + echo "exists=false" | tee -a $GITHUB_OUTPUT + else + echo "exists=true" | tee -a $GITHUB_OUTPUT + fi + + - name: Write instruction to delete PR label + # When this workflow is triggered by a PR from a fork, it doesn't have + # the permissions to delete PR labels. Instead, we forward the + # instruction to delete the label to fuzzy-ci-privileged.yml. + if: ${{ steps.diff_metadata.outputs.exists == 'false' }} + run: | + mkdir -p ./forward + jq -n \ + --arg instruction "delete_label" \ + --arg endpoint "$GH_API_LABELS" \ + '{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json + + - name: Upload instruction to delete label + if: ${{ steps.diff_metadata.outputs.exists == 'false' }} + uses: actions/upload-artifact@v3 + with: + name: forwarded_instructions + path: forward/ + + - name: Fail due to diff not existing yet + if: ${{ steps.diff_metadata.outputs.exists == 'false' }} + run: | + printf "You seem to have tried to approve a diff that doesn't exist yet.\nWait for the diff to have been generated and then try again." + exit 1 + + - name: Download diff + env: + id: ${{ steps.diff_metadata.outputs.id }} + run: | + # Doing this manually, since actions/download-artifact only works on the same workflow run on which the artifact was uploaded + curl -sSLO -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$GH_API_ARTIFACTS/$id/zip" -D headers.txt + + - name: Unzip downloaded diff + run: | + unzip zip || (echo "Download of diff artifact failed" && cat headers.txt && cat zip && exit 1) + + - name: Compute full responses diff hash + id: diff_hash + run: | + hash=$(sha256sum "$FULL_DIFF_FILE" | awk '{print $1}') + echo "hash=$hash" | tee -a $GITHUB_OUTPUT + + - name: Write instruction to comment on PR + # When this workflow is triggered by a PR from a fork, it doesn't have + # the permissions to comment on PRs. Instead, we forward the + # instruction to comment on the PR to fuzzy-ci-privileged.yml. + env: + approved_diffs_workflow_run: ${{ steps.diff_metadata.outputs.workflow_run }} + approved_diffs_hash: ${{ steps.diff_hash.outputs.hash }} + run: | + mkdir -p ./forward + jq -n \ + --arg instruction "comment" \ + --arg endpoint "$GH_API_COMMENTS" \ + --arg artifacts_url "$ACTIONS_RUNS_ENDPOINT/$approved_diffs_workflow_run" \ + --arg hash "$approved_diffs_hash" \ + '{instruction: $instruction, endpoint: $endpoint, artifacts_url: $artifacts_url, hash: $hash}' > ./forward/instruction.json + + - name: Upload instruction to comment on PR + uses: actions/upload-artifact@v3 + with: + name: forwarded_instructions + path: forward/ diff --git a/CHANGES.md b/CHANGES.md index d7e1121fe5..67bccef0fc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ merlin NEXT_VERSION + merlin binary - Add a "heap_mbytes" field to Merlin server responses to report heap usage (#1717) - Add cache stats to telemetry (#1711) - + - Add a query_num field to the `ocamlmerlin` responses to detect server crashes (#1716) merlin 4.13 ===========