From 4c3a80b2c06cb5a39eeabe422f718a1b6cfc7be2 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Mon, 1 Jan 2024 17:10:05 +0800 Subject: [PATCH 1/2] tests: Update TestRestartFromPowerFailure Update case with a combination of EXT4 filesystem's commit setting and unexpected exit event. That EXT4 filesystem's commit is to sync all its data and metadata every seconds. The kernel can help us sync even if that process has been killed. With different commit setting, we can simulate that case that kernel syncs half part of dirty pages before power failure. And for unexpected exit event, we can kill that process randomly or panic at failpoint instead of fixed code path. Signed-off-by: Wei Fu --- tests/dmflakey/dmflakey.go | 5 ++ tests/robustness/powerfailure_test.go | 85 +++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/tests/dmflakey/dmflakey.go b/tests/dmflakey/dmflakey.go index d9bdf99a0..25061a4cb 100644 --- a/tests/dmflakey/dmflakey.go +++ b/tests/dmflakey/dmflakey.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "os/exec" + "path" "path/filepath" "strings" "time" @@ -289,6 +290,10 @@ func createEmptyFSImage(imgPath string, fsType FSType) error { return fmt.Errorf("failed to create image because %s already exists", imgPath) } + if err := os.MkdirAll(path.Dir(imgPath), 0600); err != nil { + return fmt.Errorf("failed to ensure parent directory %s: %w", path.Dir(imgPath), err) + } + f, err := os.Create(imgPath) if err != nil { return fmt.Errorf("failed to create image %s: %w", imgPath, err) diff --git a/tests/robustness/powerfailure_test.go b/tests/robustness/powerfailure_test.go index a1d0bc598..09ae88124 100644 --- a/tests/robustness/powerfailure_test.go +++ b/tests/robustness/powerfailure_test.go @@ -4,8 +4,11 @@ package robustness import ( "bytes" + "crypto/rand" "fmt" "io" + "math" + "math/big" "net/http" "net/url" "os" @@ -23,9 +26,65 @@ import ( "golang.org/x/sys/unix" ) +var panicFailpoints = []string{ + "beforeSyncDataPages", + "beforeSyncMetaPage", + "lackOfDiskSpace", + "mapError", + "resizeFileError", + "unmapError", +} + // TestRestartFromPowerFailure is to test data after unexpected power failure. func TestRestartFromPowerFailure(t *testing.T) { - flakey := initFlakeyDevice(t, t.Name(), dmflakey.FSTypeEXT4, "") + for _, tc := range []struct { + name string + du time.Duration + fsMountOpt string + useFailpoint bool + }{ + { + name: "fp_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + useFailpoint: true, + }, + { + name: "kill_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + }, + { + name: "kill_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + }, + { + name: "kill_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + }, + } { + t.Run(tc.name, func(t *testing.T) { + doPowerFailure(t, tc.du, tc.fsMountOpt, tc.useFailpoint) + }) + } +} + +func doPowerFailure(t *testing.T, du time.Duration, fsMountOpt string, useFailpoint bool) { + flakey := initFlakeyDevice(t, strings.Replace(t.Name(), "/", "_", -1), dmflakey.FSTypeEXT4, fsMountOpt) root := flakey.RootFS() dbPath := filepath.Join(root, "boltdb") @@ -38,6 +97,8 @@ func TestRestartFromPowerFailure(t *testing.T) { } logPath := filepath.Join(t.TempDir(), fmt.Sprintf("%s.log", t.Name())) + require.NoError(t, os.MkdirAll(path.Dir(logPath), 0600)) + logFd, err := os.Create(logPath) require.NoError(t, err) defer logFd.Close() @@ -64,10 +125,18 @@ func TestRestartFromPowerFailure(t *testing.T) { } }() - time.Sleep(time.Duration(time.Now().UnixNano()%5+1) * time.Second) + time.Sleep(du) t.Logf("simulate power failure") - activeFailpoint(t, fpURL, "beforeSyncMetaPage", "panic") + if useFailpoint { + fpURL = "http://" + fpURL + targetFp := panicFailpoints[randomInt(t, math.MaxInt32)%len(panicFailpoints)] + t.Logf("random pick failpoint: %s", targetFp) + activeFailpoint(t, fpURL, targetFp, "panic") + } else { + t.Log("kill bbolt") + assert.NoError(t, cmd.Process.Kill()) + } select { case <-time.After(10 * time.Second): @@ -89,10 +158,10 @@ func TestRestartFromPowerFailure(t *testing.T) { // activeFailpoint actives the failpoint by http. func activeFailpoint(t *testing.T, targetUrl string, fpName, fpVal string) { - u, err := url.Parse("http://" + path.Join(targetUrl, fpName)) + u, err := url.JoinPath(targetUrl, fpName) require.NoError(t, err, "parse url %s", targetUrl) - req, err := http.NewRequest("PUT", u.String(), bytes.NewBuffer([]byte(fpVal))) + req, err := http.NewRequest("PUT", u, bytes.NewBuffer([]byte(fpVal))) require.NoError(t, err) resp, err := http.DefaultClient.Do(req) @@ -192,3 +261,9 @@ func unmountAll(target string) error { } return fmt.Errorf("failed to umount %s: %w", target, unix.EBUSY) } + +func randomInt(t *testing.T, max int) int { + n, err := rand.Int(rand.Reader, big.NewInt(int64(max))) + assert.NoError(t, err) + return int(n.Int64()) +} From c61a3be3e85720c7a8f6fcd6c71c00aea45218c5 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Mon, 1 Jan 2024 22:49:06 +0800 Subject: [PATCH 2/2] *: introduce nightly run for robustness Signed-off-by: Wei Fu --- .github/workflows/robustness_nightly.yaml | 17 ++++++++++ .github/workflows/robustness_template.yaml | 38 ++++++++++++++++++++++ .github/workflows/robustness_test.yaml | 18 +++------- Makefile | 2 +- 4 files changed, 61 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/robustness_nightly.yaml create mode 100644 .github/workflows/robustness_template.yaml diff --git a/.github/workflows/robustness_nightly.yaml b/.github/workflows/robustness_nightly.yaml new file mode 100644 index 000000000..8b2bdb81e --- /dev/null +++ b/.github/workflows/robustness_nightly.yaml @@ -0,0 +1,17 @@ +--- +name: Robustness Nightly +permissions: read-all +on: + schedule: + - cron: '25 9 * * *' # runs every day at 09:25 UTC + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: + +jobs: + main: + # GHA has a maximum amount of 6h execution time, we try to get done within 3h + uses: ./.github/workflows/robustness_template.yaml + with: + count: 100 + testTimeout: 200m + runs-on: "['ubuntu-latest-8-cores']" diff --git a/.github/workflows/robustness_template.yaml b/.github/workflows/robustness_template.yaml new file mode 100644 index 000000000..54ed3b483 --- /dev/null +++ b/.github/workflows/robustness_template.yaml @@ -0,0 +1,38 @@ +--- +name: Reusable Robustness Workflow +on: + workflow_call: + inputs: + count: + required: true + type: number + testTimeout: + required: false + type: string + default: '30m' + runs-on: + required: false + type: string + default: "['ubuntu-latest']" +permissions: read-all + +jobs: + test: + timeout-minutes: 210 + runs-on: ${{ fromJson(inputs.runs-on) }} + steps: + - uses: actions/checkout@v4 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@v5 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - name: test-robustness + run: | + set -euo pipefail + + make gofail-enable + + # build bbolt with failpoint + go install ./cmd/bbolt + sudo -E PATH=$PATH make ROBUSTNESS_TESTFLAGS="--count ${{ inputs.count }} --timeout ${{ inputs.testTimeout }} -failfast" test-robustness diff --git a/.github/workflows/robustness_test.yaml b/.github/workflows/robustness_test.yaml index b1980ac65..a96854d4e 100644 --- a/.github/workflows/robustness_test.yaml +++ b/.github/workflows/robustness_test.yaml @@ -3,16 +3,8 @@ on: [push, pull_request] permissions: read-all jobs: test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - id: goversion - run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" - - uses: actions/setup-go@v5 - with: - go-version: ${{ steps.goversion.outputs.goversion }} - - run: | - make gofail-enable - # build bbolt with failpoint - go install ./cmd/bbolt - sudo -E PATH=$PATH make test-robustness + uses: ./.github/workflows/robustness_template.yaml + with: + count: 10 + testTimeout: 30m + runs-on: "['ubuntu-latest-8-cores']" diff --git a/Makefile b/Makefile index f43b25b20..b2e95df8e 100644 --- a/Makefile +++ b/Makefile @@ -84,4 +84,4 @@ test-failpoint: .PHONY: test-robustness # Running robustness tests requires root permission test-robustness: go test -v ${TESTFLAGS} ./tests/dmflakey -test.root - go test -v ${TESTFLAGS} ./tests/robustness -test.root + go test -v ${TESTFLAGS} ${ROBUSTNESS_TESTFLAGS} ./tests/robustness -test.root