diff --git a/.github/workflows/robustness_nightly.yaml b/.github/workflows/robustness_nightly.yaml new file mode 100644 index 000000000..8b2bdb81e --- /dev/null +++ b/.github/workflows/robustness_nightly.yaml @@ -0,0 +1,17 @@ +--- +name: Robustness Nightly +permissions: read-all +on: + schedule: + - cron: '25 9 * * *' # runs every day at 09:25 UTC + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: + +jobs: + main: + # GHA has a maximum amount of 6h execution time, we try to get done within 3h + uses: ./.github/workflows/robustness_template.yaml + with: + count: 100 + testTimeout: 200m + runs-on: "['ubuntu-latest-8-cores']" diff --git a/.github/workflows/robustness_template.yaml b/.github/workflows/robustness_template.yaml new file mode 100644 index 000000000..54ed3b483 --- /dev/null +++ b/.github/workflows/robustness_template.yaml @@ -0,0 +1,38 @@ +--- +name: Reusable Robustness Workflow +on: + workflow_call: + inputs: + count: + required: true + type: number + testTimeout: + required: false + type: string + default: '30m' + runs-on: + required: false + type: string + default: "['ubuntu-latest']" +permissions: read-all + +jobs: + test: + timeout-minutes: 210 + runs-on: ${{ fromJson(inputs.runs-on) }} + steps: + - uses: actions/checkout@v4 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@v5 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - name: test-robustness + run: | + set -euo pipefail + + make gofail-enable + + # build bbolt with failpoint + go install ./cmd/bbolt + sudo -E PATH=$PATH make ROBUSTNESS_TESTFLAGS="--count ${{ inputs.count }} --timeout ${{ inputs.testTimeout }} -failfast" test-robustness diff --git a/.github/workflows/robustness_test.yaml b/.github/workflows/robustness_test.yaml index b1980ac65..a96854d4e 100644 --- a/.github/workflows/robustness_test.yaml +++ b/.github/workflows/robustness_test.yaml @@ -3,16 +3,8 @@ on: [push, pull_request] permissions: read-all jobs: test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - id: goversion - run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" - - uses: actions/setup-go@v5 - with: - go-version: ${{ steps.goversion.outputs.goversion }} - - run: | - make gofail-enable - # build bbolt with failpoint - go install ./cmd/bbolt - sudo -E PATH=$PATH make test-robustness + uses: ./.github/workflows/robustness_template.yaml + with: + count: 10 + testTimeout: 30m + runs-on: "['ubuntu-latest-8-cores']" diff --git a/Makefile b/Makefile index f43b25b20..b2e95df8e 100644 --- a/Makefile +++ b/Makefile @@ -84,4 +84,4 @@ test-failpoint: .PHONY: test-robustness # Running robustness tests requires root permission test-robustness: go test -v ${TESTFLAGS} ./tests/dmflakey -test.root - go test -v ${TESTFLAGS} ./tests/robustness -test.root + go test -v ${TESTFLAGS} ${ROBUSTNESS_TESTFLAGS} ./tests/robustness -test.root diff --git a/tests/dmflakey/dmflakey.go b/tests/dmflakey/dmflakey.go index d9bdf99a0..25061a4cb 100644 --- a/tests/dmflakey/dmflakey.go +++ b/tests/dmflakey/dmflakey.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "os/exec" + "path" "path/filepath" "strings" "time" @@ -289,6 +290,10 @@ func createEmptyFSImage(imgPath string, fsType FSType) error { return fmt.Errorf("failed to create image because %s already exists", imgPath) } + if err := os.MkdirAll(path.Dir(imgPath), 0600); err != nil { + return fmt.Errorf("failed to ensure parent directory %s: %w", path.Dir(imgPath), err) + } + f, err := os.Create(imgPath) if err != nil { return fmt.Errorf("failed to create image %s: %w", imgPath, err) diff --git a/tests/robustness/powerfailure_test.go b/tests/robustness/powerfailure_test.go index a1d0bc598..09ae88124 100644 --- a/tests/robustness/powerfailure_test.go +++ b/tests/robustness/powerfailure_test.go @@ -4,8 +4,11 @@ package robustness import ( "bytes" + "crypto/rand" "fmt" "io" + "math" + "math/big" "net/http" "net/url" "os" @@ -23,9 +26,65 @@ import ( "golang.org/x/sys/unix" ) +var panicFailpoints = []string{ + "beforeSyncDataPages", + "beforeSyncMetaPage", + "lackOfDiskSpace", + "mapError", + "resizeFileError", + "unmapError", +} + // TestRestartFromPowerFailure is to test data after unexpected power failure. func TestRestartFromPowerFailure(t *testing.T) { - flakey := initFlakeyDevice(t, t.Name(), dmflakey.FSTypeEXT4, "") + for _, tc := range []struct { + name string + du time.Duration + fsMountOpt string + useFailpoint bool + }{ + { + name: "fp_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + useFailpoint: true, + }, + { + name: "kill_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + }, + { + name: "kill_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + }, + { + name: "kill_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + }, + } { + t.Run(tc.name, func(t *testing.T) { + doPowerFailure(t, tc.du, tc.fsMountOpt, tc.useFailpoint) + }) + } +} + +func doPowerFailure(t *testing.T, du time.Duration, fsMountOpt string, useFailpoint bool) { + flakey := initFlakeyDevice(t, strings.Replace(t.Name(), "/", "_", -1), dmflakey.FSTypeEXT4, fsMountOpt) root := flakey.RootFS() dbPath := filepath.Join(root, "boltdb") @@ -38,6 +97,8 @@ func TestRestartFromPowerFailure(t *testing.T) { } logPath := filepath.Join(t.TempDir(), fmt.Sprintf("%s.log", t.Name())) + require.NoError(t, os.MkdirAll(path.Dir(logPath), 0600)) + logFd, err := os.Create(logPath) require.NoError(t, err) defer logFd.Close() @@ -64,10 +125,18 @@ func TestRestartFromPowerFailure(t *testing.T) { } }() - time.Sleep(time.Duration(time.Now().UnixNano()%5+1) * time.Second) + time.Sleep(du) t.Logf("simulate power failure") - activeFailpoint(t, fpURL, "beforeSyncMetaPage", "panic") + if useFailpoint { + fpURL = "http://" + fpURL + targetFp := panicFailpoints[randomInt(t, math.MaxInt32)%len(panicFailpoints)] + t.Logf("random pick failpoint: %s", targetFp) + activeFailpoint(t, fpURL, targetFp, "panic") + } else { + t.Log("kill bbolt") + assert.NoError(t, cmd.Process.Kill()) + } select { case <-time.After(10 * time.Second): @@ -89,10 +158,10 @@ func TestRestartFromPowerFailure(t *testing.T) { // activeFailpoint actives the failpoint by http. func activeFailpoint(t *testing.T, targetUrl string, fpName, fpVal string) { - u, err := url.Parse("http://" + path.Join(targetUrl, fpName)) + u, err := url.JoinPath(targetUrl, fpName) require.NoError(t, err, "parse url %s", targetUrl) - req, err := http.NewRequest("PUT", u.String(), bytes.NewBuffer([]byte(fpVal))) + req, err := http.NewRequest("PUT", u, bytes.NewBuffer([]byte(fpVal))) require.NoError(t, err) resp, err := http.DefaultClient.Do(req) @@ -192,3 +261,9 @@ func unmountAll(target string) error { } return fmt.Errorf("failed to umount %s: %w", target, unix.EBUSY) } + +func randomInt(t *testing.T, max int) int { + n, err := rand.Int(rand.Reader, big.NewInt(int64(max))) + assert.NoError(t, err) + return int(n.Int64()) +}