From 3833927f8b1a17f4f0480af5d4a8ed051910a0ca Mon Sep 17 00:00:00 2001 From: Kostiantyn Masliuk <1pkg@protonmail.com> Date: Fri, 4 Oct 2024 09:57:17 -0700 Subject: [PATCH] PGO: optimize collected profile file size (#14256) --- .github/workflows/benchmarks.yml | 2 +- systemtest/benchtest/profiles.go | 41 ++++++++++++++++++++++++-------- testing/benchmark/Makefile | 1 - 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index bdf4e14faf5..799cbcd1800 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -226,7 +226,7 @@ jobs: git_commit_gpgsign: true - name: Open PGO PR - if: ${{ env.RUN_STANDALONE == 'true' && github.ref == 'refs/heads/main' }} + if: ${{ env.RUN_STANDALONE == 'true' }} run: ${{ github.workspace }}/.ci/scripts/push-pgo-pr.sh env: WORKSPACE_PATH: ${{ github.workspace }} diff --git a/systemtest/benchtest/profiles.go b/systemtest/benchtest/profiles.go index e3dd240df44..0e0dcf515c8 100644 --- a/systemtest/benchtest/profiles.go +++ b/systemtest/benchtest/profiles.go @@ -18,9 +18,11 @@ package benchtest import ( + "compress/gzip" "context" "fmt" "io" + "math/rand/v2" "net/http" "os" "strconv" @@ -88,12 +90,30 @@ func (p *profiles) recordCPU() error { if benchConfig.CPUProfile == "" { return nil } - duration := benchConfig.Benchtime - profile, err := fetchProfile("/debug/pprof/profile", duration) - if err != nil { - return fmt.Errorf("failed to fetch CPU profile: %w", err) + // Limit profiling time to random 5% of overall time. + // This should not seriously affect the profile quality, + // since we merge the final profile form multiple sources, + // but prevent profile size from swelling. + var done bool + const tickets = 20 + duration := benchConfig.Benchtime / tickets + for i := range tickets { + if done || (rand.N(tickets-i)+i+1) < tickets { + time.Sleep(duration) + continue + } + profile, err := fetchProfile("/debug/pprof/profile", duration) + if err != nil { + return fmt.Errorf("failed to fetch CPU profile: %w", err) + } + // We don't need the address in the profile, so discard it to reduce the size. + if err := profile.Aggregate(true, true, true, true, false); err != nil { + return fmt.Errorf("failed to fetch CPU profile: %w", err) + } + profile = profile.Compact() + p.cpu = append(p.cpu, profile) + done = true } - p.cpu = append(p.cpu, profile) return nil } @@ -168,14 +188,15 @@ func (p *profiles) writeDeltas(filename string, deltas []*profile.Profile) error return err } defer f.Close() - return merged.Write(f) + w, err := gzip.NewWriterLevel(f, gzip.BestCompression) + if err != nil { + return err + } + defer w.Close() + return merged.WriteUncompressed(w) } func (p *profiles) mergeBenchmarkProfiles(profiles []*profile.Profile) (*profile.Profile, error) { - for i, profile := range profiles { - benchmarkName := p.benchmarkNames[i] - profile.SetLabel("benchmark", []string{benchmarkName}) - } merged, err := profile.Merge(profiles) if err != nil { return nil, fmt.Errorf("error merging profiles: %w", err) diff --git a/testing/benchmark/Makefile b/testing/benchmark/Makefile index 3344de04c04..9baf132edaf 100644 --- a/testing/benchmark/Makefile +++ b/testing/benchmark/Makefile @@ -30,7 +30,6 @@ SSH_OPTS ?= -o LogLevel=ERROR -o StrictHostKeyChecking=no -o ServerAliveInterval SSH_KEY ?= ~/.ssh/id_rsa_terraform WORKER_IP = $(shell terraform output -raw public_ip) APM_SERVER_IP = $(shell terraform output -raw apm_server_ip) -RUN_STANDALONE = $(shell echo var.run_standalone | terraform console | tr -d '"') SHELL = /bin/bash .SHELLFLAGS = -o pipefail -c