From 4391028859353a9ffc210cedc580096ede0c8e00 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 17 Sep 2024 13:39:49 +0200 Subject: [PATCH 1/3] build: use enry to detect low priority files This is a much more robust detection mechanism. Additionally we have these signals we can also add in: func IsConfiguration(path string) bool func IsDocumentation(path string) bool func IsDotFile(path string) bool func IsImage(path string) bool My main concern with this change is generated file detection on content using up RAM or CPU. Will monitor this impact on pprof in production. Test Plan: go test. --- build/builder.go | 22 ++++++---------------- build/builder_test.go | 7 +++---- build/e2e_test.go | 2 +- gitindex/index.go | 6 +++--- gitindex/index_test.go | 2 +- 5 files changed, 14 insertions(+), 25 deletions(-) diff --git a/build/builder.go b/build/builder.go index dfbfcf7fa..4f8824e1f 100644 --- a/build/builder.go +++ b/build/builder.go @@ -37,7 +37,7 @@ import ( "time" "github.com/bmatcuk/doublestar" - "github.com/grafana/regexp" + "github.com/go-enry/go-enry/v2" "github.com/rs/xid" "github.com/sourcegraph/zoekt" @@ -901,18 +901,8 @@ func squashRange(j int) float64 { // // These 'priority' criteria affects how documents are ordered within a shard. It's // also used to help guess a file's rank when we're missing ranking information. -func IsLowPriority(file string) bool { - return testRe.MatchString(file) || isGenerated(file) || isVendored(file) -} - -var testRe = regexp.MustCompile("[Tt]est") - -func isGenerated(file string) bool { - return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") -} - -func isVendored(file string) bool { - return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") +func IsLowPriority(path string, content []byte) bool { + return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) } type rankedDoc struct { @@ -931,17 +921,17 @@ func rank(d *zoekt.Document, origIdx int) []float64 { } generated := 0.0 - if isGenerated(d.Name) { + if enry.IsGenerated(d.Name, d.Content) { generated = 1.0 } vendor := 0.0 - if isVendored(d.Name) { + if enry.IsVendor(d.Name) { vendor = 1.0 } test := 0.0 - if testRe.MatchString(d.Name) { + if enry.IsTest(d.Name) { test = 1.0 } diff --git a/build/builder_test.go b/build/builder_test.go index e5f4742df..ecf5c82f8 100644 --- a/build/builder_test.go +++ b/build/builder_test.go @@ -764,8 +764,7 @@ func TestFindRepositoryMetadata(t *testing.T) { func TestIsLowPriority(t *testing.T) { cases := []string{ "builder_test.go", - "TestQuery.java", - "test/mocks.go", + "test/TestQuery.java", "search/vendor/thirdparty.cc", "search/node_modules/search/index.js", "search.min.js", @@ -774,7 +773,7 @@ func TestIsLowPriority(t *testing.T) { for _, tt := range cases { t.Run(tt, func(t *testing.T) { - if !IsLowPriority(tt) { + if !IsLowPriority(tt, nil) { t.Errorf("expected file '%s' to be low priority", tt) } }) @@ -788,7 +787,7 @@ func TestIsLowPriority(t *testing.T) { for _, tt := range negativeCases { t.Run(tt, func(t *testing.T) { - if IsLowPriority(tt) { + if IsLowPriority(tt, nil) { t.Errorf("did not expect file '%s' to be low priority", tt) } }) diff --git a/build/e2e_test.go b/build/e2e_test.go index f66b64b9e..1eb5acff4 100644 --- a/build/e2e_test.go +++ b/build/e2e_test.go @@ -502,7 +502,7 @@ func TestFileRank(t *testing.T) { name: "test", docs: []*zoekt.Document{ { - Name: "test", + Name: "foo_test.go", Content: []byte("bla"), }, { diff --git a/gitindex/index.go b/gitindex/index.go index 92cf3686c..ddf73e0d6 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -589,10 +589,10 @@ type repoPathRanks struct { // - If we have a concrete rank for this file, always use it // - If there's no rank, and it's a low priority file like a test, then use rank 0 // - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage -func (r repoPathRanks) rank(path string) float64 { +func (r repoPathRanks) rank(path string, content []byte) float64 { if rank, ok := r.Paths[path]; ok { return rank - } else if build.IsLowPriority(path) { + } else if build.IsLowPriority(path, content) { return 0.0 } else { return r.MeanRank @@ -910,7 +910,7 @@ func createDocument(key fileKey, var pathRanks []float64 if len(ranks.Paths) > 0 { // If the repository has ranking data, then store the file's rank. - pathRank := ranks.rank(keyFullPath) + pathRank := ranks.rank(keyFullPath, contents) pathRanks = []float64{pathRank} } diff --git a/gitindex/index_test.go b/gitindex/index_test.go index 63f5899a7..0aca04145 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -744,7 +744,7 @@ func TestRepoPathRanks(t *testing.T) { for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { - got := pathRanks.rank(tt.path) + got := pathRanks.rank(tt.path, nil) if got != tt.rank { t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got) } From 2db662521c49ae4481674b28b81100cd480233f0 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 17 Sep 2024 14:49:01 +0200 Subject: [PATCH 2/3] update e2e fixtures These all like improvements. We overly matched on test before, so now include test framework code which is interesting. And we now detect that a file is auto generated (the syscalls one) --- internal/e2e/testdata/coverage_data_writer.txt | 12 ++++++------ internal/e2e/testdata/test_server.txt | 10 +++++----- internal/e2e/testdata/time_compare.txt | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/internal/e2e/testdata/coverage_data_writer.txt b/internal/e2e/testdata/coverage_data_writer.txt index 06abe41e5..ce4b3b8dd 100644 --- a/internal/e2e/testdata/coverage_data_writer.txt +++ b/internal/e2e/testdata/coverage_data_writer.txt @@ -14,6 +14,12 @@ github.com/golang/go/src/cmd/cover/func.go 32:// as output the coverage data broken down by function, like this: hidden 8 more line matches +github.com/golang/go/src/testing/fuzz.go +93: Data []byte +205:// modify the underlying data of the arguments provided by the fuzzing engine. +275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) { +hidden 7 more line matches + github.com/golang/go/src/cmd/cover/html.go 199: Coverage float64 170:type templateData struct { @@ -26,12 +32,6 @@ github.com/golang/go/src/internal/fuzz/fuzz.go 908:func (c *coordinator) updateCoverage(newCoverage []byte) int { hidden 91 more line matches -github.com/golang/go/src/testing/fuzz.go -93: Data []byte -205:// modify the underlying data of the arguments provided by the fuzzing engine. -275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) { -hidden 7 more line matches - github.com/golang/go/src/cmd/vendor/golang.org/x/sys/unix/ztypes_linux.go 227: Data [7]byte 449: Data [8]uint32 diff --git a/internal/e2e/testdata/test_server.txt b/internal/e2e/testdata/test_server.txt index e569d8553..8cfce010c 100644 --- a/internal/e2e/testdata/test_server.txt +++ b/internal/e2e/testdata/test_server.txt @@ -32,10 +32,10 @@ github.com/sourcegraph/sourcegraph-public-snapshot/cmd/gitserver/server/server.g 741:func (s *Server) serverContext() (context.Context, context.CancelFunc) { hidden 166 more line matches -github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/main.go -150:type Server struct { -1232:func startServer(conf rootConfig) error { -1309:func newServer(conf rootConfig) (*Server, error) { -hidden 52 more line matches +github.com/sourcegraph/sourcegraph-public-snapshot/cmd/frontend/graphqlbackend/testing.go +46:type Test struct { +79:func RunTest(t *testing.T, test *Test) { +58:func RunTests(t *testing.T, tests []*Test) { +hidden 27 more line matches hidden 494 more file matches diff --git a/internal/e2e/testdata/time_compare.txt b/internal/e2e/testdata/time_compare.txt index 32da48b4f..b6b732d56 100644 --- a/internal/e2e/testdata/time_compare.txt +++ b/internal/e2e/testdata/time_compare.txt @@ -29,10 +29,10 @@ github.com/golang/go/src/go/constant/value.go 1381: re := Compare(x.re, token.EQL, y.re) hidden 1 more line matches -github.com/golang/go/src/syscall/zsyscall_windows.go -878:func GetSystemTimeAsFileTime(time *Filetime) { -1088:func SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) { -132: procGetSystemTimeAsFileTime = modkernel32.NewProc("GetSystemTimeAsFileTime") -hidden 19 more line matches +github.com/golang/go/src/cmd/go/internal/gover/gover.go +36:func Compare(x, y string) int { +20:// but at the time this code was written, there was an existing test that used +49: if c := cmp.Compare(vx.kind, vy.kind); c != 0 { // "" < alpha < beta < rc +hidden 4 more line matches hidden 139 more file matches From 8c5a94741cd9fdc49d5cdb6393f363dbf0f6c3ed Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 18 Sep 2024 11:07:03 +0200 Subject: [PATCH 3/3] add test checking content --- build/builder_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/build/builder_test.go b/build/builder_test.go index ecf5c82f8..aec77a4e3 100644 --- a/build/builder_test.go +++ b/build/builder_test.go @@ -792,6 +792,17 @@ func TestIsLowPriority(t *testing.T) { } }) } + + // Explicitly check that content is important by using the same filename but + // different content. + normal := "package mock\n\nvar Mock struct {}" + generated := "// Code generated by mock\npackage mock\n\nvar Mock struct {}" + if IsLowPriority("mock.go", []byte(normal)) { + t.Error("expected non-generated content to not be low priority") + } + if !IsLowPriority("mock.go", []byte(generated)) { + t.Error("expected generated content to be low priority") + } } func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {