From 4391028859353a9ffc210cedc580096ede0c8e00 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 17 Sep 2024 13:39:49 +0200 Subject: [PATCH] build: use enry to detect low priority files This is a much more robust detection mechanism. Additionally we have these signals we can also add in: func IsConfiguration(path string) bool func IsDocumentation(path string) bool func IsDotFile(path string) bool func IsImage(path string) bool My main concern with this change is generated file detection on content using up RAM or CPU. Will monitor this impact on pprof in production. Test Plan: go test. --- build/builder.go | 22 ++++++---------------- build/builder_test.go | 7 +++---- build/e2e_test.go | 2 +- gitindex/index.go | 6 +++--- gitindex/index_test.go | 2 +- 5 files changed, 14 insertions(+), 25 deletions(-) diff --git a/build/builder.go b/build/builder.go index dfbfcf7fa..4f8824e1f 100644 --- a/build/builder.go +++ b/build/builder.go @@ -37,7 +37,7 @@ import ( "time" "github.com/bmatcuk/doublestar" - "github.com/grafana/regexp" + "github.com/go-enry/go-enry/v2" "github.com/rs/xid" "github.com/sourcegraph/zoekt" @@ -901,18 +901,8 @@ func squashRange(j int) float64 { // // These 'priority' criteria affects how documents are ordered within a shard. It's // also used to help guess a file's rank when we're missing ranking information. -func IsLowPriority(file string) bool { - return testRe.MatchString(file) || isGenerated(file) || isVendored(file) -} - -var testRe = regexp.MustCompile("[Tt]est") - -func isGenerated(file string) bool { - return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map") -} - -func isVendored(file string) bool { - return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/") +func IsLowPriority(path string, content []byte) bool { + return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content) } type rankedDoc struct { @@ -931,17 +921,17 @@ func rank(d *zoekt.Document, origIdx int) []float64 { } generated := 0.0 - if isGenerated(d.Name) { + if enry.IsGenerated(d.Name, d.Content) { generated = 1.0 } vendor := 0.0 - if isVendored(d.Name) { + if enry.IsVendor(d.Name) { vendor = 1.0 } test := 0.0 - if testRe.MatchString(d.Name) { + if enry.IsTest(d.Name) { test = 1.0 } diff --git a/build/builder_test.go b/build/builder_test.go index e5f4742df..ecf5c82f8 100644 --- a/build/builder_test.go +++ b/build/builder_test.go @@ -764,8 +764,7 @@ func TestFindRepositoryMetadata(t *testing.T) { func TestIsLowPriority(t *testing.T) { cases := []string{ "builder_test.go", - "TestQuery.java", - "test/mocks.go", + "test/TestQuery.java", "search/vendor/thirdparty.cc", "search/node_modules/search/index.js", "search.min.js", @@ -774,7 +773,7 @@ func TestIsLowPriority(t *testing.T) { for _, tt := range cases { t.Run(tt, func(t *testing.T) { - if !IsLowPriority(tt) { + if !IsLowPriority(tt, nil) { t.Errorf("expected file '%s' to be low priority", tt) } }) @@ -788,7 +787,7 @@ func TestIsLowPriority(t *testing.T) { for _, tt := range negativeCases { t.Run(tt, func(t *testing.T) { - if IsLowPriority(tt) { + if IsLowPriority(tt, nil) { t.Errorf("did not expect file '%s' to be low priority", tt) } }) diff --git a/build/e2e_test.go b/build/e2e_test.go index f66b64b9e..1eb5acff4 100644 --- a/build/e2e_test.go +++ b/build/e2e_test.go @@ -502,7 +502,7 @@ func TestFileRank(t *testing.T) { name: "test", docs: []*zoekt.Document{ { - Name: "test", + Name: "foo_test.go", Content: []byte("bla"), }, { diff --git a/gitindex/index.go b/gitindex/index.go index 92cf3686c..ddf73e0d6 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -589,10 +589,10 @@ type repoPathRanks struct { // - If we have a concrete rank for this file, always use it // - If there's no rank, and it's a low priority file like a test, then use rank 0 // - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage -func (r repoPathRanks) rank(path string) float64 { +func (r repoPathRanks) rank(path string, content []byte) float64 { if rank, ok := r.Paths[path]; ok { return rank - } else if build.IsLowPriority(path) { + } else if build.IsLowPriority(path, content) { return 0.0 } else { return r.MeanRank @@ -910,7 +910,7 @@ func createDocument(key fileKey, var pathRanks []float64 if len(ranks.Paths) > 0 { // If the repository has ranking data, then store the file's rank. - pathRank := ranks.rank(keyFullPath) + pathRank := ranks.rank(keyFullPath, contents) pathRanks = []float64{pathRank} } diff --git a/gitindex/index_test.go b/gitindex/index_test.go index 63f5899a7..0aca04145 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -744,7 +744,7 @@ func TestRepoPathRanks(t *testing.T) { for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { - got := pathRanks.rank(tt.path) + got := pathRanks.rank(tt.path, nil) if got != tt.rank { t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got) }