Skip to content

Commit

Permalink
build: use enry to detect low priority files
Browse files Browse the repository at this point in the history
This is a much more robust detection mechanism. Additionally we have
these signals we can also add in:

  func IsConfiguration(path string) bool
  func IsDocumentation(path string) bool
  func IsDotFile(path string) bool
  func IsImage(path string) bool

My main concern with this change is generated file detection on content
using up RAM or CPU. Will monitor this impact on pprof in production.

Test Plan: go test.
  • Loading branch information
keegancsmith committed Sep 17, 2024
1 parent be438ef commit 4391028
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 25 deletions.
22 changes: 6 additions & 16 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
"time"

"github.com/bmatcuk/doublestar"
"github.com/grafana/regexp"
"github.com/go-enry/go-enry/v2"
"github.com/rs/xid"

"github.com/sourcegraph/zoekt"
Expand Down Expand Up @@ -901,18 +901,8 @@ func squashRange(j int) float64 {
//
// These 'priority' criteria affects how documents are ordered within a shard. It's
// also used to help guess a file's rank when we're missing ranking information.
func IsLowPriority(file string) bool {
return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
}

var testRe = regexp.MustCompile("[Tt]est")

func isGenerated(file string) bool {
return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
}

func isVendored(file string) bool {
return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
func IsLowPriority(path string, content []byte) bool {
return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
}

type rankedDoc struct {
Expand All @@ -931,17 +921,17 @@ func rank(d *zoekt.Document, origIdx int) []float64 {
}

generated := 0.0
if isGenerated(d.Name) {
if enry.IsGenerated(d.Name, d.Content) {
generated = 1.0
}

vendor := 0.0
if isVendored(d.Name) {
if enry.IsVendor(d.Name) {
vendor = 1.0
}

test := 0.0
if testRe.MatchString(d.Name) {
if enry.IsTest(d.Name) {
test = 1.0
}

Expand Down
7 changes: 3 additions & 4 deletions build/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -764,8 +764,7 @@ func TestFindRepositoryMetadata(t *testing.T) {
func TestIsLowPriority(t *testing.T) {
cases := []string{
"builder_test.go",
"TestQuery.java",
"test/mocks.go",
"test/TestQuery.java",
"search/vendor/thirdparty.cc",
"search/node_modules/search/index.js",
"search.min.js",
Expand All @@ -774,7 +773,7 @@ func TestIsLowPriority(t *testing.T) {

for _, tt := range cases {
t.Run(tt, func(t *testing.T) {
if !IsLowPriority(tt) {
if !IsLowPriority(tt, nil) {
t.Errorf("expected file '%s' to be low priority", tt)
}
})
Expand All @@ -788,7 +787,7 @@ func TestIsLowPriority(t *testing.T) {

for _, tt := range negativeCases {
t.Run(tt, func(t *testing.T) {
if IsLowPriority(tt) {
if IsLowPriority(tt, nil) {
t.Errorf("did not expect file '%s' to be low priority", tt)
}
})
Expand Down
2 changes: 1 addition & 1 deletion build/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ func TestFileRank(t *testing.T) {
name: "test",
docs: []*zoekt.Document{
{
Name: "test",
Name: "foo_test.go",
Content: []byte("bla"),
},
{
Expand Down
6 changes: 3 additions & 3 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,10 +589,10 @@ type repoPathRanks struct {
// - If we have a concrete rank for this file, always use it
// - If there's no rank, and it's a low priority file like a test, then use rank 0
// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
func (r repoPathRanks) rank(path string) float64 {
func (r repoPathRanks) rank(path string, content []byte) float64 {
if rank, ok := r.Paths[path]; ok {
return rank
} else if build.IsLowPriority(path) {
} else if build.IsLowPriority(path, content) {
return 0.0
} else {
return r.MeanRank
Expand Down Expand Up @@ -910,7 +910,7 @@ func createDocument(key fileKey,
var pathRanks []float64
if len(ranks.Paths) > 0 {
// If the repository has ranking data, then store the file's rank.
pathRank := ranks.rank(keyFullPath)
pathRank := ranks.rank(keyFullPath, contents)
pathRanks = []float64{pathRank}
}

Expand Down
2 changes: 1 addition & 1 deletion gitindex/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ func TestRepoPathRanks(t *testing.T) {

for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got := pathRanks.rank(tt.path)
got := pathRanks.rank(tt.path, nil)
if got != tt.rank {
t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got)
}
Expand Down

0 comments on commit 4391028

Please sign in to comment.