Skip to content

Commit

Permalink
build: use enry to detect low priority files (#829)
Browse files Browse the repository at this point in the history
This is a much more robust detection mechanism. Additionally we have
these signals we can also add in:

  func IsConfiguration(path string) bool
  func IsDocumentation(path string) bool
  func IsDotFile(path string) bool
  func IsImage(path string) bool

My main concern with this change is generated file detection on content
using up RAM or CPU. Will monitor this impact on pprof in production.

Test Plan: go test.
  • Loading branch information
keegancsmith authored Sep 18, 2024
1 parent 87f1948 commit a8d7c8b
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 41 deletions.
22 changes: 6 additions & 16 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
"time"

"github.com/bmatcuk/doublestar"
"github.com/grafana/regexp"
"github.com/go-enry/go-enry/v2"
"github.com/rs/xid"

"github.com/sourcegraph/zoekt"
Expand Down Expand Up @@ -901,18 +901,8 @@ func squashRange(j int) float64 {
//
// These 'priority' criteria affects how documents are ordered within a shard. It's
// also used to help guess a file's rank when we're missing ranking information.
func IsLowPriority(file string) bool {
return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
}

var testRe = regexp.MustCompile("[Tt]est")

func isGenerated(file string) bool {
return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
}

func isVendored(file string) bool {
return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
func IsLowPriority(path string, content []byte) bool {
return enry.IsTest(path) || enry.IsVendor(path) || enry.IsGenerated(path, content)
}

type rankedDoc struct {
Expand All @@ -931,17 +921,17 @@ func rank(d *zoekt.Document, origIdx int) []float64 {
}

generated := 0.0
if isGenerated(d.Name) {
if enry.IsGenerated(d.Name, d.Content) {
generated = 1.0
}

vendor := 0.0
if isVendored(d.Name) {
if enry.IsVendor(d.Name) {
vendor = 1.0
}

test := 0.0
if testRe.MatchString(d.Name) {
if enry.IsTest(d.Name) {
test = 1.0
}

Expand Down
18 changes: 14 additions & 4 deletions build/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -764,8 +764,7 @@ func TestFindRepositoryMetadata(t *testing.T) {
func TestIsLowPriority(t *testing.T) {
cases := []string{
"builder_test.go",
"TestQuery.java",
"test/mocks.go",
"test/TestQuery.java",
"search/vendor/thirdparty.cc",
"search/node_modules/search/index.js",
"search.min.js",
Expand All @@ -774,7 +773,7 @@ func TestIsLowPriority(t *testing.T) {

for _, tt := range cases {
t.Run(tt, func(t *testing.T) {
if !IsLowPriority(tt) {
if !IsLowPriority(tt, nil) {
t.Errorf("expected file '%s' to be low priority", tt)
}
})
Expand All @@ -788,11 +787,22 @@ func TestIsLowPriority(t *testing.T) {

for _, tt := range negativeCases {
t.Run(tt, func(t *testing.T) {
if IsLowPriority(tt) {
if IsLowPriority(tt, nil) {
t.Errorf("did not expect file '%s' to be low priority", tt)
}
})
}

// Explicitly check that content is important by using the same filename but
// different content.
normal := "package mock\n\nvar Mock struct {}"
generated := "// Code generated by mock\npackage mock\n\nvar Mock struct {}"
if IsLowPriority("mock.go", []byte(normal)) {
t.Error("expected non-generated content to not be low priority")
}
if !IsLowPriority("mock.go", []byte(generated)) {
t.Error("expected generated content to be low priority")
}
}

func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {
Expand Down
2 changes: 1 addition & 1 deletion build/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ func TestFileRank(t *testing.T) {
name: "test",
docs: []*zoekt.Document{
{
Name: "test",
Name: "foo_test.go",
Content: []byte("bla"),
},
{
Expand Down
6 changes: 3 additions & 3 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,10 +589,10 @@ type repoPathRanks struct {
// - If we have a concrete rank for this file, always use it
// - If there's no rank, and it's a low priority file like a test, then use rank 0
// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
func (r repoPathRanks) rank(path string) float64 {
func (r repoPathRanks) rank(path string, content []byte) float64 {
if rank, ok := r.Paths[path]; ok {
return rank
} else if build.IsLowPriority(path) {
} else if build.IsLowPriority(path, content) {
return 0.0
} else {
return r.MeanRank
Expand Down Expand Up @@ -910,7 +910,7 @@ func createDocument(key fileKey,
var pathRanks []float64
if len(ranks.Paths) > 0 {
// If the repository has ranking data, then store the file's rank.
pathRank := ranks.rank(keyFullPath)
pathRank := ranks.rank(keyFullPath, contents)
pathRanks = []float64{pathRank}
}

Expand Down
2 changes: 1 addition & 1 deletion gitindex/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ func TestRepoPathRanks(t *testing.T) {

for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got := pathRanks.rank(tt.path)
got := pathRanks.rank(tt.path, nil)
if got != tt.rank {
t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got)
}
Expand Down
12 changes: 6 additions & 6 deletions internal/e2e/testdata/coverage_data_writer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ github.com/golang/go/src/cmd/cover/func.go
32:// as output the coverage data broken down by function, like this:
hidden 8 more line matches

github.com/golang/go/src/testing/fuzz.go
93: Data []byte
205:// modify the underlying data of the arguments provided by the fuzzing engine.
275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) {
hidden 7 more line matches

github.com/golang/go/src/cmd/cover/html.go
199: Coverage float64
170:type templateData struct {
Expand All @@ -26,12 +32,6 @@ github.com/golang/go/src/internal/fuzz/fuzz.go
908:func (c *coordinator) updateCoverage(newCoverage []byte) int {
hidden 91 more line matches

github.com/golang/go/src/testing/fuzz.go
93: Data []byte
205:// modify the underlying data of the arguments provided by the fuzzing engine.
275: run := func(captureOut io.Writer, e corpusEntry) (ok bool) {
hidden 7 more line matches

github.com/golang/go/src/cmd/vendor/golang.org/x/sys/unix/ztypes_linux.go
227: Data [7]byte
449: Data [8]uint32
Expand Down
10 changes: 5 additions & 5 deletions internal/e2e/testdata/test_server.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ github.com/sourcegraph/sourcegraph-public-snapshot/cmd/gitserver/server/server.g
741:func (s *Server) serverContext() (context.Context, context.CancelFunc) {
hidden 166 more line matches

github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/main.go
150:type Server struct {
1232:func startServer(conf rootConfig) error {
1309:func newServer(conf rootConfig) (*Server, error) {
hidden 52 more line matches
github.com/sourcegraph/sourcegraph-public-snapshot/cmd/frontend/graphqlbackend/testing.go
46:type Test struct {
79:func RunTest(t *testing.T, test *Test) {
58:func RunTests(t *testing.T, tests []*Test) {
hidden 27 more line matches

hidden 494 more file matches
10 changes: 5 additions & 5 deletions internal/e2e/testdata/time_compare.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ github.com/golang/go/src/go/constant/value.go
1381: re := Compare(x.re, token.EQL, y.re)
hidden 1 more line matches

github.com/golang/go/src/syscall/zsyscall_windows.go
878:func GetSystemTimeAsFileTime(time *Filetime) {
1088:func SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) {
132: procGetSystemTimeAsFileTime = modkernel32.NewProc("GetSystemTimeAsFileTime")
hidden 19 more line matches
github.com/golang/go/src/cmd/go/internal/gover/gover.go
36:func Compare(x, y string) int {
20:// but at the time this code was written, there was an existing test that used
49: if c := cmp.Compare(vx.kind, vy.kind); c != 0 { // "" < alpha < beta < rc
hidden 4 more line matches

hidden 139 more file matches

0 comments on commit a8d7c8b

Please sign in to comment.