Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: remove unused feature "document ranks" #842

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -960,15 +960,6 @@ type SearchOptions struct {
// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
ChunkMatches bool

// EXPERIMENTAL. If true, document ranks are used as additional input for
// sorting matches.
UseDocumentRanks bool

// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
// their weight in the file match score. If the value is <= 0.0, the default weight value
// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
DocumentRanksWeight float64

// EXPERIMENTAL. If true, use text-search style scoring instead of the default
// scoring formula. The scoring algorithm treats each match in a file as a term
// and computes an approximation to BM25.
Expand Down Expand Up @@ -1036,14 +1027,9 @@ func (s *SearchOptions) String() string {
addDuration("MaxWallTime", s.MaxWallTime)
addDuration("FlushWallTime", s.FlushWallTime)

if s.DocumentRanksWeight > 0 {
add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
}

addBool("EstimateDocCount", s.EstimateDocCount)
addBool("Whole", s.Whole)
addBool("ChunkMatches", s.ChunkMatches)
addBool("UseDocumentRanks", s.UseDocumentRanks)
addBool("UseBM25Scoring", s.UseBM25Scoring)
addBool("Trace", s.Trace)
addBool("DebugScore", s.DebugScore)
Expand Down
7 changes: 2 additions & 5 deletions api_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ import (
"math/rand"
"reflect"

proto "github.com/sourcegraph/zoekt/grpc/protos/zoekt/webserver/v1"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/timestamppb"

proto "github.com/sourcegraph/zoekt/grpc/protos/zoekt/webserver/v1"
)

func FileMatchFromProto(p *proto.FileMatch) FileMatch {
Expand Down Expand Up @@ -696,8 +697,6 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
MaxMatchDisplayCount: int(p.GetMaxMatchDisplayCount()),
NumContextLines: int(p.GetNumContextLines()),
ChunkMatches: p.GetChunkMatches(),
UseDocumentRanks: p.GetUseDocumentRanks(),
DocumentRanksWeight: p.GetDocumentRanksWeight(),
Trace: p.GetTrace(),
DebugScore: p.GetDebugScore(),
UseBM25Scoring: p.GetUseBm25Scoring(),
Expand All @@ -721,8 +720,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
MaxMatchDisplayCount: int64(s.MaxMatchDisplayCount),
NumContextLines: int64(s.NumContextLines),
ChunkMatches: s.ChunkMatches,
UseDocumentRanks: s.UseDocumentRanks,
DocumentRanksWeight: s.DocumentRanksWeight,
Trace: s.Trace,
DebugScore: s.DebugScore,
UseBm25Scoring: s.UseBM25Scoring,
Expand Down
29 changes: 5 additions & 24 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,6 @@ type Options struct {
// last run.
IsDelta bool

// DocumentRanksPath is the path to the file with document ranks. If empty,
// ranks will be computed on-the-fly.
DocumentRanksPath string

// DocumentRanksVersion is a string which when changed will cause us to
// reindex a shard. This field is used so that when the contents of
// DocumentRanksPath changes, we can reindex.
DocumentRanksVersion string

// changedOrRemovedFiles is a list of file paths that have been changed or removed
// since the last indexing job for this repository. These files will be tombstoned
// in the older shards for this repository.
Expand All @@ -133,20 +124,15 @@ type HashOptions struct {
ctagsPath string
cTagsMustSucceed bool
largeFiles []string

// documentRankVersion is an experimental field which will change when the
// DocumentRanksPath content changes. If empty we ignore it.
documentRankVersion string
}

func (o *Options) HashOptions() HashOptions {
return HashOptions{
sizeMax: o.SizeMax,
disableCTags: o.DisableCTags,
ctagsPath: o.CTagsPath,
cTagsMustSucceed: o.CTagsMustSucceed,
largeFiles: o.LargeFiles,
documentRankVersion: o.DocumentRanksVersion,
sizeMax: o.SizeMax,
disableCTags: o.DisableCTags,
ctagsPath: o.CTagsPath,
cTagsMustSucceed: o.CTagsMustSucceed,
largeFiles: o.LargeFiles,
}
}

Expand All @@ -160,11 +146,6 @@ func (o *Options) GetHash() string {
hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))

if h.documentRankVersion != "" {
hasher.Write([]byte{0})
io.WriteString(hasher, h.documentRankVersion)
}

return fmt.Sprintf("%x", hasher.Sum(nil))
}

Expand Down
90 changes: 1 addition & 89 deletions build/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,92 +654,6 @@ func withoutTiebreaker(fullScore float64, useBM25 bool) float64 {
return math.Trunc(fullScore / zoekt.ScoreOffset)
}

func TestDocumentRanks(t *testing.T) {
requireCTags(t)
dir := t.TempDir()

opts := Options{
IndexDir: dir,
RepositoryDescription: zoekt.Repository{
Name: "repo",
},
DocumentRanksVersion: "ranking",
}

searchQuery := &query.Substring{Content: true, Pattern: "Inner"}
exampleJava, err := os.ReadFile("./testdata/example.java")
if err != nil {
t.Fatal(err)
}

cases := []struct {
name string
documentRank float64
documentRanksWeight float64
wantScore float64
}{
{
name: "score with no document ranks",
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match)
wantScore: 7000.00,
},
{
name: "score with document ranks",
documentRank: 0.8,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 225 (file rank)
wantScore: 7225.00,
},
{
name: "score with custom document ranks weight",
documentRank: 0.8,
documentRanksWeight: 1000.0,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 25.00 (file rank)
wantScore: 7025.00,
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
b, err := NewBuilder(opts)
if err != nil {
t.Fatalf("NewBuilder: %v", err)
}

err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: []float64{c.documentRank}})
if err != nil {
t.Fatal(err)
}

if err := b.Finish(); err != nil {
t.Fatalf("Finish: %v", err)
}

ss, err := shards.NewDirectorySearcher(dir)
if err != nil {
t.Fatalf("NewDirectorySearcher(%s): %v", dir, err)
}
defer ss.Close()

srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{
UseDocumentRanks: true,
DocumentRanksWeight: c.documentRanksWeight,
DebugScore: true,
})
if err != nil {
t.Fatal(err)
}

if got, want := len(srs.Files), 1; got != want {
t.Fatalf("file matches: want %d, got %d", want, got)
}

if got := withoutTiebreaker(srs.Files[0].Score, false); got != c.wantScore {
t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore)
}
})
}
}

func TestRepoRanks(t *testing.T) {
requireCTags(t)
dir := t.TempDir()
Expand All @@ -749,7 +663,6 @@ func TestRepoRanks(t *testing.T) {
RepositoryDescription: zoekt.Repository{
Name: "repo",
},
DocumentRanksVersion: "ranking",
}

searchQuery := &query.Substring{Content: true, Pattern: "Inner"}
Expand Down Expand Up @@ -810,8 +723,7 @@ func TestRepoRanks(t *testing.T) {
defer ss.Close()

srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{
UseDocumentRanks: true,
DebugScore: true,
DebugScore: true,
})
if err != nil {
t.Fatal(err)
Expand Down
6 changes: 1 addition & 5 deletions cmd/zoekt-git-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ import (
"strings"

"github.com/dustin/go-humanize"
"github.com/sourcegraph/zoekt/internal/profiler"
"go.uber.org/automaxprocs/maxprocs"

"github.com/sourcegraph/zoekt/cmd"
"github.com/sourcegraph/zoekt/ctags"
"github.com/sourcegraph/zoekt/gitindex"
"github.com/sourcegraph/zoekt/internal/profiler"
)

func run() int {
Expand All @@ -43,8 +43,6 @@ func run() int {
"It also affects name if the indexed repository is under this directory.")
isDelta := flag.Bool("delta", false, "whether we should use delta build")
deltaShardNumberFallbackThreshold := flag.Uint64("delta_threshold", 0, "upper limit on the number of preexisting shards that can exist before attempting a delta build (0 to disable fallback behavior)")
offlineRanking := flag.String("offline_ranking", "", "the name of the file that contains the ranking info.")
offlineRankingVersion := flag.String("offline_ranking_version", "", "a version string identifying the contents in offline_ranking.")
Comment on lines -46 to -47
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is backward incompatible but I believe it's very unlikely that anyone used this outside Sourcegraph.

languageMap := flag.String("language_map", "", "a mapping between a language and its ctags processor (a:0,b:3).")

cpuProfile := flag.String("cpuprofile", "", "write cpu profile to `file`")
Expand Down Expand Up @@ -76,8 +74,6 @@ func run() int {

opts := cmd.OptionsFromFlags()
opts.IsDelta = *isDelta
opts.DocumentRanksPath = *offlineRanking
opts.DocumentRanksVersion = *offlineRankingVersion

var branches []string
if *branchesStr != "" {
Expand Down
46 changes: 0 additions & 46 deletions cmd/zoekt-sourcegraph-indexserver/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"context"
"crypto/sha1"
"encoding/json"
"errors"
"fmt"
"io"
Expand Down Expand Up @@ -53,11 +52,6 @@ type IndexOptions struct {
// Priority indicates ranking in results, higher first.
Priority float64

// DocumentRanksVersion when non-empty will lead to indexing using offline
// ranking. When the string changes this will also cause us to re-index with
// new ranks.
DocumentRanksVersion string

// Public is true if the repository is public.
Public bool

Expand Down Expand Up @@ -133,8 +127,6 @@ func (o *indexArgs) BuildOptions() *build.Options {
DisableCTags: !o.Symbols,
IsDelta: o.UseDelta,

DocumentRanksVersion: o.DocumentRanksVersion,

LanguageMap: o.LanguageMap,

ShardMerging: o.ShardMerging,
Expand Down Expand Up @@ -381,44 +373,6 @@ func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *i
"-submodules=false",
}

if o.DocumentRanksVersion != "" {
// We store the document ranks as JSON in gitDir and tell zoekt-git-index where
// to find the file.
documentsRankFile := filepath.Join(gitDir, "documents.rank")

saveDocumentRanks := func() error {
r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name)
if err != nil {
return fmt.Errorf("GetDocumentRanks: %w", err)
}

b, err := json.Marshal(r)
if err != nil {
return err
}

if err := os.WriteFile(documentsRankFile, b, 0o600); err != nil {
return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err)
}

return nil
}

if err := saveDocumentRanks(); err != nil {
// log and fall back to online ranking
logger.Warn(
"error saving document ranks. Falling back to online ranking",
sglog.Error(err),
sglog.String("repo", o.Name),
sglog.Uint32("id", o.RepoID),
)
} else {
args = append(args,
"-offline_ranking", documentsRankFile,
"-offline_ranking_version", o.DocumentRanksVersion)
}
}

// Even though we check for incremental in this process, we still pass it
// in just in case we regress in how we check in process. We will still
// notice thanks to metrics and increased load on gitserver.
Expand Down
9 changes: 0 additions & 9 deletions cmd/zoekt-sourcegraph-indexserver/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,6 @@ var splitargs = cmpopts.AcyclicTransformer("splitargs", func(cmd string) []strin
type mockGRPCClient struct {
mockSearchConfiguration func(context.Context, *proto.SearchConfigurationRequest, ...grpc.CallOption) (*proto.SearchConfigurationResponse, error)
mockList func(context.Context, *proto.ListRequest, ...grpc.CallOption) (*proto.ListResponse, error)
mockDocumentRanks func(context.Context, *proto.DocumentRanksRequest, ...grpc.CallOption) (*proto.DocumentRanksResponse, error)
mockUpdateIndexStatus func(context.Context, *proto.UpdateIndexStatusRequest, ...grpc.CallOption) (*proto.UpdateIndexStatusResponse, error)
}

Expand All @@ -669,14 +668,6 @@ func (m *mockGRPCClient) List(ctx context.Context, in *proto.ListRequest, opts .
return nil, fmt.Errorf("mock RPC List not implemented")
}

func (m *mockGRPCClient) DocumentRanks(ctx context.Context, in *proto.DocumentRanksRequest, opts ...grpc.CallOption) (*proto.DocumentRanksResponse, error) {
if m.mockDocumentRanks != nil {
return m.mockDocumentRanks(ctx, in, opts...)
}

return nil, fmt.Errorf("mock RPC DocumentRanks not implemented")
}

func (m *mockGRPCClient) UpdateIndexStatus(ctx context.Context, in *proto.UpdateIndexStatusRequest, opts ...grpc.CallOption) (*proto.UpdateIndexStatusResponse, error) {
if m.mockUpdateIndexStatus != nil {
return m.mockUpdateIndexStatus(ctx, in, opts...)
Expand Down
Loading
Loading