From 6984e15d7a781f2d455ba313d98528969034128d Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 19 Oct 2023 17:21:52 +0200 Subject: [PATCH] score: experimental extension novelty in sorting Right now we boost a file extension that hasn't been seen to the 3rd position. This is gated by an environment variable. I want to explore if there are ways we can turn on this behaviour with the query language. Test Plan: ZOEKT_NOVELTY=1 go run ./cmd/zoekt foo --- contentprovider.go | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/contentprovider.go b/contentprovider.go index 5f27d2050..8526a4051 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -18,9 +18,13 @@ import ( "bytes" "fmt" "log" + "os" + "path" "sort" "strings" "unicode/utf8" + + "golang.org/x/exp/slices" ) var _ = log.Println @@ -913,9 +917,53 @@ func sortChunkMatchesByScore(ms []ChunkMatch) { sort.Sort(chunkMatchScoreSlice(ms)) } +var doNovelty = os.Getenv("ZOEKT_NOVELTY") != "" + // SortFiles sorts files matches. The order depends on the match score, which includes both // query-dependent signals like word overlap, and file-only signals like the file ranks (if // file ranks are enabled). func SortFiles(ms []FileMatch) { sort.Sort(fileMatchesByScore(ms)) + + if doNovelty { + // Experimentally boost something into the third filematch + boostNovelExtension(ms, 2, 0.9) + } +} + +func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { + if len(ms) <= boostOffset+1 { + return + } + + top := ms[:boostOffset] + candidates := ms[boostOffset:] + + // Don't bother boosting something which is significantly different to the + // result it replaces. + minScoreForNovelty := candidates[0].Score * minScoreRatio + + // We want to look for an ext that isn't in the top exts + exts := make([]string, len(top)) + for i := range top { + exts[i] = path.Ext(top[i].FileName) + } + + for i := range candidates { + // Do not assume sorted due to boostNovelExtension being called on subsets + if candidates[i].Score < minScoreForNovelty { + continue + } + + if slices.Contains(exts, path.Ext(candidates[i].FileName)) { + continue + } + + // Found what we are looking for, now boost to front of candidates (which + // is ms[boostOffset]) + for ; i > 0; i-- { + candidates[i], candidates[i-1] = candidates[i-1], candidates[i] + } + return + } }