From 06e9071629461b4a390874dbf82ae193aa1f1662 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 14 Nov 2023 14:54:26 +0200 Subject: [PATCH] rfc: switch to enum for matches signature I am looking for feedback on this change before improving documentation/etc. Any feedback if this is worthwhile or on naming please. Test Plan: go test ./... --- eval.go | 15 ++--- matchiter.go | 4 +- matchtree.go | 158 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 107 insertions(+), 70 deletions(-) diff --git a/eval.go b/eval.go index 720ea5cf..14a977cc 100644 --- a/eval.go +++ b/eval.go @@ -291,15 +291,16 @@ nextFileMatch: md := d.repoMetaData[d.repos[nextDoc]] for cost := costMin; cost <= costMax; cost++ { - v, ok := mt.matches(cp, cost, known) - if ok && !v { + switch mt.matches(cp, cost, known) { + case matchesRequiresHigherCost: + if cost == costMax { + log.Panicf("did not decide. Repo %s, doc %d, known %v", + md.Name, nextDoc, known) + } + case matchesFound: + case matchesNone: continue nextFileMatch } - - if cost == costMax && !ok { - log.Panicf("did not decide. Repo %s, doc %d, known %v", - md.Name, nextDoc, known) - } } fileMatch := FileMatch{ diff --git a/matchiter.go b/matchiter.go index 9a8fdc8b..e7d3f39a 100644 --- a/matchiter.go +++ b/matchiter.go @@ -92,8 +92,8 @@ func (t *noMatchTree) nextDoc() uint32 { func (t *noMatchTree) prepare(uint32) {} -func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return false, true +func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return matchesNone } func (t *noMatchTree) updateStats(s *Stats) { diff --git a/matchtree.go b/matchtree.go index 2327bb79..906b2b86 100644 --- a/matchtree.go +++ b/matchtree.go @@ -51,6 +51,25 @@ const ( costMax = costRegexp ) +type matchesState uint8 + +const ( + matchesRequiresHigherCost matchesState = iota + matchesFound + matchesNone +) + +func matchesStatePred(b bool) matchesState { + if b { + return matchesFound + } + return matchesNone +} + +func matchesStateForSlice[T any](v []T) matchesState { + return matchesStatePred(len(v) > 0) +} + // An expression tree coupled with matches. The matchtree has two // functions: // @@ -81,7 +100,7 @@ type matchTree interface { docIterator // returns whether this matches, and if we are sure. - matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) + matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState } // docMatchTree iterates over documents for which predicate(docID) returns true. @@ -200,13 +219,13 @@ func (t *symbolRegexpMatchTree) prepare(doc uint32) { t.matchTree.prepare(doc) } -func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.reEvaluated { - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } if cost < costRegexp { - return false, false + return matchesRequiresHigherCost } sections := cp.docSections() @@ -235,7 +254,7 @@ func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map t.found = found t.reEvaluated = true - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } type symbolSubstrMatchTree struct { @@ -564,23 +583,25 @@ func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { // all matches() methods. -func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return t.predicate(cp.idx), true +func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return matchesStatePred(t.predicate(cp.idx)) } -func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return true, true +func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return matchesFound } // andLineMatchTree is a performance optimization of andMatchTree. For content // searches we don't want to run the regex engine if there is no line that // contains matches from all terms. -func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - matches, sure := t.andMatchTree.matches(cp, cost, known) - if !(sure && matches) { - return matches, sure +func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + if state := t.andMatchTree.matches(cp, cost, known); state != matchesFound { + return state } + // Invariant: all children have matches. If any line contains all of them we + // can return MatchesFound. + // find child with fewest candidates min := maxUInt32 fewestChildren := 0 @@ -589,7 +610,7 @@ func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matc // make sure we are running a content search and that all candidates are a // substrMatchTree if !ok || v.fileName { - return matches, sure + return matchesFound } if len(v.current) < min { min = len(v.current) @@ -648,56 +669,63 @@ nextLine: } // return early once we found any line that contains matches from all children if hits == len(t.children) { - return matches, true + return matchesFound } } - return false, true + return matchesNone } -func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { sure := true for _, ch := range t.children { - v, ok := evalMatchTree(cp, cost, known, ch) - if ok && !v { - return false, true - } - if !ok { + switch evalMatchTree(cp, cost, known, ch) { + case matchesRequiresHigherCost: + // keep evaluating other children incase we come across matchesNone sure = false + case matchesFound: + // will return this if every child has this value + case matchesNone: + return matchesNone } } - return true, sure + if sure { + return matchesFound + } + return matchesRequiresHigherCost } -func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - matches := false - sure := true +func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + // we could short-circuit, but we want to use the other possibilities as a + // ranking signal. So we always return the most conservative state. + state := matchesNone for _, ch := range t.children { - v, ok := evalMatchTree(cp, cost, known, ch) - if ok { - // we could short-circuit, but we want to use - // the other possibilities as a ranking - // signal. - matches = matches || v - } else { - sure = false + switch evalMatchTree(cp, cost, known, ch) { + case matchesRequiresHigherCost: + state = matchesRequiresHigherCost + case matchesFound: + if state != matchesRequiresHigherCost { + state = matchesFound + } + case matchesNone: + // noop } } - return matches, sure + return state } -func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - return t.branchMask() != 0, true +func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return matchesStatePred(t.branchMask() != 0) } -func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.reEvaluated { - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } if cost < costRegexp { - return false, false + return matchesRequiresHigherCost } cp.stats.RegexpsConsidered++ @@ -715,16 +743,16 @@ func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[match t.found = found t.reEvaluated = true - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } -func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.evaluated { - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } if cost < costRegexp { - return false, false + return matchesRequiresHigherCost } data := cp.data(t.fileName) @@ -754,7 +782,7 @@ func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTr t.found = found t.evaluated = true - return len(t.found) > 0, true + return matchesStateForSlice(t.found) } // breakMatchesOnNewlines returns matches resulting from breaking each element @@ -792,43 +820,51 @@ func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { return cms } -func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { +func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { if v, ok := known[mt]; ok { - return v, true + return matchesStatePred(v) } - v, ok := mt.matches(cp, cost, known) - if ok { - known[mt] = v + ms := mt.matches(cp, cost, known) + if ms != matchesRequiresHigherCost { + known[mt] = ms == matchesFound } - return v, ok + return ms } -func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { - v, ok := evalMatchTree(cp, cost, known, t.child) - return !v, ok +func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + switch evalMatchTree(cp, cost, known, t.child) { + case matchesRequiresHigherCost: + return matchesRequiresHigherCost + case matchesFound: + return matchesNone + case matchesNone: + return matchesFound + default: + panic("unreachable") + } } -func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { return evalMatchTree(cp, cost, known, t.child) } -func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { +func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.contEvaluated { - return len(t.current) > 0, true + return matchesStateForSlice(t.current) } if len(t.current) == 0 { - return false, true + return matchesNone } if t.fileName && cost < costMemory { - return false, false + return matchesRequiresHigherCost } if !t.fileName && cost < costContent { - return false, false + return matchesRequiresHigherCost } pruned := t.current[:0] @@ -843,7 +879,7 @@ func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[match t.current = pruned t.contEvaluated = true - return len(t.current) > 0, true + return matchesStateForSlice(t.current) } type matchTreeOpt struct {