From 8801747561381ac59c6e5fc246b6991898692f49 Mon Sep 17 00:00:00 2001
From: Keegan Carruthers-Smith <keegan.csmith@gmail.com>
Date: Thu, 16 Nov 2023 16:00:32 +0200
Subject: [PATCH] matchtree: switch to enum for matches signature (#691)

The pair of bools (matches, sure) often was quite hard to reason about.
I think this came down to them often not being named at return sites,
the names itself being too concise and that two booleans represents 4
possible states, but we only had two possible states.

This commit uses a more verbose enum instead of booleans. From reading
the diff back I find the code easier to reason about, so I think this is
a good change.

Test Plan: go test ./...
---
 eval.go      |  17 +++--
 matchiter.go |   4 +-
 matchtree.go | 191 +++++++++++++++++++++++++++++++++------------------
 3 files changed, 135 insertions(+), 77 deletions(-)

diff --git a/eval.go b/eval.go
index 720ea5cf..c2b802c6 100644
--- a/eval.go
+++ b/eval.go
@@ -291,15 +291,18 @@ nextFileMatch:
 		md := d.repoMetaData[d.repos[nextDoc]]
 
 		for cost := costMin; cost <= costMax; cost++ {
-			v, ok := mt.matches(cp, cost, known)
-			if ok && !v {
+			switch mt.matches(cp, cost, known) {
+			case matchesRequiresHigherCost:
+				if cost == costMax {
+					log.Panicf("did not decide. Repo %s, doc %d, known %v",
+						md.Name, nextDoc, known)
+				}
+			case matchesFound:
+				// could short-circuit now, but we want to run higher costs to
+				// potentially find higher ranked matches.
+			case matchesNone:
 				continue nextFileMatch
 			}
-
-			if cost == costMax && !ok {
-				log.Panicf("did not decide. Repo %s, doc %d, known %v",
-					md.Name, nextDoc, known)
-			}
 		}
 
 		fileMatch := FileMatch{
diff --git a/matchiter.go b/matchiter.go
index 9a8fdc8b..e7d3f39a 100644
--- a/matchiter.go
+++ b/matchiter.go
@@ -92,8 +92,8 @@ func (t *noMatchTree) nextDoc() uint32 {
 
 func (t *noMatchTree) prepare(uint32) {}
 
-func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	return false, true
+func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	return matchesNone
 }
 
 func (t *noMatchTree) updateStats(s *Stats) {
diff --git a/matchtree.go b/matchtree.go
index 2327bb79..eaa785ea 100644
--- a/matchtree.go
+++ b/matchtree.go
@@ -29,8 +29,10 @@ import (
 
 // A docIterator iterates over documents in order.
 type docIterator interface {
-	// provide the next document where we can may find something
-	// interesting.
+	// provide the next document where we may find something interesting.
+	//
+	// This is like a "peek" and shouldn't mutate state. prepare is what should
+	// change state.
 	nextDoc() uint32
 
 	// clears any per-document state of the docIterator, and
@@ -39,6 +41,8 @@ type docIterator interface {
 	prepare(nextDoc uint32)
 }
 
+// costs are passed in increasing order to matchTree.matches until they do not
+// return matchesRequiresHigherCost.
 const (
 	costConst   = 0
 	costMemory  = 1
@@ -51,6 +55,39 @@ const (
 	costMax = costRegexp
 )
 
+// matchesState is an enum for the state of a matchTree after a call to
+// matchTree.matches.
+type matchesState uint8
+
+const (
+	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
+	// a search yet since the cost value is not high enough.
+	matchesRequiresHigherCost matchesState = iota
+
+	// matchesFound is returned when matchTree.matches has done a search and
+	// found one or more matches.
+	matchesFound
+
+	// matchesNone is returned when matchTree.matches has done a search and
+	// found nothing.
+	matchesNone
+)
+
+// matchesStatePred is a helper which returns matchesFound if b is true
+// otherwise returns matchesNone.
+func matchesStatePred(b bool) matchesState {
+	if b {
+		return matchesFound
+	}
+	return matchesNone
+}
+
+// matchesStateForSlice is a helper which returns matchesFound if v is
+// non-empty otherwise returns matchesNone.
+func matchesStateForSlice[T any](v []T) matchesState {
+	return matchesStatePred(len(v) > 0)
+}
+
 // An expression tree coupled with matches. The matchtree has two
 // functions:
 //
@@ -75,13 +112,15 @@ const (
 //
 //   - evaluate the tree using matches(), storing the result in map.
 //
-//   - if the complete tree returns (matches() == true) for the document,
-//     collect all text matches by looking at leaf matchTrees
+//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
+//     for the document, collect all text matches by looking at leaf
+//     matchTrees.
 type matchTree interface {
 	docIterator
 
-	// returns whether this matches, and if we are sure.
-	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
+	// matches if cost is high enough, caching known values for future
+	// evaluation at higher costs. See documentation for matchesState's values.
+	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 }
 
 // docMatchTree iterates over documents for which predicate(docID) returns true.
@@ -200,13 +239,13 @@ func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 	t.matchTree.prepare(doc)
 }
 
-func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	if t.reEvaluated {
-		return len(t.found) > 0, true
+		return matchesStateForSlice(t.found)
 	}
 
 	if cost < costRegexp {
-		return false, false
+		return matchesRequiresHigherCost
 	}
 
 	sections := cp.docSections()
@@ -235,7 +274,7 @@ func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map
 	t.found = found
 	t.reEvaluated = true
 
-	return len(t.found) > 0, true
+	return matchesStateForSlice(t.found)
 }
 
 type symbolSubstrMatchTree struct {
@@ -564,23 +603,25 @@ func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 
 // all matches() methods.
 
-func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	return t.predicate(cp.idx), true
+func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	return matchesStatePred(t.predicate(cp.idx))
 }
 
-func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	return true, true
+func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	return matchesFound
 }
 
 // andLineMatchTree is a performance optimization of andMatchTree. For content
 // searches we don't want to run the regex engine if there is no line that
 // contains matches from all terms.
-func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	matches, sure := t.andMatchTree.matches(cp, cost, known)
-	if !(sure && matches) {
-		return matches, sure
+func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	if state := t.andMatchTree.matches(cp, cost, known); state != matchesFound {
+		return state
 	}
 
+	// Invariant: all children have matches. If any line contains all of them we
+	// can return MatchesFound.
+
 	// find child with fewest candidates
 	min := maxUInt32
 	fewestChildren := 0
@@ -589,7 +630,7 @@ func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matc
 		// make sure we are running a content search and that all candidates are a
 		// substrMatchTree
 		if !ok || v.fileName {
-			return matches, sure
+			return matchesFound
 		}
 		if len(v.current) < min {
 			min = len(v.current)
@@ -648,56 +689,62 @@ nextLine:
 		}
 		// return early once we found any line that contains matches from all children
 		if hits == len(t.children) {
-			return matches, true
+			return matchesFound
 		}
 	}
-	return false, true
+	return matchesNone
 }
 
-func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	sure := true
+func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	// We have found matches unless a child needs to do more work or it hasn't
+	// found matches.
+	state := matchesFound
 
 	for _, ch := range t.children {
-		v, ok := evalMatchTree(cp, cost, known, ch)
-		if ok && !v {
-			return false, true
-		}
-		if !ok {
-			sure = false
+		switch evalMatchTree(cp, cost, known, ch) {
+		case matchesRequiresHigherCost:
+			// keep evaluating other children incase we come across matchesNone
+			state = matchesRequiresHigherCost
+		case matchesFound:
+			// will return this if every child has this value
+		case matchesNone:
+			return matchesNone
 		}
 	}
 
-	return true, sure
+	return state
 }
 
-func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	matches := false
-	sure := true
+func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	// we could short-circuit, but we want to use the other possibilities as a
+	// ranking signal. So we always return the most conservative state.
+	state := matchesNone
 	for _, ch := range t.children {
-		v, ok := evalMatchTree(cp, cost, known, ch)
-		if ok {
-			// we could short-circuit, but we want to use
-			// the other possibilities as a ranking
-			// signal.
-			matches = matches || v
-		} else {
-			sure = false
+		switch evalMatchTree(cp, cost, known, ch) {
+		case matchesRequiresHigherCost:
+			state = matchesRequiresHigherCost
+		case matchesFound:
+			if state != matchesRequiresHigherCost {
+				state = matchesFound
+			}
+		case matchesNone:
+			// noop
 		}
 	}
-	return matches, sure
+	return state
 }
 
-func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	return t.branchMask() != 0, true
+func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	return matchesStatePred(t.branchMask() != 0)
 }
 
-func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	if t.reEvaluated {
-		return len(t.found) > 0, true
+		return matchesStateForSlice(t.found)
 	}
 
 	if cost < costRegexp {
-		return false, false
+		return matchesRequiresHigherCost
 	}
 
 	cp.stats.RegexpsConsidered++
@@ -715,16 +762,16 @@ func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[match
 	t.found = found
 	t.reEvaluated = true
 
-	return len(t.found) > 0, true
+	return matchesStateForSlice(t.found)
 }
 
-func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	if t.evaluated {
-		return len(t.found) > 0, true
+		return matchesStateForSlice(t.found)
 	}
 
 	if cost < costRegexp {
-		return false, false
+		return matchesRequiresHigherCost
 	}
 
 	data := cp.data(t.fileName)
@@ -754,7 +801,7 @@ func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTr
 	t.found = found
 	t.evaluated = true
 
-	return len(t.found) > 0, true
+	return matchesStateForSlice(t.found)
 }
 
 // breakMatchesOnNewlines returns matches resulting from breaking each element
@@ -792,43 +839,51 @@ func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 	return cms
 }
 
-func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
+func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 	if v, ok := known[mt]; ok {
-		return v, true
+		return matchesStatePred(v)
 	}
 
-	v, ok := mt.matches(cp, cost, known)
-	if ok {
-		known[mt] = v
+	ms := mt.matches(cp, cost, known)
+	if ms != matchesRequiresHigherCost {
+		known[mt] = ms == matchesFound
 	}
 
-	return v, ok
+	return ms
 }
 
-func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
-	v, ok := evalMatchTree(cp, cost, known, t.child)
-	return !v, ok
+func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	switch evalMatchTree(cp, cost, known, t.child) {
+	case matchesRequiresHigherCost:
+		return matchesRequiresHigherCost
+	case matchesFound:
+		return matchesNone
+	case matchesNone:
+		return matchesFound
+	default:
+		panic("unreachable")
+	}
 }
 
-func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	return evalMatchTree(cp, cost, known, t.child)
 }
 
-func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	if t.contEvaluated {
-		return len(t.current) > 0, true
+		return matchesStateForSlice(t.current)
 	}
 
 	if len(t.current) == 0 {
-		return false, true
+		return matchesNone
 	}
 
 	if t.fileName && cost < costMemory {
-		return false, false
+		return matchesRequiresHigherCost
 	}
 
 	if !t.fileName && cost < costContent {
-		return false, false
+		return matchesRequiresHigherCost
 	}
 
 	pruned := t.current[:0]
@@ -843,7 +898,7 @@ func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[match
 	t.current = pruned
 	t.contEvaluated = true
 
-	return len(t.current) > 0, true
+	return matchesStateForSlice(t.current)
 }
 
 type matchTreeOpt struct {