Search: expect trailing newlines in chunk matches (#61247)

This: 1) Bumps Zoekt to include sourcegraph/zoekt#747 2) updates all the consumers of our APIs to trim the trailing newline before splitting 3) updates searcher to also include trailing newlines in chunk matches
sourcegraph · Apr 23, 2024 · 8edfc0f · 8edfc0f
1 parent 8cdba45
commit 8edfc0f
Show file tree

Hide file tree

Showing 12 changed files with 72 additions and 46 deletions.
diff --git a/client/branded/src/search-ui/components/FileContentSearchResult.tsx b/client/branded/src/search-ui/components/FileContentSearchResult.tsx
@@ -299,7 +299,7 @@ function chunkToMatchGroup(chunk: ChunkMatch): MatchGroup {
         endLine: range.end.line,
         endCharacter: range.end.column,
     }))
-    const plaintextLines = chunk.content.split(/\r?\n/)
+    const plaintextLines = chunk.content.replace(/\r?\n$/, '').split(/\r?\n/)
     return {
         plaintextLines,
         highlightedHTMLRows: undefined, // populated lazily

diff --git a/client/web-sveltekit/src/lib/search/utils.ts b/client/web-sveltekit/src/lib/search/utils.ts
@@ -19,7 +19,7 @@ export function chunkToMatchGroup(chunk: ChunkMatch): MatchGroup {
         endLine: range.end.line,
         endCharacter: range.end.column,
     }))
-    const plaintextLines = chunk.content.split(/\r?\n/)
+    const plaintextLines = chunk.content.replace(/\r?\n$/, '').split(/\r?\n/)
     return {
         plaintextLines,
         highlightedHTMLRows: undefined, // populated lazily

diff --git a/cmd/searcher/internal/search/chunk.go b/cmd/searcher/internal/search/chunk.go
@@ -81,7 +81,13 @@ func chunksToMatches(buf []byte, chunks []rangeChunk, contextLines int32) []prot
 func extendRangeToLines(inputRange protocol.Range, buf []byte) protocol.Range {
 	firstLineStart := lineStart(buf, inputRange.Start.Offset)
 	lastLineStart := lineStart(buf, inputRange.End.Offset)
-	lastLineEnd := lineEnd(buf, inputRange.End.Offset)
+	lastLineEnd := lineEnd(buf,
+		// We want the end of the line containing the last byte of the
+		// match, not the first byte after the match. In the case of a
+		// zero-width match between lines, prefer the line after rather
+		// than the line before (like we do for lineStart).
+		max(inputRange.End.Offset, max(inputRange.End.Offset, 1)-1 /* prevent underflow */),
+	)
 
 	return protocol.Range{
 		Start: protocol.Location{
@@ -113,12 +119,8 @@ func addContextLines(inputRange protocol.Range, buf []byte, contextLines int32)
 			precedingLinesAdded += 1
 		}
 
-		rest := buf[lastLineEnd:]
-		if bytes.HasPrefix(rest, []byte("\n")) && len(rest) > 1 {
-			lastLineEnd = lineEnd(buf, lastLineEnd+1)
-			succeedingLinesAdded += 1
-		} else if bytes.HasPrefix(rest, []byte("\r\n")) && len(rest) > 2 {
-			lastLineEnd = lineEnd(buf, lastLineEnd+2)
+		if int(lastLineEnd) < len(buf) {
+			lastLineEnd = lineEnd(buf, lastLineEnd)
 			succeedingLinesAdded += 1
 		}
 	}
@@ -150,10 +152,7 @@ func lineStart(buf []byte, offset int32) int32 {
 func lineEnd(buf []byte, offset int32) int32 {
 	end := int32(len(buf))
 	if loc := bytes.IndexByte(buf[offset:], '\n'); loc >= 0 {
-		end = int32(loc) + offset
-		if bytes.HasSuffix(buf[:end], []byte("\r")) {
-			end -= 1
-		}
+		end = int32(loc) + offset + 1
 	}
 	return end
 }

diff --git a/cmd/searcher/internal/search/chunk_test.go b/cmd/searcher/internal/search/chunk_test.go
@@ -172,61 +172,61 @@ func Test_addContext(t *testing.T) {
 			"\n",
 			0,
 			r(l(0, 0, 0), l(0, 0, 0)),
-			"",
+			"\n",
 		},
 		{
 			"\n",
 			1,
 			r(l(0, 0, 0), l(0, 0, 0)),
-			"",
+			"\n",
 		},
 		{
 			"\n\n\n",
 			0,
 			r(l(1, 1, 0), l(1, 1, 0)),
-			"",
+			"\n",
 		},
 		{
 			"\n\n\n\n",
 			1,
 			r(l(1, 1, 0), l(1, 1, 0)),
-			"\n\n",
+			"\n\n\n",
 		},
 		{
 			"\n\n\n\n",
 			2,
 			r(l(1, 1, 0), l(1, 1, 0)),
-			"\n\n\n",
+			"\n\n\n\n",
 		},
 		{
 			"abc\ndef\nghi\n",
 			0,
 			r(l(1, 0, 1), l(1, 0, 1)),
-			"abc",
+			"abc\n",
 		},
 		{
 			"abc\ndef\nghi\n",
 			1,
 			r(l(1, 0, 1), l(1, 0, 1)),
-			"abc\ndef",
+			"abc\ndef\n",
 		},
 		{
 			"abc\ndef\nghi\n",
 			2,
 			r(l(1, 0, 1), l(1, 0, 1)),
-			"abc\ndef\nghi",
+			"abc\ndef\nghi\n",
 		},
 		{
 			"abc\ndef\nghi",
 			0,
 			r(l(1, 0, 1), l(1, 0, 1)),
-			"abc",
+			"abc\n",
 		},
 		{
 			"abc\ndef\nghi",
 			1,
 			r(l(1, 0, 1), l(1, 0, 1)),
-			"abc\ndef",
+			"abc\ndef\n",
 		},
 		{
 			"abc\ndef\nghi",
@@ -256,7 +256,7 @@ func Test_addContext(t *testing.T) {
 			"abc\r\ndef\r\nghi\r\n",
 			1,
 			r(l(1, 0, 1), l(2, 0, 2)),
-			"abc\r\ndef",
+			"abc\r\ndef\r\n",
 		},
 		{
 			"abc\r\ndef\r\nghi",
@@ -268,19 +268,19 @@ func Test_addContext(t *testing.T) {
 			"\r\n",
 			0,
 			r(l(0, 0, 0), l(0, 0, 0)),
-			"",
+			"\r\n",
 		},
 		{
 			"\r\n",
 			1,
 			r(l(0, 0, 0), l(0, 0, 0)),
-			"",
+			"\r\n",
 		},
 		{
 			"abc\nd\xE2\x9D\x89f\nghi",
 			0,
 			r(l(4, 1, 0), l(5, 1, 1)),
-			"d\xE2\x9D\x89f",
+			"d\xE2\x9D\x89f\n",
 		},
 		{
 			"abc\nd\xE2\x9D\x89f\nghi",

diff --git a/cmd/searcher/internal/search/hybrid_test.go b/cmd/searcher/internal/search/hybrid_test.go
@@ -155,12 +155,14 @@ Hello world example in go`, typeFile},
 		Want: `
 added.md:1:1:
 hello world I am added
+// No newline at end of chunk
 changed.go:6:6:
 	fmt.Println("Hello world")
 unchanged.md:1:1:
 # Hello World
 unchanged.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `,
 	}, {
 		Name: "added",
@@ -171,6 +173,7 @@ Hello world example in go
 		Want: `
 added.md:1:1:
 hello world I am added
+// No newline at end of chunk
 `,
 	}, {
 		Name: "example",
@@ -180,6 +183,7 @@ hello world I am added
 		Want: `
 unchanged.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `,
 	}, {
 		Name: "boolean query",
@@ -199,6 +203,7 @@ Hello world example in go
 		Want: `
 added.md:1:1:
 hello world I am added
+// No newline at end of chunk
 changed.go:1:1:
 package main
 changed.go:6:6:
@@ -207,6 +212,7 @@ unchanged.md:1:1:
 # Hello World
 unchanged.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `,
 	}, {
 		Name: "negated-pattern-example",
@@ -268,6 +274,7 @@ unchanged.md
 changed.go
 unchanged.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `,
 	}, {
 		Name: "negated-pattern-path",

diff --git a/cmd/searcher/internal/search/search_structural_test.go b/cmd/searcher/internal/search/search_structural_test.go
@@ -500,14 +500,14 @@ func bar() {
 		expected := []protocol.FileMatch{{
 			Path: "main.go",
 			ChunkMatches: []protocol.ChunkMatch{{
-				Content:      "func foo() {\n    fmt.Println(\"foo\")\n}",
+				Content:      "func foo() {\n    fmt.Println(\"foo\")\n}\n",
 				ContentStart: protocol.Location{Offset: 1, Line: 1},
 				Ranges: []protocol.Range{{
 					Start: protocol.Location{Offset: 12, Line: 1, Column: 11},
 					End:   protocol.Location{Offset: 38, Line: 3, Column: 1},
 				}},
 			}, {
-				Content:      "func bar() {\n    fmt.Println(\"bar\")\n}",
+				Content:      "func bar() {\n    fmt.Println(\"bar\")\n}\n",
 				ContentStart: protocol.Location{Offset: 40, Line: 5},
 				Ranges: []protocol.Range{{
 					Start: protocol.Location{Offset: 51, Line: 5, Column: 11},

diff --git a/cmd/searcher/internal/search/search_test.go b/cmd/searcher/internal/search/search_test.go
@@ -86,6 +86,7 @@ func main() {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "world"}, IsCaseSensitive: true},
 		want: autogold.Expect(`README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 main.go:6:6:
 fmt.Println("Hello world")
 `),
@@ -95,6 +96,7 @@ fmt.Println("Hello world")
 		want: autogold.Expect(`README.md:2:3:
 
 Hello world example in go
+// No newline at end of chunk
 main.go:5:7:
 func main() {
 fmt.Println("Hello world")
@@ -107,6 +109,7 @@ fmt.Println("Hello world")
 # Hello World
 
 Hello world example in go
+// No newline at end of chunk
 main.go:4:7:
 
 func main() {
@@ -120,6 +123,7 @@ fmt.Println("Hello world")
 # Hello World
 
 Hello world example in go
+// No newline at end of chunk
 main.go:1:7:
 package main
 
@@ -135,6 +139,7 @@ fmt.Println("Hello world")
 # Hello World
 README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 main.go:6:6:
 fmt.Println("Hello world")
 `),
@@ -169,6 +174,7 @@ fmt.Println("Hello world")
 # Hello World
 README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: ""}, ExcludeLangs: []string{"Markdown"}},
@@ -185,10 +191,14 @@ symlink
 # Hello World
 README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `),
 	}, {
-		arg:  protocol.PatternInfo{Query: &protocol.PatternNode{Value: "w"}, IncludePaths: []string{`\.(md|txt)$`, `\.txt$`}},
-		want: autogold.Expect("abc.txt:1:1:\nw\n"),
+		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "w"}, IncludePaths: []string{`\.(md|txt)$`, `\.txt$`}},
+		want: autogold.Expect(`abc.txt:1:1:
+w
+// No newline at end of chunk
+`),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "world"}, ExcludePaths: "README\\.md"},
 		want: autogold.Expect(`main.go:6:6:
@@ -200,13 +210,15 @@ fmt.Println("Hello world")
 # Hello World
 README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "w"}, IncludePaths: []string{"\\.(md|txt)", "README"}},
 		want: autogold.Expect(`README.md:1:1:
 # Hello World
 README.md:3:3:
 Hello world example in go
+// No newline at end of chunk
 `),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "world"}, IncludePaths: []string{`\.(MD|go)$`}, PathPatternsAreCaseSensitive: true},
@@ -290,15 +302,15 @@ func main() {
 # Hello World
 
 Hello world example in go
-main.go:1:8:
+// No newline at end of chunk
+main.go:1:7:
 package main
 
 import "fmt"
 
 func main() {
 fmt.Println("Hello world")
 }
-
 `),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "^$", IsRegExp: true}},
@@ -310,8 +322,10 @@ main.go:4:4:
 
 main.go:8:8:
 
+// No newline at end of chunk
 milton.png:1:1:
 
+// No newline at end of chunk
 `),
 	}, {
 		arg: protocol.PatternInfo{
@@ -324,6 +338,7 @@ milton.png:1:1:
 		},
 		want: autogold.Expect(`file++.plus:1:1:
 filename contains regex metachars
+// No newline at end of chunk
 `),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "World", IsNegated: true}},
@@ -360,17 +375,18 @@ symlink
 `),
 	}, {
 		arg:  protocol.PatternInfo{Query: &protocol.PatternNode{Value: "abc"}, PatternMatchesPath: true, PatternMatchesContent: true},
-		want: autogold.Expect("abc.txt\nsymlink:1:1:\nabc.txt\n"),
+		want: autogold.Expect("abc.txt\nsymlink:1:1:\nabc.txt\n// No newline at end of chunk\n"),
 	}, {
 		arg:  protocol.PatternInfo{Query: &protocol.PatternNode{Value: "abc"}, PatternMatchesPath: false, PatternMatchesContent: true},
-		want: autogold.Expect("symlink:1:1:\nabc.txt\n"),
+		want: autogold.Expect("symlink:1:1:\nabc.txt\n// No newline at end of chunk\n"),
 	}, {
 		arg:  protocol.PatternInfo{Query: &protocol.PatternNode{Value: "abc"}, PatternMatchesPath: true, PatternMatchesContent: false},
 		want: autogold.Expect("abc.txt\n"),
 	}, {
 		arg: protocol.PatternInfo{Query: &protocol.PatternNode{Value: "utf8"}, PatternMatchesPath: false, PatternMatchesContent: true},
 		want: autogold.Expect(`nonutf8.txt:1:1:
 file contains invalid utf8 � characters
+// No newline at end of chunk
 `),
 	}}
 
@@ -740,13 +756,17 @@ func toString(m []protocol.FileMatch) string {
 		for _, cm := range f.ChunkMatches {
 			buf.WriteString(f.Path)
 			buf.WriteByte(':')
-			buf.WriteString(strconv.Itoa(int(cm.ContentStart.Line) + 1))
+			firstLine := int(cm.ContentStart.Line) + 1
+			lastLine := firstLine + strings.Count(strings.TrimSuffix(cm.Content, "\n"), "\n")
+			buf.WriteString(strconv.Itoa(firstLine))
 			buf.WriteByte(':')
-			buf.WriteString(strconv.Itoa(int(cm.ContentStart.Line) + strings.Count(cm.Content, "\n") + 1))
+			buf.WriteString(strconv.Itoa(lastLine))
 			buf.WriteByte(':')
 			buf.WriteByte('\n')
 			buf.WriteString(cm.Content)
-			buf.WriteByte('\n')
+			if !strings.HasSuffix(cm.Content, "\n") {
+				buf.WriteString("\n// No newline at end of chunk\n")
+			}
 		}
 	}
 	return buf.String()