From 1f53faedfd7bf385dfa2721c485e3af6509b229e Mon Sep 17 00:00:00 2001
From: Jackson Owens <jackson@cockroachlabs.com>
Date: Fri, 27 Sep 2024 11:33:02 -0400
Subject: [PATCH] sstable: update TestWriter to exercise TableFormatPebblev5

Update the TestWriter test to exercise the new TableFormatPebblev5 that enables
columnar blocks. This commit also fixes a couple issues with key ordering checks.
---
 internal/base/internal.go    |   5 +
 sstable/colblk/data_block.go |   3 -
 sstable/colblk/keyspan.go    |  15 ++
 sstable/colblk_writer.go     |  29 +++-
 sstable/random_test.go       |   5 +-
 sstable/testdata/writer_v5   | 282 +++++++++++++++++++++++++++++++++++
 sstable/writer_test.go       |  15 +-
 7 files changed, 340 insertions(+), 14 deletions(-)
 create mode 100644 sstable/testdata/writer_v5

diff --git a/internal/base/internal.go b/internal/base/internal.go
index 0fc719f4b5..0c123eb4ce 100644
--- a/internal/base/internal.go
+++ b/internal/base/internal.go
@@ -230,6 +230,11 @@ func MakeTrailer(seqNum SeqNum, kind InternalKeyKind) InternalKeyTrailer {
 	return (InternalKeyTrailer(seqNum) << 8) | InternalKeyTrailer(kind)
 }
 
+// String imlements the fmt.Stringer interface.
+func (t InternalKeyTrailer) String() string {
+	return fmt.Sprintf("%s,%s", SeqNum(t>>8), InternalKeyKind(t&0xff))
+}
+
 // SeqNum returns the sequence number component of the trailer.
 func (t InternalKeyTrailer) SeqNum() SeqNum {
 	return SeqNum(t >> 8)
diff --git a/sstable/colblk/data_block.go b/sstable/colblk/data_block.go
index 985d769cf6..29a70ebeef 100644
--- a/sstable/colblk/data_block.go
+++ b/sstable/colblk/data_block.go
@@ -183,9 +183,6 @@ func (w *defaultKeyWriter) ComparePrev(key []byte) KeyComparison {
 	lp := w.prefixes.UnsafeGet(w.prefixes.nKeys - 1)
 	cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen]))
 
-	if invariants.Enabled && bytes.Compare(lp, key[:cmpv.PrefixLen]) > 0 {
-		panic(errors.AssertionFailedf("keys are not in order: %q > %q", lp, key[:cmpv.PrefixLen]))
-	}
 	// Keys are written in order and prefixes must be sorted lexicograpgically,
 	// so CommonPrefixLen == PrefixLen implies that the keys share the same
 	// logical prefix. (If the previous key had a prefix longer than
diff --git a/sstable/colblk/keyspan.go b/sstable/colblk/keyspan.go
index dac1cad183..d8e50f0de1 100644
--- a/sstable/colblk/keyspan.go
+++ b/sstable/colblk/keyspan.go
@@ -132,6 +132,21 @@ func (w *KeyspanBlockWriter) UnsafeBoundaryKeys() (smallest, largest base.Intern
 	return smallest, largest
 }
 
+// UnsafeLastSpan returns the start and end user keys of the last span written
+// to the block and the trailer of its largest key. The returned keys point
+// directly into the block writer's memory and must not be mutated.
+func (w *KeyspanBlockWriter) UnsafeLastSpan() (
+	start, end []byte,
+	largestTrailer base.InternalKeyTrailer,
+) {
+	if w.keyCount == 0 {
+		return nil, nil, 0
+	}
+	return w.boundaryUserKeys.UnsafeGet(w.boundaryUserKeys.rows - 2),
+		w.boundaryUserKeys.UnsafeGet(w.boundaryUserKeys.rows - 1),
+		base.InternalKeyTrailer(w.trailers.Get(w.keyCount - 1))
+}
+
 // Size returns the size of the pending block.
 func (w *KeyspanBlockWriter) Size() int {
 	off := blockHeaderSize(keyspanColumnCount, keyspanHeaderSize)
diff --git a/sstable/colblk_writer.go b/sstable/colblk_writer.go
index f4493d8a48..f9f517a8b7 100644
--- a/sstable/colblk_writer.go
+++ b/sstable/colblk_writer.go
@@ -254,11 +254,31 @@ func (w *RawColumnWriter) EncodeSpan(span keyspan.Span) error {
 	for _, k := range span.Keys {
 		w.meta.updateSeqNum(k.SeqNum())
 	}
+
+	blockWriter := &w.rangeKeyBlock
 	if span.Keys[0].Kind() == base.InternalKeyKindRangeDelete {
-		w.rangeDelBlock.AddSpan(span)
-		return nil
+		blockWriter = &w.rangeDelBlock
+	}
+	if !w.disableKeyOrderChecks && blockWriter.KeyCount() > 0 {
+		// Check that spans are being added in fragmented order. If the two
+		// tombstones overlap, their start and end keys must be identical.
+		prevStart, prevEnd, prevTrailer := blockWriter.UnsafeLastSpan()
+		if w.opts.Comparer.Equal(prevStart, span.Start) && w.opts.Comparer.Equal(prevEnd, span.End) {
+			if prevTrailer < span.Keys[0].Trailer {
+				w.err = errors.Errorf("pebble: keys must be added in order: %s-%s:{(#%s)}, %s",
+					w.opts.Comparer.FormatKey(prevStart),
+					w.opts.Comparer.FormatKey(prevEnd),
+					prevTrailer, span.Pretty(w.opts.Comparer.FormatKey))
+			}
+		} else if c := w.opts.Comparer.Compare(prevEnd, span.Start); c > 0 {
+			w.err = errors.Errorf("pebble: keys must be added in order: %s-%s:{(#%s)}, %s",
+				w.opts.Comparer.FormatKey(prevStart),
+				w.opts.Comparer.FormatKey(prevEnd),
+				prevTrailer, span.Pretty(w.opts.Comparer.FormatKey))
+			return w.err
+		}
 	}
-	w.rangeKeyBlock.AddSpan(span)
+	blockWriter.AddSpan(span)
 	return nil
 }
 
@@ -759,6 +779,9 @@ func (w *RawColumnWriter) Close() (err error) {
 			w.err = err
 		}
 	}()
+	if w.layout.writable == nil {
+		return w.err
+	}
 
 	// Finish the last data block and send it to the write queue if it contains
 	// any pending KVs.
diff --git a/sstable/random_test.go b/sstable/random_test.go
index 12bac379b9..c3e51a4255 100644
--- a/sstable/random_test.go
+++ b/sstable/random_test.go
@@ -274,8 +274,9 @@ type randomTableConfig struct {
 
 func (cfg *randomTableConfig) readerOpts() ReaderOptions {
 	rOpts := ReaderOptions{
-		Comparer: testkeys.Comparer,
-		Filters:  map[string]FilterPolicy{},
+		Comparer:  testkeys.Comparer,
+		KeySchema: cfg.wopts.KeySchema,
+		Filters:   map[string]FilterPolicy{},
 	}
 	if cfg.wopts.FilterPolicy != nil {
 		rOpts.Filters[cfg.wopts.FilterPolicy.Name()] = cfg.wopts.FilterPolicy
diff --git a/sstable/testdata/writer_v5 b/sstable/testdata/writer_v5
new file mode 100644
index 0000000000..1fdcd5523e
--- /dev/null
+++ b/sstable/testdata/writer_v5
@@ -0,0 +1,282 @@
+build
+a.SET.1:a
+----
+point:    [a#1,SET-a#1,SET]
+seqnums:  [1-1]
+
+scan
+----
+a#1,SET:a
+
+scan-range-del
+----
+
+scan-range-key
+----
+
+build
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+EncodeSpan: d-e:{(#4,RANGEDEL)}
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+EncodeSpan: i-j:{(#8,RANGEDEL)}
+EncodeSpan: j-k:{(#9,RANGEKEYDEL)}
+EncodeSpan: k-l:{(#10,RANGEKEYUNSET,@5)}
+EncodeSpan: l-m:{(#11,RANGEKEYSET,@10,foo)}
+----
+point:    [a#1,SET-h#7,MERGE]
+rangedel: [d#4,RANGEDEL-j#inf,RANGEDEL]
+rangekey: [j#9,RANGEKEYDEL-m#inf,RANGEKEYSET]
+seqnums:  [1-11]
+
+build
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+EncodeSpan: d-e:{(#4,RANGEDEL)}
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+EncodeSpan: i-j:{(#8,RANGEDEL)}
+----
+point:    [a#1,SET-h#7,MERGE]
+rangedel: [d#4,RANGEDEL-j#inf,RANGEDEL]
+seqnums:  [1-8]
+
+scan
+----
+a#1,SET:a
+b#2,DEL:
+c#3,MERGE:c
+f#5,SET:f
+g#6,DEL:
+h#7,MERGE:h
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+i-j:{(#8,RANGEDEL)}
+
+# 3: a-----------m
+# 2:      f------------s
+# 1:          j---------------z
+
+build
+EncodeSpan: a-f:{(#3,RANGEDEL)}
+EncodeSpan: f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+EncodeSpan: j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+EncodeSpan: m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+EncodeSpan: s-z:{(#1,RANGEDEL)}
+----
+rangedel: [a#3,RANGEDEL-z#inf,RANGEDEL]
+seqnums:  [1-3]
+
+scan
+----
+
+scan-range-del
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+scan-range-key
+----
+
+# The range tombstone upper bound is exclusive, so a point operation
+# on that same key will be the actual boundary.
+
+build
+EncodeSpan: a-b:{(#3,RANGEDEL)}
+b.SET.4:c
+----
+point:    [b#4,SET-b#4,SET]
+rangedel: [a#3,RANGEDEL-b#inf,RANGEDEL]
+seqnums:  [3-4]
+
+build
+EncodeSpan: a-b:{(#3,RANGEDEL)}
+b.SET.2:c
+----
+point:    [b#2,SET-b#2,SET]
+rangedel: [a#3,RANGEDEL-b#inf,RANGEDEL]
+seqnums:  [2-3]
+
+build
+EncodeSpan: a-c:{(#3,RANGEDEL)}
+b.SET.2:c
+----
+point:    [b#2,SET-b#2,SET]
+rangedel: [a#3,RANGEDEL-c#inf,RANGEDEL]
+seqnums:  [2-3]
+
+# Keys must be added in order.
+
+build
+a.SET.1:b
+a.SET.2:c
+----
+pebble: keys must be added in strictly increasing order: a#2,SET
+
+build
+b.SET.1:a
+a.SET.2:b
+----
+pebble: keys must be added in strictly increasing order: a#2,SET
+
+build
+b.RANGEDEL.1:c
+a.RANGEDEL.2:b
+----
+RANGEDEL must be added through EncodeSpan
+
+build
+EncodeSpan: b-c:{(#1,RANGEDEL)}
+EncodeSpan: a-b:{(#2,RANGEDEL)}
+----
+pebble: keys must be added in order: b-c:{(#1,RANGEDEL)}, a-b:{(#2,RANGEDEL)}
+
+build-raw
+EncodeSpan: a-c:{(#1,RANGEDEL)}
+EncodeSpan: a-c:{(#2,RANGEDEL)}
+----
+pebble: keys must be added in order: a-c:{(#1,RANGEDEL)}, a-c:{(#2,RANGEDEL)}
+
+build-raw
+EncodeSpan: a-c:{(#1,RANGEDEL)}
+EncodeSpan: b-d:{(#2,RANGEDEL)}
+----
+pebble: keys must be added in order: a-c:{(#1,RANGEDEL)}, b-d:{(#2,RANGEDEL)}
+
+build-raw
+EncodeSpan: a-c:{(#2,RANGEDEL)}
+EncodeSpan: a-d:{(#1,RANGEDEL)}
+----
+pebble: keys must be added in order: a-c:{(#2,RANGEDEL)}, a-d:{(#1,RANGEDEL)}
+
+build-raw
+EncodeSpan: a-c:{(#1,RANGEDEL)}
+EncodeSpan: c-d:{(#2,RANGEDEL)}
+----
+rangedel: [a#1,RANGEDEL-d#inf,RANGEDEL]
+seqnums:  [1-2]
+
+build-raw
+EncodeSpan: a-b:{(#2,RANGEKEYSET,@10,foo) (#1,RANGEKEYSET,@10,foo)}
+----
+rangekey: [a#2,RANGEKEYSET-b#inf,RANGEKEYSET]
+seqnums:  [1-2]
+
+build-raw
+EncodeSpan: b-c:{(#2,RANGEKEYSET,@10,foo)}
+EncodeSpan: a-b:{(#1,RANGEKEYSET,@10,foo)}
+----
+pebble: keys must be added in order: b-c:{(#2,RANGEKEYSET)}, a-b:{(#1,RANGEKEYSET,@10,foo)}
+
+build-raw
+EncodeSpan: a-c:{(#1,RANGEKEYSET,@10,foo)}
+EncodeSpan: c-d:{(#2,RANGEKEYSET,@10,foo)}
+----
+rangekey: [a#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+seqnums:  [1-2]
+
+# Range keys may have perfectly aligned spans (including sequence numbers),
+# though the key kinds must be ordered (descending).
+
+build-raw
+EncodeSpan: a-b:{(#1,RANGEKEYSET,@10,foo) (#1,RANGEKEYUNSET,@10) (#1,RANGEKEYDEL)}
+----
+rangekey: [a#1,RANGEKEYSET-b#inf,RANGEKEYDEL]
+seqnums:  [1-1]
+
+# Setting a very small index-block-size results in a two-level index.
+
+build block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,SET-c#1,SET]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (74)
+        79  data (74)
+       158  data (74)
+       237  index (36)
+       278  index (44)
+       327  index (44)
+       376  top-index (53)
+       434  properties (480)
+       919  meta-index (33)
+       957  footer (53)
+      1010  EOF
+
+# Exercise the non-Reader layout-decoding codepath.
+
+decode-layout
+----
+         0  data (74)
+        79  data (74)
+       158  data (74)
+       237  index (36)
+       278  index (44)
+       327  index (44)
+       376  top-index (53)
+       434  properties (480)
+       919  meta-index (33)
+       957  footer (53)
+      1010  EOF
+
+scan
+----
+a#1,SET:a
+b#1,SET:b
+c#1,SET:c
+
+# Enabling leveldb format disables the creation of a two-level index
+# (the input data here mirrors the test case above).
+
+build leveldb block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,SET-c#1,SET]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (21)
+        26  data (21)
+        52  data (21)
+        78  index (45)
+       128  properties (549)
+       682  meta-index (33)
+       720  leveldb-footer (48)
+       768  EOF
+
+# Range keys, if present, are shown in the layout.
+
+build
+EncodeSpan: a-b:{(#3,RANGEKEYSET,@3,foo)}
+EncodeSpan: b-c:{(#2,RANGEKEYSET,@2,bar)}
+EncodeSpan: c-d:{(#1,RANGEKEYSET,@1,baz)}
+----
+rangekey: [a#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+seqnums:  [1-3]
+
+layout
+----
+         0  index (28)
+        33  range-key (84)
+       122  properties (441)
+       568  meta-index (57)
+       630  footer (53)
+       683  EOF
diff --git a/sstable/writer_test.go b/sstable/writer_test.go
index b707622689..91b1556130 100644
--- a/sstable/writer_test.go
+++ b/sstable/writer_test.go
@@ -39,12 +39,15 @@ type tableFormatFile struct {
 }
 
 func testWriterParallelism(t *testing.T, parallelism bool) {
-	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
-		tdFile := "testdata/writer"
-		if format == TableFormatPebblev3 {
-			tdFile = "testdata/writer_v3"
-		}
-		t.Run(format.String(), func(t *testing.T) { runDataDriven(t, tdFile, format, parallelism) })
+	formatFiles := []tableFormatFile{
+		{TableFormat: TableFormatPebblev2, File: "testdata/writer"},
+		{TableFormat: TableFormatPebblev3, File: "testdata/writer_v3"},
+		{TableFormat: TableFormatPebblev5, File: "testdata/writer_v5"},
+	}
+	for _, tff := range formatFiles {
+		t.Run(tff.TableFormat.String(), func(t *testing.T) {
+			runDataDriven(t, tff.File, tff.TableFormat, parallelism)
+		})
 	}
 }
 func TestWriter(t *testing.T) {