Skip to content

Commit

Permalink
fix: handle terms containing termSeparator
Browse files Browse the repository at this point in the history
- Updated encodeTerm to escape termSeparator and termEscape characters.
- Modified decodeTerm to correctly handle escaped characters and
  return the appropriate sub-slice.

Signed-off-by: Gao Hongtao <[email protected]>
  • Loading branch information
hanahmily committed Nov 7, 2024
1 parent 73d06af commit 864fdb4
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 20 deletions.
55 changes: 53 additions & 2 deletions contentcoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@ package ice
import (
"bytes"
"encoding/binary"
"errors"
"io"

"github.com/blugelabs/ice/compress"
)

var termSeparator byte = 0xff
var termSeparatorSplitSlice = []byte{termSeparator}
const minTermLenWithEscape = 2

var (
termSeparator byte = 0xff
termEscape byte = '\\'
)

type chunkedContentCoder struct {
final []byte
Expand Down Expand Up @@ -238,3 +243,49 @@ func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64)
}
return start, metaHeaders[chunk].DocDvOffset
}

func encodeTerm(dest, src []byte) []byte {
if src == nil {
dest = append(dest, termSeparator)
return dest
}
if bytes.IndexByte(src, termSeparator) < 0 && bytes.IndexByte(src, termEscape) < 0 {
dest = append(dest, src...)
dest = append(dest, termSeparator)
return dest
}
for _, b := range src {
if b == termSeparator || b == termEscape {
dest = append(dest, termEscape)
}
dest = append(dest, b)
}
dest = append(dest, termSeparator)
return dest
}

// nolint: gocritic, nolintlint
func decodeTerm(dest, src []byte) ([]byte, []byte, error) {
if len(src) == 0 {
return nil, nil, errors.New("empty term values")
}
if src[0] == termSeparator {
return dest, src[1:], nil
}
for len(src) > 0 {
switch {
case src[0] == termEscape:
if len(src) < minTermLenWithEscape {
return nil, nil, errors.New("invalid termEscape character")
}
src = src[1:]
dest = append(dest, src[0])
case src[0] == termSeparator:
return dest, src[1:], nil
default:
dest = append(dest, src[0])
}
src = src[1:]
}
return nil, nil, errors.New("invalid term values")
}
15 changes: 7 additions & 8 deletions docvalues.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
package ice

import (
"bytes"
"encoding/binary"
"fmt"
"math"
Expand All @@ -40,6 +39,7 @@ type docValueReader struct {
curChunkHeader []metaData
curChunkData []byte // compressed data cache
uncompressed []byte // temp buf for decompression
termBuf []byte // temp buf for term decoding
}

func (di *docValueReader) size() int {
Expand Down Expand Up @@ -254,14 +254,13 @@ func (di *docValueReader) visitDocValues(docNum uint64,

// pick the terms for the given docNum
uncompressed = uncompressed[start:end]
for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 {
break
startPos := 0
for len(uncompressed) > 0 {
if di.termBuf, uncompressed, err = decodeTerm(di.termBuf, uncompressed); err != nil {
return err
}

visitor(di.field, uncompressed[0:i])
uncompressed = uncompressed[i+1:]
visitor(di.field, di.termBuf[startPos:])
startPos = len(di.termBuf)
}

return nil
Expand Down
4 changes: 1 addition & 3 deletions new.go
Original file line number Diff line number Diff line change
Expand Up @@ -841,9 +841,7 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6

freqNormOffset++

docTermMap[docNum] = append(
append(docTermMap[docNum], term...),
termSeparator)
docTermMap[docNum] = encodeTerm(docTermMap[docNum], []byte(term))
}

tfEncoder.Close()
Expand Down
1 change: 1 addition & 0 deletions new_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) (
NewFakeField("name", "wow", false, false, true),
NewFakeField("desc", "some thing", false, false, true),
NewFakeField("tag", "cold", false, false, true),
NewFakeField("number", string([]byte{0xff})+" "+string([]byte{'\\'}), false, false, true),
}
doc.FakeComposite("_all", []string{"_id"})

Expand Down
15 changes: 8 additions & 7 deletions segment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,23 +387,24 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
}
}()

fields := []string{"desc", "name", "tag"}
fieldTerms := make(map[string][]string)
fields := []string{"desc", "name", "tag", "number"}
fieldTerms := make(map[string][][]byte)
docValueReader, err := seg.DocumentValueReader(fields)
if err != nil {
t.Fatalf("err building document value reader: %v", err)
}
err = docValueReader.VisitDocumentValues(0, func(field string, term []byte) {
fieldTerms[field] = append(fieldTerms[field], string(term))
fieldTerms[field] = append(fieldTerms[field], term)
})
if err != nil {
t.Error(err)
}

expectedFieldTerms := map[string][]string{
"name": {"wow"},
"desc": {"some", "thing"},
"tag": {"cold"},
expectedFieldTerms := map[string][][]byte{
"name": {[]byte("wow")},
"desc": {[]byte("some"), []byte("thing")},
"tag": {[]byte("cold")},
"number": {[]byte("\\"), []byte{0xff}},
}
if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) {
t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)
Expand Down

0 comments on commit 864fdb4

Please sign in to comment.