diff --git a/contentcoder.go b/contentcoder.go index 3a4e874..62752ed 100644 --- a/contentcoder.go +++ b/contentcoder.go @@ -17,13 +17,18 @@ package ice import ( "bytes" "encoding/binary" + "errors" "io" "github.com/blugelabs/ice/compress" ) -var termSeparator byte = 0xff -var termSeparatorSplitSlice = []byte{termSeparator} +const minTermLenWithEscape = 2 + +var ( + termSeparator byte = 0xff + termEscape byte = '\\' +) type chunkedContentCoder struct { final []byte @@ -238,3 +243,49 @@ func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64) } return start, metaHeaders[chunk].DocDvOffset } + +func encodeTerm(dest, src []byte) []byte { + if src == nil { + dest = append(dest, termSeparator) + return dest + } + if bytes.IndexByte(src, termSeparator) < 0 && bytes.IndexByte(src, termEscape) < 0 { + dest = append(dest, src...) + dest = append(dest, termSeparator) + return dest + } + for _, b := range src { + if b == termSeparator || b == termEscape { + dest = append(dest, termEscape) + } + dest = append(dest, b) + } + dest = append(dest, termSeparator) + return dest +} + +// nolint: gocritic, nolintlint +func decodeTerm(dest, src []byte) ([]byte, []byte, error) { + if len(src) == 0 { + return nil, nil, errors.New("empty term values") + } + if src[0] == termSeparator { + return dest, src[1:], nil + } + for len(src) > 0 { + switch { + case src[0] == termEscape: + if len(src) < minTermLenWithEscape { + return nil, nil, errors.New("invalid termEscape character") + } + src = src[1:] + dest = append(dest, src[0]) + case src[0] == termSeparator: + return dest, src[1:], nil + default: + dest = append(dest, src[0]) + } + src = src[1:] + } + return nil, nil, errors.New("invalid term values") +} diff --git a/docvalues.go b/docvalues.go index cb867ed..bdd5fde 100644 --- a/docvalues.go +++ b/docvalues.go @@ -15,7 +15,6 @@ package ice import ( - "bytes" "encoding/binary" "fmt" "math" @@ -40,6 +39,7 @@ type docValueReader struct { curChunkHeader []metaData curChunkData []byte // compressed data cache uncompressed []byte // temp buf for decompression + termBuf []byte // temp buf for term decoding } func (di *docValueReader) size() int { @@ -254,14 +254,13 @@ func (di *docValueReader) visitDocValues(docNum uint64, // pick the terms for the given docNum uncompressed = uncompressed[start:end] - for { - i := bytes.Index(uncompressed, termSeparatorSplitSlice) - if i < 0 { - break + startPos := 0 + for len(uncompressed) > 0 { + if di.termBuf, uncompressed, err = decodeTerm(di.termBuf, uncompressed); err != nil { + return err } - - visitor(di.field, uncompressed[0:i]) - uncompressed = uncompressed[i+1:] + visitor(di.field, di.termBuf[startPos:]) + startPos = len(di.termBuf) } return nil diff --git a/new.go b/new.go index 2760333..31c9c5b 100644 --- a/new.go +++ b/new.go @@ -841,9 +841,7 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6 freqNormOffset++ - docTermMap[docNum] = append( - append(docTermMap[docNum], term...), - termSeparator) + docTermMap[docNum] = encodeTerm(docTermMap[docNum], []byte(term)) } tfEncoder.Close() diff --git a/new_test.go b/new_test.go index b865c60..819532f 100644 --- a/new_test.go +++ b/new_test.go @@ -136,6 +136,7 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( NewFakeField("name", "wow", false, false, true), NewFakeField("desc", "some thing", false, false, true), NewFakeField("tag", "cold", false, false, true), + NewFakeField("number", string([]byte{0xff})+" "+string([]byte{'\\'}), false, false, true), } doc.FakeComposite("_all", []string{"_id"}) diff --git a/segment_test.go b/segment_test.go index 09d5918..3e5cea1 100644 --- a/segment_test.go +++ b/segment_test.go @@ -387,23 +387,24 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } }() - fields := []string{"desc", "name", "tag"} - fieldTerms := make(map[string][]string) + fields := []string{"desc", "name", "tag", "number"} + fieldTerms := make(map[string][][]byte) docValueReader, err := seg.DocumentValueReader(fields) if err != nil { t.Fatalf("err building document value reader: %v", err) } err = docValueReader.VisitDocumentValues(0, func(field string, term []byte) { - fieldTerms[field] = append(fieldTerms[field], string(term)) + fieldTerms[field] = append(fieldTerms[field], term) }) if err != nil { t.Error(err) } - expectedFieldTerms := map[string][]string{ - "name": {"wow"}, - "desc": {"some", "thing"}, - "tag": {"cold"}, + expectedFieldTerms := map[string][][]byte{ + "name": {[]byte("wow")}, + "desc": {[]byte("some"), []byte("thing")}, + "tag": {[]byte("cold")}, + "number": {[]byte("\\"), []byte{0xff}}, } if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)