From b97971aec38c06b4a6c87c314e55b739a24d055d Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Mon, 30 Oct 2023 17:48:26 +0100 Subject: [PATCH] Additional vendored code cleanup Signed-off-by: Dirkjan Bussink Signed-off-by: Vicent Marti --- go/mysql/collations/vindex/collate/index.go | 8 +- go/mysql/collations/vindex/collate/tables.go | 8 - .../vindex/internal/colltab/collelem.go | 215 +------- .../vindex/internal/colltab/contract.go | 59 --- .../vindex/internal/colltab/iter.go | 14 - .../vindex/internal/colltab/table.go | 6 +- .../vindex/internal/colltab/trie.go | 64 --- .../vindex/unicode/norm/composition.go | 34 +- .../vindex/unicode/norm/forminfo.go | 13 - .../collations/vindex/unicode/norm/input.go | 92 +--- .../collations/vindex/unicode/norm/iter.go | 458 ------------------ .../vindex/unicode/norm/normalize.go | 14 +- 12 files changed, 56 insertions(+), 929 deletions(-) delete mode 100644 go/mysql/collations/vindex/unicode/norm/iter.go diff --git a/go/mysql/collations/vindex/collate/index.go b/go/mysql/collations/vindex/collate/index.go index efb118a5873..19073623ce6 100644 --- a/go/mysql/collations/vindex/collate/index.go +++ b/go/mysql/collations/vindex/collate/index.go @@ -16,11 +16,9 @@ func getTable(t tableIndex) *colltab.Table { Index: mainLookup[:], Values: mainValues[:], }, - ExpandElem: mainExpandElem[:], - ContractTries: mainCTEntries[:], - ContractElem: mainContractElem[:], - MaxContractLen: 18, - VariableTop: varTop, + ExpandElem: mainExpandElem[:], + ContractTries: mainCTEntries[:], + ContractElem: mainContractElem[:], } } diff --git a/go/mysql/collations/vindex/collate/tables.go b/go/mysql/collations/vindex/collate/tables.go index 01d2aff2603..f46ff79c5ee 100644 --- a/go/mysql/collations/vindex/collate/tables.go +++ b/go/mysql/collations/vindex/collate/tables.go @@ -1,13 +1,5 @@ package collate -// UnicodeVersion is the Unicode version from which the tables in this package are derived. -const UnicodeVersion = "6.2.0" - -// CLDRVersion is the CLDR version from which the tables in this package are derived. -const CLDRVersion = "23" - -const varTop = 0x30e - // mainExpandElem: 46864 entries, 187456 bytes var mainExpandElem = [46864]uint32{ // Block 0, offset 0x0 diff --git a/go/mysql/collations/vindex/internal/colltab/collelem.go b/go/mysql/collations/vindex/internal/colltab/collelem.go index 0c23c8a48e9..41323dfee4d 100644 --- a/go/mysql/collations/vindex/internal/colltab/collelem.go +++ b/go/mysql/collations/vindex/internal/colltab/collelem.go @@ -5,33 +5,12 @@ package colltab import ( - "fmt" "unicode" ) -// Level identifies the collation comparison level. -// The primary level corresponds to the basic sorting of text. -// The secondary level corresponds to accents and related linguistic elements. -// The tertiary level corresponds to casing and related concepts. -// The quaternary level is derived from the other levels by the -// various algorithms for handling variable elements. -type Level int - -const ( - Primary Level = iota - Secondary - Tertiary - Quaternary - Identity - - NumLevels -) - const ( defaultSecondary = 0x20 - defaultTertiary = 0x2 maxTertiary = 0x1F - MaxQuaternary = 0x1FFFFF // 21 bits. ) // Elem is a representation of a collation element. This API provides ways to encode @@ -42,12 +21,8 @@ type Elem uint32 const ( maxCE Elem = 0xAFFFFFFF - PrivateUse = minContract - minContract = 0xC0000000 - maxContract = 0xDFFFFFFF - minExpand = 0xE0000000 - maxExpand = 0xEFFFFFFF - minDecomp = 0xF0000000 + maxContract Elem = 0xDFFFFFFF + maxExpand Elem = 0xEFFFFFFF ) type ceType int @@ -65,14 +40,11 @@ func (ce Elem) ctype() ceType { } if ce <= maxContract { return ceContractionIndex - } else { - if ce <= maxExpand { - return ceExpansionIndex - } - return ceDecompose } - panic("should not reach here") - return ceType(-1) + if ce <= maxExpand { + return ceExpansionIndex + } + return ceDecompose } // For normal collation elements, we assume that a collation element either has @@ -100,99 +72,23 @@ func (ce Elem) ctype() ceType { // 11qqqqqq qqqqqqqq qqqqqqq0 00000000 // - q* quaternary value const ( - ceTypeMask = 0xC0000000 - ceTypeMaskExt = 0xE0000000 - ceIgnoreMask = 0xF00FFFFF - ceType1 = 0x40000000 - ceType2 = 0x00000000 - ceType3or4 = 0x80000000 - ceType4 = 0xA0000000 - ceTypeQ = 0xC0000000 - Ignore = ceType4 - firstNonPrimary = 0x80000000 - lastSpecialPrimary = 0xA0000000 - secondaryMask = 0x80000000 - hasTertiaryMask = 0x40000000 - primaryValueMask = 0x3FFFFE00 - maxPrimaryBits = 21 - compactPrimaryBits = 16 - maxSecondaryBits = 12 - maxTertiaryBits = 8 - maxCCCBits = 8 - maxSecondaryCompactBits = 8 - maxSecondaryDiffBits = 4 - maxTertiaryCompactBits = 5 - primaryShift = 9 - compactSecondaryShift = 5 - minCompactSecondary = defaultSecondary - 4 + ceTypeMask = 0xC0000000 + ceTypeMaskExt = 0xE0000000 + ceType1 = 0x40000000 + ceType3or4 = 0x80000000 + ceType4 = 0xA0000000 + firstNonPrimary = 0x80000000 + lastSpecialPrimary = 0xA0000000 + primaryValueMask = 0x3FFFFE00 + primaryShift = 9 + compactSecondaryShift = 5 + minCompactSecondary = defaultSecondary - 4 ) func makeImplicitCE(primary int) Elem { return ceType1 | Elem(primary<= 1<= %x", w, 1<= 1<= %x", w, 1<= 1<= %x", w, 1<= 1<= %x", primary, 1<= 1<= %x", secondary, 1<= 1< %x", d, d, 1<= 1< %x", tertiary, 1<> primaryShift } -// Secondary returns the secondary collation weight for ce. -func (ce Elem) Secondary() int { - switch ce & ceTypeMask { - case ceType1: - return int(uint8(ce)) - case ceType2: - return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) - case ceType3or4: - if ce < ceType4 { - return defaultSecondary - } - return int(ce>>8) & 0xFFF - case ceTypeQ: - return 0 - } - panic("should not reach here") -} - -// Tertiary returns the tertiary collation weight for ce. -func (ce Elem) Tertiary() uint8 { - if ce&hasTertiaryMask == 0 { - if ce&ceType3or4 == 0 { - return uint8(ce & 0x1F) - } - if ce&ceType4 == ceType4 { - return uint8(ce) - } - return uint8(ce>>24) & 0x1F // type 2 - } else if ce&ceTypeMask == ceType1 { - return defaultTertiary - } - // ce is a quaternary value. - return 0 -} - func (ce Elem) updateTertiary(t uint8) Elem { if ce&ceTypeMask == ceType1 { // convert to type 4 @@ -267,33 +128,6 @@ func (ce Elem) updateTertiary(t uint8) Elem { return ce | Elem(t) } -// Quaternary returns the quaternary value if explicitly specified, -// 0 if ce == Ignore, or MaxQuaternary otherwise. -// Quaternary values are used only for shifted variants. -func (ce Elem) Quaternary() int { - if ce&ceTypeMask == ceTypeQ { - return int(ce&primaryValueMask) >> primaryShift - } else if ce&ceIgnoreMask == Ignore { - return 0 - } - return MaxQuaternary -} - -// Weight returns the collation weight for the given level. -func (ce Elem) Weight(l Level) int { - switch l { - case Primary: - return ce.Primary() - case Secondary: - return ce.Secondary() - case Tertiary: - return int(ce.Tertiary()) - case Quaternary: - return ce.Quaternary() - } - return 0 // return 0 (ignore) for undefined levels. -} - // For contractions, collation elements are of the form // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where // - n* is the size of the first node in the contraction trie. @@ -316,10 +150,6 @@ func splitContractIndex(ce Elem) (index, n, offset int) { return } -// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, -// where b* is the index into the expansion sequence table. -const maxExpandIndexBits = 16 - func splitExpandIndex(ce Elem) (index int) { return int(uint16(ce)) } @@ -340,18 +170,15 @@ func splitDecompose(ce Elem) (t1, t2 uint8) { const ( // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. minUnified rune = 0x4E00 - maxUnified = 0x9FFF - minCompatibility = 0xF900 - maxCompatibility = 0xFAFF - minRare = 0x3400 - maxRare = 0x4DBF + maxUnified rune = 0x9FFF + minCompatibility rune = 0xF900 + maxCompatibility rune = 0xFAFF ) + const ( commonUnifiedOffset = 0x10000 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF otherOffset = 0x50000 // largest rune in rare is U+2FA1D - illegalOffset = otherOffset + int(unicode.MaxRune) - maxPrimary = illegalOffset + 1 ) // implicitPrimary returns the primary weight for the a rune diff --git a/go/mysql/collations/vindex/internal/colltab/contract.go b/go/mysql/collations/vindex/internal/colltab/contract.go index 25649d4f55f..c677b65c04d 100644 --- a/go/mysql/collations/vindex/internal/colltab/contract.go +++ b/go/mysql/collations/vindex/internal/colltab/contract.go @@ -27,33 +27,16 @@ type ctScanner struct { done bool } -type ctScannerString struct { - states ContractTrieSet - s string - n int - index int - pindex int - done bool -} - func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner { return ctScanner{s: b, states: t[index:], n: n} } -func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString { - return ctScannerString{s: str, states: t[index:], n: n} -} - // result returns the offset i and bytes consumed p so far. If no suffix // matched, i and p will be 0. func (s *ctScanner) result() (i, p int) { return s.index, s.pindex } -func (s *ctScannerString) result() (i, p int) { - return s.index, s.pindex -} - const ( final = 0 noIndex = 0xFF @@ -101,45 +84,3 @@ func (s *ctScanner) scan(p int) int { } return pr } - -// scan is a verbatim copy of ctScanner.scan. -func (s *ctScannerString) scan(p int) int { - pr := p // the p at the rune start - str := s.s - states, n := s.states, s.n - for i := 0; i < n && p < len(str); { - e := states[i] - c := str[p] - // TODO: a significant number of contractions are of a form that - // cannot match discontiguous UTF-8 in a normalized string. We could let - // a negative value of e.n mean that we can set s.done = true and avoid - // the need for additional matches. - if c >= e.L { - if e.L == c { - p++ - if e.I != noIndex { - s.index = int(e.I) - s.pindex = p - } - if e.N != final { - i, states, n = 0, states[int(e.H)+n:], int(e.N) - if p >= len(str) || utf8.RuneStart(str[p]) { - s.states, s.n, pr = states, n, p - } - } else { - s.done = true - return p - } - continue - } else if e.N == final && c <= e.H { - p++ - s.done = true - s.index = int(c-e.L) + int(e.I) - s.pindex = p - return p - } - } - i++ - } - return pr -} diff --git a/go/mysql/collations/vindex/internal/colltab/iter.go b/go/mysql/collations/vindex/internal/colltab/iter.go index 541d66e4778..18434065c75 100644 --- a/go/mysql/collations/vindex/internal/colltab/iter.go +++ b/go/mysql/collations/vindex/internal/colltab/iter.go @@ -130,20 +130,6 @@ func (i *Iter) Next() bool { return done } -// nextNoNorm is the same as next, but does not "normalize" the collation -// elements. -func (i *Iter) nextNoNorm() bool { - // TODO: remove this function. Using this instead of next does not seem - // to improve performance in any significant way. We retain this until - // later for evaluation purposes. - if i.done() { - return false - } - i.appendNext() - i.N = len(i.Elems) - return true -} - const maxCombiningCharacters = 30 // doNorm reorders the collation elements in i.Elems. diff --git a/go/mysql/collations/vindex/internal/colltab/table.go b/go/mysql/collations/vindex/internal/colltab/table.go index f06f15e1e34..14c8541f8ee 100644 --- a/go/mysql/collations/vindex/internal/colltab/table.go +++ b/go/mysql/collations/vindex/internal/colltab/table.go @@ -18,10 +18,8 @@ type Table struct { ExpandElem []uint32 // contraction info - ContractTries ContractTrieSet - ContractElem []uint32 - MaxContractLen int - VariableTop uint32 + ContractTries ContractTrieSet + ContractElem []uint32 } // AppendNext appends the weights corresponding to the next rune or diff --git a/go/mysql/collations/vindex/internal/colltab/trie.go b/go/mysql/collations/vindex/internal/colltab/trie.go index a0eaa0d23be..f303134a9da 100644 --- a/go/mysql/collations/vindex/internal/colltab/trie.go +++ b/go/mysql/collations/vindex/internal/colltab/trie.go @@ -20,14 +20,11 @@ type Trie struct { } const ( - t1 = 0x00 // 0000 0000 tx = 0x80 // 1000 0000 t2 = 0xC0 // 1100 0000 t3 = 0xE0 // 1110 0000 t4 = 0xF0 // 1111 0000 t5 = 0xF8 // 1111 1000 - t6 = 0xFC // 1111 1100 - te = 0xFE // 1111 1110 ) func (t *Trie) lookupValue(n uint16, b byte) Elem { @@ -96,64 +93,3 @@ func (t *Trie) lookup(s []byte) (v Elem, sz int) { // Illegal rune return 0, 1 } - -// The body of lookupString is a verbatim copy of that of lookup. -func (t *Trie) lookupString(s string) (v Elem, sz int) { - c0 := s[0] - switch { - case c0 < tx: - return Elem(t.Values0[c0]), 1 - case c0 < t2: - return 0, 1 - case c0 < t3: - if len(s) < 2 { - return 0, 0 - } - i := t.Index0[c0] - c1 := s[1] - if c1 < tx || t2 <= c1 { - return 0, 1 - } - return t.lookupValue(i, c1), 2 - case c0 < t4: - if len(s) < 3 { - return 0, 0 - } - i := t.Index0[c0] - c1 := s[1] - if c1 < tx || t2 <= c1 { - return 0, 1 - } - o := int(i)<<6 + int(c1) - i = t.Index[o] - c2 := s[2] - if c2 < tx || t2 <= c2 { - return 0, 2 - } - return t.lookupValue(i, c2), 3 - case c0 < t5: - if len(s) < 4 { - return 0, 0 - } - i := t.Index0[c0] - c1 := s[1] - if c1 < tx || t2 <= c1 { - return 0, 1 - } - o := int(i)<<6 + int(c1) - i = t.Index[o] - c2 := s[2] - if c2 < tx || t2 <= c2 { - return 0, 2 - } - o = int(i)<<6 + int(c2) - i = t.Index[o] - c3 := s[3] - if c3 < tx || t2 <= c3 { - return 0, 3 - } - return t.lookupValue(i, c3), 4 - } - // Illegal rune - return 0, 1 -} diff --git a/go/mysql/collations/vindex/unicode/norm/composition.go b/go/mysql/collations/vindex/unicode/norm/composition.go index e2087bce527..c186f64fbf8 100644 --- a/go/mysql/collations/vindex/unicode/norm/composition.go +++ b/go/mysql/collations/vindex/unicode/norm/composition.go @@ -4,7 +4,11 @@ package norm -import "unicode/utf8" +import ( + "unicode/utf8" + + "vitess.io/vitess/go/hack" +) const ( maxNonStarters = 30 @@ -17,6 +21,10 @@ const ( maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128 ) +// MaxSegmentSize is the maximum size of a byte buffer needed to consider any +// sequence of starter and non-starter runes for the purpose of normalization. +const MaxSegmentSize = maxByteBufferSize + // ssState is used for reporting the segment state after inserting a rune. // It is returned by streamSafe.next. type ssState int @@ -111,20 +119,6 @@ type reorderBuffer struct { flushF func(*reorderBuffer) bool } -func (rb *reorderBuffer) init(f Form, src []byte) { - rb.f = *formTable[f] - rb.src.setBytes(src) - rb.nsrc = len(src) - rb.ss = 0 -} - -func (rb *reorderBuffer) initString(f Form, src string) { - rb.f = *formTable[f] - rb.src.setString(src) - rb.nsrc = len(src) - rb.ss = 0 -} - func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) { rb.out = out rb.flushF = f @@ -247,7 +241,7 @@ func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) { // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes. // It flushes the buffer on each new segment start. func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr { - rb.tmpBytes.setBytes(dcomp) + rb.tmpBytes = dcomp // As the streamSafe accounting already handles the counting for modifiers, // we don't have to call next. However, we do need to keep the accounting // intact when flushing the buffer. @@ -271,7 +265,7 @@ func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) { // insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb. func (rb *reorderBuffer) insertCGJ() { - rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))}) + rb.insertSingle(hack.StringBytes(GraphemeJoiner), 0, Properties{size: uint8(len(GraphemeJoiner))}) } // appendRune inserts a rune at the end of the buffer. It is used for Hangul. @@ -383,12 +377,6 @@ func isJamoVT(b []byte) bool { return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 } -func isHangulWithoutJamoT(b []byte) bool { - c, _ := utf8.DecodeRune(b) - c -= hangulBase - return c < jamoLVTCount && c%jamoTCount == 0 -} - // decomposeHangul writes the decomposed Hangul to buf and returns the number // of bytes written. len(buf) should be at least 9. func decomposeHangul(buf []byte, r rune) int { diff --git a/go/mysql/collations/vindex/unicode/norm/forminfo.go b/go/mysql/collations/vindex/unicode/norm/forminfo.go index 487335d14d3..75f2674486a 100644 --- a/go/mysql/collations/vindex/unicode/norm/forminfo.go +++ b/go/mysql/collations/vindex/unicode/norm/forminfo.go @@ -55,7 +55,6 @@ type formInfo struct { form Form composing, compatibility bool // form type info lookupFunc - nextMain iterFunc } var formTable = []*formInfo{{ @@ -63,25 +62,21 @@ var formTable = []*formInfo{{ composing: true, compatibility: false, info: lookupInfoNFC, - nextMain: nextComposed, }, { form: NFD, composing: false, compatibility: false, info: lookupInfoNFC, - nextMain: nextDecomposed, }, { form: NFKC, composing: true, compatibility: true, info: lookupInfoNFKC, - nextMain: nextComposed, }, { form: NFKD, composing: false, compatibility: true, info: lookupInfoNFKC, - nextMain: nextDecomposed, }} // We do not distinguish between boundaries for NFC, NFD, etc. to avoid @@ -229,14 +224,6 @@ func (f Form) Properties(s []byte) Properties { return compInfo(nfkcData.lookup(s)) } -// PropertiesString returns properties for the first rune in s. -func (f Form) PropertiesString(s string) Properties { - if f == NFC || f == NFD { - return compInfo(nfcData.lookupString(s)) - } - return compInfo(nfkcData.lookupString(s)) -} - // compInfo converts the information contained in v and sz // to a Properties. See the comment at the top of the file // for more information on the format. diff --git a/go/mysql/collations/vindex/unicode/norm/input.go b/go/mysql/collations/vindex/unicode/norm/input.go index 479e35bc258..4dbbcd15c45 100644 --- a/go/mysql/collations/vindex/unicode/norm/input.go +++ b/go/mysql/collations/vindex/unicode/norm/input.go @@ -6,102 +6,46 @@ package norm import "unicode/utf8" -type input struct { - str string - bytes []byte -} +type input []byte func inputBytes(str []byte) input { - return input{bytes: str} -} - -func inputString(str string) input { - return input{str: str} -} - -func (in *input) setBytes(str []byte) { - in.str = "" - in.bytes = str -} - -func (in *input) setString(str string) { - in.str = str - in.bytes = nil -} - -func (in *input) _byte(p int) byte { - if in.bytes == nil { - return in.str[p] - } - return in.bytes[p] + return str } -func (in *input) skipASCII(p, max int) int { - if in.bytes == nil { - for ; p < max && in.str[p] < utf8.RuneSelf; p++ { - } - } else { - for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ { - } +func (in input) skipASCII(p, max int) int { + for ; p < max && in[p] < utf8.RuneSelf; p++ { } return p } -func (in *input) skipContinuationBytes(p int) int { - if in.bytes == nil { - for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { - } - } else { - for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { - } +func (in input) skipContinuationBytes(p int) int { + for ; p < len(in) && !utf8.RuneStart(in[p]); p++ { } return p } -func (in *input) appendSlice(buf []byte, b, e int) []byte { - if in.bytes != nil { - return append(buf, in.bytes[b:e]...) - } - for i := b; i < e; i++ { - buf = append(buf, in.str[i]) - } - return buf +func (in input) appendSlice(buf []byte, b, e int) []byte { + return append(buf, in[b:e]...) } -func (in *input) copySlice(buf []byte, b, e int) int { - if in.bytes == nil { - return copy(buf, in.str[b:e]) - } - return copy(buf, in.bytes[b:e]) +func (in input) copySlice(buf []byte, b, e int) int { + return copy(buf, in[b:e]) } -func (in *input) charinfoNFC(p int) (uint16, int) { - if in.bytes == nil { - return nfcData.lookupString(in.str[p:]) - } - return nfcData.lookup(in.bytes[p:]) +func (in input) charinfoNFC(p int) (uint16, int) { + return nfcData.lookup(in[p:]) } -func (in *input) charinfoNFKC(p int) (uint16, int) { - if in.bytes == nil { - return nfkcData.lookupString(in.str[p:]) - } - return nfkcData.lookup(in.bytes[p:]) +func (in input) charinfoNFKC(p int) (uint16, int) { + return nfkcData.lookup(in[p:]) } -func (in *input) hangul(p int) (r rune) { +func (in input) hangul(p int) (r rune) { var size int - if in.bytes == nil { - if !isHangulString(in.str[p:]) { - return 0 - } - r, size = utf8.DecodeRuneInString(in.str[p:]) - } else { - if !isHangul(in.bytes[p:]) { - return 0 - } - r, size = utf8.DecodeRune(in.bytes[p:]) + if !isHangul(in[p:]) { + return 0 } + r, size = utf8.DecodeRune(in[p:]) if size != hangulUTF8Size { return 0 } diff --git a/go/mysql/collations/vindex/unicode/norm/iter.go b/go/mysql/collations/vindex/unicode/norm/iter.go deleted file mode 100644 index 417c6b26894..00000000000 --- a/go/mysql/collations/vindex/unicode/norm/iter.go +++ /dev/null @@ -1,458 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package norm - -import ( - "fmt" - "unicode/utf8" -) - -// MaxSegmentSize is the maximum size of a byte buffer needed to consider any -// sequence of starter and non-starter runes for the purpose of normalization. -const MaxSegmentSize = maxByteBufferSize - -// An Iter iterates over a string or byte slice, while normalizing it -// to a given Form. -type Iter struct { - rb reorderBuffer - buf [maxByteBufferSize]byte - info Properties // first character saved from previous iteration - next iterFunc // implementation of next depends on form - asciiF iterFunc - - p int // current position in input source - multiSeg []byte // remainder of multi-segment decomposition -} - -type iterFunc func(*Iter) []byte - -// Init initializes i to iterate over src after normalizing it to Form f. -func (i *Iter) Init(f Form, src []byte) { - i.p = 0 - if len(src) == 0 { - i.setDone() - i.rb.nsrc = 0 - return - } - i.multiSeg = nil - i.rb.init(f, src) - i.next = i.rb.f.nextMain - i.asciiF = nextASCIIBytes - i.info = i.rb.f.info(i.rb.src, i.p) - i.rb.ss.first(i.info) -} - -// InitString initializes i to iterate over src after normalizing it to Form f. -func (i *Iter) InitString(f Form, src string) { - i.p = 0 - if len(src) == 0 { - i.setDone() - i.rb.nsrc = 0 - return - } - i.multiSeg = nil - i.rb.initString(f, src) - i.next = i.rb.f.nextMain - i.asciiF = nextASCIIString - i.info = i.rb.f.info(i.rb.src, i.p) - i.rb.ss.first(i.info) -} - -// Seek sets the segment to be returned by the next call to Next to start -// at position p. It is the responsibility of the caller to set p to the -// start of a segment. -func (i *Iter) Seek(offset int64, whence int) (int64, error) { - var abs int64 - switch whence { - case 0: - abs = offset - case 1: - abs = int64(i.p) + offset - case 2: - abs = int64(i.rb.nsrc) + offset - default: - return 0, fmt.Errorf("norm: invalid whence") - } - if abs < 0 { - return 0, fmt.Errorf("norm: negative position") - } - if int(abs) >= i.rb.nsrc { - i.setDone() - return int64(i.p), nil - } - i.p = int(abs) - i.multiSeg = nil - i.next = i.rb.f.nextMain - i.info = i.rb.f.info(i.rb.src, i.p) - i.rb.ss.first(i.info) - return abs, nil -} - -// returnSlice returns a slice of the underlying input type as a byte slice. -// If the underlying is of type []byte, it will simply return a slice. -// If the underlying is of type string, it will copy the slice to the buffer -// and return that. -func (i *Iter) returnSlice(a, b int) []byte { - if i.rb.src.bytes == nil { - return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] - } - return i.rb.src.bytes[a:b] -} - -// Pos returns the byte position at which the next call to Next will commence processing. -func (i *Iter) Pos() int { - return i.p -} - -func (i *Iter) setDone() { - i.next = nextDone - i.p = i.rb.nsrc -} - -// Done returns true if there is no more input to process. -func (i *Iter) Done() bool { - return i.p >= i.rb.nsrc -} - -// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. -// For any input a and b for which f(a) == f(b), subsequent calls -// to Next will return the same segments. -// Modifying runes are grouped together with the preceding starter, if such a starter exists. -// Although not guaranteed, n will typically be the smallest possible n. -func (i *Iter) Next() []byte { - return i.next(i) -} - -func nextASCIIBytes(i *Iter) []byte { - p := i.p + 1 - if p >= i.rb.nsrc { - p0 := i.p - i.setDone() - return i.rb.src.bytes[p0:p] - } - if i.rb.src.bytes[p] < utf8.RuneSelf { - p0 := i.p - i.p = p - return i.rb.src.bytes[p0:p] - } - i.info = i.rb.f.info(i.rb.src, i.p) - i.next = i.rb.f.nextMain - return i.next(i) -} - -func nextASCIIString(i *Iter) []byte { - p := i.p + 1 - if p >= i.rb.nsrc { - i.buf[0] = i.rb.src.str[i.p] - i.setDone() - return i.buf[:1] - } - if i.rb.src.str[p] < utf8.RuneSelf { - i.buf[0] = i.rb.src.str[i.p] - i.p = p - return i.buf[:1] - } - i.info = i.rb.f.info(i.rb.src, i.p) - i.next = i.rb.f.nextMain - return i.next(i) -} - -func nextHangul(i *Iter) []byte { - p := i.p - next := p + hangulUTF8Size - if next >= i.rb.nsrc { - i.setDone() - } else if i.rb.src.hangul(next) == 0 { - i.rb.ss.next(i.info) - i.info = i.rb.f.info(i.rb.src, i.p) - i.next = i.rb.f.nextMain - return i.next(i) - } - i.p = next - return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] -} - -func nextDone(i *Iter) []byte { - return nil -} - -// nextMulti is used for iterating over multi-segment decompositions -// for decomposing normal forms. -func nextMulti(i *Iter) []byte { - j := 0 - d := i.multiSeg - // skip first rune - for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { - } - for j < len(d) { - info := i.rb.f.info(input{bytes: d}, j) - if info.BoundaryBefore() { - i.multiSeg = d[j:] - return d[:j] - } - j += int(info.size) - } - // treat last segment as normal decomposition - i.next = i.rb.f.nextMain - return i.next(i) -} - -// nextMultiNorm is used for iterating over multi-segment decompositions -// for composing normal forms. -func nextMultiNorm(i *Iter) []byte { - j := 0 - d := i.multiSeg - for j < len(d) { - info := i.rb.f.info(input{bytes: d}, j) - if info.BoundaryBefore() { - i.rb.compose() - seg := i.buf[:i.rb.flushCopy(i.buf[:])] - i.rb.insertUnsafe(input{bytes: d}, j, info) - i.multiSeg = d[j+int(info.size):] - return seg - } - i.rb.insertUnsafe(input{bytes: d}, j, info) - j += int(info.size) - } - i.multiSeg = nil - i.next = nextComposed - return doNormComposed(i) -} - -// nextDecomposed is the implementation of Next for forms NFD and NFKD. -func nextDecomposed(i *Iter) (next []byte) { - outp := 0 - inCopyStart, outCopyStart := i.p, 0 - for { - if sz := int(i.info.size); sz <= 1 { - i.rb.ss = 0 - p := i.p - i.p++ // ASCII or illegal byte. Either way, advance by 1. - if i.p >= i.rb.nsrc { - i.setDone() - return i.returnSlice(p, i.p) - } else if i.rb.src._byte(i.p) < utf8.RuneSelf { - i.next = i.asciiF - return i.returnSlice(p, i.p) - } - outp++ - } else if d := i.info.Decomposition(); d != nil { - // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. - // Case 1: there is a leftover to copy. In this case the decomposition - // must begin with a modifier and should always be appended. - // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. - p := outp + len(d) - if outp > 0 { - i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) - // TODO: this condition should not be possible, but we leave it - // in for defensive purposes. - if p > len(i.buf) { - return i.buf[:outp] - } - } else if i.info.multiSegment() { - // outp must be 0 as multi-segment decompositions always - // start a new segment. - if i.multiSeg == nil { - i.multiSeg = d - i.next = nextMulti - return nextMulti(i) - } - // We are in the last segment. Treat as normal decomposition. - d = i.multiSeg - i.multiSeg = nil - p = len(d) - } - prevCC := i.info.tccc - if i.p += sz; i.p >= i.rb.nsrc { - i.setDone() - i.info = Properties{} // Force BoundaryBefore to succeed. - } else { - i.info = i.rb.f.info(i.rb.src, i.p) - } - switch i.rb.ss.next(i.info) { - case ssOverflow: - i.next = nextCGJDecompose - fallthrough - case ssStarter: - if outp > 0 { - copy(i.buf[outp:], d) - return i.buf[:p] - } - return d - } - copy(i.buf[outp:], d) - outp = p - inCopyStart, outCopyStart = i.p, outp - if i.info.ccc < prevCC { - goto doNorm - } - continue - } else if r := i.rb.src.hangul(i.p); r != 0 { - outp = decomposeHangul(i.buf[:], r) - i.p += hangulUTF8Size - inCopyStart, outCopyStart = i.p, outp - if i.p >= i.rb.nsrc { - i.setDone() - break - } else if i.rb.src.hangul(i.p) != 0 { - i.next = nextHangul - return i.buf[:outp] - } - } else { - p := outp + sz - if p > len(i.buf) { - break - } - outp = p - i.p += sz - } - if i.p >= i.rb.nsrc { - i.setDone() - break - } - prevCC := i.info.tccc - i.info = i.rb.f.info(i.rb.src, i.p) - if v := i.rb.ss.next(i.info); v == ssStarter { - break - } else if v == ssOverflow { - i.next = nextCGJDecompose - break - } - if i.info.ccc < prevCC { - goto doNorm - } - } - if outCopyStart == 0 { - return i.returnSlice(inCopyStart, i.p) - } else if inCopyStart < i.p { - i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) - } - return i.buf[:outp] -doNorm: - // Insert what we have decomposed so far in the reorderBuffer. - // As we will only reorder, there will always be enough room. - i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) - i.rb.insertDecomposed(i.buf[0:outp]) - return doNormDecomposed(i) -} - -func doNormDecomposed(i *Iter) []byte { - for { - i.rb.insertUnsafe(i.rb.src, i.p, i.info) - if i.p += int(i.info.size); i.p >= i.rb.nsrc { - i.setDone() - break - } - i.info = i.rb.f.info(i.rb.src, i.p) - if i.info.ccc == 0 { - break - } - if s := i.rb.ss.next(i.info); s == ssOverflow { - i.next = nextCGJDecompose - break - } - } - // new segment or too many combining characters: exit normalization - return i.buf[:i.rb.flushCopy(i.buf[:])] -} - -func nextCGJDecompose(i *Iter) []byte { - i.rb.ss = 0 - i.rb.insertCGJ() - i.next = nextDecomposed - i.rb.ss.first(i.info) - buf := doNormDecomposed(i) - return buf -} - -// nextComposed is the implementation of Next for forms NFC and NFKC. -func nextComposed(i *Iter) []byte { - outp, startp := 0, i.p - var prevCC uint8 - for { - if !i.info.isYesC() { - goto doNorm - } - prevCC = i.info.tccc - sz := int(i.info.size) - if sz == 0 { - sz = 1 // illegal rune: copy byte-by-byte - } - p := outp + sz - if p > len(i.buf) { - break - } - outp = p - i.p += sz - if i.p >= i.rb.nsrc { - i.setDone() - break - } else if i.rb.src._byte(i.p) < utf8.RuneSelf { - i.rb.ss = 0 - i.next = i.asciiF - break - } - i.info = i.rb.f.info(i.rb.src, i.p) - if v := i.rb.ss.next(i.info); v == ssStarter { - break - } else if v == ssOverflow { - i.next = nextCGJCompose - break - } - if i.info.ccc < prevCC { - goto doNorm - } - } - return i.returnSlice(startp, i.p) -doNorm: - // reset to start position - i.p = startp - i.info = i.rb.f.info(i.rb.src, i.p) - i.rb.ss.first(i.info) - if i.info.multiSegment() { - d := i.info.Decomposition() - info := i.rb.f.info(input{bytes: d}, 0) - i.rb.insertUnsafe(input{bytes: d}, 0, info) - i.multiSeg = d[int(info.size):] - i.next = nextMultiNorm - return nextMultiNorm(i) - } - i.rb.ss.first(i.info) - i.rb.insertUnsafe(i.rb.src, i.p, i.info) - return doNormComposed(i) -} - -func doNormComposed(i *Iter) []byte { - // First rune should already be inserted. - for { - if i.p += int(i.info.size); i.p >= i.rb.nsrc { - i.setDone() - break - } - i.info = i.rb.f.info(i.rb.src, i.p) - if s := i.rb.ss.next(i.info); s == ssStarter { - break - } else if s == ssOverflow { - i.next = nextCGJCompose - break - } - i.rb.insertUnsafe(i.rb.src, i.p, i.info) - } - i.rb.compose() - seg := i.buf[:i.rb.flushCopy(i.buf[:])] - return seg -} - -func nextCGJCompose(i *Iter) []byte { - i.rb.ss = 0 // instead of first - i.rb.insertCGJ() - i.next = nextComposed - // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, - // even if they are not. This is particularly dubious for U+FF9E and UFF9A. - // If we ever change that, insert a check here. - i.rb.ss.first(i.info) - i.rb.insertUnsafe(i.rb.src, i.p, i.info) - return doNormComposed(i) -} diff --git a/go/mysql/collations/vindex/unicode/norm/normalize.go b/go/mysql/collations/vindex/unicode/norm/normalize.go index 2a6964a41c4..eadfbf4a2c6 100644 --- a/go/mysql/collations/vindex/unicode/norm/normalize.go +++ b/go/mysql/collations/vindex/unicode/norm/normalize.go @@ -71,7 +71,7 @@ func patchTail(rb *reorderBuffer) bool { rb.insertCGJ() rb.ss = 0 } - rb.insertUnsafe(inputBytes(buf), 0, info) + rb.insertUnsafe(buf, 0, info) return true } @@ -152,12 +152,6 @@ func doAppendInner(rb *reorderBuffer, p int) []byte { return rb.out } -// AppendString returns f(append(out, []byte(s))). -// The buffer out must be nil, empty, or equal to f(out). -func (f Form) AppendString(out []byte, src string) []byte { - return f.doAppend(out, inputString(src), len(src)) -} - // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and // whether any non-normalized parts were found. If atEOF is false, n will // not point past the last segment if this segment might be become @@ -249,12 +243,6 @@ func (f Form) firstBoundary(src input, nsrc int) int { } } -// FirstBoundaryInString returns the position i of the first boundary in s -// or -1 if s contains no boundary. -func (f Form) FirstBoundaryInString(s string) int { - return f.firstBoundary(inputString(s), len(s)) -} - // decomposeSegment scans the first segment in src into rb. It inserts 0x034f // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters // and returns the number of bytes consumed from src or iShortDst or iShortSrc.