-
Notifications
You must be signed in to change notification settings - Fork 32
/
buff.go
309 lines (257 loc) · 8.05 KB
/
buff.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
package fixedwidth
import (
"bytes"
"errors"
"strings"
"unicode/utf8"
)
// lineBuilder is a multibyte character aware buffer that can be used to efficiently build
// a line of fixed width text.
type lineBuilder struct {
data []byte
// Used when `SetUseCodepointIndices` has been called on `Encoder`. A
// mapping of codepoint indices into the bytes. So the `codepointIndices[n]` is the
// starting position for the n-th codepoint in `bytes`.
codepointIndices []int
}
// newLineBuilder makes a new lineBuilder. The line is filled with the provided fillChar.
func newLineBuilder(len, cap int, fillChar byte) *lineBuilder {
data := make([]byte, len, cap)
// Fill the buffer with the fill character.
data[0] = fillChar
filled := 1
for filled < len {
copy(data[filled:], data[:filled])
filled *= 2
}
buff := &lineBuilder{
data: data,
}
return buff
}
// lineBufferFromValue creates a lineBuilder from a rawValue.
func lineBufferFromValue(value rawValue) *lineBuilder {
buff := newLineBuilder(value.len(), value.byteLen(), ' ')
buff.WriteValue(0, value)
return buff
}
// WriteValue writes the given value to the lineBuilder at the give start index.
func (b *lineBuilder) WriteValue(start int, value rawValue) {
// If the value is empty there is nothing to write.
if len(value.data) == 0 {
return
}
// Fast path for ascii only operation.
if !b.hasMultiByteChar() && !value.hasMultiByteChar() {
copy(b.data[start:], value.data)
return
}
// If this is the first time a multibyte character has been encountered, the codepoint
// indices need to be initialized.
if !b.hasMultiByteChar() && value.hasMultiByteChar() {
b.initializeIndices()
}
end := start + value.len() - 1
// Calculate the byte start and end indices accounting for any multibyte characters.
byteStart := b.codepointIndices[start]
byteEnd := b.byteEndIndex(end)
writeSpan := b.data[byteStart : byteEnd+1]
// Ensure the there is space for the value being written. adjustByteSpan will grow or
// shrink the byte span if required.
byteDiff := value.byteLen() - len(writeSpan)
if byteDiff != 0 {
b.adjustByteSpan(end, byteDiff)
// Correct the writeSpan after the adjustment.
byteEnd = b.byteEndIndex(end)
writeSpan = b.data[byteStart : byteEnd+1]
}
// Write the value to the buffer
copy(b.data[byteStart:byteEnd+1], value.data)
// Correct the indices for the value that was just written. This only needs to happen
// if we adjusted the write-span or the new value contains multibyte characters.
if byteDiff != 0 || value.hasMultiByteChar() {
b.correctIndices(start, value)
}
}
// WriteASCII writes an ascii string to the line builder.
func (b *lineBuilder) WriteASCII(start int, data string) {
v, _ := newRawValue(data, false)
b.WriteValue(start, v)
}
func (b *lineBuilder) String() string {
return string(b.data)
}
func (b *lineBuilder) AsRawValue() rawValue {
return rawValue{
data: b.String(),
codepointIndices: b.codepointIndices,
}
}
func (b *lineBuilder) initializeIndices() {
b.codepointIndices = make([]int, len(b.data))
for i := range b.codepointIndices {
b.codepointIndices[i] = i
}
}
func (b *lineBuilder) correctIndices(start int, value rawValue) {
firstIndex := b.byteEndIndex(start-1) + 1
// Fast path for ascii values – there is no need to individually calculate the
// indices.
if !value.hasMultiByteChar() {
for i := 0; i < value.len(); i++ {
b.codepointIndices[start+i] = firstIndex + i
}
return
}
for i, s := range value.codepointIndices {
b.codepointIndices[start+i] = firstIndex + s
}
}
func (b *lineBuilder) adjustByteSpan(end, diff int) {
byteEnd := b.byteEndIndex(end)
switch {
case diff < 0:
// shorten buffer data
copy(b.data[byteEnd+diff:], b.data[byteEnd:])
b.data = b.data[:len(b.data)+diff]
case diff > 0:
// expand buffer data
b.data = append(b.data, bytes.Repeat([]byte{' '}, diff)...)
copy(b.data[byteEnd+diff:], b.data[byteEnd:])
}
// correct indices
for i := end + 1; i < len(b.codepointIndices); i++ {
b.codepointIndices[i] += diff
}
}
func (b *lineBuilder) byteStartIndex(start int) int {
if b.codepointIndices == nil {
return start
}
return b.codepointIndices[start]
}
func (b *lineBuilder) byteEndIndex(end int) int {
if b.codepointIndices == nil {
return end
}
if end == len(b.codepointIndices)-1 {
return len(b.data) - 1
}
return b.codepointIndices[end+1] - 1
}
func (b *lineBuilder) hasMultiByteChar() bool {
return b.codepointIndices != nil
}
type rawValue struct {
data string
// Used when `SetUseCodepointIndices` has been called on `Decoder` or `Encoder`. A
// mapping of codepoint indices into the bytes. So the `codepointIndices[n]` is the
// starting position for the n-th codepoint in `bytes`.
codepointIndices []int
}
func (r rawValue) trimLeft(cutset string) rawValue {
newData := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(newData)
if r.codepointIndices == nil {
return rawValue{data: newData}
}
newIndices := r.trimCodepointIndices(leftRemovedBytes, 0)
return rawValue{data: newData, codepointIndices: newIndices}
}
func (r rawValue) trimRight(cutset string) rawValue {
newData := strings.TrimRight(r.data, cutset)
rightRemovedBytes := len(r.data) - len(newData)
if r.codepointIndices == nil {
return rawValue{data: newData}
}
newIndices := r.trimCodepointIndices(0, rightRemovedBytes)
return rawValue{data: newData, codepointIndices: newIndices}
}
func (r rawValue) trim(cutset string) rawValue {
leftTrimmed := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(leftTrimmed)
bothTrimmed := strings.TrimRight(leftTrimmed, cutset)
rightRemovedBytes := len(leftTrimmed) - len(bothTrimmed)
if r.codepointIndices == nil {
return rawValue{data: bothTrimmed}
}
newIndices := r.trimCodepointIndices(leftRemovedBytes, rightRemovedBytes)
return rawValue{data: bothTrimmed, codepointIndices: newIndices}
}
func (r rawValue) trimCodepointIndices(leftRemovedBytes int, rightRemovedBytes int) []int {
newIndices := make([]int, 0, len(r.codepointIndices))
for _, idx := range r.codepointIndices {
if idx >= leftRemovedBytes && idx < len(r.data)-rightRemovedBytes {
newIndices = append(newIndices, idx-leftRemovedBytes)
}
}
return newIndices
}
func newRawValue(data string, useCodepointIndices bool) (rawValue, error) {
value := rawValue{
data: data,
}
if useCodepointIndices {
bytesIdx := findFirstMultiByteChar(data)
// If we've got multi-byte characters, fill in the rest of codepointIndices.
if bytesIdx < len(data) {
codepointIndices := make([]int, bytesIdx)
for i := 0; i < bytesIdx; i++ {
codepointIndices[i] = i
}
for bytesIdx < len(data) {
_, codepointSize := utf8.DecodeRuneInString(data[bytesIdx:])
if codepointSize == 0 {
return rawValue{}, errors.New("fixedwidth: Invalid codepoint")
}
codepointIndices = append(codepointIndices, bytesIdx)
bytesIdx += codepointSize
}
value.codepointIndices = codepointIndices
}
}
return value, nil
}
func (v rawValue) len() int {
if v.codepointIndices == nil {
return len(v.data)
}
return len(v.codepointIndices)
}
func (v rawValue) byteLen() int {
return len(v.data)
}
func (v rawValue) hasMultiByteChar() bool {
return v.codepointIndices != nil
}
func (v rawValue) byteStartIndex(start int) int {
if v.codepointIndices == nil {
return start
}
return v.codepointIndices[start]
}
func (v rawValue) byteEndIndex(end int) int {
if v.codepointIndices == nil {
return end
}
if end == len(v.codepointIndices)-1 {
return len(v.data) - 1
}
return v.codepointIndices[end+1] - 1
}
func (v rawValue) slice(start, end int) (rawValue, error) {
d := v.data[v.byteStartIndex(start) : v.byteEndIndex(end)+1]
return newRawValue(d, v.hasMultiByteChar())
}
// Scans bytes, looking for multi-byte characters, returns either the index of
// the first multi-byte chracter or the length of the string if there are none.
func findFirstMultiByteChar(data string) int {
for i := 0; i < len(data); i++ {
// We have a multi-byte codepoint, we need to allocate
// codepointIndices
if data[i]&0x80 == 0x80 {
return i
}
}
return len(data)
}