-
Notifications
You must be signed in to change notification settings - Fork 0
/
text.go
434 lines (412 loc) · 12 KB
/
text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
package magicnumber
// Package file text.go contains the functions that parse bytes as common text and document formats.
import (
"bytes"
"io"
)
// NotASCII returns true if the byte is not an printable ASCII character.
// Most control characters are not printable ASCII characters, but an exception
// is made for the ESC (escape) character which is used in ANSI escape codes and
// the EOF (end of file) character which is used in DOS.
func NotASCII(b byte) bool {
// a list of rune literals for the control characters
// https://go.dev/ref/spec#Rune_literals
const (
nul = 0x0
tab = byte('\t')
nl = byte('\n')
vt = byte('\v')
ff = byte('\f')
cr = byte('\r')
bel = byte('\a')
bak = byte('\b')
eof = 0x1a // end of file character commonly used in DOS
esc = 0x1b // escape character used in ANSI escape codes
)
return (b < 0x20 || b > 0x7f) &&
b != nul && b != tab && b != nl && b != vt && b != ff && b != cr && b != bel && b != bak &&
b != esc && b != eof
}
// NotPlainText returns true if the byte is not a printable plain text character.
// This includes any printable ASCII character as well as any "extended ASCII".
func NotPlainText(b byte) bool {
if !NotASCII(b) {
return false
}
const extendedBegin = 0x80
const extendedEnd = 0xff
ExtendedASCII := b >= extendedBegin && b <= extendedEnd
return !ExtendedASCII
}
// NonISO889591 returns true if the byte is not a printable ISO/IEC-8895-1 character.
func NonISO889591(b byte) bool {
if !NotASCII(b) {
return false
}
const extendedBegin = 0xa0
const extendedEnd = 0xff
ExtendedASCII := b >= extendedBegin && b <= extendedEnd
return !ExtendedASCII
}
// NonWindows1252 returns true if the byte is not a printable Windows-1252 character.
func NonWindows1252(b byte) bool {
if !NonISO889591(b) {
return false
}
const (
extendedBegin = 0x80
extendedEnd = 0xff
unused81 = 0x81
unused8d = 0x8d
unused8f = 0x8f
unused90 = 0x90
unused9d = 0x9d
)
ExtraTypography := b != unused81 && b != unused8d && b != unused8f && b != unused90 && b != unused9d
return !(b >= extendedBegin && b <= extendedEnd && ExtraTypography)
}
// ASCII returns true if the reader exclusively contains printable ASCII characters.
// Today, ASCII characters are the first characters of the Unicode character set
// but historically it was a 7 and 8-bit character encoding standard found on
// most microcomputers, personal computers, and the early Internet.
func ASCII(r io.ReaderAt) bool {
size := Length(r)
const chunkSize = 1024
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
for i := range n {
if NotASCII(buf[i]) {
return false
}
}
if err == io.EOF {
break
}
}
return true
}
// CodePage returns true if the reader contains is a possible IBM code page
// text file that was often found on DOS and 16-bit Windows computers.
//
// This function is heuristic and checks for the following:
// - no multiple nulls before the EOF marker
// - require IBM PC/Microsoft newlines
// - number of newlines should be at least (80 columns / length of file) / halfed
func CodePage(r io.ReaderAt) bool {
nulpair := []byte{0x0, 0x0}
msdosNL := []byte{0x0d, 0x0a}
size := Length(r)
const chunkSize = 1024
const binary, textfile = false, true
newlineCount := 0
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return binary
}
if pos := bytes.Index(buf[:n], nulpair); pos != -1 {
return binary
}
newlineCount += bytes.Count(buf[:n], msdosNL)
if err == io.EOF {
break
}
}
const columns = int64(80)
if size > columns {
return int64(newlineCount) >= (size/columns)/2
}
return textfile
}
// CSI returns true if the reader contains three or more common Control Sequence Introducer (CSI) escape codes
// that are used in ANSI encoded texts. This is a heuristic function and does not guarantee that the reader
// contains ANSI encoded text.
func CSI(r io.ReaderAt) bool {
const esc, leftBracket = 0x1b, 0x5b
const minRequired = 3
csi := []byte{esc, leftBracket, 0x0}
codes := []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'J', 'K', '=', 's', 'u', '#'}
finds := 0
size := Length(r)
const chunkSize = 1024
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
for _, c := range codes {
if finds >= minRequired {
return true
}
csi[2] = byte(c)
if pos := bytes.Index(buf[:n], csi); pos > -1 {
finds++
continue
}
}
if err == io.EOF {
break
}
}
return false
}
// Ansi returns true if the reader contains some common ANSI escape codes.
// It for speed and to avoid false positives it only matches the ANSI escape codes
// for bold, normal and reset text.
func Ansi(r io.ReaderAt) bool {
const esc = 0x1b
var (
reset = []byte{esc, '[', '0', 'm'}
clear = []byte{esc, '[', '2', 'J'}
bold = []byte{esc, '[', '1', ';'}
normal = []byte{esc, '[', '0', ';'}
)
// check for the common ANSI escape codes
size := Length(r)
const chunkSize = 1024
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
if pos := bytes.Index(buf[:n], reset); pos != -1 {
return true
}
if pos := bytes.Index(buf[:n], clear); pos != -1 {
return true
}
if pos := bytes.Index(buf[:n], bold); pos != -1 {
return true
}
if pos := bytes.Index(buf[:n], normal); pos != -1 {
return true
}
if err == io.EOF {
break
}
}
return false
}
// Hlp returns true if the reader contains the Windows Help File signature.
// This is a generic signature for Windows help files and does not differentiate between
// the various versions of the help file format.
func Hlp(r io.ReaderAt) bool {
const size = 4
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
compiledHTML := []byte{'I', 'T', 'S', 'F'}
windowsHelpLN := []byte{'L', 'N', 0x2, 0x0}
windowsHelp := []byte{'?', 0x5f, 0x3, 0x0}
help := bytes.Equal(p, compiledHTML) ||
bytes.Equal(p, windowsHelp) ||
bytes.Equal(p, windowsHelpLN)
if help {
return true
}
const offset, size6b = 6, 6
p = make([]byte, size6b)
sr = io.NewSectionReader(r, offset, size6b)
if n, err := sr.Read(p); err != nil || n < size6b {
return false
}
windowsHelp6byte := []byte{0x0, 0x0, 0xff, 0xff, 0xff, 0xff}
return bytes.Equal(p, windowsHelp6byte)
}
// Pdf returns true if the reader contains the Portable Document Format signature.
func Pdf(r io.ReaderAt) bool {
const size = 4
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
if !bytes.Equal(p, []byte{'%', 'P', 'D', 'F'}) {
return false
}
length := Length(r)
endoffileMarks := [][]byte{
{0x0a, '%', '%', 'E', 'O', 'F'},
{0x0a, '%', '%', 'E', 'O', 'F', 0x0a},
{0x0d, 0x0a, '%', '%', 'E', 'O', 'F', 0x0d, 0x0a},
{0x0d, '%', '%', 'E', 'O', 'F', 0x0d},
}
for _, eof := range endoffileMarks {
eofSize := int64(len(eof))
offset := length - eofSize
p := make([]byte, eofSize)
sr := io.NewSectionReader(r, offset, eofSize)
if n, err := sr.Read(p); err != nil || int64(n) < eofSize {
continue
}
if bytes.HasSuffix(p, eof) {
return true
}
}
return false
}
// Rtf returns true if the reader contains the Rich Text Format signature.
func Rtf(r io.ReaderAt) bool {
const size = 5
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
if !bytes.Equal(p, []byte{'{', 0x5c, 'r', 't', 'f'}) {
return false
}
length := Length(r)
p = make([]byte, 1)
sr = io.NewSectionReader(r, length-1, 1)
if n, err := sr.Read(p); err != nil || n < 1 {
return false
}
return p[0] == '}'
}
// Txt returns true if the reader exclusively contains plain text ASCII characters,
// control characters or "extended ASCII characters".
//
// There is a 2% threshold for non-plain text characters such as ASCII control characters
// which are not printable but often found in plain text files for 8-bit microcomputers.
func Txt(r io.ReaderAt) bool {
const chunkSize = 1024
size := Length(r)
buf := make([]byte, chunkSize)
nonPlainText := 0
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
for i := range n {
if NotPlainText(buf[i]) {
nonPlainText++
if !threshold(nonPlainText, size) {
return false
}
}
}
if err == io.EOF {
break
}
}
return threshold(nonPlainText, size)
}
// If count is greater than 2% of the filesize, then it is not plain text.
func threshold(count int, size int64) bool {
const percentage = 0.02
return float64(count)/float64(size) < percentage
}
// TxtLatin1 returns true if the reader exclusively contains plain text ISO/IEC-8895-1 characters,
// commonly known as the Latin-1 character set.
func TxtLatin1(r io.ReaderAt) bool {
size := Length(r)
const chunkSize = 1024
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
for i := range n {
if NonISO889591(buf[i]) {
return false
}
}
if err == io.EOF {
break
}
}
return true
}
// TxtWindows returns true if the reader exclusively contains plain text Windows-1252 characters.
// This is an extension of the Latin-1 character set with additional typography characters and was
// the default character set for English in Microsoft Windows up to Windows 7?
func TxtWindows(r io.ReaderAt) bool {
size := Length(r)
const chunkSize = 1024
buf := make([]byte, chunkSize)
for offset := int64(0); offset < size; offset += chunkSize {
bytesToRead := chunkSize
if offset+int64(chunkSize) > size {
bytesToRead = int(size - offset)
}
n, err := r.ReadAt(buf[:bytesToRead], offset)
if err != nil && err != io.EOF {
return false
}
for i := range n {
if NonWindows1252(buf[i]) {
return false
}
}
if err == io.EOF {
break
}
}
return true
}
// Utf8 returns true if the reader begins with the UTF-8 Byte Order Mark signature.
func Utf8(r io.ReaderAt) bool {
const size = 3
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
return bytes.Equal(p, []byte{0xef, 0xbb, 0xbf})
}
// Utf16 returns true if the reader beings with the UTF-16 Byte Order Mark signature.
func Utf16(r io.ReaderAt) bool {
const size = 2
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
return bytes.Equal(p, []byte{0xff, 0xfe}) || bytes.Equal(p, []byte{0xfe, 0xff})
}
// Utf32 returns true if the reader beings with the UTF-32 Byte Order Mark signature.
func Utf32(r io.ReaderAt) bool {
const size = 4
p := make([]byte, size)
sr := io.NewSectionReader(r, 0, size)
if n, err := sr.Read(p); err != nil || n < size {
return false
}
return bytes.Equal(p, []byte{0xff, 0xfe, 0x0, 0x0}) || bytes.Equal(p, []byte{0x0, 0x0, 0xfe, 0xff})
}