-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.go
165 lines (133 loc) · 3.24 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
package simplexer
import (
"io"
"strings"
)
// Defined default values for properties of Lexer as a package value.
var (
DefaultWhitespace = NewPatternTokenType(-1, []string{" ", "\t", "\r", "\n"})
DefaultTokenTypes = []TokenType{
NewRegexpTokenType(IDENT, `[a-zA-Z_][a-zA-Z0-9_]*`),
NewRegexpTokenType(NUMBER, `[0-9]+(?:\.[0-9]+)?`),
NewRegexpTokenType(STRING, `\"([^"]*)\"`),
NewRegexpTokenType(OTHER, `.`),
}
)
/*
The lexical analyzer.
Whitespace is a TokenType for skipping characters like whitespaces.
The default value is simplexer.DefaultWhitespace.
Won't skip any characters if Whitespace is nil.
TokenTypes is an array of TokenType.
Lexer will sequential check TokenTypes, and return first matched token.
Default is simplexer.DefaultTokenTypes.
Please be careful, Lexer will never use it even if append TokenType after OTHER.
Because OTHER will accept any single character.
*/
type Lexer struct {
reader io.Reader
buf string
loadedLine string
nextPos Position
Whitespace TokenType
TokenTypes []TokenType
}
// Make a new Lexer.
func NewLexer(reader io.Reader) *Lexer {
l := new(Lexer)
l.reader = reader
l.Whitespace = DefaultWhitespace
l.TokenTypes = DefaultTokenTypes
return l
}
func (l *Lexer) readBufIfNeed() {
if len(l.buf) < 1024 {
buf := make([]byte, 2048)
l.reader.Read(buf)
l.buf += strings.TrimRight(string(buf), "\x00")
}
}
func (l *Lexer) consumeBuffer(t *Token) {
if t == nil {
return
}
l.buf = l.buf[len(t.Literal):]
l.nextPos = shiftPos(l.nextPos, t.Literal)
if idx := strings.LastIndex(t.Literal, "\n"); idx >= 0 {
l.loadedLine = t.Literal[idx+1:]
} else {
l.loadedLine += t.Literal
}
}
func (l *Lexer) skipWhitespace() {
if l.Whitespace == nil {
return
}
for true {
l.readBufIfNeed()
if t := l.Whitespace.FindToken(l.buf, l.nextPos); t != nil {
l.consumeBuffer(t)
} else {
break
}
}
}
func (l *Lexer) makeError() error {
for shift, _ := range l.buf {
if l.Whitespace != nil && l.Whitespace.FindToken(l.buf[shift:], l.nextPos) != nil {
return UnknownTokenError{
Literal: l.buf[:shift],
Position: l.nextPos,
}
}
for _, tokenType := range l.TokenTypes {
if tokenType.FindToken(l.buf[shift:], l.nextPos) != nil {
return UnknownTokenError{
Literal: l.buf[:shift],
Position: l.nextPos,
}
}
}
}
return UnknownTokenError{
Literal: l.buf,
Position: l.nextPos,
}
}
/*
Peek the first token in the buffer.
Returns nil as *Token if the buffer is empty.
*/
func (l *Lexer) Peek() (*Token, error) {
for _, tokenType := range l.TokenTypes {
l.skipWhitespace()
l.readBufIfNeed()
if t := tokenType.FindToken(l.buf, l.nextPos); t != nil {
return t, nil
}
}
if len(l.buf) > 0 {
return nil, l.makeError()
}
return nil, nil
}
/*
Scan will get the first token in the buffer and remove it from the buffer.
This function using Lexer.Peek. Please read document of Peek.
*/
func (l *Lexer) Scan() (*Token, error) {
t, e := l.Peek()
l.consumeBuffer(t)
return t, e
}
/*
GetCurrentLine returns line of last scanned token.
*/
func (l *Lexer) GetLastLine() string {
l.readBufIfNeed()
if idx := strings.Index(l.buf, "\n"); idx >= 0 {
return l.loadedLine + l.buf[:strings.Index(l.buf, "\n")]
} else {
return l.loadedLine + l.buf
}
}