forked from lloyddewit/RScript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clsRToken.vb
387 lines (356 loc) · 20.1 KB
/
clsRToken.vb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
Imports System.Text.RegularExpressions
Public Class clsRToken
''' <summary> The different types of R element (function name, key word, comment etc.)
''' that the token may represent. </summary>
Public Enum typToken
RSyntacticName
RFunctionName
RKeyWord
RConstantString
RComment
RSpace
RBracket
RSeparator
REndStatement
REndScript
RNewLine
ROperatorUnaryLeft
ROperatorUnaryRight
ROperatorBinary
ROperatorBracket
RPresentation
RInvalid
End Enum
''' <summary> The lexeme associated with the token. </summary>
Public strTxt As String
''' <summary> The token type (function name, key word, comment etc.). </summary>
Public enuToken As typToken
''' <summary> The token's children. </summary>
Public lstTokens As New List(Of clsRToken)
'''--------------------------------------------------------------------------------------------
''' <summary>
''' Constructs a new token with lexeme <paramref name="strTxtNew"/> and token type
''' <paramref name="enuTokenNew"/>.
''' <para>
''' A token is a string of characters that represent a valid R element, plus meta data about
''' the token type (identifier, operator, keyword, bracket etc.).
''' </para>
''' </summary>
'''
''' <param name="strTxtNew"> The lexeme to associate with the token. </param>
''' <param name="enuTokenNew"> The token type (function name, key word, comment etc.). </param>
'''--------------------------------------------------------------------------------------------
Public Sub New(strTxtNew As String, enuTokenNew As typToken)
strTxt = strTxtNew
enuToken = enuTokenNew
End Sub
'''--------------------------------------------------------------------------------------------
''' <summary>
''' Constructs a token from <paramref name="strLexemeCurrent"/>.
''' <para>
''' A token is a string of characters that represent a valid R element, plus meta data about
''' the token type (identifier, operator, keyword, bracket etc.).
''' </para><para>
''' <paramref name="strLexemePrev"/> and <paramref name="strLexemeNext"/> are needed
''' to correctly identify if <paramref name="strLexemeCurrent"/> is a unary or binary
''' operator.</para>
''' </summary>
'''
''' <param name="strLexemePrev"> The non-space lexeme immediately to the left of
''' <paramref name="strLexemeCurrent"/>. </param>
''' <param name="strLexemeCurrent"> The lexeme to convert to a token. </param>
''' <param name="strLexemeNext"> The non-space lexeme immediately to the right of
''' <paramref name="strLexemeCurrent"/>. </param>
'''
'''--------------------------------------------------------------------------------------------
Public Sub New(strLexemePrev As String, strLexemeCurrent As String, strLexemeNext As String, bLexemeNextOnSameLine As Boolean)
'TODO refactor so that strLexemePrev and strLexemeNext are booleans rather than strings?
If String.IsNullOrEmpty(strLexemeCurrent) Then
Exit Sub
End If
strTxt = strLexemeCurrent
If IsKeyWord(strLexemeCurrent) Then 'reserved key word (e.g. if, else etc.)
enuToken = clsRToken.typToken.RKeyWord
ElseIf IsSyntacticName(strLexemeCurrent) Then
If strLexemeNext = "(" AndAlso bLexemeNextOnSameLine Then
enuToken = clsRToken.typToken.RFunctionName 'function name
Else
enuToken = clsRToken.typToken.RSyntacticName 'syntactic name
End If
ElseIf IsComment(strLexemeCurrent) Then 'comment (starts with '#*')
enuToken = clsRToken.typToken.RComment
ElseIf IsConstantString(strLexemeCurrent) Then 'string literal (starts with single or double quote)
enuToken = clsRToken.typToken.RConstantString
ElseIf IsNewLine(strLexemeCurrent) Then 'new line (e.g. '\n')
enuToken = clsRToken.typToken.RNewLine
ElseIf strLexemeCurrent = ";" Then 'end statement
enuToken = clsRToken.typToken.REndStatement
ElseIf strLexemeCurrent = "," Then 'parameter separator
enuToken = clsRToken.typToken.RSeparator
ElseIf IsSequenceOfSpaces(strLexemeCurrent) Then 'sequence of spaces (needs to be after separator check,
enuToken = clsRToken.typToken.RSpace ' else linefeed is recognised as space)
ElseIf IsBracket(strLexemeCurrent) Then 'bracket (e.g. '{')
If strLexemeCurrent = "}" Then
enuToken = clsRToken.typToken.REndScript
Else
enuToken = clsRToken.typToken.RBracket
End If
ElseIf IsOperatorBrackets(strLexemeCurrent) Then 'bracket operator (e.g. '[')
enuToken = clsRToken.typToken.ROperatorBracket
ElseIf IsOperatorUnary(strLexemeCurrent) AndAlso 'unary right operator (e.g. '!x')
(String.IsNullOrEmpty(strLexemePrev) OrElse
Not Regex.IsMatch(strLexemePrev, "[a-zA-Z0-9_\.)\]]$")) Then
enuToken = clsRToken.typToken.ROperatorUnaryRight
ElseIf strLexemeCurrent = "~" AndAlso 'unary left operator (e.g. x~)
(String.IsNullOrEmpty(strLexemeNext) OrElse
Not bLexemeNextOnSameLine OrElse
Not Regex.IsMatch(strLexemeNext, "^[a-zA-Z0-9_\.(\+\-\!~]")) Then
enuToken = clsRToken.typToken.ROperatorUnaryLeft
ElseIf IsOperatorReserved(strLexemeCurrent) OrElse 'binary operator (e.g. '+')
Regex.IsMatch(strLexemeCurrent, "^%.*%$") Then
enuToken = clsRToken.typToken.ROperatorBinary
Else
enuToken = clsRToken.typToken.RInvalid
End If
End Sub
'''--------------------------------------------------------------------------------------------
''' <summary> Creates and returns a clone of this object. </summary>
'''
''' <exception cref="Exception"> Thrown when the object has an empty child token. </exception>
'''
''' <returns> A clone of this object. </returns>
'''--------------------------------------------------------------------------------------------
Public Function CloneMe() As clsRToken
Dim clsToken = New clsRToken(strTxt, enuToken)
For Each clsTokenChild As clsRToken In lstTokens
If IsNothing(clsTokenChild) Then
Throw New Exception("Token has illegal empty child.")
End If
clsToken.lstTokens.Add(clsTokenChild.CloneMe)
Next
Return clsToken
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a valid lexeme, else returns false.
''' </summary>
'''
''' <param name="strTxt"> A sequence of characters from a syntactically correct R script </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a valid lexeme, else false. </returns>
'''--------------------------------------------------------------------------------------------
Public Shared Function IsValidLexeme(strTxt As String) As Boolean
If (String.IsNullOrEmpty(strTxt)) Then
Return False
End If
'if string is not a valid lexeme ...
If Not strTxt = vbCrLf AndAlso Regex.IsMatch(strTxt, ".+\n$") OrElse 'string is >1 char and ends in newline (handy to do this test first because it simplifies the regular expressions below)
Regex.IsMatch(strTxt, ".+\r$") OrElse 'string is >1 char and ends in carriage return
Regex.IsMatch(strTxt, "^%.*%.+") OrElse 'string is a user-defined operator followed by another character
Regex.IsMatch(strTxt, "^'.*'.+") OrElse 'string is a single quoted string followed by another character
Regex.IsMatch(strTxt, "^"".*"".+") OrElse 'string is a double quoted string followed by another character
Regex.IsMatch(strTxt, "^`.*`.+") Then 'string is a backtick quoted string followed by another character
Return False
End If
'if string is a valid lexeme ...
If IsSyntacticName(strTxt) OrElse 'syntactic name or reserved word
IsOperatorReserved(strTxt) OrElse 'operator (e.g. '+')
IsOperatorBrackets(strTxt) OrElse 'bracket operator (e.g. '[')
strTxt = "<<" OrElse 'partial operator (e.g. ':')
IsNewLine(strTxt) OrElse 'newlines (e.g. '\n')
strTxt = "," OrElse strTxt = ";" OrElse 'parameter separator or end statement
IsBracket(strTxt) OrElse 'bracket (e.g. '{')
IsSequenceOfSpaces(strTxt) OrElse 'sequence of spaces
IsConstantString(strTxt) OrElse 'string constant (starts with single or double)
IsOperatorUserDefined(strTxt) OrElse 'user-defined operator (starts with '%*')
IsComment(strTxt) Then 'comment (starts with '#*')
Return True
End If
'if the string is not covered by any of the checks above,
' then we assume by default, that it's not a valid lexeme
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a complete or partial
''' valid R syntactic name or key word, else returns false.<para>
''' Please note that the rules for syntactic names are actually stricter than
''' the rules used in this function, but this library assumes it is parsing valid
''' R code. </para></summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a valid R syntactic name or key word,
''' else returns false.</returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsSyntacticName(strTxt As String) As Boolean
If String.IsNullOrEmpty(strTxt) Then
Return False
End If
Return Regex.IsMatch(strTxt, "^[a-zA-Z0-9_\.]+$") OrElse
Regex.IsMatch(strTxt, "^`.*")
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a complete or partial string
''' constant, else returns false.<para>
''' String constants are delimited by a pair of single (‘'’) or double (‘"’) quotes
''' and can contain all other printable characters. Quotes and other special
''' characters within strings are specified using escape sequences. </para></summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a complete or partial string constant,
''' else returns false.</returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsConstantString(strTxt As String) As Boolean
If Not String.IsNullOrEmpty(strTxt) AndAlso
(Regex.IsMatch(strTxt, "^"".*") OrElse (Regex.IsMatch(strTxt, "^'.*"))) Then
Return True
End If
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a comment, else returns false.
''' <para>
''' Any text from a # character to the end of the line is taken to be a comment,
''' unless the # character is inside a quoted string. </para></summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a comment, else returns false.</returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsComment(strTxt As String) As Boolean
If Not String.IsNullOrEmpty(strTxt) AndAlso Regex.IsMatch(strTxt, "^#.*") Then
Return True
End If
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is sequence of spaces (and no other
''' characters), else returns false. </summary>
'''
''' <param name="strTxt"> The text to check . </param>
'''
''' <returns> True if <paramref name="strTxt"/> is sequence of spaces (and no other
''' characters), else returns false. </returns>
'''--------------------------------------------------------------------------------------------
Public Shared Function IsSequenceOfSpaces(strTxt As String) As Boolean 'TODO make private?
If Not String.IsNullOrEmpty(strTxt) AndAlso
Not strTxt = vbLf AndAlso
Regex.IsMatch(strTxt, "^ *$") Then
Return True
End If
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a functional R element
''' (i.e. not empty, and not a space, comment or new line), else returns false. </summary>
'''
''' <param name="strTxt"> The text to check . </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a functional R element
''' (i.e. not a space, comment or new line), else returns false. </returns>
'''--------------------------------------------------------------------------------------------
Public Shared Function IsElement(strTxt As String) As Boolean 'TODO make private?
If Not (String.IsNullOrEmpty(strTxt) OrElse
IsNewLine(strTxt) OrElse
IsSequenceOfSpaces(strTxt) OrElse
IsComment(strTxt)) Then
Return True
End If
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a complete or partial
''' user-defined operator, else returns false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a complete or partial
''' user-defined operator, else returns false.</returns>
'''--------------------------------------------------------------------------------------------
Public Shared Function IsOperatorUserDefined(strTxt As String) As Boolean 'TODO make private?
If Not String.IsNullOrEmpty(strTxt) AndAlso Regex.IsMatch(strTxt, "^%.*") Then
Return True
End If
Return False
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a resrved operator, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a reserved operator, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Public Shared Function IsOperatorReserved(strTxt As String) As Boolean 'TODO make private?
Dim arrROperators() As String = {"::", ":::", "$", "@", "^", ":", "%%", "%/%",
"%*%", "%o%", "%x%", "%in%", "/", "*", "+", "-", "<", ">", "<=", ">=", "==", "!=", "!", "&",
"&&", "|", "||", "~", "->", "->>", "<-", "<<-", "="}
Return arrROperators.Contains(strTxt)
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a bracket operator, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a bracket operator, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsOperatorBrackets(strTxt As String) As Boolean
Dim arrROperatorBrackets() As String = {"[", "]", "[[", "]]"}
Return arrROperatorBrackets.Contains(strTxt)
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a unary operator, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a unary operator, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsOperatorUnary(strTxt As String) As Boolean
Dim arrROperatorUnary() As String = {"+", "-", "!", "~"}
Return arrROperatorUnary.Contains(strTxt)
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a bracket, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a bracket, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsBracket(strTxt As String) As Boolean
Dim arrRBrackets() As String = {"(", ")", "{", "}"}
Return arrRBrackets.Contains(strTxt)
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a new line, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a new line, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsNewLine(strTxt As String) As Boolean
Dim arrRNewLines() As String = {vbCr, vbLf, vbCrLf}
Return arrRNewLines.Contains(strTxt)
End Function
'''--------------------------------------------------------------------------------------------
''' <summary> Returns true if <paramref name="strTxt"/> is a key word, else returns
''' false.</summary>
'''
''' <param name="strTxt"> The text to check. </param>
'''
''' <returns> True if <paramref name="strTxt"/> is a key word, else returns false.
''' </returns>
'''--------------------------------------------------------------------------------------------
Private Shared Function IsKeyWord(strTxt As String) As Boolean
Dim arrKeyWords() As String = {"if", "else", "repeat", "while", "function", "for", "in", "next", "break"}
Return arrKeyWords.Contains(strTxt)
End Function
End Class