-
Notifications
You must be signed in to change notification settings - Fork 1
/
regex.py
148 lines (126 loc) · 4.94 KB
/
regex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import string
CHARSET = string.digits + string.ascii_letters
EMPTY_STRING = 0
SYMBOL_SIMPLE = 1
SYMBOL_ANY = 2
SYMBOL_SET = 3
MAYBE = 4
STAR = 5
PLUS = 6
RANGE = 7
CONCATENATION = 8
ALTERNATION = 9
_SIMPLE_TYPES = {EMPTY_STRING, SYMBOL_SIMPLE, SYMBOL_ANY, SYMBOL_SET}
_BINARY_TYPES = {CONCATENATION, ALTERNATION}
def str_paranthesize(parent_type, re):
if parent_type > re.type or parent_type == re.type and parent_type != STAR:
return str(re)
else:
return "({!s})".format(str(re))
class RegEx(object):
"""Model a RegEx TDA
The member "type" is always available, indicating the type of the
RegularExpression. Its value dictates which other members (if any) are
defined:
- EMPTY_STRING:
- SYMBOL_SIMPLE: "symbol" is the symbol
- SYMBOL_ANY:
- SYMBOL_SET: "symbol_set" is a set of elements which are either
regular symbols or tuples of symbols representing a symbol range
- MAYBE: "lhs" is the RegEx
- STAR: "lhs" is the RegEx
- PLUS: "lhs" is the RegEx
- RANGE: "lhs" is the RegEx and "range" is a tuple representing the
range; an unspecified extremity is denoted with a value of -1
- CONCATENATION: "lhs" and "rhs" are the RegExes
- ALTERNATION: "lhs" and "rhs" are the RegExes
"""
def __init__(self, type, obj1=None, obj2=None):
"""Create a RegEx
The value of the "type" parameter influences the interpretation of the
other two paramters:
- EMPTY_STRING: obj1 and obj2 are unused
- SYMBOL_SIMPLE: obj1 should be a symbol; obj2 is unused
- SYMBOL_ANY: obj1 and obj2 are unused
- SYMBOL_SET: obj1 is a set of either symbols or tuples signifying
a symbol range; obj2 is unused
- MAYBE: obj1 should be a RegEx; obj2 is unused
- STAR: obj1 should be a RegEx; obj2 is unused
- PLUS: obj1 should be a RegEx; obj2 is unused
- RANGE: obj1 should be a RegEx; obj2 should be a tuple
representing the range (an unspecified extremity is represented
with a value of -1)
- CONCATENATION: obj1 and obj2 should be RegEx
- ALTERNATION: obj1 and obj2 should be RegEx
"""
self.type = type
if type in _SIMPLE_TYPES:
if type == SYMBOL_SIMPLE:
assert obj1 in CHARSET
self.symbol = obj1
elif type == SYMBOL_SET:
assert obj1 is not None
self.symbol_set = obj1
else:
assert isinstance(obj1, RegEx)
self.lhs = obj1
if type == RANGE:
assert obj2 is not None
x, y = obj2
assert (y > 0 and x <= y) or (x >= 0)
self.range = obj2
if type in _BINARY_TYPES:
assert obj2 is not None
assert isinstance(obj2, RegEx)
self.rhs = obj2
def __str__(self):
def normalize_to_tuple(e):
"""Allows us to sort sets containing both symbols and ranges
This is needed in order to deterministically represent a symbol set
as a string; namely all single characters first (in alphabetical
order), followed by all ranges.
"""
if isinstance(e, str):
return (e, "")
return e
if self.type == EMPTY_STRING:
return ""
if self.type == SYMBOL_SIMPLE:
return str(self.symbol)
if self.type == SYMBOL_ANY:
return "."
if self.type == SYMBOL_SET:
result = "["
for range in sorted(self.symbol_set, key=normalize_to_tuple):
result += "-".join(range)
return result + "]"
if self.type == MAYBE:
slhs = str_paranthesize(self.type, self.lhs)
return slhs + "?"
if self.type == STAR:
slhs = str_paranthesize(self.type, self.lhs)
return slhs + "*"
if self.type == PLUS:
slhs = str_paranthesize(self.type, self.lhs)
return slhs + "+"
if self.type == RANGE:
slhs = str_paranthesize(self.type, self.lhs)
x, y = self.range
if x == -1:
aux = "{{,{}}}".format(y)
elif y == -1:
aux = "{{{},}}".format(x)
elif x == y:
aux = "{{{}}}".format(x)
else:
aux = "{{{},{}}}".format(x, y)
return slhs + aux
if self.type == CONCATENATION:
slhs = str_paranthesize(self.type, self.lhs)
srhs = str_paranthesize(self.type, self.rhs)
return slhs + srhs
if self.type == ALTERNATION:
slhs = str_paranthesize(self.type, self.lhs)
srhs = str_paranthesize(self.type, self.rhs)
return slhs + "|" + srhs
raise Exception("Unknown type!")