-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsers.py
137 lines (113 loc) · 4.97 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
from tree_sitter import Language, Parser
'''
Backslashes in ts_patterns should be escaped twice.
Backslashes in re_patterns should be escaped once.
Example:
"patterns": [
{
"ts_pattern": [
"?!config\\\\.Config",
"\\\\bConfig\\\\b"
],
"ts_type": "type",
"re_pattern": "\\b(?<!config\\.)Config\\b"
}
]
'''
def format_pattern_string(pattern_s, change, namespaces):
if change.get('namespace') != None:
if namespaces.get(change['namespace']) != None:
pattern_s = pattern_s.replace('{{namespace}}', namespaces.get(change['namespace']))
else:
pattern_s = pattern_s.replace('{{namespace}}', change['namespace'])
if '{namespace}}' in pattern_s:
raise ValueError(f'Pattern regex `{pattern_s}` contains `{{{{namespace}}}}` but namespace not given in json entry')
return pattern_s
class TreeSitterParser:
def __init__(self, language_name):
if language_name == 'python':
import tree_sitter_python as tslang
from languagequeries.python import PythonQueries
self.queries = PythonQueries()
elif language_name == 'go':
import tree_sitter_go as tslang
from languagequeries.go import GoQueries
self.queries = GoQueries()
elif language_name == 'javascript':
import tree_sitter_javascript as tslang
from languagequeries.javascript import JSQueries
self.queries = JSQueries()
else:
# click should prevent ever getting here
raise ValueError('Invalid language. Valid choices are: python, go, javascript.')
self.language = Language(tslang.language())
def set_source(self, source):
self.source = source
self.ast = Parser(self.language).parse(bytes(self.source.text, 'utf8'))
self.namespaces = self.build_namespaces_dict()
def get_captures_for_pattern(self, pattern, change):
if pattern.get('ts_pattern') == None:
return []
if isinstance(pattern['ts_pattern'], str):
pattern_formatted = format_pattern_string(pattern['ts_pattern'], change, self.namespaces)
query = getattr(self.queries, pattern['ts_type'])(pattern_formatted)
elif isinstance(pattern['ts_pattern'], list):
pattern_formatted = [format_pattern_string(p, change, self.namespaces) for p in pattern['ts_pattern']]
query = getattr(self.queries, pattern['ts_type'])(*pattern_formatted)
else:
raise ValueError('Change identifier must be str or list.')
matches = self.language.query(query)
if pattern.get('ts_uniqueify') != None and pattern.get('ts_uniqueify') == 'True':
captures = self.uniqueify_captures(matches.captures(self.ast.root_node), pattern)
else:
captures = matches.captures(self.ast.root_node)
return [(c[0].range.start_point, c[0].range.end_point) for c in captures] # only extract relevant bits
def uniqueify_captures(self, captures, pattern):
'''
Pattern change entries consisting of a list yield as many matches as
list elements, from tree-sitter. (Normally?) Only the last match is relevant.
'''
if isinstance(pattern['ts_pattern'], str):
step_size = 1
elif isinstance(pattern['ts_pattern'], list):
step_size = len(pattern['ts_pattern'])
return captures[step_size-1::step_size]
def build_namespaces_dict(self):
"""
Allows library to work with apps that have aliased their imports,
for example in Go:
import n "github.com/neo4j/neo4j-go-driver/v4/neo4j"
import ll "github.com/neo4j/neo4j-go-driver/v5/neo4j/log"
The changelog json specifies the standard namespace to which each changelog
entry is about, and this function infers the actual namespace names from
the user's codebase so that the defaults may be overwritten.
Dict has form {default_name : alias}.
"""
namespaces_d = {}
ast = self.ast
query = self.queries._import_for_namespace()
matches = self.language.query(query).matches(ast.root_node)
for m in matches:
if m[1] == {}:
continue
pkg_name, imported_as = self.queries._parse_import_alias(m, self.source.lines)
namespaces_d[pkg_name] = imported_as
return namespaces_d
class RegexParser:
def set_source(self, source):
self.source = source
def get_captures_for_pattern(self, pattern, change):
captures = []
if pattern.get('re_pattern') == None:
return captures
for i in range(len(self.source.lines)):
match = re.search(pattern['re_pattern'], self.source.lines[i])
if match != None:
captures.append(
(
(i, match.start()),
(i, match.end())
)
)
return captures