-
Notifications
You must be signed in to change notification settings - Fork 21
/
context.py
121 lines (106 loc) · 3.5 KB
/
context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
def flatten(container):
for i in container:
if isinstance(i, (list,tuple)):
for j in flatten(i):
yield j
else:
yield i
def extract_tokens(context_dict):
ret_list = []
if 'arg_val' in context_dict:
arguments = context_dict['arg_val']
ret_list.extend(list(flatten(arguments)))
if 'keyword_val' in context_dict:
keywords = context_dict['keyword_val']
ret_list.extend(list(flatten(keywords)))
if 'keyword_key' in context_dict:
keywords = context_dict['keyword_key']
ret_list.extend(list(flatten(keywords)))
return ret_list
def extract_types(context_dict):
ret_list = []
if 'arg_type' in context_dict:
arguments = context_dict['arg_type']
ret_list.extend(list(flatten(arguments)))
if 'keyword_type' in context_dict:
keywords = context_dict['keyword_type']
ret_list.extend(list(flatten(keywords)))
return ret_list
def process_obj_name(name):
ret_list=[]
stemmer=PorterStemmer()
splits=split_tokens(name)
for split in splits:
split=split.lower()
ret_list.append(
stemmer.stem(split)
)
return ret_list
def camel_case_split(identifier):
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
return [m.group(0) for m in matches]
def split_tokens(identifier):
#print identifier
word_list=[]
words=camel_case_split(identifier)
for word in words:
word= word.replace("_", " ")
word_list.extend(
re.split(r'[^\w]', word))
return word_list
def process_tokens(context_dict):
tokens=extract_tokens(context_dict)
porter_stemmer = PorterStemmer()
#wordnet_lemmatizer = WordNetLemmatizer()
processed = []
for token in tokens:
if isinstance(token, basestring): # only procses strings
token_splits=split_tokens(token)
for split in token_splits:
split=split.lower()
split = porter_stemmer.stem(split)
#split = wordnet_lemmatizer.lemmatize(split)
processed.append(split)
return processed
def process_context(context_dict, process_types=False, process_values=False):
processed_list=[]
if process_values:
processed_list.extend(
process_tokens(context_dict))
if process_types:
processed_list.extend(
extract_types(context_dict))
return processed_list
# main function
#print process_obj_name('test_runn')
'''
test = [['submitCommands', ['SendCommands']], ['cancelCommands',
['SendCommands']], ['denyCommands', ['SendCommands']],
['confirmCommands', ['SendCommands']], ['denyText', 'Keep it'],
['cancelLabel', 'Keep it'], ['submitLabel', 'Change it'],
['confirmText', 'Change it'], ['cancelTrigger', 'Confirm']]
print list(flatten(test))
'''
'''
with open('graph-28594.txt', 'r') as file:
json_chunks = file.read().strip().split('--------------------')
for chunk in json_chunks:
if chunk:
json_data = json.loads(chunk)
#pprint(json_data)
for graph_file in json_data['files']:
for node_num, node_value in graph_file['graph']['graph_dict'].items():
if 'context' in node_value:
#print node_num, extract_tokens(node_value['context'])
token_list = extract_tokens(node_value['context'])
processed_tokens = process_tokens(token_list)
print node_num, processed_tokens
'''
# two root nodes: files and folders
# under files, two nodes: graph and file
# under graph, three nodes: count, graph_dict, start_vertex
# graph_dict is a dictionary about node_num and node_value
# under node_value, which is a dictionary itself, there's dictionary called 'context':''