-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcache.py
187 lines (165 loc) · 5.85 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from collections import Counter
import hashlib
import io
import json
import os
import re
import requests
from logspam import WARNING_RE
class CacheFileNotFoundException(Exception):
pass
def normalize_line(line):
"""
Normalizes the given line to make comparisons easier. Removes:
- timestamps
- pids
- trims file paths
- stuff that looks like a pointer address
"""
line = line.decode('utf-8')
try:
# Raw logs are now encoded in json
json_line = json.loads(line)
line = json_line['data']
except:
# Legacy logs may be plain text, particularly if live_backing.log is
# specified.
pass
# taskcluster prefixing:
# [task 2016-09-20T11:09:35.539828Z] 11:09:35
line = re.sub(r'^\[task[^\]]+\]\s', '', line)
line = re.sub(r'^[0-9:]+\s+INFO\s+-\s+', '', line)
line = re.sub(r'^[0-9]+\s+INFO\s+', '', line)
line = re.sub(r'^PROCESS \| [0-9]+ \| ', '', line)
#PID 13497 | WARNING:
line = re.sub(r'^PID\s+[0-9]+\s+\|\s+', '', line)
line = re.sub(r'\[(Child|Parent|GMP|NPAPI)?\s?[0-9]+\]', '', line)
line = re.sub(r'/home/worker/workspace/build/src/', '', line)
line = re.sub(r'/builds/worker/checkouts/gecko/', '', line)
# Attempt buildbot paths, ie:
# c:/builds/moz2_slave/m-cen-w32-d-000000000000000000/build/src/
line = re.sub(r'([a-z]:)?/builds/[^/]+/[^/]+/build/src/', '', line)
# z:/build/build/src/
line = re.sub(r'([a-z]:)?/(build/)+src/', '', line)
#blah=1caa2c00
line = re.sub(r'=[a-z0-9]+', '=NNNNNN', line)
#GECKO(1265) |
line = re.sub(r'GECKO\([0-9]+\) \|', '', line)
line = line.strip()
#[1355, Main Thread]
line = re.sub(r'^\[[^\]]+\]\s+', '', line)
return line
class CustomEncoder(json.JSONEncoder):
"""
Custom JSON encoder that handles ParsedLog objects.
"""
def default(self, obj):
if isinstance(obj, ParsedLog):
return obj.to_json()
return json.JSONEncoder.default(self, obj)
class ParsedLog:
"""
Represents a log file that was downloaded and processed.
"""
def __init__(self, url, job_name, file_name=None):
self.url = url
self.job_name = job_name
if not file_name:
self.fname = os.path.basename(job_name.replace(' ', '_') + '.log')
else:
self.fname = file_name
self.warnings = Counter()
def _download_file(self, dest, warning_re):
r = requests.get(self.url, stream=True)
with open(dest, 'w') as f:
for x in r.iter_lines():
if x:
line = normalize_line(x)
self.add_warning(line, warning_re)
f.write(line + '\n')
def download(self, cache_dir, warning_re):
"""
Downloads the log file and normalizes it. Warnings are also
accumulated.
"""
success = True
# Check if we can bypass downloading first.
dest = os.path.join(cache_dir, self.fname)
if os.path.exists(dest):
# Use |io.open| to handle unicode weirdness
with io.open(dest, 'r', encoding='utf-8') as f:
for x in f:
self.add_warning(x.rstrip(), warning_re)
else:
success = False
for i in range(5):
try:
self._download_file(dest, warning_re)
success = True
break
except requests.exceptions.ConnectionError:
# TODO(ER): Maybe nuke dest?
self.warnings.clear()
pass
return success
def add_warning(self, line, match_re=WARNING_RE):
"""
Adds the line to the set of warnings if it contains a warning.
"""
if re.search(match_re, line):
self.warnings[line] += 1
def to_json(self):
"""
Creates a json representation of this object.
"""
return {
'url': self.url,
'job_name': self.job_name,
'fname': self.fname,
'warnings': dict(self.warnings)
}
def cache_file_path(cache_dir, warning_re):
"""
Generates the cache file name.
"""
if warning_re != WARNING_RE:
warning_md5 = hashlib.md5(warning_re).hexdigest()
return os.path.join(cache_dir, "results.%s.json" % warning_md5)
else:
return os.path.join(cache_dir, "results.json")
class Cache(object):
def __init__(self, cache_dir, warning_re):
self.path = cache_file_path(cache_dir, warning_re)
def store_results(self, parsed_logs):
"""
Caches the parsed results in a json file.
"""
with open(self.path, 'w') as f:
json.dump(parsed_logs, f, cls=CustomEncoder)
def read_results(self):
"""
Reads the cached results from a previous run.
"""
if not os.path.isfile(self.path):
raise CacheFileNotFoundException(
"Cache file %s not found" % self.path)
print(("Reading cache from %s" % self.path))
parsed_logs = []
with open(self.path, 'r') as f:
try:
raw_list = json.load(f)
except Exception:
# Corrupt json file, nuke it.
os.remove(self.path)
raise CacheFileNotFoundException(
"Cache file %s was corrupt", self.path)
for x in raw_list:
if not x:
continue
log = ParsedLog(x['url'], x['job_name'], x['fname'])
log.warnings.update(x['warnings'])
parsed_logs.append(log)
return parsed_logs