From 0205c386f98b0c1e1bc847986ea2c797c025425a Mon Sep 17 00:00:00 2001 From: Justin Bassett Date: Wed, 4 Aug 2021 16:40:23 -0700 Subject: [PATCH] Fix import utils --- util/analyze/imports/import_cpu2006.py | 5 +- util/analyze/imports/import_utils.py | 145 ++++++++++++++----------- 2 files changed, 82 insertions(+), 68 deletions(-) diff --git a/util/analyze/imports/import_cpu2006.py b/util/analyze/imports/import_cpu2006.py index c8b18363a..23edd8a32 100755 --- a/util/analyze/imports/import_cpu2006.py +++ b/util/analyze/imports/import_cpu2006.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import os -import re from . import import_utils @@ -13,8 +12,8 @@ def parse(file): with open(file, 'r') as f: return import_utils.parse_multi_bench_file( f.read(), - benchstart=re.compile(r'Building (?P\S*)'), - filename=re.compile(r'/[fc]lang\b.*\s(\S+\.\S+)\n')) + benchstart=r'Building (?P\S*)', + filename=r'/[fc]lang\b.*\s(\S+\.\S+)\n') if __name__ == '__main__': diff --git a/util/analyze/imports/import_utils.py b/util/analyze/imports/import_utils.py index 3454cb673..5b5baa224 100644 --- a/util/analyze/imports/import_utils.py +++ b/util/analyze/imports/import_utils.py @@ -1,13 +1,15 @@ -import pickle -import json import itertools +import json +import pickle import re import sys -from collections import namedtuple +from dataclasses import dataclass +from typing import List, Match, Optional, Pattern, Union -from .._types import Logs, Benchmark, Block +from .._types import Benchmark, Block, Logs -_RE_REGION_INFO = re.compile(r'EVENT:.*ProcessDag.*"name": "(?P[^"]*)"') +_REGION_DELIMITER = 'INFO: ********** Opt Scheduling **********' +_RE_REGION_DELIMITER = re.compile(re.escape(_REGION_DELIMITER)) def import_main(parsefn, *, description): @@ -24,18 +26,39 @@ def import_main(parsefn, *, description): pickle.dump(result, f) -def parse_multi_bench_file(logtext, *, benchstart, filename=None): +def parse_multi_bench_file(logtext: str, *, benchstart: Union[Pattern, str], filename: Optional[Union[Pattern, str]] = None): + if filename is not None: + filename = re.compile(filename) + benchstart = re.compile(benchstart) + + def parse_bench(benchm: Match, nextm: Union[Match, _DummyEnd], is_first: bool = False): + # The RE can specify any extra properties. + info = benchm.groupdict() + # If this is the first benchmark in the file, we want to start from the + # start of the file so that we don't lose any information. + start = 0 if is_first else benchm.start() + end = nextm.start() + return _parse_benchmark(info, logtext, + start, end, + filenamere=filename) + + bench_matches = list(benchstart.finditer(logtext)) benchmarks = [] - for benchm, nextm in _splititer(benchstart, logtext): - bench = _parse_benchmark(benchm.groupdict(), logtext, - benchm.end(), nextm.start(), - filenamere=filename) - benchmarks.append(bench) + + is_first: bool = True + for benchm, nextm in zip( + bench_matches, + [*bench_matches[1:], _DummyEnd(len(logtext))] + ): + benchmarks.append(parse_bench(benchm, nextm, is_first)) + is_first = False return Logs(benchmarks) -def parse_single_bench_file(logtext, *, benchname, filename=None): +def parse_single_bench_file(logtext, *, benchname, filename: Optional[Union[Pattern, str]] = None): + if filename is not None: + filename = re.compile(filename) return Logs([ _parse_benchmark( {'name': benchname}, @@ -45,21 +68,10 @@ def parse_single_bench_file(logtext, *, benchname, filename=None): ]) -_FileInfo = namedtuple('_FileInfo', ('filename', 'from_pos')) - - -def _each_cons(iterable, n): - ''' - Iterates over each consecutive n items of the iterable. - - _each_cons((1, 2, 3, 4), 2) # (1, 2), (2, 3), (3, 4) - ''' - iters = [None] * n - iters[0] = iter(iterable) - for i in range(1, n): - iters[i - 1], iters[i] = itertools.tee(iters[i - 1]) - next(iters[i], None) - return zip(*iters) +@dataclass +class _FileInfo: + filename: Optional[str] + from_pos: int class _DummyEnd: @@ -73,58 +85,59 @@ def end(self): return self._end -def _splititer(regex, text, pos=0, endpos=None): - ''' - 'Splits' the string by the regular expression, using an iterable. - Returns both where the regex matches and where it matched next (or the end). - ''' - if endpos is None: - endpos = len(text) - 1 +def _filename_info(filenamere: Optional[Pattern], logtext: str, start: int, end: int) -> List[_FileInfo]: + if filenamere is None: + filenamere = re.compile(r'.^') # RE that doesn't match anything + files = [] - return _each_cons( - itertools.chain(regex.finditer(text, pos, endpos), - (_DummyEnd(endpos + 1),)), - 2 - ) + for filem in filenamere.finditer(logtext, start, end): + filename = filem.group(1) + filestart = filem.end() + files.append(_FileInfo(filename=filename, from_pos=filestart)) + return files -def _parse_benchmark(info, logtext: str, start, end, *, filenamere): - NAME = info['name'] + +def _parse_benchmark(info: dict, logtext: str, start: int, end: int, *, filenamere: Optional[Pattern]): + BENCHNAME = info['name'] blocks = [] - if filenamere and filenamere.search(logtext, start, end): - files = [ - *(_FileInfo(filename=r.group(1), from_pos=r.end()) - for r in filenamere.finditer(logtext, start, end)), - _FileInfo(filename=None, from_pos=len(logtext)), - ][::-1] - else: - files = [ - _FileInfo(filename=None, from_pos=start), - _FileInfo(filename=None, from_pos=len(logtext)), - ][::-1] + files: List[_FileInfo] = _filename_info(filenamere, logtext, start, end) + if not files: + # We have an unknown file starting from the very beginning + files = [_FileInfo(filename=None, from_pos=start)] + + # Allow us to peek ahead by giving a dummy "file" at the end which will never match a block + files.append(_FileInfo(filename=None, from_pos=end)) + assert len(files) >= 2 + file_pos = 0 + + block_matches1, block_matches2 = itertools.tee(_RE_REGION_DELIMITER.finditer(logtext, start, end)) + next(block_matches2) # Drop first + block_matches2 = itertools.chain(block_matches2, (_DummyEnd(end),)) blocks = [] - for regionm, nextm in _splititer(_RE_REGION_INFO, logtext, start, end): - assert regionm.end() > files[-1].from_pos - if regionm.end() > files[-2].from_pos: - files.pop() + is_first = True + for regionm, nextm in zip(block_matches1, block_matches2): + region_start = regionm.end() + if region_start > files[file_pos + 1].from_pos: + file_pos += 1 + + assert region_start > files[file_pos].from_pos - try: - filename = files[-1].filename - except NameError: - filename = None + filename = files[file_pos].filename if files[file_pos] else None regioninfo = { - 'name': regionm['name'], 'file': filename, - 'benchmark': NAME, + 'benchmark': BENCHNAME, } - block = _parse_block(regioninfo, logtext, - regionm.start() - 1, nextm.start()) - blocks.append(block) + blk_start = start if is_first else regionm.start() + blk_end = nextm.start() + blocks.append(_parse_block(regioninfo, logtext, + blk_start, blk_end)) + is_first = False return Benchmark(info, blocks) @@ -132,6 +145,8 @@ def _parse_benchmark(info, logtext: str, start, end, *, filenamere): def _parse_block(info, logtext: str, start, end): events = _parse_events(logtext, start, end) raw_log = logtext[start:end] + assert 'ProcessDag' in events + info['name'] = events['ProcessDag'][0]['name'] return Block(info, raw_log, events)