-
Notifications
You must be signed in to change notification settings - Fork 1
/
p-stats2data.py
executable file
·138 lines (114 loc) · 5.03 KB
/
p-stats2data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 12 10:13:43 2017
@author: pemoser
"""
import sys
import re
import os
def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
return [int(text) if text.isdigit() else text.lower()
for text in re.split(_nsre, s)]
def printErrorAndExit(msg):
print(os.path.basename(sys.argv[0]) + ": ERROR: " + msg, file=sys.stderr, flush=True)
sys.exit(1)
def main():
if len(sys.argv) <= 2:
print("USAGE: pw_stats2data.py <prefix> list-of-files")
print()
print(" Extract from a list of files containing experiments (TSV)")
print(" the execution time. Remove <prefix> from each filename, and choose the remaining")
print(" text as varying parameter name.")
print(" The full filename pattern looks like this: <prefix><parameter-name><parameter-value>")
print()
print(" The result is then a table with the prefix as first column name, and all algorithms as following")
print(" column names. The cells contain the average of all execution times within a single file.")
print()
sys.exit(0)
# Collect data
# Results have the following hierarchy: parameterValue > algo > [runtimesum, experimentcount, resultcount]
# For example: parameterName = 'N' and the value is the cardinality, so we have a table as follows:
# 10 algoA [10, 2, 200]
# algoB [...]
# 20 algoA ...
results = {}
algorithms = []
parameters = []
parameterName = "X"
oldParameterName = None
prefix = sys.argv[1]
for arg in sys.argv[2:]:
if not os.path.exists(arg):
printErrorAndExit("File '%s' does not exist." % arg)
# Get the varying variable name and value from the filename
filename = os.path.splitext(os.path.basename(arg))[0]
m = re.match(r"%s([a-zA-Z]+)([0-9\.]+).*" % prefix, filename)
if not m:
printErrorAndExit("Prefix '%s' does not match with filename '%s'. At least one letter must be left as parameter name." % (prefix, filename))
parameterName = m.groups()[0]
parameterValue = m.groups()[1]
if oldParameterName == None:
oldParameterName = parameterName
elif parameterName != oldParameterName:
printErrorAndExit("Parameter name mismatch. First it was '%s', then '%s'." % (oldParameterName, parameterName))
if not parameterName in parameters:
parameters.append(parameterValue)
if not parameterValue in results:
results[parameterValue] = {}
expRun = 0
# Read the contents of the file
with open(arg, 'r') as f:
try:
for line in f:
try:
cells = line.split("\t")
algo = os.path.basename(cells[0])
# We added timesplit to the results recently, hence result counts are at pos 7 now
# Before that, they were at pos 7 (since pos 8 is a filename/path we can simply check casting
# errors)
try:
resultCount = int(float(cells[8]))
except:
resultCount = int(float(cells[7]))
if not algo in algorithms:
algorithms.append(algo)
# Experiment run deterimens if we must overwrite an older experiment
if algo in results[parameterValue] and expRun in results[parameterValue][algo]:
results[parameterValue][algo][expRun][0] += int(cells[1])
results[parameterValue][algo][expRun][1] += 1
else:
results[parameterValue][algo] = {expRun : [int(cells[1]), 1, resultCount]}
except:
expRun += 1
continue
except:
continue
# Print header
print(parameterName, end='')
for a in algorithms:
print("\t%s" % a, end='')
print("\tRESULTS")
# Print data lines (first field = varying parameter)
resultCount = -1
for parameter in sorted(parameters, key=natural_sort_key):
print(parameter + "\t", end='')
res = results[parameter]
# print(parameter)
# print(res)
try:
val = next(iter(res[algorithms[0]].values()))
resultCount = val[2]
except:
printErrorAndExit("Algorithm %s not found in results." % algorithms[0])
for a in algorithms:
if a in res:
val = next(iter(res[a].values()))
print("%d\t" % int(float(val[0]) / val[1]), end='')
if resultCount != val[2]:
printErrorAndExit("Different result counts for the same parameter-value found!")
else:
print("nan\t", end='')
print("%d" % resultCount)
if __name__ == '__main__':
main()