-
Notifications
You must be signed in to change notification settings - Fork 5
/
get_mathlib4_history.py
123 lines (106 loc) · 4.05 KB
/
get_mathlib4_history.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from dataclasses import dataclass
import functools
import hashlib
import re
from tqdm import tqdm
import datetime
import git
from tqdm import tqdm
import port_status_yaml
# upstream bug
git.Git.CatFileContentStream.__next__ = git.Git.CatFileContentStream.next
# from https://github.com/leanprover-community/mathlib4/blob/master/scripts/make_port_status.py#L83-L95
source_module_re = re.compile(r"^! .*source module (.*)$")
commit_re = re.compile(r"^! (leanprover-community/[a-z]*) commit ([0-9a-f]*)")
import_re = re.compile(r"^import ([^ ]*)")
align_import_re = re.compile(
r'^#align_import ([^ ]*) from "(leanprover-community/[a-z]*)" ?@ ?"([0-9a-f]*)"')
def parse_name(name: str) -> tuple[str]:
"""Parse a lean name including french quotes"""
components = re.compile(r"((?:[^.«»]|«.*?»)*)(?:(\.)|$)", re.UNICODE)
pos = 0
parts = []
while True:
m = components.match(name, pos=pos)
if m:
parts.append(m.group(1).replace('«', '').replace('»', ''))
if not m.group(2):
break
pos = m.end()
else:
raise ValueError(name, pos)
return tuple(parts)
def name_to_str(parts: tuple[str]) -> str:
return '.'.join([
'«' + p + '»' if p.isdigit() else p for p in parts
])
def get_mathlib4_module_commit_info(contents):
module = repo = commit = None
for line in contents:
m = source_module_re.match(line)
if m:
module = parse_name(m.group(1))
m = commit_re.match(line)
if m and module:
repo = m.group(1)
commit = m.group(2)
break
m = align_import_re.match(line)
if m:
module = parse_name(m.group(1))
repo = m.group(2)
commit = m.group(3)
# TODO: parse multiple `#align_import`s? We don't want to read the whole file unless we
# have to.
break
return module, repo, commit
@functools.cache
def port_info_from_blob(b: git.Blob):
return get_mathlib4_module_commit_info(l.decode('utf8') for l in b.data_stream.stream)
@dataclass
class FileHistoryEntry:
module: tuple[str]
source: port_status_yaml.PortStatusEntry.Source
commit: git.Commit
diff: git.Diff
def _NULL_TREE(repo):
"""
An empty Git tree is represented by the C string "tree 0". The hash of the
empty tree is always SHA1("tree 0\\0"). This method computes the
hexdigest of this sha1 and creates a tree object for the empty tree of the
passed repo.
"""
null_tree_sha = hashlib.sha1(b"tree 0\0").hexdigest()
return repo.tree(null_tree_sha)
def get_history(repo: git.Repo, root='Mathlib', rev=None, desc='Getting mathlib4 history') -> dict[str, list[FileHistoryEntry]]:
file_history = {}
last = _NULL_TREE(repo)
with tqdm(repo.iter_commits(rev=rev, paths=[root], first_parent=True, reverse=True),
desc=desc) as pbar:
for commit in pbar:
pbar.set_postfix_str(datetime.datetime.fromtimestamp(commit.committed_date).isoformat(), refresh=False)
diffs = last.diff(commit, create_patch=True)
last = commit
for d in diffs:
if d.b_blob is not None:
try:
module, r, c = port_info_from_blob(d.b_blob)
except ValueError:
continue
if module is None:
continue
entry = FileHistoryEntry(
module=module,
source=port_status_yaml.PortStatusEntry.Source(repo=r, commit=c),
commit=commit,
diff=d)
file_history.setdefault(module, []).insert(0, entry)
return file_history
if __name__ == '__main__':
import time
repo = git.Repo('build/repos/mathlib4')
t0 = time.time()
history = get_mathlib4_history(repo)
t = time.time() - t0
print(t, len(history), max(len(v) for v in history.values()))
print(max(history.items(), key=lambda x: len(x[1])))