-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_nez_docs.py
70 lines (48 loc) · 1.46 KB
/
parse_nez_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
from os import listdir
from os.path import isfile, join
import re
import sys
import pdb
PATTERNS = [
r'JCC',
r'Hearing',
r'Assessor',
r'Resolution',
r'Petition',
r'PDD',
r'Clerk',
]
def find_first_pattern(file):
for pattern in PATTERNS:
match = re.search(pattern, file)
if match:
return match
return None
def get_title(file, match):
title = file[0 : match.end()]
title = title.replace('_', ' ')
# print(file + " - " + title)
return title
def get_year(title):
OFFSET = 4
match = re.search(r'[\d]{4}', title[OFFSET : ])
if not match:
return None
year = title[match.start() + OFFSET : match.end() + OFFSET]
return int(year)
if __name__ == '__main__':
if len(sys.argv) != 2:
msg = "usage: ./parse_nez_docs.py <path to nez docs"
raise Exception(msg)
script_file = sys.argv[0][2:]
path = sys.argv[1]
files = [ file for file in listdir(path) if isfile(join(path, file)) and file != script_file ]
print("URL,Title,Year")
for file in files:
match = find_first_pattern(file)
if not match:
raise Exception(msg="File name {} could not be parsed".format(file))
title = get_title(file, match)
year = get_year(title)
print("\"{}\",\"{}\",{}".format("https://detroitmi.gov/sites/detroitmi.localhost/files/migrated_docs/nez_reports/" + file, title, year))