-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_html.py
116 lines (102 loc) · 2.84 KB
/
generate_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
from show_extractor import *
from glob import glob
from pathlib import PurePath
def emit_header():
header = """
<html>
<head>
<meta charset="UTF-8">
<style rel="stylesheet">
body {
font-family:Sans-Serif;
line-height: 1.5em;
background-color: #FAFAFA
}
ul {
float:left;
color:#444;
margin:5px;
width:250px;
}
div {
margin-left:20px
}
h1 {
font-weight: 600;
margin-bottom:0px;
line-height:120%;
margin-left: 30px;
margin-top: 15px;
clear:both;
}
h2 {
margin-top:5px;
padding-bottom:4px;
margin-left:-20px;
margin-bottom:5px;
border-bottom: 4px solid #777;
}
p {
margin-left:30px;
font-size:1.2em;
margin-bottom: 0px;
}
</style>
</head>
<body>
<p>Shows extracted from the Content section of the 8K/Exhibit 99.1 SEC filings of Netflix.</p>
"""
print(header)
def emit_footer():
footer = """
</body>
</html>
"""
print(footer)
def emit_heading(year):
print(f"<h1>{year}</h1>")
def emit_content_list(quarter, shows):
html = f"<ul><h2>{quarter}</h2>"
html += "".join(map(
lambda s: f"<li><a href='https://netflix.com/search?q={s}'>{s}</a></li>", shows))
html += "</ul>"
print(html)
def apply_manual_corrections(shows):
# this function exists to clean shows that are a quirk of parsing
# a manually generated, inconsistent html file
blacklist = ["and", "Marvel’s"]
shows = list(filter(lambda s: s not in blacklist, shows))
# shows are already deduplicated but sometimes the same content
# might appear in different casing e.g. Set it Up / Set It Up so
# while we want to keep the original casing, we need to dedupe in
# a case insensitive way
unique_shows = set()
display_shows = set()
for show in shows:
# clean certain artifacts, like dangling punctuation
show = show.strip(".").strip(": ")
if not "(" in show and show.endswith(")"):
show = show.strip(")")
if show.lower() not in unique_shows:
unique_shows.add(show.lower())
display_shows.add(show)
return sorted(list(display_shows))
if __name__ == '__main__':
data_dir = './data/'
yqre = re.compile('^8K-Exhibit-91-(\d{4})-(Q[1-4])\.html$')
current_year = '0'
emit_header()
for path in sorted(list(glob(data_dir + "*.html")), reverse=True):
html,text = extract_content_section(path)
shows = apply_manual_corrections(get_shows_html(html))
file = PurePath(path).name
year, quarter = yqre.match(file).groups()
if year != current_year:
if current_year != '0':
print("</div>")
emit_heading(year)
print("<div>")
current_year = year
emit_content_list(quarter,shows)
emit_footer()