-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsource.py
193 lines (153 loc) · 8.46 KB
/
source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import bs4
import hashlib
import json
import langdetect
import mistune.toc
import os
import shutil
from datetime import datetime
from pathlib import Path
from parser import parser # `parser` used to be a stdlib in 2.6(!), but dw it'll find ./parser.py
class SourceFile:
def __init__(self, fp: Path, domain: "WebsiteDomain"):
self.fp = fp
self.domain = domain
self.metadata = {} | domain.config # this duplicates the dict, instead of copying the pointer
# Target output path
self.metadata["target"] = fp.relative_to(domain.source)
if self.fp.suffix in [".html", ".md"]:
if (domain.source / self.metadata["target"]).with_suffix("").is_dir():
# nginx is fucking stupid so if the file has same name as directory, it has to go in there instead
self.metadata["target"] = self.metadata["target"].with_suffix("") / "index.html"
else:
# If it is a normal file location/extension/etc., just make it target the same filename as .html
self.metadata["target"] = self.metadata["target"].with_suffix(".html")
meta_fp = self.fp.with_suffix(self.fp.suffix + ".json")
if meta_fp.is_file():
with open(meta_fp, "r", encoding="utf-8") as f:
self.metadata |= json.load(f)
self.source = None
self.rendered = None
self.md_ast = None
self.rendered_toc = ""
self.load_source()
self.load_metadata()
def load_source(self):
if self.fp.suffix not in [".html", ".md"]:
return
if self.fp.suffix == ".html":
self.source = self.fp.read_text("utf-8")
return
# If it's a markdown file, do our full processing on it
self.source = self.fp.read_text("utf-8")
# If no language specified, detect from content (or from inline metadata)
if self.metadata.get("lang") is None:
self.metadata["lang"] = langdetect.detect(self.source)
# If the file has inline metadata, extract that out + update
if self.source.startswith("```meta\n"):
self.source = self.source[8:]
line, self.source = self.source.split("\n", 1)
while line != "```": # Iterate through the fenced metablock and set key:val pairs in it
key, val = line.split(":", 1)
self.metadata[key.strip().lower()] = val.strip()
line, self.source = self.source.split("\n", 1)
if self.metadata.get("description") is None:
# We need to get the description from the first paragraph of the text (ignoring headers, etc)
intro = ""
intro_length = 255 # How long the intro will be
for line in self.source.split("\n"):
if not (line.startswith("#") or line.startswith("<img")):
intro += line.strip() + " "
if len(intro) >= intro_length:
break
self.metadata["description"] = intro[:intro_length].strip()
def load_metadata(self):
if self.fp.with_suffix(self.fp.suffix + ".json").is_file():
with open(self.fp.with_suffix(self.fp.suffix + ".json"), "r", encoding="utf-8") as f:
self.metadata |= json.load(f)
def render_to_html(self):
if self.fp.suffix != ".md":
# Direct HTML ones we render (copy) later in load_template
return
self.source, self.md_ast = parser.parse(self.source)
def generate_toc(self):
# If the TOC/outline doesn't exist, then just skip this step
if self.md_ast is None:
return
if len(self.md_ast.env["toc_items"]) == 0:
return
# If the first item of the ToC is the title....
if self.md_ast.env["toc_items"][0][0] == 1:
if self.metadata.get("title") is None:
self.metadata["title"] = self.md_ast.env["toc_items"][0][2] # set it as the page title (if not already)
self.md_ast.env["toc_items"].pop(0) # ...and then pop it, so it doesn't end up in the TOC
self.rendered_toc = mistune.toc.render_toc_ul(self.md_ast.env["toc_items"])
def search_for_og_image(self):
if self.metadata.get("og_image"): # if the og:image is already set, skip
return
if self.fp.suffix not in [".html", ".md"]: # don't try to find og:image for staticfiles lmao
return
soup = bs4.BeautifulSoup(self.source, "html5lib")
self.metadata["og_image"] = (self.metadata.get("og_image")
or soup.find("img", class_="og_image")
or soup.find("img")
or self.domain.config.get("default_og_image")
or "") # If we didn't find ANYTHING, just set it to an empty string
if isinstance(self.metadata["og_image"], bs4.Tag): # if we found something in the article, grab the url
self.metadata["og_image"] = str(self.metadata["og_image"]["src"])
def load_template(self):
if self.fp.suffix not in [".html", ".md"]:
# staticfiles don't have a template
return
if self.fp.suffix == ".html":
# Direct HTML files just need to copy the contents to the output
self.rendered = self.source
return
# Now we're on to the markdowns
if self.metadata.get("template") is None:
raise NotImplementedError(f"No template specified for markdown file {self.fp} (this shouldn't happen?)")
self.rendered = ("_templates" / Path(self.metadata["template"])).read_text(encoding="utf-8")
def fill_helper(self, find, repl):
self.rendered = self.rendered.replace(find, repl)
def fill_variables(self):
if self.fp.suffix not in [".html", ".md"]:
return
self.fill_helper("{_template_page_title}", self.metadata.get("title", ""))
self.fill_helper("{_template_page_description}", self.metadata.get("description", ""))
# Todo: og:image (share image) should default to /static/banner.webp or the first image of the article
self.fill_helper("{_template_target_url}",
f"https://{self.domain.domain}/{self.metadata['target'].with_suffix('')}")
self.fill_helper("{_template_etag}", hashlib.sha1(self.fp.read_bytes()).hexdigest()[:12])
self.fill_helper("{_template_mtime}", str(datetime.fromtimestamp(int(self.fp.stat().st_mtime))))
self.fill_helper("{_template_template}", self.metadata["template"])
if self.md_ast is not None: # the ToC only applies for markdown content, not the plain html
self.fill_helper("{_template_toc_content}", mistune.toc.render_toc_ul(self.md_ast.env["toc_items"]))
self.fill_helper("{_template_main_content}", self.source)
def write_output(self):
if self.fp.suffix == ".json":
return None # Skip .json metadatas
target_fp = Path(self.domain.output / self.metadata["target"])
os.makedirs(target_fp.parent, exist_ok=True) # make sure the output folder actually exists first
if self.fp.suffix not in [".html", ".md"]:
shutil.copy2(self.fp, target_fp)
return None # None because file was skipped / staticfile
# continuing execution only if this was webpage content we parsed (not a staticfile)
target_fp = target_fp.with_suffix(".html") # don't forget to set the extension
if target_fp.is_file():
# check the sha1 of the target file. We can safely read into memory b/c staticfiles are handled above
target_sha = hashlib.sha512(target_fp.read_bytes())
source_sha = hashlib.sha512(self.rendered.encode("utf-8"))
if target_sha.hexdigest() == source_sha.hexdigest():
return False # This page exists, and wasn't updated, so False
target_fp.write_text(self.rendered, encoding="utf-8")
return True # Page was updated, so True
# this is a new page!
target_fp.write_text(self.rendered, encoding="utf-8")
return "new" # Page was newly created
def render(self):
self.render_to_html()
self.generate_toc()
self.search_for_og_image()
self.load_template()
self.fill_variables()
return self.write_output()