forked from PaulSonOfLars/telegram-bot-api-spec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
365 lines (284 loc) · 12.2 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import json
import re
import string
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
TG_CORE_TYPES = ["String", "Boolean", "Integer", "Float"]
ROOT_URL = "https://core.telegram.org"
TO_SCRAPE = {
"api": ROOT_URL + "/bots/api",
}
METHODS = "methods"
TYPES = "types"
# List of all abstract types which don't have subtypes.
APPROVED_NO_SUBTYPES = {
"InputFile" # This is how telegram represents files
}
# List of all approved multi-returns.
APPROVED_MULTI_RETURNS = [
['Message', 'Boolean'] # Edit returns either the new message, or an OK to confirm the edit.
]
def retrieve_info(url: str) -> dict:
r = requests.get(url)
soup = BeautifulSoup(r.text, features="html5lib")
dev_rules = soup.find("div", {"id": "dev_page_content"})
curr_type = ""
curr_name = ""
# First header in the body is the release date (with an anchor)
release_tag = dev_rules.find("h4", recursive=False)
changelog_url = url + release_tag.find("a").get("href")
# First paragraph in the root body is the version
version = dev_rules.find("p", recursive=False).get_text()
items = {
"version": version,
"release_date": release_tag.get_text(),
"changelog": changelog_url,
METHODS: dict(),
TYPES: dict(),
}
for x in list(dev_rules.children): # type: Tag
if x.name == "h3" or x.name == "hr":
# New category; clear name and type.
curr_name = ""
curr_type = ""
if x.name == "h4":
anchor = x.find("a")
name = anchor.get("name")
if name and "-" in name:
curr_name = ""
curr_type = ""
continue
curr_name, curr_type = get_type_and_name(x, anchor, items, url)
if not curr_type or not curr_name:
continue
if x.name == "p":
items[curr_type][curr_name].setdefault("description", []).extend(clean_tg_description(x, url))
if x.name == "table":
get_fields(curr_name, curr_type, x, items, url)
if x.name == "ul":
get_subtypes(curr_name, curr_type, x, items, url)
# Only methods have return types.
# We check this every time just in case the description has been updated, and we have new return types to add.
if curr_type == METHODS and items[curr_type][curr_name].get("description"):
get_method_return_type(curr_name, curr_type, items[curr_type][curr_name].get("description"), items)
return items
def get_subtypes(curr_name: str, curr_type: str, x: Tag, items: dict, url: str):
if curr_name == "InputFile": # Has no interesting subtypes
return
list_contents = []
for li in x.find_all("li"):
list_contents.extend(clean_tg_description(li, url))
# List items found in types define possible subtypes.
if curr_type == TYPES:
items[curr_type][curr_name]["subtypes"] = list_contents
# Always add the list to the description, for better docs.
items[curr_type][curr_name]["description"] += [f"- {s}" for s in list_contents]
# Get fields/parameters of type/method
def get_fields(curr_name: str, curr_type: str, x: Tag, items: dict, url: str):
body = x.find("tbody")
fields = []
for tr in body.find_all("tr"):
children = list(tr.find_all("td"))
if curr_type == TYPES and len(children) == 3:
desc = clean_tg_field_description(children[2], url)
fields.append(
{
"name": children[0].get_text(),
"types": clean_tg_type(children[1].get_text()),
"required": not desc.startswith("Optional. "),
"description": desc,
}
)
elif curr_type == METHODS and len(children) == 4:
fields.append(
{
"name": children[0].get_text(),
"types": clean_tg_type(children[1].get_text()),
"required": children[2].get_text() == "Yes",
"description": clean_tg_field_description(children[3], url)
}
)
else:
print("An unexpected state has occurred!")
print("Type:", curr_type)
print("Name:", curr_name)
print("Number of children:", len(children))
print(children)
exit(1)
items[curr_type][curr_name]["fields"] = fields
def get_method_return_type(curr_name: str, curr_type: str, description_items: list[str], items: dict):
description = "\n".join(description_items)
ret_search = re.search("(?:on success,|returns)([^.]*)(?:on success)?", description, re.IGNORECASE)
ret_search2 = re.search("([^.]*)(?:is returned)", description, re.IGNORECASE)
if ret_search:
extract_return_type(curr_type, curr_name, ret_search.group(1).strip(), items)
elif ret_search2:
extract_return_type(curr_type, curr_name, ret_search2.group(1).strip(), items)
def get_type_and_name(t: Tag, anchor: Tag, items: dict, url: str):
if t.text[0].isupper():
curr_type = TYPES
else:
curr_type = METHODS
curr_name = t.get_text()
items[curr_type][curr_name] = {"name": curr_name}
href = anchor.get("href")
if href:
items[curr_type][curr_name]["href"] = url + href
return curr_name, curr_type
def extract_return_type(curr_type: str, curr_name: str, ret_str: str, items: dict):
array_match = re.search(r"(?:array of )+(\w*)", ret_str, re.IGNORECASE)
if array_match:
ret = clean_tg_type(array_match.group(1))
rets = [f"Array of {r}" for r in ret]
items[curr_type][curr_name]["returns"] = rets
else:
words = ret_str.split()
rets = [
r for ret in words
for r in clean_tg_type(ret.translate(str.maketrans("", "", string.punctuation)))
if ret[0].isupper()
]
items[curr_type][curr_name]["returns"] = rets
def clean_tg_field_description(t: Tag, url: str) -> str:
return " ".join(clean_tg_description(t, url))
def clean_tg_description(t: Tag, url: str) -> list[str]:
# Replace HTML emoji images with actual emoji
for i in t.find_all("img"):
i.replace_with(i.get("alt"))
# Make sure to include linebreaks, or spacing gets weird
for br in t.find_all("br"):
br.replace_with("\n")
# Replace helpful anchors with the actual URL.
for a in t.find_all("a"):
anchor_text = a.get_text()
if "»" not in anchor_text:
continue
link = a.get("href")
# Page-relative URL
if link.startswith("#"):
link = url + link
# Domain-relative URL
elif link.startswith("/"):
link = ROOT_URL + link
anchor_text = anchor_text.replace(" »", ": " + link)
a.replace_with(anchor_text)
text = t.get_text()
# Replace any weird double whitespaces (eg double space, newlines, tabs) with single occurrences
text = re.sub(r"(\s){2,}", r"\1", text)
# Replace weird UTF-8 quotes with proper quotes
text = text.replace('”', '"').replace('“', '"')
# Replace weird unicode ellipsis with three dots
text = text.replace("…", "...")
# Use sensible dashes
text = text.replace(u"\u2013", "-")
text = text.replace(u"\u2014", "-")
# Use sensible single quotes
text = text.replace(u"\u2019", "'")
# Split on newlines to improve description output.
return [t.strip() for t in text.split("\n") if t.strip()]
def get_proper_type(t: str) -> str:
if t == "Messages": # Avoids https://core.telegram.org/bots/api#sendmediagroup
return "Message"
elif t == "Float number":
return "Float"
elif t == "Int":
return "Integer"
elif t == "True" or t == "Bool":
return "Boolean"
return t
def clean_tg_type(t: str) -> list[str]:
pref = ""
if t.startswith("Array of "):
pref = "Array of "
t = t[len("Array of "):]
fixed_ors = [x.strip() for x in t.split(" or ")] # Fix situations like "A or B"
fixed_ands = [x.strip() for fo in fixed_ors for x in fo.split(" and ")] # Fix situations like "A and B"
fixed_commas = [x.strip() for fa in fixed_ands for x in fa.split(", ")] # Fix situations like "A, B"
return [pref + get_proper_type(x) for x in fixed_commas]
# Returns True if an issue is found.
def verify_type_parameters(items: dict) -> bool:
issue_found = False
for type_name, values in items[TYPES].items():
# check all values have a URL
if not values.get("href"):
print(f"{type_name} has no link!")
issue_found = True
continue
fields = values.get("fields", [])
if len(fields) == 0:
subtypes = values.get("subtypes", [])
description = "".join(values.get("description", [])).lower()
# Some types are abstract and have no information or subtypes - this is mentioned in their description.
# Otherwise, check if they're manually approved.
if not subtypes and \
not ("currently holds no information" in description or type_name in APPROVED_NO_SUBTYPES):
print("TYPE", type_name, "has no fields or subtypes, and is not approved")
issue_found = True
continue
for st in subtypes:
if st in items[TYPES]:
items[TYPES][st].setdefault("subtype_of", []).append(type_name)
else:
print("TYPE", type_name, "USES INVALID SUBTYPE", st)
issue_found = True
# check all parameter types are valid
for param in fields:
field_types = param.get("types")
for field_type_name in field_types:
while field_type_name.startswith("Array of "):
field_type_name = field_type_name[len("Array of "):]
if field_type_name not in items[TYPES] and field_type_name not in TG_CORE_TYPES:
print("UNKNOWN FIELD TYPE", field_type_name)
issue_found = True
return issue_found
# Returns True if an issue is found.
def verify_method_parameters(items: dict) -> bool:
issue_found = False
# Type check all methods
for method, values in items[METHODS].items():
# check all values have a URL
if not values.get("href"):
print(f"{method} has no link!")
issue_found = True
continue
# check all methods have a return
returns = values.get("returns")
if not returns:
print(f"{method} has no return types!")
issue_found = True
continue
# If we have multiple returns, this is an edge case, but not a deal-breaker; check that output.
# Some multi-returns are common and pre-approved.
if len(returns) > 1 and returns not in APPROVED_MULTI_RETURNS:
print(f"{method} has multiple return types: {returns}")
# check all parameter types are valid
for param in values.get("fields", []):
types = param.get("types")
for t in types:
while t.startswith("Array of "):
t = t[len("Array of "):]
if t not in items[TYPES] and t not in TG_CORE_TYPES:
issue_found = True
print("UNKNOWN PARAM TYPE", t)
# check all return types are valid
for ret in values.get("returns", []):
while ret.startswith("Array of "):
ret = ret[len("Array of "):]
if ret not in items[TYPES] and ret not in TG_CORE_TYPES:
issue_found = True
print("UNKNOWN RETURN TYPE", ret)
return issue_found
def main():
for filename, url in TO_SCRAPE.items():
print("parsing", url)
items = retrieve_info(url)
if verify_type_parameters(items) or verify_method_parameters(items):
print("Failed to validate schema. View logs above for more information.")
exit(1)
with open(f"{filename}.json", "w") as f:
json.dump(items, f, indent=2)
with open(f"{filename}.min.json", "w") as f:
json.dump(items, f)
if __name__ == '__main__':
main()