-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
74 lines (56 loc) · 2.52 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def _noop (self, *args, **kwargs):
return
def __init__ (self):
self._handle_starttag = self._find_div_id_questions
self._handle_endtag = self._noop
self._handle_data = self._noop
super ().__init__ ()
self._correct_answer = False
self._questions = list ()
def handle_starttag(self, tag, attrs):
return self._handle_starttag (tag, attrs)
def handle_endtag(self, tag):
return self._handle_endtag (tag)
def handle_data(self, data):
return self._handle_data (data)
def _find_div_id_questions (self, tag, attrs):
if (tag == "div" and ("id", "questions") in attrs):
self._handle_starttag = self._find_question_b
def _find_question_b (self, tag, attrs):
if (tag == "b"):
self._questions.append ({"question" : "", "answers" : []})
self._handle_data = self._print_question
self._handle_starttag = self._find_correct_answer
self._correct_answer = False
else:
self._handle_data = self._noop
def _find_correct_answer (self, tag, attrs):
if (tag == "div" and ("class", "row correct-answer green inverted") in attrs):
self._correct_answer = True
elif (tag == "div" and ("class", "row") in attrs):
self._correct_answer = False
self._handle_starttag = self._find_p_fifteen
def _find_p_fifteen (self, tag, attrs):
if (tag == "p" and ("class", "fifteen wide column ") in attrs):
self._handle_data = self._print_answer
elif (tag == "div" and ("class", "ui grid") in attrs):
self._handle_starttag = self._find_question_b
def _print_answer (self, data):
answer = data.strip ().replace ('\n', '')
if answer [-1] == ',' or answer [-1] == '.':
answer = answer [:-1]
self._questions [-1] ["answers"].append (
{"correct" : self._correct_answer,
"answer" : answer})
self._handle_data = self._noop
self._handle_starttag = self._find_correct_answer
def _print_question (self, data):
self._questions [-1]["question"] = data
self._handle_data = self._noop
parser = MyHTMLParser()
with open ("Seznam testových otázek ke zkoušce odborné způsobilosti | ZbraněKvalitně.cz.html", "r") as fp:
parser.feed (data=fp.read())
import json, sys
json.dump (parser._questions, sys.stdout, ensure_ascii=False, indent=4)