-
Notifications
You must be signed in to change notification settings - Fork 1
/
parsers.py
96 lines (72 loc) · 2.37 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import ssl
import urllib.error
import urllib.parse
import urllib.request
import requests
from bs4.dammit import UnicodeDammit
from lxml import html
def create_url_opener():
context = ssl._create_unverified_context()
handler = urllib.request.HTTPSHandler(context=context)
opener = urllib.request.build_opener(handler)
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
return opener
class Message():
text = ''
def __init__(self):
pass
class SiteParser(object):
cache = {}
is_media_parser = False
def __init__(self, base_url):
self.base_url = base_url
self.opener = create_url_opener()
def parse_page(self, url):
if url not in self.cache:
content = self.opener.open(url).read()
doc = UnicodeDammit(content, is_html=True)
parser = html.HTMLParser(encoding=doc.original_encoding)
page = html.document_fromstring(content, parser=parser)
self.cache[url] = page
return self.cache[url]
def get_title(self):
"""
:rtype: str
"""
return ""
def get_messages(self):
"""
:rtype: list of Message
"""
return []
class ForumParser(SiteParser):
limit = 30
def get_title(self):
page = self.parse_page(self.base_url)
return self.get_page_title(page)
def get_messages(self):
self.messages = []
m_count = 0
last_page = self.get_page_count()
for page in range(last_page, 0, -1):
if m_count >= self.limit:
self.messages = self.messages[-self.limit:]
break
page_url = self.build_page_url(page)
page = self.parse_page(page_url)
messages = self.get_messages_for_page(page, page_url)
m_count += len(messages)
self.messages = messages + self.messages
return reversed(self.messages)
def get_page_count(self):
return int(self.get_last_page(self.parse_page(self.base_url)))
class MediaParser(SiteParser):
is_media_parser = True
def get_file_size(self, url):
try:
r = requests.head(url, allow_redirects=True, verify=False)
if r.status_code == 200:
return r.headers['Content-Length']
except requests.exceptions.RequestException as e:
pass
return None