-
Notifications
You must be signed in to change notification settings - Fork 0
/
virgool.py
69 lines (60 loc) · 2.58 KB
/
virgool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import scrapy
from pathlib import Path
from logger import logger
import re
from trafilatura import extract
def get_cookie(response):
content = response.body.decode('utf-8')
cookie_regex = "document\.cookie\s*=\s*\\'(.+); Max-Age"
match = re.search(cookie_regex, content)
if match:
return match.group(1)
return None
class VirgoolSpider(scrapy.Spider):
name = "Virgool"
number_of_pages = 30000
custom_settings = {'AUTOTHROTTLE_ENABLED': True,
'HTTPCACHE_ENABLED': True,
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
}
def __init__(self, gather_index_pages=False, **kwargs):
super().__init__(**kwargs)
self.start_urls = []
self.gather_index_pages = gather_index_pages
if not self.gather_index_pages:
self.start_urls = Path('virgool/index.txt').read_text().split('\n')
else:
for i in range(0, self.number_of_pages + 1):
self.start_urls.append(
f"https://virgool.io/?page={i}"
)
logger.info('urls are appended')
def parse(self, response, **kwargs):
try:
if self.gather_index_pages:
for item in range(1, 20):
url = response.css(f'main#app article:nth-child({item}) > div > a::attr(href)').get()
if url:
with Path('virgool/index.txt').open("a") as f:
f.write(url + '\n')
else:
item = {'title': response.css('main#app h1::text').get(),
'author': response.css('main#app div.module-header > a::text').get(),
'text': extract(response.body.decode('utf-8'),deduplicate=True, include_images=False, include_comments=False, include_links=False),
'url': response.css('.shorturl-text::text').get()
}
# if I use strip on all of them I may get error. I have to check if it is not none.
for key in item:
if item[key]:
item[key] = re.sub(' +', ' ', item[key]).strip()
return item
except Exception:
logger.error("Parsing Error: ", exc_info=True)
def handle_error(self, failure):
logger.warning("Error,", failure.request.url)
yield scrapy.Request(
url=failure.request.url,
dont_filter=True,
callback=self.parse,
errback=self.handle_error)