-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
executable file
·32 lines (24 loc) · 1005 Bytes
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from framework.spider import Spider as _Spider
from framework.engine import Engine
from framework.http.request import Request
from pipeline import PrintPipeline
from item import ArticleItem
import redis
class Spider(_Spider):
start_url = "http://blog.jobbole.com/all-posts/"
def parse(self, response):
for post in response.cssselect('.archive-title'):
url = post.get('href')
yield Request(url, self.article_parser)
next_page = response.cssselect('.navigation .next')[0].get('href')
if next_page:
yield Request(next_page, self.parse)
def article_parser(self, response):
item = ArticleItem()
item.title = response.xpath('//div[@class="entry-header"]/h1/text()')[0]
item.url = response.url
yield item
if __name__ == '__main__':
engine = Engine(Spider(), redis.Redis('192.168.222.134'))
engine.set_pipeline(ArticleItem, PrintPipeline())
engine.driver()