This repository has been archived by the owner on Jul 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 41
/
dt.py
executable file
·154 lines (124 loc) · 5.04 KB
/
dt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/03/12 21:10
# @Author : Xu
# @Site : https://xuccc.github.io/
import click
import logging
import sys
from scrapy.crawler import CrawlerRunner, CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor, defer
from apscheduler.schedulers.twisted import TwistedScheduler
from DuTracker.spiders.brand import BrandSpider
from DuTracker.spiders.serie import SerieSpider
from DuTracker.spiders.product import ProductSpider
from DuTracker.spiders.tracker import TrackerSpider
from DuTracker.utils.log import log
@click.group()
def cli():
pass
@cli.command()
def show():
settings = get_project_settings()
log.info('显示远程品牌&系列信息')
runner = CrawlerRunner(settings)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(BrandSpider, auto=True)
yield runner.crawl(SerieSpider, auto=True)
reactor.stop()
crawl()
reactor.run()
@cli.command(help='Init Database')
@click.option('--verbose', '-v', is_flag=True, default=False, )
@click.option('--debug', is_flag=True, default=False, help='show scrapy log')
@click.option('--proxy', help='proxy url')
def crawl(verbose, debug, proxy, ):
settings = get_project_settings()
if verbose:
log.setLevel(logging.DEBUG)
if proxy:
settings['DOWNLOADER_MIDDLEWARES'].update({
'DuTracker.middlewares.RandomProxy': 760
})
settings['PROXY_URL'] = proxy
if debug:
settings['LOG_ENABLED'] = True
log.info('初始化数据库 product.sqlite')
runner = CrawlerRunner(settings)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(BrandSpider)
yield runner.crawl(SerieSpider)
yield runner.crawl(ProductSpider, fromDB=True)
reactor.stop()
crawl()
reactor.run()
@cli.command(help='add product information by productId')
@click.argument('pid', type=int, nargs=-1)
@click.option('--verbose', '-v', is_flag=True, default=False, )
@click.option('--debug', is_flag=True, default=False, help='show scrapy log')
def addproduct(pid, debug, verbose):
settings = get_project_settings()
if verbose: log.setLevel(logging.DEBUG)
if debug: settings['LOG_ENABLED'] = True
process = CrawlerProcess(settings)
process.crawl(ProductSpider, productIds=pid)
process.start()
@cli.command(help='Monitor products\' price')
@click.option('--verbose', '-v', is_flag=True, default=False, )
@click.option('--debug', is_flag=True, default=False, help='show scrapy log')
@click.option('--proxy', help='proxy url')
# @click.option('--day', type=int, default=1)
@click.option('--min', type=int, default=1000)
@click.option('--product', '-p', multiple=True, type=int, help='product ids')
@click.option('--brand', '-b', multiple=True, type=int, help='brand ids')
@click.option('--serie', '-s', multiple=True, type=int, help='serie ids')
@click.option('--check/--no-check', default=True)
@click.option('--delay', type=float, help='delay between download')
@click.option('--news', is_flag=True, default=False)
@click.option('--days', type=int, default=14,help='save log by days')
def start(verbose, debug, proxy, min, product, brand, serie, check, delay, news, days):
def check_db():
from DuTracker.tsdb import influxdb
try:
influxdb.ping()
except Exception as e:
log.error(f'InfluxDB 连接错误')
sys.exit(1)
else:
log.success(f'InfluxDB 连接成功')
if check: check_db()
# https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
settings = get_project_settings()
if verbose: log.setLevel(logging.DEBUG)
if proxy:
settings['DOWNLOADER_MIDDLEWARES'].update({
'DuTracker.middlewares.RandomProxy': 760
})
settings['PROXY_URL'] = proxy
if debug: settings['LOG_ENABLED'] = True
if delay: settings['DOWNLOAD_DELAY'] = delay
process = CrawlerProcess(settings)
sched = TwistedScheduler()
if brand:
sched.add_job(process.crawl, 'interval', args=[BrandSpider], kwargs={'auto': True, 'Ids': brand}, days=1)
process.crawl(BrandSpider, auto=True, Ids=brand)
if serie:
sched.add_job(process.crawl, 'interval', args=[SerieSpider], kwargs={'auto': True, 'Ids': serie}, days=1)
process.crawl(SerieSpider, auto=True, Ids=serie)
if brand or serie:
sched.add_job(process.crawl, 'interval', args=[ProductSpider], kwargs={'fromDB': True}, days=1)
process.crawl(ProductSpider, fromDB=True)
process.crawl(TrackerSpider, soldNum_min=min, Ids=product)
sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={'soldNum_min': min, 'Ids': product}, hours=6)
if news:
sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={'newItem': True, 'days': days},
hours=1)
sched.add_job(sched.print_jobs, 'interval', hours=6)
log.info('开始商品价格追踪')
sched.start()
process.start(False)
if __name__ == '__main__':
cli()