forked from bucm-policy-search/webSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart_crawl.py
72 lines (58 loc) · 1.88 KB
/
start_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import logging
import schedule
import time
from time import strftime
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from webSpider.spiders.beijing import BATCM
from webSpider.spiders.hebei import HCOHP
from webSpider.spiders.shanxi import HCOSP
from webSpider.spiders.guojia import NATCM
from webSpider.spiders.shaanxi import SATCM
from webSpider.spiders.jilin import JLTCM
from webSpider.spiders.anhui import AHTCM
import argparse
parser = argparse.ArgumentParser(description="Crawl Parameters")
parser.add_argument(
"-m",
"--mode",
help=' 爬虫模式:测试模式 or 生产模式 - "dev(development)" / "prod(production)",默认 "prod" ',
default="prod",
)
parser.add_argument(
"-a",
"--auto",
help=' 是否开启定点自动爬虫 - "auto(Automatic)" / "non(Non-automatic)",默认 "auto" ',
default="auto",
)
args = parser.parse_args()
crawl_mode = args.mode
crawl_auto = args.auto
def job():
check_folder = os.path.isdir("./logs")
if not check_folder:
os.makedirs("./logs/", mode=0o755)
current_time = strftime("%Y-%m-%dT%H:%M:%S%z")
logging.basicConfig(
format="%(asctime)s %(levelname)s:%(message)s",
filename=f"./logs/scrapy_{current_time}.log",
level=logging.WARNING,
) # ISO 8601 Timestamp format
process = CrawlerProcess(get_project_settings())
process.crawl(BATCM, mode=crawl_mode)
process.crawl(HCOHP, mode=crawl_mode)
process.crawl(HCOSP, mode=crawl_mode)
process.crawl(NATCM, mode=crawl_mode)
process.crawl(SATCM, mode=crawl_mode)
process.crawl(JLTCM, mode=crawl_mode)
process.crawl(AHTCM, mode=crawl_mode)
process.start()
if crawl_auto == "non":
job()
else:
schedule.every().day.at("03:35").do(job)
logging.info("Start crawling.")
while True:
schedule.run_pending()
time.sleep(1)