-
Notifications
You must be signed in to change notification settings - Fork 7
/
crawl.py
executable file
·91 lines (72 loc) · 2.37 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
# chameleon-crawler
#
# Copyright 2016 ghostwords.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from datetime import datetime
from random import shuffle
from multiprocessing import Process, Queue
from urllib.parse import urlparse
from crawler.args import parse_args
from crawler.collector import collect
from crawler.crawler_manager import Crawler
from crawler.utils import Logger
from utils.database import DATABASE_URL, initialize_database
import dataset
import sys
def run():
# get commandline args
args = parse_args()
initialize_database()
# store start time & args, plus get an ID for this crawl
with dataset.connect(DATABASE_URL) as db:
crawl_id = db['crawl'].insert(dict(
args=" ".join(sys.argv[1:]),
start_time=datetime.now()
))
url_queue = Queue() # (url, num_timeouts) tuples
result_queue = Queue()
# read in URLs and populate the job queue
with args.urls:
urls = list(args.urls)
# randomize crawl order
shuffle(urls)
for url in urls:
url = url.strip()
if not urlparse(url).scheme:
url = 'http://' + url
url_queue.put((url, 0))
log = Logger().log if not args.quiet else lambda *args, **kwargs: None
# launch browsers
crawlers = []
for i in range(args.num_crawlers):
crawler = Process(
target=Crawler,
args=(i + 1,),
kwargs={
'crx': args.crx,
'headless': args.headless,
'logger': log,
'timeout': args.timeout,
'url_queue': url_queue,
'result_queue': result_queue
}
)
crawler.start()
crawlers.append(crawler)
# start the collector process
Process(target=collect, args=(crawl_id, result_queue, log)).start()
# wait for all browsers to finish
for crawler in crawlers:
crawler.join()
# tell collector we are done
result_queue.put(None)
# store completion time
with dataset.connect(DATABASE_URL) as db:
db['crawl'].update(dict(id=crawl_id, end_time=datetime.now()), 'id')
log("Main process all done!")
if __name__ == '__main__':
run()