This repository has been archived by the owner on Mar 12, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
arachas.py
270 lines (207 loc) · 8.84 KB
/
arachas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import os.path
import json
import time
import queue
import threading
import requests
import mimetypes
import argparse
import re
from unidecode import unidecode
import gwentifyHandler as siteHandler
import indexer
args = {}
NAME_REPLACE = {" ": "_", ":": "", "'": "", "`": "", "’": "", "(": "", ")": ""}
# URL where we can begin the crawl.
HOST = 'http://gwentify.com/cards/?view=table'
IMAGE_FOLDER = 'media'
FILE_NAME = 'latest'
DOWNLOAD_ARTWORK = False
# Timeout for the requests module.
TIMEOUT = 5.0
# Number of threads that the program uses.
THREADS_COUNT = 10
# Queue containing the URL of every pages.
pageQueue = queue.Queue()
# Queue containing the URL of every cards.
cardQueue = queue.Queue()
# Queue containing every cards already processed and ready to be saved.
finalDataQueue = queue.Queue()
imageQueue = queue.Queue()
# Request headers
HEADERS = {
'User-Agent': 'Mozilla/5.0'
}
# Set the command line parameters.
def setParser():
parser = argparse.ArgumentParser(description='This script allows you to crawl different Gwent community website '
'to parse and save data about the cards.')
parser.add_argument('-o', '--output', help='Name of the json file that will be saved.', required=False)
parser.add_argument('--image', help='Use this argument to download the full size artwork for all cards.',
action='store_true', required=False)
global args
args = parser.parse_args()
# Class responsible for processing the URL of a page and obtaining the URL of every cards on the page.
class ThreadPage(threading.Thread):
def __init__(self, pageQueue, cardQueue):
threading.Thread.__init__(self)
self.pageQueue = pageQueue
self.cardQueue = cardQueue
def run(self):
while True:
url = self.pageQueue.get()
res = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if res.status_code == 200:
# Send the html to the siteHandler module for processing.
# Return a list of URL where every URL is the URL for a card.
listCards = siteHandler.getCardsUrl(res.content)
# Add all entry of listCards in the cardQueue.
list(map(self.cardQueue.put, listCards))
else:
print("Error")
# Notify that we have finished one task.
self.pageQueue.task_done()
# Class responsible for processing the URL of a card and obtaining all information related to the card.
class CardThread(threading.Thread):
def __init__(self, cardQueue, finalDataQueue, imageQueue):
threading.Thread.__init__(self)
self.cardQueue = cardQueue
self.finalDataQueue = finalDataQueue
self.imageQueue = imageQueue
def run(self):
while True:
url = self.cardQueue.get()
res = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if res.status_code == 200:
# Send the html to the siteHandler module for processing.
# Return a card.
cardData = siteHandler.getCardJson(res.content)
key = getNameKey(cardData['name'])
cardData['key'] = key
self.finalDataQueue.put(cardData)
self.imageQueue.put((cardData['key'], cardData['variations'][0]['art']['fullsizeImage']))
self.imageQueue.put((cardData['key'] + "_thumbnail", cardData['variations'][0]['art']['thumbnailImage']))
else:
print("bad")
# Notify that we have finished one task.
self.cardQueue.task_done()
# Transform the given name to an url friendly format.
def getNameKey(name):
# https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
name = name.lower()
# In one pass, replace a few space characters and special characters.
rep = dict((re.escape(k), v) for k, v in NAME_REPLACE.items())
pattern = re.compile("|".join(rep.keys()))
name = pattern.sub(lambda m: rep[re.escape(m.group(0))], name)
# Tries to represent the Unicode data in ASCII characters. It will remove accents and the likes.
name = unidecode(name)
return name
# Class responsible for downloading the card artworks.
class ImageThread(threading.Thread):
def __init__(self, imageQueue):
threading.Thread.__init__(self)
self.imageQueue = imageQueue
def run(self):
while True:
# The name will be used for saving the file
name, url = self.imageQueue.get()
res = requests.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True)
if res.status_code == 200:
content_type = res.headers['content-type']
# With the content type received from the web server, use mimetypes to guess the file extension.
extension = mimetypes.guess_extension(content_type)
filepath = os.path.join('./' + IMAGE_FOLDER + '/' + name + extension)
with open(filepath, 'wb') as f:
# Stream the files.
for chunk in res:
f.write(chunk)
# Notify that we have finished one task.
self.imageQueue.task_done()
# Function to retrieve a list of URL for every pages of cards.
# The url parameter is the entry point of the website where we might extract the information.
def getPages(url):
listPages = []
res = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if res.status_code == 200:
# Process the html and return a list of URL for every available pages.
listPages = siteHandler.getPages(res.content)
listPages.append(url)
else:
print("bad")
return listPages
# Save a list of cards in a file in the json format.
# filename is the name under which the file will be saved.
# cardList is the list of cards.
# The file is saved in the same path as where the script is ran from.
def saveJson(filename, cardList):
filepath = os.path.join('./output/' + filename)
print("Saving %s cards to: %s" % (len(cardList), filepath))
with open(filepath + ".json", "w", encoding="utf-8", newline="\n") as f:
json.dump(cardList, f, ensure_ascii=False, sort_keys=True, indent=2, separators=(',', ': '))
with open(filepath + ".jsonl", "w", encoding="utf-8", newline="\n") as f:
isFirst = True
for card in cardList:
if not isFirst:
f.write("\n")
else:
isFirst = False
json.dump(card, f, ensure_ascii=False, sort_keys=True)
def main():
# Attribute for the cli parameter to know whether or not we should download the artworks.
global DOWNLOAD_ARTWORK
# Retrieve the parameter send by the user.
if args.image:
DOWNLOAD_ARTWORK = args.image
# Folder where the artworks are saved.
imageFolderPath = os.path.join('./' + IMAGE_FOLDER)
if not os.path.exists(imageFolderPath):
os.makedirs(imageFolderPath)
# Start THREADS_COUNT number of thread working on retrieving cards URL from a page URL.
for i in range(THREADS_COUNT):
t = ThreadPage(pageQueue, cardQueue)
t.setDaemon(True)
t.start()
# Retrieve the URL of all pages.
pages = getPages(HOST)
# Populate the page queue.
for page in pages:
pageQueue.put(page)
# for page in test:
# pageQueue.put(page)
# Start THREADS_COUNT number of thread working on retrieving card data from card URL.
for i in range(THREADS_COUNT):
c = CardThread(cardQueue, finalDataQueue, imageQueue)
c.setDaemon(True)
c.start()
# Start THREADS_COUNT number of thread working on downloading the artwork for the cards.
if DOWNLOAD_ARTWORK:
for i in range(THREADS_COUNT):
it = ImageThread(imageQueue)
it.setDaemon(True)
it.start()
# Blocks until the queue is finished processing.
pageQueue.join()
# Blocks until the queue is finished processing.
cardQueue.join()
if DOWNLOAD_ARTWORK:
imageQueue.join()
cardList = list(finalDataQueue.queue)
# Sort the cards in the list by the name of the cards in order to get a predictable output.
# Makes it easier to see difference when using a diff tool.
cardList = sorted(cardList, key=lambda element: element['name'])
# Attribute for the default file name used to save the data.
global FILE_NAME
# If it was overwritten by sending a cli parameter.
if args.output:
FILE_NAME = args.output
saveJson(FILE_NAME, cardList)
# Run the indexer to have a gross summary of changes between evert run of the script.
indexer.Indexer(cardList)
if __name__ == '__main__':
setParser()
print("Starting")
start = time.time()
main()
print("Elapsed Time: %s" % (time.time() - start))