-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
82 lines (70 loc) · 2.46 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
from shutil import rmtree
import urllib3
import urllib3.request
from bs4 import BeautifulSoup
from tqdm import tqdm
from istockphoto import IStockSearch
class Scrapper:
def __init__(self, search_engine: IStockSearch, total_image=1000):
self.search_engine = search_engine
self.total_image = total_image
self.save_path = self.search_engine.engine_name + '_' + '_'.join(search_engine.search_query.split())
if os.path.exists(self.save_path):
rmtree(self.save_path, ignore_errors=True)
os.mkdir(self.save_path)
self.pool_man = urllib3.PoolManager()
self.downloaded_im = 0
self.parser_type = 'html.parser'
self.response = None
self.links = None
self.pbar = tqdm(total=self.total_image, unit='image')
def url_request(self, page):
self.response = self.pool_man.request('GET', self.search_engine.create_search_url(page))
self.links = self.__bs4_parser()
self.response.release_conn()
def __bs4_parser(self):
soup = BeautifulSoup(self.response.data, self.parser_type)
return soup.find_all(self.search_engine.attr_name, attrs=self.search_engine.attr_dict)
def image_url_to_file(self):
for link in self.links:
im_url = link.get('src')
im_request = self.pool_man.request('GET', im_url, preload_content=False)
with open(os.path.join(self.save_path, f'{self.downloaded_im+1:04}.jpg'), 'wb') as out:
while True:
data = im_request.read(256)
if not data:
break
out.write(data)
self.downloaded_im += 1
self.pbar.update(1)
if self.downloaded_im == self.total_image:
break
im_request.release_conn()
def download(self):
self.print_scrap_conf()
print('Starting!')
page = 1
while self.downloaded_im < self.total_image:
self.pbar.set_description(f"Downloading page: {page} ")
try:
self.url_request(page)
self.image_url_to_file()
page += 1
except Exception as e:
page += 1
print('Fail on page', e)
self.pbar.close()
self.check_download_count()
def check_download_count(self):
p = os.listdir(self.save_path)
print('\nChecking local files!')
if len(p) == self.total_image:
print(f'Scrap successful! {self.downloaded_im}/{self.total_image}')
else:
print(f'Scrap failed! {self.downloaded_im}/{self.total_image}')
def print_scrap_conf(self):
print(f'\nSearch engine: \t\t\t\t{self.search_engine}')
print(f'Search engine sort method: \t{self.search_engine.sort_type}')
print(f'Total image request: \t\t{self.total_image}')
print(f'Save path: \t\t\t\t\t{self.save_path}\n')