-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDownloader.py
140 lines (129 loc) · 6 KB
/
Downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
"""
This class provides many methods for download images,
from a list of urls that can be from a file or a website.
After downloaded, the user can automatcally resize,
rename, transform to gray scale up or down and remove
default stock images.
@author Thiago da Silva Alves
@version 1.0, 26/12/16
"""
from urllib.parse import urlparse
from requests import exceptions
from requests import get
import mimetypes
import os
import argparse
import PreProcessing as prp
def name_from_url(response, url):
"""Gets an url and by this searches by image name"""
parsedResult = urlparse(url)
file_name = os.path.split(parsedResult.path)[1]
file_name_without_ext = os.path.splitext(file_name)[0]
content_type = response.headers['content-type']
extension = mimetypes.guess_extension(content_type)
extension = "" if extension == None else extension
return ''.join([file_name_without_ext, extension]), extension
def download_and_save(url, save_path, timeout=2,
extensions_filter=(".jpg", ".jpeg", ".png", ".bmp")):
"""gets the request"""
try:
response = get(url, timeout=timeout)
if response.status_code == 200: #200 == success
file_name, extension = name_from_url(response, url)
print(url)
if extension in extensions_filter:
with open(os.path.join(save_path, file_name), "wb") as file:
file.write(response.content)
except exceptions.Timeout:
print("Timeout was reached for the url: [{}]".format(url))
except exceptions.TooManyRedirects:
print("Too many redirects for the url: [{}]".format(url))
except exceptions.RequestException as e:
print (e)
def download_images_by_list(file, save_path, timeout=2,
extensions_filter=(".jpg", ".jpeg", ".png",
".bmp")):
"""Using an list of urls, iterates all over them and downloads each image"""
#if the file is in a local machine
if os.path.isfile(file):
url_images = open(file).readlines()
for i, url in enumerate(url_images):
if url: # If url content is ""
download_and_save(url, save_path, timeout, extensions_filter)
#If the file is in some webside
else:
response = get(file, stream=True, timeout=timeout)
if response.status_code == 200:
lines = response.iter_lines()
for line in lines:
if line: # If line content is ""
download_and_save(line.decode('utf-8'),
save_path, timeout, extensions_filter)
def parse_arguments():
"""Parses arguments choosen by the user."""
parser = argparse.ArgumentParser()
parser.add_argument('--urls',
help='File where urls are, can be an site or a file.')
parser.add_argument('--out',
help='Where the images will be saved.', required=True)
parser.add_argument('--timeout',type=float,
help='Maximum time to download each image. <2>',
default=2)
parser.add_argument('--img-extensions',
help='Tuple of image extensions that will be accepted. '
'(".jpg", ".jpeg", ".png", ".bmp")',
default=(".jpg", ".jpeg", ".png", ".bmp"))
parser.add_argument('--prefix',
help='Name prefix to save images. <img>',
default="img")
parser.add_argument('--out-extension',
help='Extension to use when sava an image. <.png>',
default=".png")
parser.add_argument('--default-images',
help="Path that contains default images, these will be"
" used to verify if the downloaded"
" image is a default image and delete it.")
parser.add_argument('--no-convert-gray',
help="Don't convert downloaded images to gray.",
action='store_true')
parser.add_argument('--no-resize',
help="Don't resizes each downloaded image.",
action='store_true')
parser.add_argument('--max-size',
help="Resizes the bigger side of an image to "
"fit in this param. <500.0>",
type=float, default=500.0)
parser.add_argument('--no-std-names',
help="Don't standardize image names using --prefix",
action='store_true')
return parser.parse_args()
def validations(arg):
"""Validations to argumens"""
if not os.path.isdir(arg.out):
raise Exception("Out path doesn't exists.")
if __name__ == '__main__':
try:
args = parse_arguments()
validations(args)
if args.urls != None:
download_images_by_list(args.urls, args.out, args.timeout,
args.img_extensions)
#Verify if there is some corrupted image, if yes, delete it.
prp.remove_corrupted_img_path(args.out, args.img_extensions)
if args.default_images != None:
"""Verify images in the path that is equals the images in the first
argument, if found, delete it."""
prp.remove_same_images(args.default_images, args.out,
args.img_extensions)
if not args.no_resize:
#Resizes all images in a path to the max_size passed"""
prp.resize_image_path(args.out, args.max_size, args.img_extensions)
if not args.no_convert_gray:
#Converts all images in a path to gray scale
prp.convert_to_gray_path(args.out, args.img_extensions)
if not args.no_std_names:
#Standardizes all image names to starts with args.prefix.
prp.rename_images_path(args.out, args.prefix, args.out_extension)
except Exception as e:
print(' '.join(["Error: ", str(e)]))