diff --git a/config_flickr.ini b/config_flickr.ini index 925ad5d..4956087 100644 --- a/config_flickr.ini +++ b/config_flickr.ini @@ -2,7 +2,7 @@ id = [your api id] secret = [your secret/password] [Download] -path = d:\data\christmas +folder = scrapped_images search = christmas decorations prefix = christmas update_minutes = 1 diff --git a/flickr-download.py b/flickr-download.py index c613f45..29e233c 100644 --- a/flickr-download.py +++ b/flickr-download.py @@ -1,7 +1,6 @@ # Flickr Download, by Jeff Heaton (http://www.heatonresearch.com) # https://github.com/jeffheaton/pyimgdata # Copyright 2020, MIT License -import flickrapi import requests import logging import logging.config @@ -10,55 +9,69 @@ import time import csv import sys -from urllib.request import urlretrieve -from PIL import Image from io import BytesIO from hashlib import sha256 +from concurrent.futures import ThreadPoolExecutor +from multiprocessing import cpu_count + +import flickrapi +from PIL import Image # https://code.flickr.net/2008/08/19/standard-photos-response-apis-for-civilized-age/ -# Nicely formatted time string + def hms_string(sec_elapsed): + """Nicely formatted time string""" h = int(sec_elapsed / (60 * 60)) m = int((sec_elapsed % (60 * 60)) / 60) s = sec_elapsed % 60 return f"{h}:{m:>02}:{s:>05.2f}" - + + def is_true(str): - return str.lower()[0] == 't' + return str.lower()[0] == "t" + + +def handle_path(dirname): + base_path = os.path.dirname(__file__) + full_path = os.path.join(base_path, dirname) + return full_path + class FlickrImageDownload: def __init__(self): self.config = configparser.ConfigParser() - self.config.read("config_flickr.ini") - logging.config.fileConfig("logging.properties") - - self.config_path = self.config['Download']['path'] - self.config_prefix = self.config['Download']['prefix'] - self.config_search = self.config['Download']['search'] - self.config_update_minutes = int(self.config['Download']['update_minutes']) - self.config_max_download_count = int(self.config['Download']['max_download']) - self.config_license_allowed = [int(e) if e.isdigit() else e - for e in self.config['Download']['license'].split(',')] - self.config_format = self.config['Process']['image_format'] - self.config_process = is_true(self.config['Process']['process']) - self.config_crop_square = is_true(self.config['Process']['crop_square']) - self.config_scale_width = int(self.config['Process']['scale_width']) - self.config_scale_height = int(self.config['Process']['scale_height']) - self.config_min_width = int(self.config['Process']['min_width']) - self.config_min_height = int(self.config['Process']['min_height']) - - if "sources_file" in self.config['Download']: - self.config_sources_file = self.config['Download']['sources_file'] + self.config.read(handle_path("config_flickr.ini")) + logging.config.fileConfig(handle_path("logging.properties")) + self.config_path = handle_path(self.config["Download"]["folder"]) + logging.info(f"Loading images into {self.config_path}") + if not os.path.exists(self.config_path): + os.makedirs(self.config_path) + self.config_prefix = self.config["Download"]["prefix"] + self.config_search = self.config["Download"]["search"] + self.config_update_minutes = int(self.config["Download"]["update_minutes"]) + self.config_max_download_count = int(self.config["Download"]["max_download"]) + self.config_license_allowed = [ + int(e) if e.isdigit() else e + for e in self.config["Download"]["license"].split(",") + ] + self.config_format = self.config["Process"]["image_format"] + self.config_process = is_true(self.config["Process"]["process"]) + self.config_crop_square = is_true(self.config["Process"]["crop_square"]) + self.config_scale_width = int(self.config["Process"]["scale_width"]) + self.config_scale_height = int(self.config["Process"]["scale_height"]) + self.config_min_width = int(self.config["Process"]["min_width"]) + self.config_min_height = int(self.config["Process"]["min_height"]) + + if "sources_file" in self.config["Download"]: + self.config_sources_file = self.config["Download"]["sources_file"] else: self.config_sources_file = None - - - self.flickr=flickrapi.FlickrAPI( - self.config['FLICKR']['id'], - self.config['FLICKR']['secret'], - cache=True) - + + self.flickr = flickrapi.FlickrAPI( + self.config["FLICKR"]["id"], self.config["FLICKR"]["secret"], cache=True + ) + def reset_counts(self): self.download_count = 0 self.start_time = time.time() @@ -67,7 +80,7 @@ def reset_counts(self): self.error_count = 0 self.cached = 0 self.sources = [] - + def load_image(self, url): try: response = requests.get(url) @@ -79,28 +92,31 @@ def load_image(self, url): logging.info("Keyboard interrupt, stopping") sys.exit(0) except: - logging.warning(f"Unexpected exception while downloading image: {url}" , exc_info=True) + logging.warning( + f"Unexpected exception while downloading image: {url}", exc_info=True + ) return None, None - def obtain_photo(self, photo): - url = photo.get('url_c') - license = photo.get('license') + url = photo.get("url_c") + license = photo.get("license") if int(license) in self.config_license_allowed and url: image, h = self.load_image(url) - + if image: return image else: self.error_count += 1 - + return None - + def check_to_keep_photo(self, url, image): h = sha256(image.tobytes()).hexdigest() - p = os.path.join(self.config_path, f"{self.config_prefix}-{h}.{self.config_format}") - self.sources.append([url,p]) + p = os.path.join( + self.config_path, f"{self.config_prefix}-{h}.{self.config_format}" + ) + self.sources.append([url, p]) if not os.path.exists(p): self.download_count += 1 logging.debug(f"Downloaded: {url} to {p}") @@ -109,87 +125,90 @@ def check_to_keep_photo(self, url, image): self.cached += 1 logging.debug(f"Image already exists: {url}") return None - - def process_image(self, image, path): + + def process_image(self, image, path): width, height = image.size - + # Crop the image, centered if self.config_crop_square and self.config_process: - new_width = min(width,height) + new_width = min(width, height) new_height = new_width - left = (width - new_width)/2 - top = (height - new_height)/2 - right = (width + new_width)/2 - bottom = (height + new_height)/2 + left = (width - new_width) / 2 + top = (height - new_height) / 2 + right = (width + new_width) / 2 + bottom = (height + new_height) / 2 image = image.crop((left, top, right, bottom)) - - # Scale the image - if self.config_scale_width>0 and self.config_process: - image = image.resize(( - self.config_scale_width, - self.config_scale_height), - Image.ANTIALIAS) + # Scale the image + if self.config_scale_width > 0 and self.config_process: + image = image.resize( + (self.config_scale_width, self.config_scale_height), Image.ANTIALIAS + ) # Convert to full color (no grayscale, no transparent) - if image.mode not in ('RGB'): + if image.mode not in ("RGB"): logging.debug(f"Grayscale to RGB: {path}") rgbimg = Image.new("RGB", image.size) rgbimg.paste(image) image = rgbimg - + return image - + def track_progress(self): - elapsed_min = int((time.time() - self.start_time)/60) + elapsed_min = int((time.time() - self.start_time) / 60) self.since_last_update = elapsed_min - self.last_update if self.since_last_update >= self.config_update_minutes: - logging.info(f"Update for {elapsed_min}: images={self.download_count:,}; errors={self.error_count:,}; cached={self.cached:,}") + logging.info( + f"Update for {elapsed_min}: images={self.download_count:,}; errors={self.error_count:,}; cached={self.cached:,}" + ) self.last_update = elapsed_min if self.download_count > self.config_max_download_count: logging.info("Reached max download count") return True - + return False - + def write_sources(self): if self.config_sources_file: logging.info("Writing sources file.") filename = os.path.join(self.config_path, self.config_sources_file) - with open(filename, 'w', newline='') as csvfile: - csvwriter = csv.writer(csvfile) - csvwriter.writerow(['url', 'file']) + with open(filename, "w", newline="") as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["url", "file"]) csvwriter.writerows(self.sources) + def get_photos(self, photo): + url = photo.get("url_c") + img = self.obtain_photo(photo) + if img: + path = self.check_to_keep_photo(url, img) + if path: + img = self.process_image(img, path) + img.save(path) + + if self.track_progress(): + return None + def run(self): logging.info("Starting...") - self.reset_counts() - - photos = self.flickr.walk(text=self.config_search, - tag_mode='all', + self.reset_counts() + photos = self.flickr.walk( + text=self.config_search, + tag_mode="all", tags=self.config_search, - extras='url_c,license', - per_page=100, - sort='relevance', - #license='0' - ) + extras="url_c,license", + per_page=100, + sort="relevance", + # license='0' + ) + with ThreadPoolExecutor(max_workers=cpu_count()) as executor: + executor.map(self.get_photos, photos) - for photo in photos: - url = photo.get('url_c') - img = self.obtain_photo(photo) - if img: - path = self.check_to_keep_photo(url, img) - if path: - img = self.process_image(img, path) - img.save(path) - - if self.track_progress(): - break - self.write_sources() elapsed_time = time.time() - self.start_time logging.info("Complete, elapsed time: {}".format(hms_string(elapsed_time))) + task = FlickrImageDownload() task.run()