Merge pull request #35 from arrrlo/feature/multithreading

Feature/multithreading
arrrlo · Jun 13, 2019 · bfa05d0 · bfa05d0
2 parents 37e50d7 + 64e7012
commit bfa05d0
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## 1.0.0
+
+### Added in 1.0.0
+-   multithreaded images downloading
+-   download progress bars
+-   external progress bar insertion 
+
 ## 0.3.8
 
 ### Fixed in 0.3.8

diff --git a/README.md b/README.md
@@ -84,7 +84,6 @@ _search_params = {
     'fileType': 'jpg|gif|png',
     'imgType': 'clipart|face|lineart|news|photo',
     'imgSize': 'huge|icon|large|medium|small|xlarge|xxlarge',
-    'searchType': 'image',
     'imgDominantColor': 'black|blue|brown|gray|green|pink|purple|teal|white|yellow'
 }
 
@@ -104,6 +103,21 @@ for image in gis.results():
     image.resize(500, 500)
 ```
 
+## Inserting custom progressbar function
+
+```python
+from google_images_search import GoogleImagesSearch
+
+def my_progressbar(url, progress):
+    print(url + ' ' + progress + '%')
+
+gis = GoogleImagesSearch(
+    'your_dev_api_key', 'your_project_cx', progressbar_fn=my_progressbar
+)
+
+...
+```
+
 ## Saving to a BytesIO object
 
 ```python

diff --git a/google_images_search/cli.py b/google_images_search/cli.py
@@ -1,6 +1,4 @@
 import click
-from termcolor import cprint
-from pyfiglet import figlet_format
 
 from .fetch_resize_save import FetchResizeSave
 from .google_api import GoogleBackendException
@@ -12,7 +10,9 @@
 @click.option('-c', '--custom_search_cx', help='Custom Search CX')
 def cli(ctx, developer_key, custom_search_cx):
     ctx.obj = {
-        'object': FetchResizeSave(developer_key, custom_search_cx)
+        'object': FetchResizeSave(
+            developer_key, custom_search_cx, progress=True
+        )
     }
 
 
@@ -57,14 +57,10 @@ def search(ctx, query, num, safe, filetype, imagetype,
 
     click.clear()
 
-    cprint(figlet_format('Google Images Search', width=120), 'red')
-
-    click.echo('-'*120)
-
     try:
         ctx.obj['object'].search(search_params, download_path, width, height)
 
-        for _, image in enumerate(ctx.obj['object'].results()):
+        for image in ctx.obj['object'].results():
             click.echo(image.url)
             if image.path:
                 click.secho(image.path, fg='blue')
@@ -78,6 +74,3 @@ def search(ctx, query, num, safe, filetype, imagetype,
         click.secho('Error occurred trying to fetch '
                     'images from Google. Please try again.', fg='red')
         return
-
-    click.echo('-'*120)
-    click.echo()
diff --git a/google_images_search/fetch_resize_save.py b/google_images_search/fetch_resize_save.py
@@ -1,6 +1,7 @@
 import os
-import shutil
+import curses
 import requests
+import threading
 from PIL import Image
 from resizeimage import resizeimage
 
@@ -10,10 +11,31 @@
 class FetchResizeSave(object):
     """Class with resizing and downloading logic"""
 
-    def __init__(self, developer_key, custom_search_cx):
-        self._google_custom_search = GoogleCustomSearch(developer_key,
-                                                        custom_search_cx)
-        self._search_resut = []
+    def __init__(self, developer_key, custom_search_cx,
+                 progressbar_fn=None, progress=False):
+
+        # initialise google api
+        self._google_custom_search = GoogleCustomSearch(
+            developer_key, custom_search_cx, self)
+
+        self._search_result = list()
+
+        self._stdscr = None
+        self._progress = False
+        self._chunk_sizes = dict()
+        self._terminal_lines = dict()
+        self._download_progress = dict()
+        self._report_progress = progressbar_fn
+
+        if progressbar_fn:
+            # user nserted progressbar fn
+            self._progress = True
+        else:
+            if progress:
+                # initialise internal progressbar
+                self._progress = True
+                self._stdscr = curses.initscr()
+                self._report_progress = self.__report_progress
 
     def search(self, search_params, path_to_dir=False, width=None,
                height=None, cache_discovery=True):
@@ -27,24 +49,70 @@ def search(self, search_params, path_to_dir=False, width=None,
         :return: None
         """
 
-        for url in self._google_custom_search.search(search_params,
-                                                     cache_discovery):
+        i = 0
+        threads = list()
+        for url in self._google_custom_search.search(
+            search_params, cache_discovery
+        ):
+            # initialise image object
             image = GSImage(self)
             image.url = url
 
-            if path_to_dir:
-                image.download(path_to_dir)
-                if width and height:
-                    image.resize(width, height)
+            # set thread safe variables
+            self._download_progress[url] = 0
+            self._terminal_lines[url] = i
+            i += 2
+
+            # set thread with function and arguments
+            thread = threading.Thread(
+                target=self._download_and_resize,
+                args=(path_to_dir, image, width, height)
+            )
+
+            # start thread
+            thread.start()
+
+            # register thread
+            threads.append(thread)
 
-            self._search_resut.append(image)
+        # wait for all threads to end here
+        for thread in threads:
+            thread.join()
+
+        if self._progress:
+            if self._stdscr:
+                curses.endwin()
+
+    def set_chunk_size(self, url, content_size):
+        """Set images chunk size according to its size
+        :param url: image url
+        :param content_size: image size
+        :return: None
+        """
+
+        self._chunk_sizes[url] = int(int(content_size) / 100) + 1
+
+    def _download_and_resize(self, path_to_dir, image, width, height):
+        """Method used for threading
+        :param path_to_dir: path to download dir
+        :param image: image object
+        :param width: crop width
+        :param height: crop height
+        :return: None
+        """
+
+        if path_to_dir:
+            image.download(path_to_dir)
+            if width and height:
+                image.resize(width, height)
+        self._search_result.append(image)
 
     def results(self):
         """Returns objects of downloaded images
         :return: list
         """
 
-        return self._search_resut
+        return self._search_result
 
     def download(self, url, path_to_dir):
         """Downloads image from url to path dir
@@ -57,40 +125,37 @@ def download(self, url, path_to_dir):
         if not os.path.exists(path_to_dir):
             os.makedirs(path_to_dir)
 
-        raw_data = self.__class__.get_raw_data(url)
-
         raw_filename = url.split('/')[-1].split('?')[0]
         basename, ext = os.path.splitext(raw_filename)
         filename = "".join(x for x in basename if x.isalnum()) + ext
 
         path_to_image = os.path.join(path_to_dir, filename)
 
-        with open(path_to_image, 'wb') as f:
-            self.__class__.copy_to(raw_data, f)
+        with open(path_to_image, 'wb+') as f:
+            for chunk in self.get_raw_data(url):
+                f.write(chunk)
 
         return path_to_image
 
-    @staticmethod
-    def get_raw_data(url):
-        """Takes data from image url into a variable
+    def get_raw_data(self, url):
+        """Generator method for downloading images in chunks
         :param url: url to image
         :return: raw image data
         """
 
-        req = requests.get(url, stream=True)
-        req.raw.decode_content = True
-        return req.raw
+        with requests.get(url, stream=True) as req:
+            for chunk in req.iter_content(chunk_size=self._chunk_sizes[url]):
 
-    @staticmethod
-    def copy_to(raw_data, obj):
-        """
-        Copy raw image data to another object, preferably BytesIO
-        :param raw_data: raw image data
-        :param obj: BytesIO object
-        :return: None
-        """
+                # filter out keep-alive new chunks
+                if chunk:
+
+                    # report progress
+                    if self._progress:
+                        self._download_progress[url] += 1
+                        if self._download_progress[url] <= 100:
+                            self._report_progress(url, self._download_progress[url])
 
-        shutil.copyfileobj(raw_data, obj)
+                    yield chunk
 
     @staticmethod
     def resize(path_to_image, width, height):
@@ -107,6 +172,22 @@ def resize(path_to_image, width, height):
         img.save(path_to_image, img.format)
         fd_img.close()
 
+    def __report_progress(self, url, progress):
+        """Prints a progress bar in terminal
+        :param url:
+        :param progress:
+        :return:
+        """
+
+        self._stdscr.addstr(
+            self._terminal_lines[url], 0, "Downloading file: {0}".format(url)
+        )
+        self._stdscr.addstr(
+            self._terminal_lines[url] + 1, 0,
+            "Progress: [{1:100}] {0}%".format(progress, "#" * progress)
+        )
+        self._stdscr.refresh()
+
 
 class GSImage(object):
     """Class for handling one image"""
@@ -166,7 +247,7 @@ def get_raw_data(self):
         :return: raw data
         """
 
-        return self._fetch_resize_save.__class__.get_raw_data(self._url)
+        return b''.join(list(self._fetch_resize_save.get_raw_data(self._url)))
 
     def copy_to(self, obj, raw_data=None):
         """Copies raw image data to another object, preferably BytesIO
@@ -178,7 +259,7 @@ def copy_to(self, obj, raw_data=None):
         if not raw_data:
             raw_data = self.get_raw_data()
 
-        self._fetch_resize_save.__class__.copy_to(raw_data, obj)
+        obj.write(raw_data)
 
     def resize(self, width, height):
         """Resize the image

diff --git a/google_images_search/google_api.py b/google_images_search/google_api.py
@@ -7,14 +7,16 @@ class GoogleCustomSearch(object):
     """Wrapper class for Google images search api"""
 
     def __init__(self, developer_key=None,
-                 custom_search_cx=None):
+                 custom_search_cx=None,
+                 fethch_resize_save=None):
 
         self._developer_key = developer_key or \
                               os.environ.get('GCS_DEVELOPER_KEY')
         self._custom_search_cx = custom_search_cx or \
                                  os.environ.get('GCS_CX')
 
         self._google_build = None
+        self._fethch_resize_save = fethch_resize_save
 
         self._search_params_keys = {
             'q': None,
@@ -54,8 +56,10 @@ def _search_params(self, params):
         for key, value in self._search_params_keys.items():
             params_value = params.get(key)
             if params_value:
+                # take user defined param value if defined
                 search_params[key] = params_value
             elif value:
+                # take default param value if defined
                 search_params[key] = value
 
         return search_params
@@ -70,18 +74,34 @@ def search(self, params, cache_discovery=True):
 
         search_params = self._search_params(params)
 
-        try:
-            res = self._query_google_api(search_params, cache_discovery)
-        except:
-            raise GoogleBackendException()
+        res = self._query_google_api(search_params, cache_discovery)
 
         for image in res.get('items'):
             try:
-                check = requests.get(image['link'], timeout=5)
-                if check.status_code == 200:
+                response = requests.head(image['link'], timeout=5)
+                content_length = response.headers.get('Content-Length')
+
+                # check if the url is valid
+                if response.status_code == 200 and \
+                        'image' in response.headers['Content-Type'] and \
+                        content_length:
+
+                    # calculate download chunk size based on image size
+                    self._fethch_resize_save.set_chunk_size(
+                        image['link'], content_length
+                    )
+
+                    # if everything is ok, yield image url back
                     yield image['link']
+
+                else:
+                    # validation failed, go with another image
+                    continue
+
             except requests.exceptions.ConnectTimeout:
                 pass
+            except requests.exceptions.SSLError:
+                pass
 
 
 class GoogleBackendException(Exception):

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@ def readme():
 
 setup(
     name='Google Images Search',
-    version="0.3.8",
+    version="1.0.0",
 
     description='Search for image using Google Custom Search API and resize & crop the image afterwords',
     long_description=readme(),
@@ -21,7 +21,7 @@ def readme():
     author_email='[email protected]',
 
     classifiers=[
-        'Development Status :: 4 - Beta',
+        'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',
         'Topic :: Software Development :: Build Tools',
         'License :: OSI Approved :: MIT License',