From 37748031a6abe48caa46db0c7ea67a3625aa29c6 Mon Sep 17 00:00:00 2001 From: ManiMozaffar Date: Thu, 21 Sep 2023 13:58:36 +0300 Subject: [PATCH 1/4] Fix linting issue --- undetected_chromedriver/cdp.py | 1 - undetected_chromedriver/devtool.py | 17 +++++++---------- undetected_chromedriver/dprocess.py | 4 +--- undetected_chromedriver/patcher.py | 21 +++++++++++++-------- undetected_chromedriver/reactor.py | 1 - undetected_chromedriver/webelement.py | 10 +++++----- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/undetected_chromedriver/cdp.py b/undetected_chromedriver/cdp.py index 32a503c7..3742be91 100644 --- a/undetected_chromedriver/cdp.py +++ b/undetected_chromedriver/cdp.py @@ -7,7 +7,6 @@ import requests import websockets - log = logging.getLogger(__name__) diff --git a/undetected_chromedriver/devtool.py b/undetected_chromedriver/devtool.py index 30e7c08c..073a85bb 100644 --- a/undetected_chromedriver/devtool.py +++ b/undetected_chromedriver/devtool.py @@ -1,16 +1,11 @@ import asyncio -from collections.abc import Mapping -from collections.abc import Sequence -from functools import wraps import logging import threading import time import traceback -from typing import Any -from typing import Awaitable -from typing import Callable -from typing import List -from typing import Optional +from collections.abc import Mapping, Sequence +from functools import wraps +from typing import Any, Awaitable, Callable, List, Optional class Structure(dict): @@ -101,12 +96,14 @@ def function_reached_timeout(): def test(): - import sys, os + import os + import sys sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) - import undetected_chromedriver as uc import threading + import undetected_chromedriver as uc + def collector( driver: uc.Chrome, stop_event: threading.Event, diff --git a/undetected_chromedriver/dprocess.py b/undetected_chromedriver/dprocess.py index e6187fab..5a161daf 100644 --- a/undetected_chromedriver/dprocess.py +++ b/undetected_chromedriver/dprocess.py @@ -4,10 +4,8 @@ import os import platform import signal -from subprocess import PIPE -from subprocess import Popen import sys - +from subprocess import PIPE, Popen CREATE_NEW_PROCESS_GROUP = 0x00000200 DETACHED_PROCESS = 0x00000008 diff --git a/undetected_chromedriver/patcher.py b/undetected_chromedriver/patcher.py index 1b5409d5..8cc0be0d 100644 --- a/undetected_chromedriver/patcher.py +++ b/undetected_chromedriver/patcher.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # this module is part of undetected_chromedriver -from distutils.version import LooseVersion import io import json import logging @@ -14,10 +13,10 @@ import string import sys import time -from urllib.request import urlopen -from urllib.request import urlretrieve import zipfile +from distutils.version import LooseVersion from multiprocessing import Lock +from urllib.request import urlopen, urlretrieve logger = logging.getLogger(__name__) @@ -197,7 +196,6 @@ def driver_binary_in_use(self, path: str = None) -> bool: with open(p, mode="a+b") as fs: exc = [] try: - fs.seek(0, 0) except PermissionError as e: exc.append(e) # since some systems apprently allow seeking @@ -208,7 +206,6 @@ def driver_binary_in_use(self, path: str = None) -> bool: exc.append(e) if exc: - return True return False # ok safe to assume this is in use @@ -260,7 +257,9 @@ def fetch_release_number(self): response = conn.read().decode() major_versions = json.loads(response) - return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"]) + return LooseVersion( + major_versions["milestones"][str(self.version_main)]["version"] + ) def parse_exe_version(self): with io.open(self.executable_path, "rb") as f: @@ -277,10 +276,16 @@ def fetch_package(self): """ zip_name = f"chromedriver_{self.platform_name}.zip" if self.is_old_chromedriver: - download_url = "%s/%s/%s" % (self.url_repo, self.version_full.vstring, zip_name) + download_url = "%s/%s/%s" % ( + self.url_repo, + self.version_full.vstring, + zip_name, + ) else: zip_name = zip_name.replace("_", "-", 1) - download_url = "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/%s/%s/%s" + download_url = ( + "https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/%s/%s/%s" + ) download_url %= (self.version_full.vstring, self.platform_name, zip_name) logger.debug("downloading from %s" % download_url) diff --git a/undetected_chromedriver/reactor.py b/undetected_chromedriver/reactor.py index d52e312e..58b0264b 100644 --- a/undetected_chromedriver/reactor.py +++ b/undetected_chromedriver/reactor.py @@ -6,7 +6,6 @@ import logging import threading - logger = logging.getLogger(__name__) diff --git a/undetected_chromedriver/webelement.py b/undetected_chromedriver/webelement.py index 03d68789..5f87beed 100644 --- a/undetected_chromedriver/webelement.py +++ b/undetected_chromedriver/webelement.py @@ -1,7 +1,7 @@ from typing import List -from selenium.webdriver.common.by import By import selenium.webdriver.remote.webelement +from selenium.webdriver.common.by import By class WebElement(selenium.webdriver.remote.webelement.WebElement): @@ -46,11 +46,11 @@ def attrs(self): if not self._attrs: self._attrs = self._parent.execute_script( """ - var items = {}; - for (index = 0; index < arguments[0].attributes.length; ++index) + var items = {}; + for (index = 0; index < arguments[0].attributes.length; ++index) { - items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value - }; + items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value + }; return items; """, self, From fc065becb0ffcf2a8bfd632bf69cfed7e7ff2a05 Mon Sep 17 00:00:00 2001 From: ManiMozaffar Date: Thu, 21 Sep 2023 14:12:49 +0300 Subject: [PATCH 2/4] Improve code quality with assertions and proper validation --- undetected_chromedriver/__init__.py | 103 +++++++++++++++------------- undetected_chromedriver/patcher.py | 2 +- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/undetected_chromedriver/__init__.py b/undetected_chromedriver/__init__.py index d31055a3..0e70b284 100644 --- a/undetected_chromedriver/__init__.py +++ b/undetected_chromedriver/__init__.py @@ -16,7 +16,6 @@ """ from __future__ import annotations - __version__ = "3.5.3" import json @@ -33,20 +32,17 @@ import selenium.webdriver.chrome.service import selenium.webdriver.chrome.webdriver -from selenium.webdriver.common.by import By import selenium.webdriver.chromium.service import selenium.webdriver.remote.command import selenium.webdriver.remote.webdriver +from selenium.webdriver.common.utils import free_port from .cdp import CDP from .dprocess import start_detached from .options import ChromeOptions -from .patcher import IS_POSIX -from .patcher import Patcher +from .patcher import IS_POSIX, Patcher from .reactor import Reactor -from .webelement import UCWebElement -from .webelement import WebElement - +from .webelement import UCWebElement, WebElement __all__ = ( "Chrome", @@ -270,14 +266,10 @@ def __init__( except AttributeError: pass - options._session = self + options._session = self # type: ignore if not options.debugger_address: - debug_port = ( - port - if port != 0 - else selenium.webdriver.common.service.utils.free_port() - ) + debug_port = port if port != 0 else free_port() debug_host = "127.0.0.1" options.debugger_address = "%s:%d" % (debug_host, debug_port) else: @@ -299,13 +291,13 @@ def __init__( # see if a custom user profile is specified in options for arg in options.arguments: - if any([_ in arg for _ in ("--headless", "headless")]): options.arguments.remove(arg) options.headless = True if "lang" in arg: m = re.search("(?:--)?lang(?:[ =])?(.*)", arg) + assert m is not None try: language = m[1] except IndexError: @@ -314,6 +306,7 @@ def __init__( if "user-data-dir" in arg: m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg) + assert m is not None try: user_data_dir = m[1] logger.debug( @@ -360,7 +353,10 @@ def __init__( try: import locale - language = locale.getdefaultlocale()[0].replace("_", "-") + default_langs = locale.getdefaultlocale() + assert len(default_langs) > 1 + assert default_langs[0] + language = default_langs[0].replace("_", "-") except Exception: pass if not language: @@ -369,21 +365,26 @@ def __init__( options.add_argument("--lang=%s" % language) if not options.binary_location: - options.binary_location = ( + options.binary_location = ( # type: ignore browser_executable_path or find_chrome_executable() ) - if not options.binary_location or not \ - pathlib.Path(options.binary_location).exists(): - raise FileNotFoundError( - "\n---------------------\n" - "Could not determine browser executable." - "\n---------------------\n" - "Make sure your browser is installed in the default location (path).\n" - "If you are sure about the browser executable, you can specify it using\n" - "the `browser_executable_path='{}` parameter.\n\n" - .format("/path/to/browser/executable" if IS_POSIX else "c:/path/to/your/browser.exe") + if ( + not options.binary_location + or not pathlib.Path(options.binary_location).exists() + ): + raise FileNotFoundError( + "\n---------------------\n" + "Could not determine browser executable." + "\n---------------------\n" + "Make sure your browser is installed in the default location (path).\n" + "If you are sure about the browser executable, you can specify it using\n" + "the `browser_executable_path='{}` parameter.\n\n".format( + "/path/to/browser/executable" + if IS_POSIX + else "c:/path/to/your/browser.exe" ) + ) self._delay = 3 @@ -396,15 +397,18 @@ def __init__( options.arguments.extend(["--no-sandbox", "--test-type"]) if headless or options.headless: - #workaround until a better checking is found + # workaround until a better checking is found try: - if self.patcher.version_main < 108: - options.add_argument("--headless=chrome") - elif self.patcher.version_main >= 108: - options.add_argument("--headless=new") - except: - logger.warning("could not detect version_main." - "therefore, we are assuming it is chrome 108 or higher") + if isinstance(self.patcher.version_main, int): + if self.patcher.version_main < 108: + options.add_argument("--headless=chrome") + elif self.patcher.version_main >= 108: + options.add_argument("--headless=new") + except Exception: + logger.warning( + "could not detect version_main." + "therefore, we are assuming it is chrome 108 or higher" + ) options.add_argument("--headless=new") options.add_argument("--window-size=1920,1080") @@ -424,7 +428,7 @@ def __init__( # fix exit_type flag to prevent tab-restore nag try: with open( - os.path.join(user_data_dir, "Default/Preferences"), + os.path.join(user_data_dir, "Default/Preferences"), # type: ignore encoding="latin1", mode="r+", ) as fs: @@ -436,7 +440,7 @@ def __init__( json.dump(config, fs) fs.truncate() # the file might be shorter logger.debug("fixed exit_type flag") - except Exception as e: + except Exception: logger.debug("did not find a bad exit_type flag ") self.options = options @@ -458,14 +462,13 @@ def __init__( ) self.browser_pid = browser.pid - service = selenium.webdriver.chromium.service.ChromiumService( self.patcher.executable_path ) super(Chrome, self).__init__( - service=service, - options=options, + service=service, # type: ignore + options=options, # type: ignore keep_alive=keep_alive, ) @@ -739,6 +742,7 @@ def find_elements_recursive(self, by, value): value: str Returns: Generator[webelement.WebElement] """ + def search_frame(f=None): if not f: # ensure we are on main content frame @@ -754,7 +758,7 @@ def search_frame(f=None): for elem in search_frame(): yield elem # get iframes - frames = self.find_elements('css selector', 'iframe') + frames = self.find_elements("css selector", "iframe") # search per frame for f in frames: @@ -768,10 +772,12 @@ def quit(self): except (AttributeError, RuntimeError, OSError): pass try: - self.reactor.event.set() - logger.debug("shutting down reactor") + if self.reactor: + self.reactor.event.set() + logger.debug("shutting down reactor") except AttributeError: pass + try: os.kill(self.browser_pid, 15) logger.debug("gracefully closed browser") @@ -781,6 +787,7 @@ def quit(self): hasattr(self, "keep_user_data_dir") and hasattr(self, "user_data_dir") and not self.keep_user_data_dir + and self.user_data_dir ): for _ in range(5): try: @@ -866,7 +873,9 @@ def find_chrome_executable(): """ candidates = set() if IS_POSIX: - for item in os.environ.get("PATH").split(os.pathsep): + path = os.environ.get("PATH") + assert path is not None + for item in path.split(os.pathsep): for subitem in ( "google-chrome", "chromium", @@ -888,12 +897,10 @@ def find_chrome_executable(): ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"), ): if item is not None: - for subitem in ( - "Google/Chrome/Application", - ): + for subitem in ("Google/Chrome/Application",): candidates.add(os.sep.join((item, subitem, "chrome.exe"))) for candidate in candidates: - logger.debug('checking if %s exists and is executable' % candidate) + logger.debug("checking if %s exists and is executable" % candidate) if os.path.exists(candidate) and os.access(candidate, os.X_OK): - logger.debug('found! using %s' % candidate) + logger.debug("found! using %s" % candidate) return os.path.normpath(candidate) diff --git a/undetected_chromedriver/patcher.py b/undetected_chromedriver/patcher.py index 8cc0be0d..b884a9ac 100644 --- a/undetected_chromedriver/patcher.py +++ b/undetected_chromedriver/patcher.py @@ -44,7 +44,7 @@ def __init__( self, executable_path=None, force=False, - version_main: int = 0, + version_main: None | int = 0, user_multi_procs=False, ): """ From 28a12652a3418dc8abd80d75d100271a5e88c7ce Mon Sep 17 00:00:00 2001 From: ManiMozaffar Date: Thu, 21 Sep 2023 14:13:04 +0300 Subject: [PATCH 3/4] Fix connection leak issue --- undetected_chromedriver/reactor.py | 50 +++++++++++++++++------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/undetected_chromedriver/reactor.py b/undetected_chromedriver/reactor.py index 58b0264b..3cc73bfd 100644 --- a/undetected_chromedriver/reactor.py +++ b/undetected_chromedriver/reactor.py @@ -1,27 +1,26 @@ -#!/usr/bin/env python3 -# this module is part of undetected_chromedriver - import asyncio import json import logging import threading +import time + +from selenium import webdriver logger = logging.getLogger(__name__) class Reactor(threading.Thread): - def __init__(self, driver: "Chrome"): + def __init__(self, driver: webdriver.Chrome): super().__init__() - self.driver = driver self.loop = asyncio.new_event_loop() - + self.paused = False self.lock = threading.Lock() self.event = threading.Event() self.daemon = True self.handlers = {} - def add_event_handler(self, method_name, callback: callable): + def add_event_handler(self, method_name, callback): """ Parameters @@ -39,9 +38,16 @@ def add_event_handler(self, method_name, callback: callable): with self.lock: self.handlers[method_name.lower()] = callback - @property - def running(self): - return not self.event.is_set() + def terminate(self, timeout=10): + self.event.set() + start_time = time.time() + while not self.paused: + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + break + time.sleep(0.1) + self.loop.close() + return None def run(self): try: @@ -58,12 +64,12 @@ async def _wait_service_started(self): and getattr(self.driver.service, "process", None) and self.driver.service.process.poll() ): - await asyncio.sleep(self.driver._delay or 0.25) + await asyncio.sleep(self.driver._delay or 0.25) # type: ignore else: break async def listen(self): - while self.running: + while not self.event.is_set(): await self._wait_service_started() await asyncio.sleep(1) @@ -74,9 +80,11 @@ async def listen(self): for entry in log_entries: try: obj_serialized: str = entry.get("message") - obj = json.loads(obj_serialized) + obj: dict[str, dict] = json.loads(obj_serialized) message = obj.get("message") - method = message.get("method") + assert message is not None + method: str | None = message.get("method") + assert isinstance(method, str) if "*" in self.handlers: await self.loop.run_in_executor( @@ -86,13 +94,11 @@ async def listen(self): await self.loop.run_in_executor( None, self.handlers[method.lower()], message ) + except Exception as error: + logging.debug("exception ignored :", error) + raise error from None - # print(type(message), message) - except Exception as e: - raise e from None + except Exception as error: + logging.debug("exception ignored :", error) - except Exception as e: - if "invalid session id" in str(e): - pass - else: - logging.debug("exception ignored :", e) + self.paused = True From dea745265202bbfdcb8026b982d30a0fe467c352 Mon Sep 17 00:00:00 2001 From: ManiMozaffar Date: Thu, 21 Sep 2023 14:16:13 +0300 Subject: [PATCH 4/4] Add support for python <3.10 --- undetected_chromedriver/patcher.py | 3 ++- undetected_chromedriver/reactor.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/undetected_chromedriver/patcher.py b/undetected_chromedriver/patcher.py index b884a9ac..343494ef 100644 --- a/undetected_chromedriver/patcher.py +++ b/undetected_chromedriver/patcher.py @@ -16,6 +16,7 @@ import zipfile from distutils.version import LooseVersion from multiprocessing import Lock +from typing import Optional from urllib.request import urlopen, urlretrieve logger = logging.getLogger(__name__) @@ -44,7 +45,7 @@ def __init__( self, executable_path=None, force=False, - version_main: None | int = 0, + version_main: Optional[int] = 0, user_multi_procs=False, ): """ diff --git a/undetected_chromedriver/reactor.py b/undetected_chromedriver/reactor.py index 3cc73bfd..c8b389bb 100644 --- a/undetected_chromedriver/reactor.py +++ b/undetected_chromedriver/reactor.py @@ -3,6 +3,7 @@ import logging import threading import time +from typing import Dict, Optional from selenium import webdriver @@ -80,10 +81,10 @@ async def listen(self): for entry in log_entries: try: obj_serialized: str = entry.get("message") - obj: dict[str, dict] = json.loads(obj_serialized) + obj: Dict[str, dict] = json.loads(obj_serialized) message = obj.get("message") assert message is not None - method: str | None = message.get("method") + method: Optional[str] = message.get("method") assert isinstance(method, str) if "*" in self.handlers: