From 2656defb621dffc8cae193ffa42fa41faad9dcac Mon Sep 17 00:00:00 2001 From: Panos Mavrogiorgos Date: Sat, 9 Dec 2023 22:24:07 +0200 Subject: [PATCH] scraper: Allow specifying the number of threads/processes --- observer/ioc/scraper.py | 6 ++++-- poetry.lock | 8 ++++---- pyproject.toml | 2 +- requirements/requirements-dev.txt | 2 +- requirements/requirements.txt | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/observer/ioc/scraper.py b/observer/ioc/scraper.py index 4767b01..90aa87e 100644 --- a/observer/ioc/scraper.py +++ b/observer/ioc/scraper.py @@ -103,6 +103,8 @@ def scrape_ioc_station( start_date: pd.Timestamp, end_date: pd.Timestamp, rate_limit: multifutures.RateLimit | None = None, + n_threads: int = max(10, multifutures.MAX_AVAILABLE_PROCESSES), + n_processes: int = min(128, multifutures.MAX_AVAILABLE_PROCESSES), ) -> pd.DataFrame: """ Return a DataFrame with all the data of `ioc_code` from `start_date` to `end_date` @@ -123,7 +125,7 @@ def scrape_ioc_station( dict(ioc_code=ioc_code, url=url, client=client, rate_limit=rate_limit) for url in urls ] logger.debug("%s: Starting data retrieval", ioc_code) - results = multifutures.multithread(fetch_url, multithread_kwargs, check=True) + results = multifutures.multithread(fetch_url, multithread_kwargs, check=True, n_workers=n_threads) logger.debug("%s: Finished data retrieval", ioc_code) # Parse the json files using pandas @@ -145,7 +147,7 @@ def scrape_ioc_station( # This is a CPU heavy process, so let's use multiprocess logger.debug("%s: Starting conversion to pandas", ioc_code) - results = multifutures.multiprocess(parse_json, multiprocess_kwargs, check=False) + results = multifutures.multiprocess(parse_json, multiprocess_kwargs, check=False, n_workers=n_processes) multifutures.check_results(results) dataframes: list[pd.DataFrame] = [r.result for r in results] if dataframes: diff --git a/poetry.lock b/poetry.lock index 529f8fe..11996b5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2379,13 +2379,13 @@ files = [ [[package]] name = "multifutures" -version = "0.1.1" +version = "0.1.2" description = "" optional = false python-versions = ">=3.10,<4.0" files = [ - {file = "multifutures-0.1.1-py3-none-any.whl", hash = "sha256:91b0b8b7ee90fd6de3695980f9030a14ace1ccfc12cc7d6bc7b941197a387536"}, - {file = "multifutures-0.1.1.tar.gz", hash = "sha256:dd661b65a3ca34f2a8c2d15fbc41b8e5da0c79f9c16d3a9852f3a013f0b17f52"}, + {file = "multifutures-0.1.2-py3-none-any.whl", hash = "sha256:6130f7c3533adf8dfe46ab2ca3511045b2e021f00f424bb02d22500af50588c2"}, + {file = "multifutures-0.1.2.tar.gz", hash = "sha256:1b00e31d6693a2167f53fa070907267c65d867b5eab1331f4e24ef4fce7c2157"}, ] [package.dependencies] @@ -4314,4 +4314,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b0c23910687b30296d83eaa97d95826728853ca89bf0175f4e1275ad93647d6e" +content-hash = "eeb13ee1d7aa45e7acaa4a579e013ee09da1a5881b8c72842f5d09acac3a53b5" diff --git a/pyproject.toml b/pyproject.toml index da57c8b..b1c3db1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ azure-storage-blob = "*" dask = {version = "*", extras = ["complete"]} fastparquet = "*" httpx = "*" -multifutures = ">0.1.0" +multifutures = ">=0.1.2" pandas = "*" pydantic-settings = "*" scipy = ">1.8" diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 59cb823..4bccf7b 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -75,7 +75,7 @@ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0" msal-extensions==1.1.0 ; python_version >= "3.10" and python_version < "4.0" msal==1.26.0 ; python_version >= "3.10" and python_version < "4.0" multidict==6.0.4 ; python_version >= "3.10" and python_version < "4.0" -multifutures==0.1.1 ; python_version >= "3.10" and python_version < "4.0" +multifutures==0.1.2 ; python_version >= "3.10" and python_version < "4.0" mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0" mypy==1.7.1 ; python_version >= "3.10" and python_version < "4.0" nest-asyncio==1.5.8 ; python_version >= "3.10" and python_version < "4.0" diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 21bc6e8..76c80ae 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -60,7 +60,7 @@ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0" msal-extensions==1.1.0 ; python_version >= "3.10" and python_version < "4.0" msal==1.26.0 ; python_version >= "3.10" and python_version < "4.0" multidict==6.0.4 ; python_version >= "3.10" and python_version < "4.0" -multifutures==0.1.1 ; python_version >= "3.10" and python_version < "4.0" +multifutures==0.1.2 ; python_version >= "3.10" and python_version < "4.0" netcdf4==1.6.5 ; python_version >= "3.10" and python_version < "4.0" numba==0.58.1 ; python_version >= "3.10" and python_version < "4.0" numbagg==0.6.7 ; python_version >= "3.10" and python_version < "4.0"