Skip to content

Commit

Permalink
scraper: Allow specifying the number of threads/processes
Browse files Browse the repository at this point in the history
  • Loading branch information
pmav99 committed Dec 9, 2023
1 parent 704ccd8 commit 2656def
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 9 deletions.
6 changes: 4 additions & 2 deletions observer/ioc/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ def scrape_ioc_station(
start_date: pd.Timestamp,
end_date: pd.Timestamp,
rate_limit: multifutures.RateLimit | None = None,
n_threads: int = max(10, multifutures.MAX_AVAILABLE_PROCESSES),
n_processes: int = min(128, multifutures.MAX_AVAILABLE_PROCESSES),
) -> pd.DataFrame:
"""
Return a DataFrame with all the data of `ioc_code` from `start_date` to `end_date`
Expand All @@ -123,7 +125,7 @@ def scrape_ioc_station(
dict(ioc_code=ioc_code, url=url, client=client, rate_limit=rate_limit) for url in urls
]
logger.debug("%s: Starting data retrieval", ioc_code)
results = multifutures.multithread(fetch_url, multithread_kwargs, check=True)
results = multifutures.multithread(fetch_url, multithread_kwargs, check=True, n_workers=n_threads)
logger.debug("%s: Finished data retrieval", ioc_code)

# Parse the json files using pandas
Expand All @@ -145,7 +147,7 @@ def scrape_ioc_station(

# This is a CPU heavy process, so let's use multiprocess
logger.debug("%s: Starting conversion to pandas", ioc_code)
results = multifutures.multiprocess(parse_json, multiprocess_kwargs, check=False)
results = multifutures.multiprocess(parse_json, multiprocess_kwargs, check=False, n_workers=n_processes)
multifutures.check_results(results)
dataframes: list[pd.DataFrame] = [r.result for r in results]
if dataframes:
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ azure-storage-blob = "*"
dask = {version = "*", extras = ["complete"]}
fastparquet = "*"
httpx = "*"
multifutures = ">0.1.0"
multifutures = ">=0.1.2"
pandas = "*"
pydantic-settings = "*"
scipy = ">1.8"
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
msal-extensions==1.1.0 ; python_version >= "3.10" and python_version < "4.0"
msal==1.26.0 ; python_version >= "3.10" and python_version < "4.0"
multidict==6.0.4 ; python_version >= "3.10" and python_version < "4.0"
multifutures==0.1.1 ; python_version >= "3.10" and python_version < "4.0"
multifutures==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0"
mypy==1.7.1 ; python_version >= "3.10" and python_version < "4.0"
nest-asyncio==1.5.8 ; python_version >= "3.10" and python_version < "4.0"
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
msal-extensions==1.1.0 ; python_version >= "3.10" and python_version < "4.0"
msal==1.26.0 ; python_version >= "3.10" and python_version < "4.0"
multidict==6.0.4 ; python_version >= "3.10" and python_version < "4.0"
multifutures==0.1.1 ; python_version >= "3.10" and python_version < "4.0"
multifutures==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
netcdf4==1.6.5 ; python_version >= "3.10" and python_version < "4.0"
numba==0.58.1 ; python_version >= "3.10" and python_version < "4.0"
numbagg==0.6.7 ; python_version >= "3.10" and python_version < "4.0"
Expand Down

0 comments on commit 2656def

Please sign in to comment.