Skip to content

Commit

Permalink
Retry after timeout when waiting for PMHC extract
Browse files Browse the repository at this point in the history
For large extract, we have to wait a long time for the extract to
generate. While waiting, we make a request every 30 seconds to check if
the extract is ready. There are a lot of chances for request timeouts
during this period. This commit adds a max_retries parameter, which
will retry up to the specified number of timeouts. By default, this is
set to 20 retries.
  • Loading branch information
daviewales committed Dec 21, 2023
1 parent dd5e6a7 commit 403c838
Showing 1 changed file with 19 additions and 5 deletions.
24 changes: 19 additions & 5 deletions src/pmhclib/pmhc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pathlib import Path
from typing import Optional

import playwright.sync_api
from playwright.sync_api import sync_playwright
from rich.progress import Progress, TimeElapsedColumn

Expand Down Expand Up @@ -328,6 +329,7 @@ def download_pmhc_mds(
specification: PMHCSpecification = PMHCSpecification.PMHC,
without_associated_dates: bool = False,
matched_episodes: bool = False,
max_retries: int = 20,
) -> Path:
"""Extract PMHC MDS Data within the date range. If no date range
is given, `start_date` defaults to 30 days before the current
Expand All @@ -346,6 +348,8 @@ def download_pmhc_mds(
"Include data without associated dates"
matched_episodes: Enable extract option
"Include all data associated with matched episodes"
max_retries: Number of times to retry after timeout when
waiting for extract to be generated by PMHC website.
Returns:
Path to downloaded extract.
Expand Down Expand Up @@ -384,12 +388,22 @@ def download_pmhc_mds(
# as we have the uuid. However, the URL doesn't exist
# immediately. We loop until we get a success code.
request_ok = False
while not request_ok:
retries = 0
while not request_ok and retries <= max_retries:
time.sleep(30)
download_url_request = self.page.request.get(
f"https://pmhc-mds.net/api/extract/{download_uuid}/fetch"
)
request_ok = download_url_request.ok
try:
download_url_request = self.page.request.get(
f"https://pmhc-mds.net/api/extract/{download_uuid}/fetch"
)
request_ok = download_url_request.ok
except playwright.sync_api.Error as err:
if "Request timed out" in err.message:
retries += 1
logging.warning(
f"Request timed out ({retries} of {max_retries}). Retrying."
)
else:
raise err

download_url_json = download_url_request.json()
download_url = download_url_json["location"]
Expand Down

0 comments on commit 403c838

Please sign in to comment.