Retry after timeout when waiting for PMHC extract

For large extract, we have to wait a long time for the extract to generate. While waiting, we make a request every 30 seconds to check if the extract is ready. There are a lot of chances for request timeouts during this period. This commit adds a max_retries parameter, which will retry up to the specified number of timeouts. By default, this is set to 20 retries.
swsphn · Dec 21, 2023 · 403c838 · 403c838
1 parent dd5e6a7
commit 403c838
Showing 1 changed file with 19 additions and 5 deletions.
diff --git a/src/pmhclib/pmhc.py b/src/pmhclib/pmhc.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 from typing import Optional
 
+import playwright.sync_api
 from playwright.sync_api import sync_playwright
 from rich.progress import Progress, TimeElapsedColumn
 
@@ -328,6 +329,7 @@ def download_pmhc_mds(
  specification: PMHCSpecification = PMHCSpecification.PMHC,
  without_associated_dates: bool = False,
  matched_episodes: bool = False,
+ max_retries: int = 20,
  ) -> Path:
  """Extract PMHC MDS Data within the date range. If no date range
  is given, `start_date` defaults to 30 days before the current
@@ -346,6 +348,8 @@ def download_pmhc_mds(
  "Include data without associated dates"
  matched_episodes: Enable extract option
  "Include all data associated with matched episodes"
+ max_retries: Number of times to retry after timeout when
+ waiting for extract to be generated by PMHC website.
 
  Returns:
  Path to downloaded extract.
@@ -384,12 +388,22 @@ def download_pmhc_mds(
  # as we have the uuid. However, the URL doesn't exist
  # immediately. We loop until we get a success code.
  request_ok = False
- while not request_ok:
+ retries = 0
+ while not request_ok and retries <= max_retries:
  time.sleep(30)
- download_url_request = self.page.request.get(
- f"https://pmhc-mds.net/api/extract/{download_uuid}/fetch"
- )
- request_ok = download_url_request.ok
+ try:
+ download_url_request = self.page.request.get(
+ f"https://pmhc-mds.net/api/extract/{download_uuid}/fetch"
+ )
+ request_ok = download_url_request.ok
+ except playwright.sync_api.Error as err:
+ if "Request timed out" in err.message:
+ retries += 1
+ logging.warning(
+ f"Request timed out ({retries} of {max_retries}). Retrying."
+ )
+ else:
+ raise err
 
  download_url_json = download_url_request.json()
  download_url = download_url_json["location"]