From 7c63a2c1766d353025236cd175dd3123ef77976d Mon Sep 17 00:00:00 2001 From: Benjamin Pelletier Date: Mon, 8 May 2023 11:17:28 -0700 Subject: [PATCH] [monitorlib] Add timeout retry logic for query_and_describe (#94) * Add timeout retry logic for query_and_describe * Address comments Co-authored-by: Michael Barroco --------- Co-authored-by: Michael Barroco --- monitoring/monitorlib/fetch/__init__.py | 40 ++++++++++++++++++++----- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/monitoring/monitorlib/fetch/__init__.py b/monitoring/monitorlib/fetch/__init__.py index bd945b4b09..8f57d67326 100644 --- a/monitoring/monitorlib/fetch/__init__.py +++ b/monitoring/monitorlib/fetch/__init__.py @@ -1,10 +1,10 @@ import datetime import json import traceback -from types import MappingProxyType from typing import Dict, Optional, List import flask +from loguru import logger import requests import urllib3 import yaml @@ -15,6 +15,9 @@ TIMEOUTS = (5, 25) # Timeouts of `connect` and `read` in seconds +ATTEMPTS = ( + 2 # Number of attempts to query when experiencing a retryable error like a timeout +) class RequestDescription(ImplicitDict): @@ -197,12 +200,35 @@ def query_and_describe( req_kwargs = kwargs.copy() if "timeout" not in req_kwargs: req_kwargs["timeout"] = TIMEOUTS - t0 = datetime.datetime.utcnow() - try: - return describe_query(client.request(verb, url, **req_kwargs), t0) - except (requests.RequestException, urllib3.exceptions.ReadTimeoutError) as e: - msg = "{}: {}".format(type(e).__name__, str(e)) - t1 = datetime.datetime.utcnow() + + # Note: retry logic could be attached to the `client` Session by `mount`ing an HTTPAdapter with custom + # `max_retries`, however we do not want to mutate the provided Session. Instead, retry only on errors we explicitly + # consider retryable. + for attempt in range(ATTEMPTS): + t0 = datetime.datetime.utcnow() + try: + return describe_query(client.request(verb, url, **req_kwargs), t0) + except (requests.Timeout, urllib3.exceptions.ReadTimeoutError) as e: + logger.warning( + "query_and_describe attempt {} to {} {} failed with timeout {}: {}", + attempt + 1, + verb, + url, + type(e).__name__, + str(e), + ) + except requests.RequestException as e: + logger.warning( + "query_and_describe attempt {} to {} {} failed with non-retryable RequestException {}: {}", + attempt + 1, + verb, + url, + type(e).__name__, + str(e), + ) + break + finally: + t1 = datetime.datetime.utcnow() # Reconstruct request similar to the one in the query (which is not # accessible at this point)