-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
dev(narugo): add retry session in entries
- Loading branch information
1 parent
3459e6c
commit 9d07268
Showing
9 changed files
with
143 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
""" | ||
This module provides functionality for creating and managing HTTP sessions with customizable retry logic, | ||
timeout settings, and user-agent rotation using random user-agent generation. It is designed to help with | ||
robust web scraping and API consumption by handling common HTTP errors and timeouts gracefully. | ||
Main Features: | ||
- Automatic retries on specified HTTP response status codes. | ||
- Configurable request timeout. | ||
- Rotating user-agent for each session to mimic different browsers and operating systems. | ||
- Optional SSL verification. | ||
""" | ||
|
||
from functools import lru_cache | ||
from typing import Optional, Dict | ||
|
||
import requests | ||
from random_user_agent.params import SoftwareName, OperatingSystem | ||
from random_user_agent.user_agent import UserAgent | ||
from requests.adapters import HTTPAdapter, Retry | ||
|
||
DEFAULT_TIMEOUT = 15 # seconds | ||
|
||
|
||
class TimeoutHTTPAdapter(HTTPAdapter): | ||
""" | ||
A custom HTTPAdapter that enforces a default timeout on all requests. | ||
:param args: Variable length argument list for HTTPAdapter. | ||
:param kwargs: Arbitrary keyword arguments. 'timeout' can be specified to set a custom timeout. | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.timeout = DEFAULT_TIMEOUT | ||
if "timeout" in kwargs: | ||
self.timeout = kwargs["timeout"] | ||
del kwargs["timeout"] | ||
super().__init__(*args, **kwargs) | ||
|
||
def send(self, request, **kwargs): | ||
""" | ||
Sends the Request object, applying the timeout setting. | ||
:param request: The Request object to send. | ||
:type request: requests.PreparedRequest | ||
:param kwargs: Keyword arguments that may contain 'timeout'. | ||
:return: The response to the request. | ||
""" | ||
timeout = kwargs.get("timeout") | ||
if timeout is None: | ||
kwargs["timeout"] = self.timeout | ||
return super().send(request, **kwargs) | ||
|
||
|
||
def get_requests_session(max_retries: int = 5, timeout: int = DEFAULT_TIMEOUT, verify: bool = True, | ||
headers: Optional[Dict[str, str]] = None, session: Optional[requests.Session] = None) \ | ||
-> requests.Session: | ||
""" | ||
Creates a requests session with retry logic, timeout settings, and random user-agent headers. | ||
:param max_retries: Maximum number of retries on failed requests. | ||
:type max_retries: int | ||
:param timeout: Request timeout in seconds. | ||
:type timeout: int | ||
:param verify: Whether to verify SSL certificates. | ||
:type verify: bool | ||
:param headers: Additional headers to include in the requests. | ||
:type headers: Optional[Dict[str, str]] | ||
:param session: An existing requests.Session instance to use. | ||
:type session: Optional[requests.Session] | ||
:return: A configured requests.Session object. | ||
:rtype: requests.Session | ||
""" | ||
session = session or requests.session() | ||
retries = Retry( | ||
total=max_retries, backoff_factor=1, | ||
status_forcelist=[408, 413, 429, 500, 501, 502, 503, 504, 505, 506, 507, 509, 510, 511], | ||
allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], | ||
) | ||
adapter = TimeoutHTTPAdapter(max_retries=retries, timeout=timeout, pool_connections=32, pool_maxsize=32) | ||
session.mount('http://', adapter) | ||
session.mount('https://', adapter) | ||
session.headers.update({ | ||
"User-Agent": get_random_ua(), | ||
**dict(headers or {}), | ||
}) | ||
if not verify: | ||
session.verify = False | ||
|
||
return session | ||
|
||
|
||
@lru_cache() | ||
def _ua_pool(): | ||
""" | ||
Creates and caches a UserAgent rotator instance with a specified number of user agents. | ||
:return: A UserAgent rotator instance. | ||
:rtype: UserAgent | ||
""" | ||
software_names = [SoftwareName.CHROME.value, SoftwareName.FIREFOX.value, SoftwareName.EDGE.value] | ||
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.MACOS.value] | ||
|
||
user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=1000) | ||
return user_agent_rotator | ||
|
||
|
||
def get_random_ua(): | ||
""" | ||
Retrieves a random user agent string from the cached UserAgent rotator. | ||
:return: A random user agent string. | ||
:rtype: str | ||
""" | ||
return _ua_pool().get_random_user_agent() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ click>=7 | |
tzlocal | ||
natsort | ||
urlobject | ||
fsspec>=2024 | ||
fsspec>=2024 | ||
random_user_agent |