-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #37 from deepghs/dev/retry
dev(narugo): add retry session in entries
- Loading branch information
Showing
12 changed files
with
257 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ hfutils.utils | |
download | ||
number | ||
path | ||
session | ||
tqdm_ | ||
walk | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
hfutils.utils.session | ||
================================= | ||
|
||
.. currentmodule:: hfutils.utils.session | ||
|
||
.. automodule:: hfutils.utils.session | ||
|
||
|
||
|
||
TimeoutHTTPAdapter | ||
----------------------------------------------------- | ||
|
||
.. autoclass:: TimeoutHTTPAdapter | ||
:members: __init__, send | ||
|
||
|
||
|
||
get_requests_session | ||
----------------------------------------------------- | ||
|
||
.. autofunction:: get_requests_session | ||
|
||
|
||
|
||
get_random_ua | ||
----------------------------------------------------- | ||
|
||
.. autofunction:: get_random_ua | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
""" | ||
This module provides functionality for creating and managing HTTP sessions with customizable retry logic, | ||
timeout settings, and user-agent rotation using random user-agent generation. It is designed to help with | ||
robust web scraping and API consumption by handling common HTTP errors and timeouts gracefully. | ||
Main Features: | ||
- Automatic retries on specified HTTP response status codes. | ||
- Configurable request timeout. | ||
- Rotating user-agent for each session to mimic different browsers and operating systems. | ||
- Optional SSL verification. | ||
""" | ||
|
||
from functools import lru_cache | ||
from typing import Optional, Dict | ||
|
||
import requests | ||
from random_user_agent.params import SoftwareName, OperatingSystem | ||
from random_user_agent.user_agent import UserAgent | ||
from requests.adapters import HTTPAdapter, Retry | ||
|
||
DEFAULT_TIMEOUT = 15 # seconds | ||
|
||
|
||
class TimeoutHTTPAdapter(HTTPAdapter): | ||
""" | ||
A custom HTTPAdapter that enforces a default timeout on all requests. | ||
:param args: Variable length argument list for HTTPAdapter. | ||
:param kwargs: Arbitrary keyword arguments. 'timeout' can be specified to set a custom timeout. | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.timeout = DEFAULT_TIMEOUT | ||
if "timeout" in kwargs: | ||
self.timeout = kwargs["timeout"] | ||
del kwargs["timeout"] | ||
super().__init__(*args, **kwargs) | ||
|
||
def send(self, request, **kwargs): | ||
""" | ||
Sends the Request object, applying the timeout setting. | ||
:param request: The Request object to send. | ||
:type request: requests.PreparedRequest | ||
:param kwargs: Keyword arguments that may contain 'timeout'. | ||
:return: The response to the request. | ||
""" | ||
timeout = kwargs.get("timeout") | ||
if timeout is None: | ||
kwargs["timeout"] = self.timeout | ||
return super().send(request, **kwargs) | ||
|
||
|
||
def get_requests_session(max_retries: int = 5, timeout: int = DEFAULT_TIMEOUT, verify: bool = True, | ||
headers: Optional[Dict[str, str]] = None, session: Optional[requests.Session] = None) \ | ||
-> requests.Session: | ||
""" | ||
Creates a requests session with retry logic, timeout settings, and random user-agent headers. | ||
:param max_retries: Maximum number of retries on failed requests. | ||
:type max_retries: int | ||
:param timeout: Request timeout in seconds. | ||
:type timeout: int | ||
:param verify: Whether to verify SSL certificates. | ||
:type verify: bool | ||
:param headers: Additional headers to include in the requests. | ||
:type headers: Optional[Dict[str, str]] | ||
:param session: An existing requests.Session instance to use. | ||
:type session: Optional[requests.Session] | ||
:return: A configured requests.Session object. | ||
:rtype: requests.Session | ||
""" | ||
session = session or requests.session() | ||
retries = Retry( | ||
total=max_retries, backoff_factor=1, | ||
status_forcelist=[408, 429, 500, 501, 502, 503, 504, 505, 506, 507, 509, 510, 511], | ||
allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], | ||
) | ||
adapter = TimeoutHTTPAdapter(max_retries=retries, timeout=timeout, pool_connections=32, pool_maxsize=32) | ||
session.mount('http://', adapter) | ||
session.mount('https://', adapter) | ||
session.headers.update({ | ||
"User-Agent": get_random_ua(), | ||
**dict(headers or {}), | ||
}) | ||
if not verify: | ||
session.verify = False | ||
|
||
return session | ||
|
||
|
||
@lru_cache() | ||
def _ua_pool(): | ||
""" | ||
Creates and caches a UserAgent rotator instance with a specified number of user agents. | ||
:return: A UserAgent rotator instance. | ||
:rtype: UserAgent | ||
""" | ||
software_names = [SoftwareName.CHROME.value, SoftwareName.FIREFOX.value, SoftwareName.EDGE.value] | ||
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.MACOS.value] | ||
|
||
user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=1000) | ||
return user_agent_rotator | ||
|
||
|
||
def get_random_ua(): | ||
""" | ||
Retrieves a random user agent string from the cached UserAgent rotator. | ||
:return: A random user agent string. | ||
:rtype: str | ||
""" | ||
return _ua_pool().get_random_user_agent() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ click>=7 | |
tzlocal | ||
natsort | ||
urlobject | ||
fsspec>=2024 | ||
fsspec>=2024 | ||
random_user_agent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from unittest.mock import patch, Mock | ||
|
||
import pytest | ||
import requests | ||
from huggingface_hub import hf_hub_url | ||
from requests.adapters import HTTPAdapter | ||
|
||
from hfutils.utils.session import TimeoutHTTPAdapter, get_requests_session, get_random_ua | ||
|
||
|
||
@pytest.fixture | ||
def mock_requests_session(): | ||
with patch('requests.session') as mock_session: | ||
yield mock_session.return_value | ||
|
||
|
||
@pytest.fixture | ||
def mock_ua_pool(): | ||
with patch('hfutils.utils.session._ua_pool') as mock_pool: | ||
mock_pool.return_value.get_random_user_agent.return_value = 'MockUserAgent' | ||
yield mock_pool | ||
|
||
|
||
@pytest.fixture() | ||
def example_url(): | ||
return hf_hub_url( | ||
repo_id='deepghs/danbooru_newest', | ||
repo_type='dataset', | ||
filename='README.md' | ||
) | ||
|
||
|
||
@pytest.mark.unittest | ||
class TestUtilsSession: | ||
def test_timeout_http_adapter_init(self, ): | ||
adapter = TimeoutHTTPAdapter() | ||
assert adapter.timeout == 15 | ||
|
||
adapter = TimeoutHTTPAdapter(timeout=30) | ||
assert adapter.timeout == 30 | ||
|
||
def test_timeout_http_adapter_send(self, ): | ||
adapter = TimeoutHTTPAdapter(timeout=10) | ||
mock_request = Mock() | ||
mock_kwargs = {} | ||
|
||
with patch.object(HTTPAdapter, 'send') as mock_send: | ||
adapter.send(mock_request, **mock_kwargs) | ||
mock_send.assert_called_once_with(mock_request, timeout=10) | ||
|
||
mock_kwargs = {'timeout': 20} | ||
with patch.object(HTTPAdapter, 'send') as mock_send: | ||
adapter.send(mock_request, **mock_kwargs) | ||
mock_send.assert_called_once_with(mock_request, timeout=20) | ||
|
||
def test_get_requests_session(self, mock_ua_pool): | ||
session = get_requests_session() | ||
assert isinstance(session, requests.Session) | ||
assert 'User-Agent' in session.headers | ||
assert session.headers['User-Agent'] == 'MockUserAgent' | ||
|
||
custom_headers = {'Custom-Header': 'Value'} | ||
session = get_requests_session(headers=custom_headers) | ||
assert 'Custom-Header' in session.headers | ||
assert session.headers['Custom-Header'] == 'Value' | ||
|
||
session = get_requests_session(verify=False) | ||
assert session.verify is False | ||
|
||
existing_session = requests.Session() | ||
session = get_requests_session(session=existing_session) | ||
assert session is existing_session | ||
|
||
def test_get_requests_session_with_custom_params(self): | ||
session = get_requests_session(max_retries=3, timeout=30) | ||
assert isinstance(session, requests.Session) | ||
# You might want to add more assertions here to check if the custom parameters are applied correctly | ||
|
||
def test_get_random_ua(self, mock_ua_pool): | ||
ua = get_random_ua() | ||
assert ua == 'MockUserAgent' | ||
mock_ua_pool.return_value.get_random_user_agent.assert_called_once() |