diff --git a/.github/workflows/deploy_daily_tests.yml b/.github/workflows/deploy_daily_tests.yml index 4c5239a..2207945 100644 --- a/.github/workflows/deploy_daily_tests.yml +++ b/.github/workflows/deploy_daily_tests.yml @@ -14,6 +14,6 @@ jobs: DailyTests: uses: ./.github/workflows/testing_dev.yml secrets: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} diff --git a/.github/workflows/deploy_tests_on_pull_request.yml b/.github/workflows/deploy_tests_on_pull_request.yml index 3b68591..dd6c825 100644 --- a/.github/workflows/deploy_tests_on_pull_request.yml +++ b/.github/workflows/deploy_tests_on_pull_request.yml @@ -12,6 +12,6 @@ jobs: DevTests: uses: ./.github/workflows/testing_dev.yml secrets: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_dev.yml index 11f5e0f..27544ee 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_dev.yml @@ -2,7 +2,7 @@ name: Dev tests on: workflow_call: secrets: - IPINFO_HASH_SALT: + IP_HASH_SALT: required: true IPINFO_CREDENTIALS: required: true @@ -10,7 +10,7 @@ on: required: true env: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} jobs: diff --git a/README.md b/README.md index c97da7a..90566b2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ codecov

- PyPI latest release version + PyPI latest release version Ubuntu Supported Python versions License: BSD-3 @@ -29,7 +29,7 @@ This parser reduces this amount of raw content down to only around 20 GB of cons These are then additionally mapped only to currently available assets in persistent published Dandiset versions and current drafts, which only comprise around 100 MB of total data. -These small Dandiset-specific summaries are soon to be shared publically. +These small Dandiset-specific summaries are soon to be shared publicly. diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 35aad56..1451f3b 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -1,14 +1,14 @@ """Outermost exposed imports; including global environment variables.""" -from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, IPINFO_CREDENTIALS, get_hash_salt +from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._s3_log_file_parser import parse_raw_s3_log from ._buffered_text_reader import BufferedTextReader from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs from ._dandi_s3_log_file_parser import parse_dandi_raw_s3_log, parse_all_dandi_raw_s3_logs +from ._ip_utils import get_hash_salt __all__ = [ "DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH", - "IPINFO_CREDENTIALS", "BufferedTextReader", "get_hash_salt", "parse_raw_s3_log", diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index ede1c86..354657a 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -48,7 +48,7 @@ "--maximum_number_of_workers", help="The maximum number of workers to distribute tasks across.", required=False, - type=click.IntRange(min=1, max=os.cpu_count()), + type=click.IntRange(min=1, max=os.cpu_count() * 5), default=1, ) @click.option( diff --git a/src/dandi_s3_log_parser/_config.py b/src/dandi_s3_log_parser/_config.py index 090ca76..8c950ef 100644 --- a/src/dandi_s3_log_parser/_config.py +++ b/src/dandi_s3_log_parser/_config.py @@ -1,5 +1,3 @@ -import hashlib -import os import pathlib REQUEST_TYPES = ("GET", "PUT", "HEAD") @@ -10,38 +8,3 @@ DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH.mkdir(exist_ok=True) _IP_HASH_TO_REGION_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_to_region.yaml" - -if "IPINFO_CREDENTIALS" not in os.environ: - raise ValueError( - "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!", - ) # pragma: no cover -IPINFO_CREDENTIALS = os.environ["IPINFO_CREDENTIALS"] - -if "IPINFO_HASH_SALT" not in os.environ: - raise ValueError( - "The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! " - "To retrieve the value, set a temporary value to this environment variable and then use the `get_hash_salt` " - "helper function and set it to the correct value.", - ) # pragma: no cover -IPINFO_HASH_SALT = bytes.fromhex(os.environ["IPINFO_HASH_SALT"]) - - -def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str: - """ - Calculate the salt (in hexadecimal encoding) used for IP hashing. - - Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to. - - Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it. - """ - base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) - - # Retrieve the first line of the first log file (which only we know) and use that as a secure salt - first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log" - - with open(file=first_log_file_path) as io: - first_line = io.readline() - - hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8")) - - return hash_salt.hexdigest() diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index 07a48c4..54a12c0 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -32,7 +32,7 @@ def parse_all_dandi_raw_s3_logs( excluded_log_files: list[FilePath] | None = None, excluded_ips: collections.defaultdict[str, bool] | None = None, exclude_github_ips: bool = True, - maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count(), default=1), + maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count() * 5, default=1), maximum_buffer_size_in_bytes: int = 4 * 10**9, ) -> None: """ diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 122f154..3aeaeb8 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -2,9 +2,11 @@ import datetime import hashlib +import importlib.metadata import ipaddress +import os +import pathlib import traceback -from importlib.metadata import version as importlib_version import ipinfo import requests @@ -14,11 +16,30 @@ from ._config import ( _IP_HASH_TO_REGION_FILE_PATH, DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, - IPINFO_CREDENTIALS, - IPINFO_HASH_SALT, ) +def get_hash_salt(base_raw_s3_log_folder_path: FilePath) -> str: + """ + Calculate the salt (in hexadecimal encoding) used for IP hashing. + + Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to. + + Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it. + """ + base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) + + # Retrieve the first line of the first log file (which only we know) and use that as a secure salt + first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log" + + with open(file=first_log_file_path) as io: + first_line = io.readline() + + hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8")) + + return hash_salt.hexdigest() + + def _cidr_address_to_ip_range(*, cidr_address: str) -> list[str]: """Convert a CIDR address to a list of IP addresses.""" cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0])) @@ -85,7 +106,21 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st if ip_address == "unknown": return "unknown" - ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + IPINFO_HASH_SALT).hexdigest() + if "IPINFO_CREDENTIALS" not in os.environ: + message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!" + raise ValueError(message) # pragma: no cover + ipinfo_credentials = os.environ["IPINFO_CREDENTIALS"] + + if "IP_HASH_SALT" not in os.environ: + message = ( + "The environment variable 'IP_HASH_SALT' must be set to import `dandi_s3_log_parser`! " + "To retrieve the value, set a temporary value to this environment variable " + "and then use the `get_hash_salt` helper function and set it to the correct value." + ) + raise ValueError(message) # pragma: no cover + ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"]) + + ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() # Early return for speed lookup_result = ip_hash_to_region.get(ip_hash) @@ -95,7 +130,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st # Log errors in IP fetching # Lines cannot be covered without testing on a real IP try: # pragma: no cover - handler = ipinfo.getHandler(access_token=IPINFO_CREDENTIALS) + handler = ipinfo.getHandler(access_token=ipinfo_credentials) details = handler.getDetails(ip_address=ip_address) country = details.details.get("country", None) @@ -121,7 +156,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st errors_folder_path = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" errors_folder_path.mkdir(exist_ok=True) - dandi_s3_log_parser_version = importlib_version(distribution_name="dandi_s3_log_parser") + dandi_s3_log_parser_version = importlib.metadata.version(distribution_name="dandi_s3_log_parser") date = datetime.datetime.now().strftime("%y%m%d") lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_ipinfo_errors.txt"