From d8c9a5ffc33910aca8004d0e4598db9cf13d6228 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 13 Aug 2024 00:44:55 -0400 Subject: [PATCH 1/8] override cpu limit --- src/dandi_s3_log_parser/_command_line_interface.py | 2 +- src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index ede1c86..354657a 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -48,7 +48,7 @@ "--maximum_number_of_workers", help="The maximum number of workers to distribute tasks across.", required=False, - type=click.IntRange(min=1, max=os.cpu_count()), + type=click.IntRange(min=1, max=os.cpu_count() * 5), default=1, ) @click.option( diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index 07a48c4..54a12c0 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -32,7 +32,7 @@ def parse_all_dandi_raw_s3_logs( excluded_log_files: list[FilePath] | None = None, excluded_ips: collections.defaultdict[str, bool] | None = None, exclude_github_ips: bool = True, - maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count(), default=1), + maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count() * 5, default=1), maximum_buffer_size_in_bytes: int = 4 * 10**9, ) -> None: """ From 47051523b98ff6d0c58f1cd11a1c21480c15c235 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 13 Aug 2024 09:19:16 -0400 Subject: [PATCH 2/8] simplify import --- src/dandi_s3_log_parser/__init__.py | 4 +-- src/dandi_s3_log_parser/_config.py | 37 ---------------------- src/dandi_s3_log_parser/_ip_utils.py | 47 ++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 35aad56..1451f3b 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -1,14 +1,14 @@ """Outermost exposed imports; including global environment variables.""" -from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, IPINFO_CREDENTIALS, get_hash_salt +from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._s3_log_file_parser import parse_raw_s3_log from ._buffered_text_reader import BufferedTextReader from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs from ._dandi_s3_log_file_parser import parse_dandi_raw_s3_log, parse_all_dandi_raw_s3_logs +from ._ip_utils import get_hash_salt __all__ = [ "DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH", - "IPINFO_CREDENTIALS", "BufferedTextReader", "get_hash_salt", "parse_raw_s3_log", diff --git a/src/dandi_s3_log_parser/_config.py b/src/dandi_s3_log_parser/_config.py index 090ca76..8c950ef 100644 --- a/src/dandi_s3_log_parser/_config.py +++ b/src/dandi_s3_log_parser/_config.py @@ -1,5 +1,3 @@ -import hashlib -import os import pathlib REQUEST_TYPES = ("GET", "PUT", "HEAD") @@ -10,38 +8,3 @@ DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH.mkdir(exist_ok=True) _IP_HASH_TO_REGION_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_to_region.yaml" - -if "IPINFO_CREDENTIALS" not in os.environ: - raise ValueError( - "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!", - ) # pragma: no cover -IPINFO_CREDENTIALS = os.environ["IPINFO_CREDENTIALS"] - -if "IPINFO_HASH_SALT" not in os.environ: - raise ValueError( - "The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! " - "To retrieve the value, set a temporary value to this environment variable and then use the `get_hash_salt` " - "helper function and set it to the correct value.", - ) # pragma: no cover -IPINFO_HASH_SALT = bytes.fromhex(os.environ["IPINFO_HASH_SALT"]) - - -def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str: - """ - Calculate the salt (in hexadecimal encoding) used for IP hashing. - - Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to. - - Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it. - """ - base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) - - # Retrieve the first line of the first log file (which only we know) and use that as a secure salt - first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log" - - with open(file=first_log_file_path) as io: - first_line = io.readline() - - hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8")) - - return hash_salt.hexdigest() diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 122f154..cf15578 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -2,9 +2,11 @@ import datetime import hashlib +import importlib.metadata import ipaddress +import os +import pathlib import traceback -from importlib.metadata import version as importlib_version import ipinfo import requests @@ -14,11 +16,30 @@ from ._config import ( _IP_HASH_TO_REGION_FILE_PATH, DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, - IPINFO_CREDENTIALS, - IPINFO_HASH_SALT, ) +def get_hash_salt(base_raw_s3_log_folder_path: FilePath) -> str: + """ + Calculate the salt (in hexadecimal encoding) used for IP hashing. + + Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to. + + Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it. + """ + base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) + + # Retrieve the first line of the first log file (which only we know) and use that as a secure salt + first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log" + + with open(file=first_log_file_path) as io: + first_line = io.readline() + + hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8")) + + return hash_salt.hexdigest() + + def _cidr_address_to_ip_range(*, cidr_address: str) -> list[str]: """Convert a CIDR address to a list of IP addresses.""" cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0])) @@ -85,7 +106,21 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st if ip_address == "unknown": return "unknown" - ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + IPINFO_HASH_SALT).hexdigest() + if "IPINFO_CREDENTIALS" not in os.environ: + message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!" + raise ValueError(message) # pragma: no cover + ipinfo_credentials = os.environ["IPINFO_CREDENTIALS"] + + if "IPINFO_HASH_SALT" not in os.environ: + message = ( + "The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! " + "To retrieve the value, set a temporary value to this environment variable " + "and then use the `get_hash_salt` helper function and set it to the correct value." + ) + raise ValueError(message) # pragma: no cover + ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"]) + + ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() # Early return for speed lookup_result = ip_hash_to_region.get(ip_hash) @@ -95,7 +130,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st # Log errors in IP fetching # Lines cannot be covered without testing on a real IP try: # pragma: no cover - handler = ipinfo.getHandler(access_token=IPINFO_CREDENTIALS) + handler = ipinfo.getHandler(access_token=ipinfo_credentials) details = handler.getDetails(ip_address=ip_address) country = details.details.get("country", None) @@ -121,7 +156,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st errors_folder_path = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" errors_folder_path.mkdir(exist_ok=True) - dandi_s3_log_parser_version = importlib_version(distribution_name="dandi_s3_log_parser") + dandi_s3_log_parser_version = importlib.metadata.version(distribution_name="dandi_s3_log_parser") date = datetime.datetime.now().strftime("%y%m%d") lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_ipinfo_errors.txt" From a967321d22b919b91304ba44016f64724bf6416d Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:34:57 -0400 Subject: [PATCH 3/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c97da7a..df856b7 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ This parser reduces this amount of raw content down to only around 20 GB of cons These are then additionally mapped only to currently available assets in persistent published Dandiset versions and current drafts, which only comprise around 100 MB of total data. -These small Dandiset-specific summaries are soon to be shared publically. +These small Dandiset-specific summaries are soon to be shared publicly. From 2f024fba5d5448d8e68c137641397acc827e4e11 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:46:34 -0400 Subject: [PATCH 4/8] update environment variable name --- .github/workflows/testing_dev.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_dev.yml index 11f5e0f..27544ee 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_dev.yml @@ -2,7 +2,7 @@ name: Dev tests on: workflow_call: secrets: - IPINFO_HASH_SALT: + IP_HASH_SALT: required: true IPINFO_CREDENTIALS: required: true @@ -10,7 +10,7 @@ on: required: true env: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} jobs: From a666c87c4274762be30930dc9787dada9a2d3951 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:46:51 -0400 Subject: [PATCH 5/8] update environment variable name --- .github/workflows/deploy_daily_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_daily_tests.yml b/.github/workflows/deploy_daily_tests.yml index 4c5239a..2207945 100644 --- a/.github/workflows/deploy_daily_tests.yml +++ b/.github/workflows/deploy_daily_tests.yml @@ -14,6 +14,6 @@ jobs: DailyTests: uses: ./.github/workflows/testing_dev.yml secrets: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} From 5aa557cd5bdb05c82f9633beee9e05b726d52b9f Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:47:03 -0400 Subject: [PATCH 6/8] update environment variable name --- .github/workflows/deploy_tests_on_pull_request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_tests_on_pull_request.yml b/.github/workflows/deploy_tests_on_pull_request.yml index 3b68591..dd6c825 100644 --- a/.github/workflows/deploy_tests_on_pull_request.yml +++ b/.github/workflows/deploy_tests_on_pull_request.yml @@ -12,6 +12,6 @@ jobs: DevTests: uses: ./.github/workflows/testing_dev.yml secrets: - IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} From daeaec0e212d7c12f7f45c4de4a8636592533431 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:51:07 -0400 Subject: [PATCH 7/8] fix badge caching --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index df856b7..90566b2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ codecov

- PyPI latest release version + PyPI latest release version Ubuntu Supported Python versions License: BSD-3 From 731861ae0676ef51ad9d0ee3edba00c899ac8abe Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 13 Aug 2024 10:04:07 -0400 Subject: [PATCH 8/8] fix --- src/dandi_s3_log_parser/_ip_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index cf15578..3aeaeb8 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -111,9 +111,9 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st raise ValueError(message) # pragma: no cover ipinfo_credentials = os.environ["IPINFO_CREDENTIALS"] - if "IPINFO_HASH_SALT" not in os.environ: + if "IP_HASH_SALT" not in os.environ: message = ( - "The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! " + "The environment variable 'IP_HASH_SALT' must be set to import `dandi_s3_log_parser`! " "To retrieve the value, set a temporary value to this environment variable " "and then use the `get_hash_salt` helper function and set it to the correct value." )