Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify import approach #32

Merged
merged 9 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/deploy_daily_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ jobs:
DailyTests:
uses: ./.github/workflows/testing_dev.yml
secrets:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}
CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }}
2 changes: 1 addition & 1 deletion .github/workflows/deploy_tests_on_pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ jobs:
DevTests:
uses: ./.github/workflows/testing_dev.yml
secrets:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}
CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }}
4 changes: 2 additions & 2 deletions .github/workflows/testing_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ name: Dev tests
on:
workflow_call:
secrets:
IPINFO_HASH_SALT:
IP_HASH_SALT:
required: true
IPINFO_CREDENTIALS:
required: true
CODECOV_CREDENTIALS:
required: true

env:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}

jobs:
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<a href="https://codecov.io/github/CatalystNeuro/dandi_s3_log_parser?branch=main"><img alt="codecov" src="https://codecov.io/github/CatalystNeuro/dandi_s3_log_parser/coverage.svg?branch=main"></a>
</p>
<p align="center">
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="PyPI latest release version" src="https://badge.fury.io/py/dandi_s3_log_parser.svg"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="PyPI latest release version" src="https://badge.fury.io/py/dandi_s3_log_parser.svg?id=py&kill_cache=1"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="Ubuntu" src="https://img.shields.io/badge/Ubuntu-E95420?style=flat&logo=ubuntu&logoColor=white"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/dandi_s3_log_parser.svg"></a>
<a href="https://github.com/catalystneuro/dandi_s3_log_parser/blob/main/license.txt"><img alt="License: BSD-3" src="https://img.shields.io/pypi/l/dandi_s3_log_parser.svg"></a>
Expand All @@ -29,7 +29,7 @@ This parser reduces this amount of raw content down to only around 20 GB of cons

These are then additionally mapped only to currently available assets in persistent published Dandiset versions and current drafts, which only comprise around 100 MB of total data.

These small Dandiset-specific summaries are soon to be shared publically.
These small Dandiset-specific summaries are soon to be shared publicly.



Expand Down
4 changes: 2 additions & 2 deletions src/dandi_s3_log_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Outermost exposed imports; including global environment variables."""

from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, IPINFO_CREDENTIALS, get_hash_salt
from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
from ._s3_log_file_parser import parse_raw_s3_log
from ._buffered_text_reader import BufferedTextReader
from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs
from ._dandi_s3_log_file_parser import parse_dandi_raw_s3_log, parse_all_dandi_raw_s3_logs
from ._ip_utils import get_hash_salt

__all__ = [
"DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH",
"IPINFO_CREDENTIALS",
"BufferedTextReader",
"get_hash_salt",
"parse_raw_s3_log",
Expand Down
2 changes: 1 addition & 1 deletion src/dandi_s3_log_parser/_command_line_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"--maximum_number_of_workers",
help="The maximum number of workers to distribute tasks across.",
required=False,
type=click.IntRange(min=1, max=os.cpu_count()),
type=click.IntRange(min=1, max=os.cpu_count() * 5),
default=1,
)
@click.option(
Expand Down
37 changes: 0 additions & 37 deletions src/dandi_s3_log_parser/_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import hashlib
import os
import pathlib

REQUEST_TYPES = ("GET", "PUT", "HEAD")
Expand All @@ -10,38 +8,3 @@
DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH.mkdir(exist_ok=True)

_IP_HASH_TO_REGION_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_to_region.yaml"

if "IPINFO_CREDENTIALS" not in os.environ:
raise ValueError(
"The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!",
) # pragma: no cover
IPINFO_CREDENTIALS = os.environ["IPINFO_CREDENTIALS"]

if "IPINFO_HASH_SALT" not in os.environ:
raise ValueError(
"The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! "
"To retrieve the value, set a temporary value to this environment variable and then use the `get_hash_salt` "
"helper function and set it to the correct value.",
) # pragma: no cover
IPINFO_HASH_SALT = bytes.fromhex(os.environ["IPINFO_HASH_SALT"])


def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str:
"""
Calculate the salt (in hexadecimal encoding) used for IP hashing.

Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to.

Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it.
"""
base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path)

# Retrieve the first line of the first log file (which only we know) and use that as a secure salt
first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log"

with open(file=first_log_file_path) as io:
first_line = io.readline()

hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8"))

return hash_salt.hexdigest()
2 changes: 1 addition & 1 deletion src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def parse_all_dandi_raw_s3_logs(
excluded_log_files: list[FilePath] | None = None,
excluded_ips: collections.defaultdict[str, bool] | None = None,
exclude_github_ips: bool = True,
maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count(), default=1),
maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count() * 5, default=1),
maximum_buffer_size_in_bytes: int = 4 * 10**9,
) -> None:
"""
Expand Down
47 changes: 41 additions & 6 deletions src/dandi_s3_log_parser/_ip_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import datetime
import hashlib
import importlib.metadata
import ipaddress
import os
import pathlib
import traceback
from importlib.metadata import version as importlib_version

import ipinfo
import requests
Expand All @@ -14,11 +16,30 @@
from ._config import (
_IP_HASH_TO_REGION_FILE_PATH,
DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH,
IPINFO_CREDENTIALS,
IPINFO_HASH_SALT,
)


def get_hash_salt(base_raw_s3_log_folder_path: FilePath) -> str:
"""
Calculate the salt (in hexadecimal encoding) used for IP hashing.

Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to.

Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it.
"""
base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path)

# Retrieve the first line of the first log file (which only we know) and use that as a secure salt
first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log"

with open(file=first_log_file_path) as io:
first_line = io.readline()

hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8"))

return hash_salt.hexdigest()


def _cidr_address_to_ip_range(*, cidr_address: str) -> list[str]:
"""Convert a CIDR address to a list of IP addresses."""
cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0]))
Expand Down Expand Up @@ -85,7 +106,21 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
if ip_address == "unknown":
return "unknown"

ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + IPINFO_HASH_SALT).hexdigest()
if "IPINFO_CREDENTIALS" not in os.environ:
message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!"
raise ValueError(message) # pragma: no cover
ipinfo_credentials = os.environ["IPINFO_CREDENTIALS"]

if "IP_HASH_SALT" not in os.environ:
message = (
"The environment variable 'IP_HASH_SALT' must be set to import `dandi_s3_log_parser`! "
"To retrieve the value, set a temporary value to this environment variable "
"and then use the `get_hash_salt` helper function and set it to the correct value."
)
raise ValueError(message) # pragma: no cover
ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"])

ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest()

# Early return for speed
lookup_result = ip_hash_to_region.get(ip_hash)
Expand All @@ -95,7 +130,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
# Log errors in IP fetching
# Lines cannot be covered without testing on a real IP
try: # pragma: no cover
handler = ipinfo.getHandler(access_token=IPINFO_CREDENTIALS)
handler = ipinfo.getHandler(access_token=ipinfo_credentials)
details = handler.getDetails(ip_address=ip_address)

country = details.details.get("country", None)
Expand All @@ -121,7 +156,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
errors_folder_path = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors"
errors_folder_path.mkdir(exist_ok=True)

dandi_s3_log_parser_version = importlib_version(distribution_name="dandi_s3_log_parser")
dandi_s3_log_parser_version = importlib.metadata.version(distribution_name="dandi_s3_log_parser")
date = datetime.datetime.now().strftime("%y%m%d")
lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_ipinfo_errors.txt"

Expand Down