Skip to content

Commit

Permalink
Simplify import approach (#32)
Browse files Browse the repository at this point in the history
* override cpu limit

* simplify import

* Update README.md

* update environment variable name

* update environment variable name

* update environment variable name

* fix badge caching

* fix

---------

Co-authored-by: CodyCBakerPhD <[email protected]>
  • Loading branch information
CodyCBakerPhD and CodyCBakerPhD authored Aug 13, 2024
1 parent be048a6 commit d941bc1
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy_daily_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ jobs:
DailyTests:
uses: ./.github/workflows/testing_dev.yml
secrets:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}
CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }}
2 changes: 1 addition & 1 deletion .github/workflows/deploy_tests_on_pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ jobs:
DevTests:
uses: ./.github/workflows/testing_dev.yml
secrets:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}
CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }}
4 changes: 2 additions & 2 deletions .github/workflows/testing_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ name: Dev tests
on:
workflow_call:
secrets:
IPINFO_HASH_SALT:
IP_HASH_SALT:
required: true
IPINFO_CREDENTIALS:
required: true
CODECOV_CREDENTIALS:
required: true

env:
IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}
IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }}
IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }}

jobs:
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<a href="https://codecov.io/github/CatalystNeuro/dandi_s3_log_parser?branch=main"><img alt="codecov" src="https://codecov.io/github/CatalystNeuro/dandi_s3_log_parser/coverage.svg?branch=main"></a>
</p>
<p align="center">
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="PyPI latest release version" src="https://badge.fury.io/py/dandi_s3_log_parser.svg"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="PyPI latest release version" src="https://badge.fury.io/py/dandi_s3_log_parser.svg?id=py&kill_cache=1"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="Ubuntu" src="https://img.shields.io/badge/Ubuntu-E95420?style=flat&logo=ubuntu&logoColor=white"></a>
<a href="https://pypi.org/project/dandi_s3_log_parser/"><img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/dandi_s3_log_parser.svg"></a>
<a href="https://github.com/catalystneuro/dandi_s3_log_parser/blob/main/license.txt"><img alt="License: BSD-3" src="https://img.shields.io/pypi/l/dandi_s3_log_parser.svg"></a>
Expand All @@ -29,7 +29,7 @@ This parser reduces this amount of raw content down to only around 20 GB of cons

These are then additionally mapped only to currently available assets in persistent published Dandiset versions and current drafts, which only comprise around 100 MB of total data.

These small Dandiset-specific summaries are soon to be shared publically.
These small Dandiset-specific summaries are soon to be shared publicly.



Expand Down
4 changes: 2 additions & 2 deletions src/dandi_s3_log_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Outermost exposed imports; including global environment variables."""

from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, IPINFO_CREDENTIALS, get_hash_salt
from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
from ._s3_log_file_parser import parse_raw_s3_log
from ._buffered_text_reader import BufferedTextReader
from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs
from ._dandi_s3_log_file_parser import parse_dandi_raw_s3_log, parse_all_dandi_raw_s3_logs
from ._ip_utils import get_hash_salt

__all__ = [
"DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH",
"IPINFO_CREDENTIALS",
"BufferedTextReader",
"get_hash_salt",
"parse_raw_s3_log",
Expand Down
2 changes: 1 addition & 1 deletion src/dandi_s3_log_parser/_command_line_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"--maximum_number_of_workers",
help="The maximum number of workers to distribute tasks across.",
required=False,
type=click.IntRange(min=1, max=os.cpu_count()),
type=click.IntRange(min=1, max=os.cpu_count() * 5),
default=1,
)
@click.option(
Expand Down
37 changes: 0 additions & 37 deletions src/dandi_s3_log_parser/_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import hashlib
import os
import pathlib

REQUEST_TYPES = ("GET", "PUT", "HEAD")
Expand All @@ -10,38 +8,3 @@
DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH.mkdir(exist_ok=True)

_IP_HASH_TO_REGION_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_to_region.yaml"

if "IPINFO_CREDENTIALS" not in os.environ:
raise ValueError(
"The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!",
) # pragma: no cover
IPINFO_CREDENTIALS = os.environ["IPINFO_CREDENTIALS"]

if "IPINFO_HASH_SALT" not in os.environ:
raise ValueError(
"The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! "
"To retrieve the value, set a temporary value to this environment variable and then use the `get_hash_salt` "
"helper function and set it to the correct value.",
) # pragma: no cover
IPINFO_HASH_SALT = bytes.fromhex(os.environ["IPINFO_HASH_SALT"])


def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str:
"""
Calculate the salt (in hexadecimal encoding) used for IP hashing.
Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to.
Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it.
"""
base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path)

# Retrieve the first line of the first log file (which only we know) and use that as a secure salt
first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log"

with open(file=first_log_file_path) as io:
first_line = io.readline()

hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8"))

return hash_salt.hexdigest()
2 changes: 1 addition & 1 deletion src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def parse_all_dandi_raw_s3_logs(
excluded_log_files: list[FilePath] | None = None,
excluded_ips: collections.defaultdict[str, bool] | None = None,
exclude_github_ips: bool = True,
maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count(), default=1),
maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count() * 5, default=1),
maximum_buffer_size_in_bytes: int = 4 * 10**9,
) -> None:
"""
Expand Down
47 changes: 41 additions & 6 deletions src/dandi_s3_log_parser/_ip_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import datetime
import hashlib
import importlib.metadata
import ipaddress
import os
import pathlib
import traceback
from importlib.metadata import version as importlib_version

import ipinfo
import requests
Expand All @@ -14,11 +16,30 @@
from ._config import (
_IP_HASH_TO_REGION_FILE_PATH,
DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH,
IPINFO_CREDENTIALS,
IPINFO_HASH_SALT,
)


def get_hash_salt(base_raw_s3_log_folder_path: FilePath) -> str:
"""
Calculate the salt (in hexadecimal encoding) used for IP hashing.
Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to.
Otherwise, it would be fairly easy to iterate over every possible IP address and find the SHA1 of it.
"""
base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path)

# Retrieve the first line of the first log file (which only we know) and use that as a secure salt
first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log"

with open(file=first_log_file_path) as io:
first_line = io.readline()

hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8"))

return hash_salt.hexdigest()


def _cidr_address_to_ip_range(*, cidr_address: str) -> list[str]:
"""Convert a CIDR address to a list of IP addresses."""
cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0]))
Expand Down Expand Up @@ -85,7 +106,21 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
if ip_address == "unknown":
return "unknown"

ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + IPINFO_HASH_SALT).hexdigest()
if "IPINFO_CREDENTIALS" not in os.environ:
message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!"
raise ValueError(message) # pragma: no cover
ipinfo_credentials = os.environ["IPINFO_CREDENTIALS"]

if "IP_HASH_SALT" not in os.environ:
message = (
"The environment variable 'IP_HASH_SALT' must be set to import `dandi_s3_log_parser`! "
"To retrieve the value, set a temporary value to this environment variable "
"and then use the `get_hash_salt` helper function and set it to the correct value."
)
raise ValueError(message) # pragma: no cover
ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"])

ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest()

# Early return for speed
lookup_result = ip_hash_to_region.get(ip_hash)
Expand All @@ -95,7 +130,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
# Log errors in IP fetching
# Lines cannot be covered without testing on a real IP
try: # pragma: no cover
handler = ipinfo.getHandler(access_token=IPINFO_CREDENTIALS)
handler = ipinfo.getHandler(access_token=ipinfo_credentials)
details = handler.getDetails(ip_address=ip_address)

country = details.details.get("country", None)
Expand All @@ -121,7 +156,7 @@ def _get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, st
errors_folder_path = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors"
errors_folder_path.mkdir(exist_ok=True)

dandi_s3_log_parser_version = importlib_version(distribution_name="dandi_s3_log_parser")
dandi_s3_log_parser_version = importlib.metadata.version(distribution_name="dandi_s3_log_parser")
date = datetime.datetime.now().strftime("%y%m%d")
lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_ipinfo_errors.txt"

Expand Down

0 comments on commit d941bc1

Please sign in to comment.