diff --git a/src/dandi_s3_log_parser/_buffered_text_reader.py b/src/dandi_s3_log_parser/_buffered_text_reader.py index 05917db..38ec423 100644 --- a/src/dandi_s3_log_parser/_buffered_text_reader.py +++ b/src/dandi_s3_log_parser/_buffered_text_reader.py @@ -3,8 +3,7 @@ class BufferedTextReader: def __init__(self, *, file_path: str | pathlib.Path, maximum_buffer_size_in_bytes: int = 10**9): - """ - Lazily read a text file into RAM using buffers of a specified size. + """Lazily read a text file into RAM using buffers of a specified size. Parameters ---------- @@ -13,6 +12,7 @@ def __init__(self, *, file_path: str | pathlib.Path, maximum_buffer_size_in_byte maximum_buffer_size_in_bytes : int, default: 1 GB The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the source text file. + """ self.file_path = file_path self.maximum_buffer_size_in_bytes = maximum_buffer_size_in_bytes @@ -50,7 +50,7 @@ def __next__(self) -> list[str]: if len(buffer) == 0 and last_line != "": raise ValueError( f"BufferedTextReader encountered a line at offset {self.offset} that exceeds the buffer " - "size! Try increasing the `maximum_buffer_size_in_bytes` to account for this line." + "size! Try increasing the `maximum_buffer_size_in_bytes` to account for this line.", ) # The last line split by the intermediate buffer may or may not be incomplete diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 6c921c6..ede1c86 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -3,12 +3,16 @@ import collections import os import pathlib -import click from typing import Literal -from ._dandi_s3_log_file_parser import parse_dandi_raw_s3_log, parse_all_dandi_raw_s3_logs -from .testing._helpers import find_random_example_line +import click + from ._config import REQUEST_TYPES +from ._dandi_s3_log_file_parser import ( + parse_all_dandi_raw_s3_logs, + parse_dandi_raw_s3_log, +) +from .testing._helpers import find_random_example_line NUMBER_OF_CPU = os.cpu_count() # Note: Not distinguishing if logical or not diff --git a/src/dandi_s3_log_parser/_config.py b/src/dandi_s3_log_parser/_config.py index ef6895f..f7a4eff 100644 --- a/src/dandi_s3_log_parser/_config.py +++ b/src/dandi_s3_log_parser/_config.py @@ -1,6 +1,6 @@ +import hashlib import os import pathlib -import hashlib REQUEST_TYPES = ("GET", "PUT", "HEAD") @@ -13,7 +13,7 @@ if "IPINFO_CREDENTIALS" not in os.environ: raise ValueError( - "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!" + "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!", ) # pragma: no cover IPINFO_CREDENTIALS = os.environ["IPINFO_CREDENTIALS"] @@ -21,14 +21,13 @@ raise ValueError( "The environment variable 'IPINFO_HASH_SALT' must be set to import `dandi_s3_log_parser`! " "To retrieve the value, set a temporary value to this environment variable and then use the `get_hash_salt` " - "helper function and set it to the correct value." + "helper function and set it to the correct value.", ) # pragma: no cover IPINFO_HASH_SALT = bytes.fromhex(os.environ["IPINFO_HASH_SALT"]) def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str: - """ - Calculate the salt (in hexadecimal encoding) used for IP hashing. + """Calculate the salt (in hexadecimal encoding) used for IP hashing. Uses actual data from the first line of the first log file in the raw S3 log folder, which only we have access to. @@ -39,7 +38,7 @@ def get_hash_salt(base_raw_s3_log_folder_path: str | pathlib.Path) -> str: # Retrieve the first line of the first log file (which only we know) and use that as a secure salt first_log_file_path = base_raw_s3_log_folder_path / "2019" / "10" / "01.log" - with open(file=first_log_file_path, mode="r") as io: + with open(file=first_log_file_path) as io: first_line = io.readline() hash_salt = hashlib.sha1(string=bytes(first_line, "utf-8")) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index 734fcb0..f83f43b 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -2,25 +2,26 @@ import collections import datetime -import pathlib +import importlib.metadata import os +import pathlib import shutil import traceback import uuid +from collections.abc import Callable from concurrent.futures import ProcessPoolExecutor, as_completed -from typing import Callable, Literal -import importlib.metadata +from typing import Literal import pandas -from pydantic import validate_call, Field, FilePath import tqdm +from pydantic import Field, FilePath, validate_call +from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._ip_utils import ( _get_latest_github_ip_ranges, ) -from ._s3_log_file_parser import parse_raw_s3_log -from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs +from ._s3_log_file_parser import parse_raw_s3_log @validate_call @@ -34,8 +35,7 @@ def parse_all_dandi_raw_s3_logs( maximum_number_of_workers: int = Field(ge=1, le=os.cpu_count(), default=1), maximum_buffer_size_in_bytes: int = 4 * 10**9, ) -> None: - """ - Batch parse all raw S3 log files in a folder and write the results to a folder of TSV files. + """Batch parse all raw S3 log files in a folder and write the results to a folder of TSV files. Assumes the following folder structure... @@ -66,6 +66,7 @@ def parse_all_dandi_raw_s3_logs( Actual total RAM usage will be higher due to overhead and caching. Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is greater than one. + """ base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) parsed_s3_log_folder_path = pathlib.Path(parsed_s3_log_folder_path) @@ -138,7 +139,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: temporary_folder_path=temporary_folder_path, excluded_ips=excluded_ips, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes_per_job, - ) + ), ) progress_bar_iterable = tqdm.tqdm( @@ -177,7 +178,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: header = False if merged_temporary_file_path.exists() else True parsed_s3_log.to_csv( - path_or_buf=merged_temporary_file_path, mode="a", sep="\t", header=header, index=False + path_or_buf=merged_temporary_file_path, mode="a", sep="\t", header=header, index=False, ) print("\n\n") @@ -192,7 +193,6 @@ def asset_id_handler(*, raw_asset_id: str) -> str: shutil.rmtree(path=temporary_base_folder_path, ignore_errors=True) - return None # Function cannot be covered because the line calls occur on subprocesses @@ -205,13 +205,11 @@ def _multi_job_parse_dandi_raw_s3_log( excluded_ips: collections.defaultdict[str, bool] | None, maximum_buffer_size_in_bytes: int, ) -> None: - """ - A mostly pass-through function to calculate the job index on the worker and target the correct subfolder. + """A mostly pass-through function to calculate the job index on the worker and target the correct subfolder. Also dumps error stack (which is only typically seen by the worker and not sent back to the main stdout pipe) to a log file. """ - try: error_message = "" @@ -248,10 +246,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str: ) except Exception as exception: with open(file=parallel_errors_file_path, mode="a") as io: - error_message += f"{type(exception)}: {str(exception)}\n\n{traceback.format_exc()}\n\n" + error_message += f"{type(exception)}: {exception!s}\n\n{traceback.format_exc()}\n\n" io.write(error_message) - return None def parse_dandi_raw_s3_log( @@ -266,8 +263,7 @@ def parse_dandi_raw_s3_log( maximum_buffer_size_in_bytes: int = 4 * 10**9, order_results: bool = True, ) -> None: - """ - Parse a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. + """Parse a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. 'Parsing' here means: - limiting only to requests of the specified type (i.e., GET, PUT, etc.) @@ -308,6 +304,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: Whether to order the results chronologically. This is strongly suggested, but a common case of disabling it is if ordering is intended to be applied after multiple steps of processing instead of during this operation. + """ raw_s3_log_file_path = pathlib.Path(raw_s3_log_file_path) parsed_s3_log_folder_path = pathlib.Path(parsed_s3_log_folder_path) @@ -342,4 +339,3 @@ def asset_id_handler(*, raw_asset_id: str) -> str: order_results=order_results, ) - return None diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 1730f42..347c14d 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -1,10 +1,9 @@ """Various private utility functions for handling IP address related tasks.""" -import ipaddress +import datetime import hashlib +import ipaddress import traceback -import datetime -from typing import List from importlib.metadata import version as importlib_version import ipinfo @@ -14,13 +13,13 @@ from ._config import ( _IP_HASH_TO_REGION_FILE_PATH, + DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, IPINFO_CREDENTIALS, IPINFO_HASH_SALT, - DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH, ) -def _cidr_address_to_ip_range(cidr_address: str) -> List[str]: +def _cidr_address_to_ip_range(cidr_address: str) -> list[str]: """Convert a CIDR address to a list of IP addresses.""" cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0])) ip_address_range = list() @@ -58,12 +57,12 @@ def _load_ip_address_to_region_cache(ip_hash_to_region_file_path: FilePath | Non if not ip_hash_to_region_file_path.exists(): return dict() # pragma: no cover - with open(file=ip_hash_to_region_file_path, mode="r") as stream: + with open(file=ip_hash_to_region_file_path) as stream: return yaml.load(stream=stream, Loader=yaml.SafeLoader) def _save_ip_address_to_region_cache( - ip_hash_to_region: dict[str, str], ip_hash_to_region_file_path: FilePath | None = None + ip_hash_to_region: dict[str, str], ip_hash_to_region_file_path: FilePath | None = None, ) -> None: """Save the IP address to region cache to disk.""" ip_hash_to_region_file_path = ip_hash_to_region_file_path or _IP_HASH_TO_REGION_FILE_PATH @@ -73,15 +72,14 @@ def _save_ip_address_to_region_cache( def _get_region_from_ip_address(ip_hash_to_region: dict[str, str], ip_address: str) -> str | None: - """ - If the parsed S3 logs are meant to be shared openly, the remote IP could be used to directly identify individuals. + """If the parsed S3 logs are meant to be shared openly, the remote IP could be used to directly identify individuals. Instead, identify the generic region of the world the request came from and report that instead. """ ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + IPINFO_HASH_SALT).hexdigest() # Early return for speed - lookup_result = ip_hash_to_region.get(ip_hash, None) + lookup_result = ip_hash_to_region.get(ip_hash) if lookup_result is not None: return lookup_result @@ -121,8 +119,8 @@ def _get_region_from_ip_address(ip_hash_to_region: dict[str, str], ip_address: s with open(file=lines_errors_file_path, mode="a") as io: io.write( f"Error fetching IP information for {ip_address}!\n\n" - f"{type(exception)}: {str(exception)}\n\n" - f"{traceback.format_exc()}" + f"{type(exception)}: {exception!s}\n\n" + f"{traceback.format_exc()}", ) return "unknown" diff --git a/src/dandi_s3_log_parser/_order_and_anonymize_parsed_logs.py b/src/dandi_s3_log_parser/_order_and_anonymize_parsed_logs.py index 82539ec..4dc78dc 100644 --- a/src/dandi_s3_log_parser/_order_and_anonymize_parsed_logs.py +++ b/src/dandi_s3_log_parser/_order_and_anonymize_parsed_logs.py @@ -5,7 +5,7 @@ def order_and_anonymize_parsed_logs( - unordered_parsed_s3_log_folder_path: pathlib.Path, ordered_and_anonymized_s3_log_folder_path: pathlib.Path + unordered_parsed_s3_log_folder_path: pathlib.Path, ordered_and_anonymized_s3_log_folder_path: pathlib.Path, ) -> None: """Order the contents of all parsed log files chronologically.""" ordered_and_anonymized_s3_log_folder_path.mkdir(exist_ok=True) @@ -29,5 +29,5 @@ def order_and_anonymize_parsed_logs( ordered_and_anonymized_s3_log_folder_path / unordered_parsed_s3_log_file_path.name ) ordered_and_anonymized_parsed_s3_log.to_csv( - path_or_buf=ordered_and_anonymized_parsed_s3_log_file_path, sep="\t", header=True, index=True + path_or_buf=ordered_and_anonymized_parsed_s3_log_file_path, sep="\t", header=True, index=True, ) diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py index 54a2732..4e7754f 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py @@ -4,14 +4,15 @@ import pathlib import shutil import uuid -from typing import Callable, Literal +from collections.abc import Callable +from typing import Literal import pandas import tqdm -from ._s3_log_line_parser import _ReducedLogLine, _append_reduced_log_line from ._buffered_text_reader import BufferedTextReader from ._order_and_anonymize_parsed_logs import order_and_anonymize_parsed_logs +from ._s3_log_line_parser import _append_reduced_log_line, _ReducedLogLine def parse_raw_s3_log( @@ -27,8 +28,7 @@ def parse_raw_s3_log( maximum_buffer_size_in_bytes: int = 4 * 10**9, order_results: bool = True, ) -> None: - """ - Parse a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. + """Parse a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. 'Parsing' here means: - limiting only to requests of the specified type (i.e., GET, PUT, etc.) @@ -71,6 +71,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: Whether to order the results chronologically. This is strongly suggested, but a common case of disabling it is if ordering is intended to be applied after multiple steps of processing instead of during this operation. + """ raw_s3_log_file_path = pathlib.Path(raw_s3_log_file_path) parsed_s3_log_folder_path = pathlib.Path(parsed_s3_log_folder_path) @@ -103,7 +104,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: for reduced_log in reduced_logs: raw_asset_id = reduced_log.asset_id reduced_logs_binned_by_unparsed_asset[raw_asset_id] = reduced_logs_binned_by_unparsed_asset.get( - raw_asset_id, collections.defaultdict(list) + raw_asset_id, collections.defaultdict(list), ) reduced_logs_binned_by_unparsed_asset[raw_asset_id]["timestamp"].append(reduced_log.timestamp) @@ -136,7 +137,6 @@ def asset_id_handler(*, raw_asset_id: str) -> str: shutil.rmtree(path=temporary_output_folder_path, ignore_errors=True) - return None def _get_reduced_log_lines( @@ -149,8 +149,7 @@ def _get_reduced_log_lines( maximum_buffer_size_in_bytes: int = 4 * 10**9, ip_hash_to_region_file_path: pathlib.Path | None, ) -> list[_ReducedLogLine]: - """ - Reduce the full S3 log file to minimal content and return a list of in-memory collections.namedtuple objects. + """Reduce the full S3 log file to minimal content and return a list of in-memory collections.namedtuple objects. Parameters ---------- @@ -167,6 +166,7 @@ def _get_reduced_log_lines( maximum_buffer_size_in_bytes : int, default: 4 GB The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the source text file. + """ assert raw_s3_log_file_path.suffix == ".log", f"{raw_s3_log_file_path=} should end in '.log'!" @@ -181,10 +181,10 @@ def _get_reduced_log_lines( reduced_log_lines = list() per_buffer_index = 0 buffered_text_reader = BufferedTextReader( - file_path=raw_s3_log_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes + file_path=raw_s3_log_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) for buffered_raw_lines in tqdm.tqdm( - iterable=buffered_text_reader, total=len(buffered_text_reader), **resolved_tqdm_kwargs + iterable=buffered_text_reader, total=len(buffered_text_reader), **resolved_tqdm_kwargs, ): index = 0 for raw_line in buffered_raw_lines: diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index 4c7f2b9..578c8ae 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -1,5 +1,4 @@ -""" -Primary functions for parsing a single line of a raw S3 log. +"""Primary functions for parsing a single line of a raw S3 log. The strategy is to... @@ -16,9 +15,9 @@ import collections import datetime +import importlib.metadata import pathlib import re -import importlib.metadata from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._ip_utils import _get_region_from_ip_address @@ -73,8 +72,7 @@ def _find_all_possible_substring_indices(*, string: str, substring: str) -> list def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: - """ - Attempt to remove bad quotes from a raw line of an S3 log file. + """Attempt to remove bad quotes from a raw line of an S3 log file. These quotes are not properly escaped and are causing issues with the regex pattern. Various attempts to fix the regex failed, so this is the most reliable correction I could find. @@ -98,8 +96,7 @@ def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: def _parse_s3_log_line(*, raw_line: str) -> list[str]: - """ - The current method of parsing lines of an S3 log file. + """The current method of parsing lines of an S3 log file. Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing as a pre-step. No self-contained single regex was found that could account for this uncorrected strings. @@ -168,8 +165,7 @@ def _append_reduced_log_line( index: int, ip_hash_to_region: dict[str, str], ) -> None: - """ - Append the `reduced_log_lines` list with a ReducedLogLine constructed from a single raw log line, if it is valid. + """Append the `reduced_log_lines` list with a ReducedLogLine constructed from a single raw log line, if it is valid. Parameters ---------- @@ -184,6 +180,7 @@ def _append_reduced_log_line( The type of request to filter for. excluded_ips : collections.defaultdict of strings to booleans A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. + """ bucket = "" if bucket is None else bucket excluded_ips = excluded_ips or collections.defaultdict(bool) @@ -198,24 +195,24 @@ def _append_reduced_log_line( ) if full_log_line is None: - return None + return # Various early skip conditions if full_log_line.bucket != bucket: - return None + return # Skip all non-success status codes (those in the 200 block) if full_log_line.status_code[0] != "2": - return None + return # Derived from command string, e.g., "HEAD /blobs/b38/..." # Subset first 7 characters for performance parsed_request_type = full_log_line.operation.split(".")[1] if parsed_request_type != request_type: - return None + return if excluded_ips[full_log_line.ip_address] is True: - return None + return assert ( full_log_line.timestamp[-5:] == "+0000" diff --git a/src/dandi_s3_log_parser/testing/_helpers.py b/src/dandi_s3_log_parser/testing/_helpers.py index acba082..70bcb32 100644 --- a/src/dandi_s3_log_parser/testing/_helpers.py +++ b/src/dandi_s3_log_parser/testing/_helpers.py @@ -14,8 +14,7 @@ def find_random_example_line( maximum_lines_per_request_type: int = 5, seed: int = 0, ) -> str: - """ - Return a randomly chosen line from a folder of raw S3 log files to serve as an example for testing purposes. + """Return a randomly chosen line from a folder of raw S3 log files to serve as an example for testing purposes. Parameters ---------- @@ -30,6 +29,7 @@ def find_random_example_line( These lines are always found chronologically from the start of the file. seed : int The seed to use for the random number generator. + """ raw_s3_log_folder_path = pathlib.Path(raw_s3_log_folder_path) @@ -61,7 +61,7 @@ def find_random_example_line( # Safe - but possibly slower for random_log_file_path in all_raw_s3_log_file_paths: - with open(file=random_log_file_path, mode="r") as io: + with open(file=random_log_file_path) as io: all_lines = io.readlines() # 170 is just an estimation @@ -87,7 +87,7 @@ def find_random_example_line( print( f"No lines found for request type ('{request_type}') in file '{random_log_file_path}'! " - "Scanning the next file..." + "Scanning the next file...", ) if running_counts_by_request_type[request_type] > maximum_lines_per_request_type: diff --git a/tests/test_buffered_text_reader.py b/tests/test_buffered_text_reader.py index 7ef6b13..98bc6fd 100644 --- a/tests/test_buffered_text_reader.py +++ b/tests/test_buffered_text_reader.py @@ -37,7 +37,7 @@ def single_line_text_file_path(tmp_path_factory: pytest.TempPathFactory): def test_buffered_text_reader(large_text_file_path: pathlib.Path): maximum_buffer_size_in_bytes = 10**6 # 1 MB buffered_text_reader = dandi_s3_log_parser.BufferedTextReader( - file_path=large_text_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes + file_path=large_text_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) assert iter(buffered_text_reader) is buffered_text_reader, "BufferedTextReader object is not iterable!" @@ -58,7 +58,7 @@ def test_value_error(single_line_text_file_path: pathlib.Path): maximum_buffer_size_in_bytes = 10**6 # 1 MB with pytest.raises(ValueError) as error_info: buffered_text_reader = dandi_s3_log_parser.BufferedTextReader( - file_path=single_line_text_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes + file_path=single_line_text_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) next(buffered_text_reader) diff --git a/tests/test_order_and_anonymize.py b/tests/test_order_and_anonymize.py index 8970324..4efa422 100644 --- a/tests/test_order_and_anonymize.py +++ b/tests/test_order_and_anonymize.py @@ -1,6 +1,7 @@ import pathlib -import py + import pandas +import py import dandi_s3_log_parser @@ -24,11 +25,11 @@ def test_order_and_anonymize(tmpdir: py.path.local) -> None: expected_ordered_and_anonymized_s3_log_file_path = expected_output_folder_path / parsed_log_file_name test_ordered_and_anonymized_s3_log = pandas.read_table( - filepath_or_buffer=test_ordered_and_anonymized_s3_log_file_path, index_col=0 + filepath_or_buffer=test_ordered_and_anonymized_s3_log_file_path, index_col=0, ) expected_ordered_and_anonymized_s3_log = pandas.read_table( - filepath_or_buffer=expected_ordered_and_anonymized_s3_log_file_path, index_col=0 + filepath_or_buffer=expected_ordered_and_anonymized_s3_log_file_path, index_col=0, ) pandas.testing.assert_frame_equal( - left=test_ordered_and_anonymized_s3_log, right=expected_ordered_and_anonymized_s3_log + left=test_ordered_and_anonymized_s3_log, right=expected_ordered_and_anonymized_s3_log, ) diff --git a/tests/test_parse_dandi_raw_s3_log.py b/tests/test_parse_dandi_raw_s3_log.py index 23dbd53..21b3b71 100644 --- a/tests/test_parse_dandi_raw_s3_log.py +++ b/tests/test_parse_dandi_raw_s3_log.py @@ -7,8 +7,7 @@ def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): - """ - Most basic test of functionality. + """Most basic test of functionality. If there are failures in the parsing of any lines found in application, please raise an issue and contribute them to the example log collection. @@ -22,7 +21,7 @@ def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): test_parsed_s3_log_folder_path = tmpdir / "parsed_example_0" dandi_s3_log_parser.parse_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path + raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, ) test_output_file_paths = [path for path in test_parsed_s3_log_folder_path.iterdir() if path.is_file()] diff --git a/tests/test_parse_dandi_raw_s3_log_bad_lines.py b/tests/test_parse_dandi_raw_s3_log_bad_lines.py index 5c34f13..1868f8b 100644 --- a/tests/test_parse_dandi_raw_s3_log_bad_lines.py +++ b/tests/test_parse_dandi_raw_s3_log_bad_lines.py @@ -7,8 +7,7 @@ def test_parse_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local): - """ - 'ordered_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. + """'ordered_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. """ tmpdir = pathlib.Path(tmpdir) @@ -24,7 +23,7 @@ def test_parse_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local): test_parsed_s3_log_folder_path = tmpdir / "parsed_example_2" dandi_s3_log_parser.parse_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path + raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, ) test_output_file_paths = [path for path in test_parsed_s3_log_folder_path.iterdir() if path.is_file()]