diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py index 3c05cbf..ce39a2e 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py @@ -13,7 +13,7 @@ _load_ip_address_to_region_cache, _save_ip_address_to_region_cache, ) -from ._s3_log_line_parser import ReducedLogLine, _append_reduced_log_line +from ._s3_log_line_parser import _ReducedLogLine, _append_reduced_log_line from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._buffered_text_reader import BufferedTextReader from ._order_parsed_logs import order_parsed_logs @@ -155,7 +155,7 @@ def _get_reduced_log_lines( excluded_ips: collections.defaultdict[str, bool], tqdm_kwargs: dict | None = None, maximum_buffer_size_in_bytes: int = 4 * 10**9, -) -> list[ReducedLogLine]: +) -> list[_ReducedLogLine]: """ Reduce the full S3 log file to minimal content and return a list of in-memory collections.namedtuple objects. diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index 08b4edb..d938fda 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -23,7 +23,7 @@ from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._ip_utils import _get_region_from_ip_address -FULL_PATTERN_TO_FIELD_MAPPING = [ +_FULL_PATTERN_TO_FIELD_MAPPING = [ "bucket_owner", "bucket", "timestamp", @@ -50,25 +50,70 @@ "endpoint", "tls_version", "access_point_arn", - "extra", # TODO: Never figured out what this field is... ] -REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "region"] +_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "region"] -FullLogLine = collections.namedtuple("FullLogLine", FULL_PATTERN_TO_FIELD_MAPPING) -ReducedLogLine = collections.namedtuple("ReducedLogLine", REDUCED_PATTERN_TO_FIELD_MAPPING) +_FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING) +_ReducedLogLine = collections.namedtuple("ReducedLogLine", _REDUCED_PATTERN_TO_FIELD_MAPPING) +_S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)') -# Original -# S3_LOG_REGEX = re.compile(r'(?:"([^"]+)")|(?:\[([^\]]+)\])|([^ ]+)') +def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]: + indices = list() + start = 0 + while True: + next_index = string.find(substring, start) + if next_index == -1: # .find(...) was unable to locate the substring + break + indices.append(next_index) + start = next_index + 1 -# AI corrected... -S3_LOG_REGEX = re.compile(r'"([^"]+)"|\[([^]]+)]|([^ ]+)') + return indices + + +def _attempt_to_remove_bad_quotes(*, raw_line: str, bad_parsed_line: str) -> str: + """ + Attempt to remove bad quotes from a raw line of an S3 log file. + + These quotes are not properly escaped and are causing issues with the regex pattern. + Various attempts to fix the regex failed, so this is the most reliable correction I could find. + """ + starting_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring=' "') + ending_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring='" ') + + # If even further unexpected structure, just return the bad parsed line so that the error reporter can catch it + if len(starting_quotes_indices) == 0: + return bad_parsed_line + if len(starting_quotes_indices) != len(ending_quotes_indices): + return bad_parsed_line + + cleaned_raw_line = raw_line[0 : starting_quotes_indices[0]] + for counter in range(1, len(starting_quotes_indices) - 1): + next_block = raw_line[ending_quotes_indices[counter - 1] + 2 : starting_quotes_indices[counter]] + cleaned_raw_line += " - " + next_block + cleaned_raw_line += " - " + raw_line[ending_quotes_indices[-1] + 2 :] + + return cleaned_raw_line def _parse_s3_log_line(*, raw_line: str) -> list[str]: - """The current method of parsing lines of an S3 log file.""" - parsed_log_line = [a or b or c for a, b, c in S3_LOG_REGEX.findall(raw_line)] + """ + The current method of parsing lines of an S3 log file. + + Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing + as a pre-step. No self-contained single regex was found that could account for this uncorrected strings. + """ + parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_line)] + + number_of_parsed_items = len(parsed_log_line) + + # Everything worked as expected + if number_of_parsed_items <= 26: + return parsed_log_line + + potentially_cleaned_raw_line = _attempt_to_remove_bad_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) + parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)] return parsed_log_line @@ -79,7 +124,7 @@ def _get_full_log_line( log_file_path: pathlib.Path, index: int, raw_line: str, -) -> FullLogLine | None: +) -> _FullLogLine | None: """Construct a FullLogLine from a single parsed log line, or dump to error collection file and return None.""" full_log_line = None @@ -88,15 +133,13 @@ def _get_full_log_line( # ARN not detected case 24: parsed_log_line.append("-") - parsed_log_line.append("-") - full_log_line = FullLogLine(*parsed_log_line) - # Expected form most of the time + full_log_line = _FullLogLine(*parsed_log_line) + # Expected length for good lines case 25: - parsed_log_line.append("-") - full_log_line = FullLogLine(*parsed_log_line) - # Happens for certain types of HEAD requests + full_log_line = _FullLogLine(*parsed_log_line) + # Happens for certain types of HEAD requests; not sure what the extra element is case 26: - full_log_line = FullLogLine(*parsed_log_line) + full_log_line = _FullLogLine(*parsed_log_line[:25]) # Deviant log entry; usually some very ill-formed content in the URI # Dump information to a log file in the base folder for easy sharing @@ -117,7 +160,7 @@ def _get_full_log_line( def _append_reduced_log_line( *, raw_line: str, - reduced_log_lines: list[ReducedLogLine], + reduced_log_lines: list[_ReducedLogLine], bucket: str, request_type: str, excluded_ips: collections.defaultdict[str, bool], @@ -181,7 +224,7 @@ def _append_reduced_log_line( parsed_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") parsed_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 region = _get_region_from_ip_address(ip_hash_to_region=ip_hash_to_region, ip_address=full_log_line.remote_ip) - reduced_log_line = ReducedLogLine( + reduced_log_line = _ReducedLogLine( asset_id=full_log_line.asset_id, timestamp=parsed_timestamp, bytes_sent=parsed_bytes_sent, diff --git a/tests/examples/ordered_example_2/example_dandi_s3_log.log b/tests/examples/ordered_example_2/example_dandi_s3_log.log new file mode 100644 index 0000000..81d1749 --- /dev/null +++ b/tests/examples/ordered_example_2/example_dandi_s3_log.log @@ -0,0 +1,3 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv new file mode 100644 index 0000000..76f3a91 --- /dev/null +++ b/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2022-05-04 05:06:35 512 unknown diff --git a/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv new file mode 100644 index 0000000..6980324 --- /dev/null +++ b/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2021-12-31 23:06:42 1443 unknown diff --git a/tests/test_dandi_s3_log_parser.py b/tests/test_dandi_s3_log_parser.py deleted file mode 100644 index 60a7ce3..0000000 --- a/tests/test_dandi_s3_log_parser.py +++ /dev/null @@ -1,127 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): - """ - Most basic test of functionality. - - If there are failures in the parsing of any lines found in application, - please raise an issue and contribute them to the example log collection. - """ - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "ordered_example_0" - example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log" - expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" - - test_parsed_s3_log_folder_path = tmpdir / "parsed_example_0" - dandi_s3_log_parser.parse_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path - ) - test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) - expected_parsed_s3_log_file_path = ( - expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -def test_parse_all_dandi_raw_s3_logs_example_0(tmpdir: py.path.local): - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "ordered_example_1" - expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" - - test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1" - dandi_s3_log_parser.parse_all_dandi_raw_s3_logs( - base_raw_s3_log_folder_path=examples_folder_path, - parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, - ) - test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) - expected_parsed_s3_log_file_path = ( - expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local): - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "ordered_example_1" - expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" - - test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1" - dandi_s3_log_parser.parse_all_dandi_raw_s3_logs( - base_raw_s3_log_folder_path=examples_folder_path, - parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, - maximum_number_of_workers=2, - ) - test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) - expected_parsed_s3_log_file_path = ( - expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -# TODO: add tests for API and CLI usage of finding random example line from testing submodule diff --git a/tests/test_parse_all_dandi_raw_s3_logs.py b/tests/test_parse_all_dandi_raw_s3_logs.py new file mode 100644 index 0000000..afdc76a --- /dev/null +++ b/tests/test_parse_all_dandi_raw_s3_logs.py @@ -0,0 +1,46 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_parse_all_dandi_raw_s3_logs_example_0(tmpdir: py.path.local): + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "ordered_example_1" + expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" + + test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1" + dandi_s3_log_parser.parse_all_dandi_raw_s3_logs( + base_raw_s3_log_folder_path=examples_folder_path, + parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, + ) + test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) + expected_parsed_s3_log_file_path = ( + expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + +# TODO: add CLI diff --git a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py new file mode 100644 index 0000000..a975df9 --- /dev/null +++ b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py @@ -0,0 +1,47 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local): + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "ordered_example_1" + expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" + + test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1" + dandi_s3_log_parser.parse_all_dandi_raw_s3_logs( + base_raw_s3_log_folder_path=examples_folder_path, + parsed_s3_log_folder_path=test_parsed_s3_log_folder_path, + maximum_number_of_workers=2, + ) + test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) + expected_parsed_s3_log_file_path = ( + expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + +# TODO: add CLI diff --git a/tests/test_parse_dandi_raw_s3_log.py b/tests/test_parse_dandi_raw_s3_log.py new file mode 100644 index 0000000..329b180 --- /dev/null +++ b/tests/test_parse_dandi_raw_s3_log.py @@ -0,0 +1,49 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): + """ + Most basic test of functionality. + + If there are failures in the parsing of any lines found in application, + please raise an issue and contribute them to the example log collection. + """ + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "ordered_example_0" + example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log" + expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" + + test_parsed_s3_log_folder_path = tmpdir / "parsed_example_0" + dandi_s3_log_parser.parse_dandi_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path + ) + test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) + expected_parsed_s3_log_file_path = ( + expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) diff --git a/tests/test_parse_dandi_raw_s3_log_bad_lines.py b/tests/test_parse_dandi_raw_s3_log_bad_lines.py new file mode 100644 index 0000000..9ae2718 --- /dev/null +++ b/tests/test_parse_dandi_raw_s3_log_bad_lines.py @@ -0,0 +1,56 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_parse_dandi_raw_s3_log_example_2(tmpdir: py.path.local): + """ + 'ordered_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. + """ + tmpdir = pathlib.Path(tmpdir) + + # Count initial error folder contents + error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" + error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + initial_number_of_error_folder_contents = len(error_folder_contents) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "ordered_example_2" + example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log" + expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output" + + test_parsed_s3_log_folder_path = tmpdir / "parsed_example_2" + dandi_s3_log_parser.parse_dandi_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path + ) + test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0) + expected_parsed_s3_log_file_path = ( + expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + assert ( + len(post_test_error_folder_contents) == initial_number_of_error_folder_contents + ), "Errors occurred during line parsing!"