From 19ac9acba5a55e78c1b6574d34407a0d8033b0a9 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:53:50 -0400 Subject: [PATCH] add line index to parser for easier lookup in source data (#33) Co-authored-by: CodyCBakerPhD --- src/dandi_s3_log_parser/_s3_log_file_parser.py | 5 ++++- src/dandi_s3_log_parser/_s3_log_line_parser.py | 14 ++++++++------ .../blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv | 4 ++-- .../blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 4 ++-- .../blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv | 6 +++--- .../blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 6 +++--- .../blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv | 4 ++-- .../blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv | 4 ++-- .../blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 4 ++-- 9 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py index 80ae153..c23e4b0 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py @@ -90,6 +90,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str: reduced_logs_binned_by_unparsed_asset[raw_asset_id]["timestamp"].append(reduced_log.timestamp) reduced_logs_binned_by_unparsed_asset[raw_asset_id]["bytes_sent"].append(reduced_log.bytes_sent) reduced_logs_binned_by_unparsed_asset[raw_asset_id]["ip_address"].append(reduced_log.ip_address) + reduced_logs_binned_by_unparsed_asset[raw_asset_id]["line_index"].append(reduced_log.line_index) if asset_id_handler is not None: reduced_logs_binned_by_asset = dict() @@ -160,6 +161,8 @@ def _get_reduced_log_lines( ): index = 0 for raw_line in buffered_raw_lines: + line_index = per_buffer_index + index + _append_reduced_log_line( raw_line=raw_line, reduced_log_lines=reduced_log_lines, @@ -167,7 +170,7 @@ def _get_reduced_log_lines( request_type=request_type, excluded_ips=excluded_ips, log_file_path=raw_s3_log_file_path, - index=index, + line_index=line_index, ) index += 1 per_buffer_index += index diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index 9501ca6..af9ef1b 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -50,7 +50,7 @@ "tls_version", "access_point_arn", ] -_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "ip_address"] +_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "ip_address", "line_index"] _FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING) _ReducedLogLine = collections.namedtuple("ReducedLogLine", _REDUCED_PATTERN_TO_FIELD_MAPPING) @@ -121,7 +121,7 @@ def _get_full_log_line( *, parsed_log_line: list[str], log_file_path: pathlib.Path, - index: int, + line_index: int, raw_line: str, ) -> _FullLogLine | None: """Construct a FullLogLine from a single parsed log line, or dump to error collection file and return None.""" @@ -151,7 +151,7 @@ def _get_full_log_line( lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_lines_errors.txt" with open(file=lines_errors_file_path, mode="a") as io: - io.write(f"Line {index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}\n\n") + io.write(f"Line {line_index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}\n\n") return full_log_line @@ -163,8 +163,8 @@ def _append_reduced_log_line( bucket: str, request_type: str, excluded_ips: collections.defaultdict[str, bool], + line_index: int, log_file_path: pathlib.Path, - index: int, ) -> None: """ Append the `reduced_log_lines` list with a ReducedLogLine constructed from a single raw log line, if it is valid. @@ -182,7 +182,8 @@ def _append_reduced_log_line( The type of request to filter for. excluded_ips : collections.defaultdict of strings to booleans A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - + line_index: int + The index of the line in the raw log file. """ bucket = "" if bucket is None else bucket excluded_ips = excluded_ips or collections.defaultdict(bool) @@ -192,7 +193,7 @@ def _append_reduced_log_line( full_log_line = _get_full_log_line( parsed_log_line=parsed_log_line, log_file_path=log_file_path, - index=index, + line_index=line_index, raw_line=raw_line, ) @@ -226,6 +227,7 @@ def _append_reduced_log_line( timestamp=parsed_timestamp, bytes_sent=parsed_bytes_sent, ip_address=full_log_line.ip_address, + line_index=line_index, ) reduced_log_lines.append(reduced_log_line) diff --git a/tests/examples/parsed_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/parsed_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv index ce1959a..1cc8230 100644 --- a/tests/examples/parsed_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv +++ b/tests/examples/parsed_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address -2022-05-04 05:06:35 512 192.0.2.0 +timestamp bytes_sent ip_address line_index +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/parsed_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/parsed_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv index b4bb825..5b97e6c 100644 --- a/tests/examples/parsed_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ b/tests/examples/parsed_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address -2021-12-31 23:06:42 1443 192.0.2.0 +timestamp bytes_sent ip_address line_index +2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/parsed_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/parsed_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv index 9c8da44..570c480 100644 --- a/tests/examples/parsed_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv +++ b/tests/examples/parsed_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,3 +1,3 @@ -timestamp bytes_sent ip_address -2022-03-16 02:21:12 512 192.0.2.0 -2022-05-04 05:06:35 512 192.0.2.0 +timestamp bytes_sent ip_address line_index +2022-03-16 02:21:12 512 192.0.2.0 1 +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/parsed_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/parsed_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv index 7a2e63d..0851ae8 100644 --- a/tests/examples/parsed_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ b/tests/examples/parsed_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -1,3 +1,3 @@ -timestamp bytes_sent ip_address -2021-04-24 12:03:05 1443 192.0.2.0 -2021-12-31 23:06:42 1443 192.0.2.0 +timestamp bytes_sent ip_address line_index +2021-04-24 12:03:05 1443 192.0.2.0 0 +2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/parsed_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/examples/parsed_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv index 89d2979..a55bf13 100644 --- a/tests/examples/parsed_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv +++ b/tests/examples/parsed_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address -2023-06-26 03:05:53 6616308 192.0.2.0 +timestamp bytes_sent ip_address line_index +2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/tests/examples/parsed_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/parsed_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv index ce1959a..1cc8230 100644 --- a/tests/examples/parsed_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv +++ b/tests/examples/parsed_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address -2022-05-04 05:06:35 512 192.0.2.0 +timestamp bytes_sent ip_address line_index +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/parsed_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/parsed_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv index b4bb825..5b97e6c 100644 --- a/tests/examples/parsed_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ b/tests/examples/parsed_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address -2021-12-31 23:06:42 1443 192.0.2.0 +timestamp bytes_sent ip_address line_index +2021-12-31 23:06:42 1443 192.0.2.0 0