Skip to content

Commit

Permalink
add line index to parser for easier lookup in source data (#33)
Browse files Browse the repository at this point in the history
Co-authored-by: CodyCBakerPhD <[email protected]>
  • Loading branch information
CodyCBakerPhD and CodyCBakerPhD authored Aug 13, 2024
1 parent d941bc1 commit 19ac9ac
Show file tree
Hide file tree
Showing 9 changed files with 28 additions and 23 deletions.
5 changes: 4 additions & 1 deletion src/dandi_s3_log_parser/_s3_log_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
reduced_logs_binned_by_unparsed_asset[raw_asset_id]["timestamp"].append(reduced_log.timestamp)
reduced_logs_binned_by_unparsed_asset[raw_asset_id]["bytes_sent"].append(reduced_log.bytes_sent)
reduced_logs_binned_by_unparsed_asset[raw_asset_id]["ip_address"].append(reduced_log.ip_address)
reduced_logs_binned_by_unparsed_asset[raw_asset_id]["line_index"].append(reduced_log.line_index)

if asset_id_handler is not None:
reduced_logs_binned_by_asset = dict()
Expand Down Expand Up @@ -160,14 +161,16 @@ def _get_reduced_log_lines(
):
index = 0
for raw_line in buffered_raw_lines:
line_index = per_buffer_index + index

_append_reduced_log_line(
raw_line=raw_line,
reduced_log_lines=reduced_log_lines,
bucket=bucket,
request_type=request_type,
excluded_ips=excluded_ips,
log_file_path=raw_s3_log_file_path,
index=index,
line_index=line_index,
)
index += 1
per_buffer_index += index
Expand Down
14 changes: 8 additions & 6 deletions src/dandi_s3_log_parser/_s3_log_line_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"tls_version",
"access_point_arn",
]
_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "ip_address"]
_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "ip_address", "line_index"]

_FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING)
_ReducedLogLine = collections.namedtuple("ReducedLogLine", _REDUCED_PATTERN_TO_FIELD_MAPPING)
Expand Down Expand Up @@ -121,7 +121,7 @@ def _get_full_log_line(
*,
parsed_log_line: list[str],
log_file_path: pathlib.Path,
index: int,
line_index: int,
raw_line: str,
) -> _FullLogLine | None:
"""Construct a FullLogLine from a single parsed log line, or dump to error collection file and return None."""
Expand Down Expand Up @@ -151,7 +151,7 @@ def _get_full_log_line(
lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_lines_errors.txt"

with open(file=lines_errors_file_path, mode="a") as io:
io.write(f"Line {index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}\n\n")
io.write(f"Line {line_index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}\n\n")

return full_log_line

Expand All @@ -163,8 +163,8 @@ def _append_reduced_log_line(
bucket: str,
request_type: str,
excluded_ips: collections.defaultdict[str, bool],
line_index: int,
log_file_path: pathlib.Path,
index: int,
) -> None:
"""
Append the `reduced_log_lines` list with a ReducedLogLine constructed from a single raw log line, if it is valid.
Expand All @@ -182,7 +182,8 @@ def _append_reduced_log_line(
The type of request to filter for.
excluded_ips : collections.defaultdict of strings to booleans
A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing.
line_index: int
The index of the line in the raw log file.
"""
bucket = "" if bucket is None else bucket
excluded_ips = excluded_ips or collections.defaultdict(bool)
Expand All @@ -192,7 +193,7 @@ def _append_reduced_log_line(
full_log_line = _get_full_log_line(
parsed_log_line=parsed_log_line,
log_file_path=log_file_path,
index=index,
line_index=line_index,
raw_line=raw_line,
)

Expand Down Expand Up @@ -226,6 +227,7 @@ def _append_reduced_log_line(
timestamp=parsed_timestamp,
bytes_sent=parsed_bytes_sent,
ip_address=full_log_line.ip_address,
line_index=line_index,
)

reduced_log_lines.append(reduced_log_line)
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
timestamp bytes_sent ip_address
2022-05-04 05:06:35 512 192.0.2.0
timestamp bytes_sent ip_address line_index
2022-05-04 05:06:35 512 192.0.2.0 1
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
timestamp bytes_sent ip_address
2021-12-31 23:06:42 1443 192.0.2.0
timestamp bytes_sent ip_address line_index
2021-12-31 23:06:42 1443 192.0.2.0 0
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
timestamp bytes_sent ip_address
2022-03-16 02:21:12 512 192.0.2.0
2022-05-04 05:06:35 512 192.0.2.0
timestamp bytes_sent ip_address line_index
2022-03-16 02:21:12 512 192.0.2.0 1
2022-05-04 05:06:35 512 192.0.2.0 1
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
timestamp bytes_sent ip_address
2021-04-24 12:03:05 1443 192.0.2.0
2021-12-31 23:06:42 1443 192.0.2.0
timestamp bytes_sent ip_address line_index
2021-04-24 12:03:05 1443 192.0.2.0 0
2021-12-31 23:06:42 1443 192.0.2.0 0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
timestamp bytes_sent ip_address
2023-06-26 03:05:53 6616308 192.0.2.0
timestamp bytes_sent ip_address line_index
2023-06-26 03:05:53 6616308 192.0.2.0 3
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
timestamp bytes_sent ip_address
2022-05-04 05:06:35 512 192.0.2.0
timestamp bytes_sent ip_address line_index
2022-05-04 05:06:35 512 192.0.2.0 1
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
timestamp bytes_sent ip_address
2021-12-31 23:06:42 1443 192.0.2.0
timestamp bytes_sent ip_address line_index
2021-12-31 23:06:42 1443 192.0.2.0 0

0 comments on commit 19ac9ac

Please sign in to comment.