Skip to content

Commit

Permalink
readability
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyCBakerPhD committed Aug 15, 2024
1 parent e7e4ad2 commit 5f242c4
Showing 1 changed file with 45 additions and 41 deletions.
86 changes: 45 additions & 41 deletions src/dandi_s3_log_parser/_s3_log_line_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
)

if full_log_line is None:
return
return None

# Various early skip conditions
if full_log_line.bucket != bucket:
return
return None

# Apply some minimal validation and contribute any invalidations to error collection
# These might slow parsing down a bit, but could be important to ensuring accuracy
Expand All @@ -101,30 +101,34 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
)
with open(file=lines_errors_file_path, mode="a") as io:
io.write(message)
return None

if _IS_OPERATION_TYPE_KNOWN[full_log_line.operation] is False:
message = (
f"Unexpected request type: '{full_log_line.operation}' on line {line_index} of file {log_file_path}.\n\n"
)
with open(file=lines_errors_file_path, mode="a") as io:
io.write(message)
return None

timezone = full_log_line.timestamp[-5:] != "+0000"
if timezone:
timezone = full_log_line.timestamp[-5:]
is_timezone_utc = timezone != "+0000"
if is_timezone_utc:
message = f"Unexpected time shift attached to log! Have always seen '+0000', found `{timezone=}`.\n\n"
with open(file=lines_errors_file_path, mode="a") as io:
io.write(message)
# Fine to continue here

# More early skip conditions
# More early skip conditions after validation
# Only accept 200-block status codes
if full_log_line.status_code[0] != "2":
return
return None

if full_log_line.operation != operation_type:
return
return None

if excluded_ips[full_log_line.ip_address] is True:
return
return None

# All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID
handled_asset_id = asset_id_handler(raw_asset_id=full_log_line.asset_id)
Expand All @@ -141,24 +145,25 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
reduced_and_binned_logs[handled_asset_id]["line_index"].append(line_index)


def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]:
indices = list()
start = 0
max_iter = 10**6
while True and start < max_iter:
next_index = string.find(substring, start)
if next_index == -1: # .find(...) was unable to locate the substring
break
indices.append(next_index)
start = next_index + 1
def _parse_s3_log_line(*, raw_line: str) -> list[str]:
"""
The current method of parsing lines of an S3 log file.
if start >= max_iter:
message = (
f"Exceeded maximum iterations in `_find_all_possible_substring_indices` on `{string=}` with `{substring=}`."
)
raise StopIteration(message)
Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing
as a pre-step. No self-contained single regex was found that could account for this uncorrected strings.
"""
parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_line)]

return indices
number_of_parsed_items = len(parsed_log_line)

# Everything worked as expected
if number_of_parsed_items <= 26:
return parsed_log_line

potentially_cleaned_raw_line = _attempt_to_remove_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line)
parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)]

return parsed_log_line


def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str:
Expand Down Expand Up @@ -186,25 +191,24 @@ def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str:
return cleaned_raw_line


def _parse_s3_log_line(*, raw_line: str) -> list[str]:
"""
The current method of parsing lines of an S3 log file.
Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing
as a pre-step. No self-contained single regex was found that could account for this uncorrected strings.
"""
parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_line)]

number_of_parsed_items = len(parsed_log_line)

# Everything worked as expected
if number_of_parsed_items <= 26:
return parsed_log_line
def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]:
indices = list()
start = 0
max_iter = 10**6
while True and start < max_iter:
next_index = string.find(substring, start)
if next_index == -1: # .find(...) was unable to locate the substring
break
indices.append(next_index)
start = next_index + 1

potentially_cleaned_raw_line = _attempt_to_remove_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line)
parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)]
if start >= max_iter:
message = (
f"Exceeded maximum iterations in `_find_all_possible_substring_indices` on `{string=}` with `{substring=}`."
)
raise StopIteration(message)

return parsed_log_line
return indices


def _get_full_log_line(
Expand Down

0 comments on commit 5f242c4

Please sign in to comment.