From 23fdeb66bbf77005fe44086734fb4b55299c164e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Sun, 11 Aug 2024 16:25:55 -0400 Subject: [PATCH] added another bad line; adjusted name of helper to clarify --- src/dandi_s3_log_parser/_s3_log_line_parser.py | 4 ++-- tests/examples/ordered_example_2/example_dandi_s3_log.log | 1 + .../blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index d938fda..2df7abb 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -72,7 +72,7 @@ def _find_all_possible_substring_indices(*, string: str, substring: str) -> list return indices -def _attempt_to_remove_bad_quotes(*, raw_line: str, bad_parsed_line: str) -> str: +def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: """ Attempt to remove bad quotes from a raw line of an S3 log file. @@ -112,7 +112,7 @@ def _parse_s3_log_line(*, raw_line: str) -> list[str]: if number_of_parsed_items <= 26: return parsed_log_line - potentially_cleaned_raw_line = _attempt_to_remove_bad_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) + potentially_cleaned_raw_line = _attempt_to_remove_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)] return parsed_log_line diff --git a/tests/examples/ordered_example_2/example_dandi_s3_log.log b/tests/examples/ordered_example_2/example_dandi_s3_log.log index 81d1749..9205ad3 100644 --- a/tests/examples/ordered_example_2/example_dandi_s3_log.log +++ b/tests/examples/ordered_example_2/example_dandi_s3_log.log @@ -1,3 +1,4 @@ 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv new file mode 100644 index 0000000..0f387f7 --- /dev/null +++ b/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2023-06-26 03:05:53 6616308 unknown