diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index 19bc288..5590ff3 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -3,15 +3,12 @@ The strategy is to... -1) Parse the raw line into a list of strings using a regex pattern. +1) Parse the raw line into a list of strings using a combination of regex patterns and custom string manipulation. 2) Construct a FullLogLine object from the parsed line. A collections.namedtuple object is used for performance. -3) Reduce and map the information from the FullLogLine into a ReducedLogLine object. - This uses a lot less memory than the full version. +3) Reduce and map the information from the FullLogLine into a collections.defaultdict object. Some of the mapping operations at this step include... - - Identifying the DANDI asset ID from the full blob. - - Parsing the timestamp in memory as a datetime.datetime object. - - Filtering out log lines from excluded IPs (such as Drogon or GitHub actions). - - Converting the full remote IP to a country and region, so it can be saved without violating privacy. + - Handling the timestamp in memory as a datetime.datetime object. + - Filtering out log lines from excluded IPs. """ import collections @@ -28,11 +25,12 @@ # REST.PUT.OBJECT # REST.HEAD.OBJECT # REST.POST.OBJECT +# REST.COPY.PART and REST.COPY.OBJECT_GET # REST.DELETE.OBJECT # REST.OPTIONS.PREFLIGHT # BATCH.DELETE.OBJECT # Longer names are truncated for lower data overhead via direct slicing based on known lengths and separator locations -_KNOWN_REQUEST_TYPES = ["GET", "PUT", "HEAD", "POST", "DELE", "OPTI", ".DEL"] +_KNOWN_REQUEST_TYPES = ["GET", "PUT", "HEAD", "POST", "COPY", "DELE", "OPTI", ".DEL"] _IS_REQUEST_TYPE_KNOWN = collections.defaultdict(bool) for request_type in _KNOWN_REQUEST_TYPES: diff --git a/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv index a10cb8a..2a908ec 100644 --- a/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv +++ b/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,5 +1,5 @@ timestamp bytes_sent region 0 2020-02-24 05:06:35 124 MX/Nuevo León -1 2021-05-21 05:06:35 1234 EC/Guayas +1 2021-05-21 05:06:35 1234 EC/Santa Elena 2 2022-07-01 05:06:35 512 unknown 3 2022-11-04 05:06:35 141424 FI/Uusimaa diff --git a/tests/test_order_and_anonymize.py b/tests/test_order_and_anonymize.py index 8dcb190..45bd0df 100644 --- a/tests/test_order_and_anonymize.py +++ b/tests/test_order_and_anonymize.py @@ -33,6 +33,10 @@ def test_order_and_anonymize(tmpdir: py.path.local) -> None: filepath_or_buffer=expected_ordered_and_anonymized_s3_log_file_path, index_col=0, ) + + test_ordered_and_anonymized_s3_log = test_ordered_and_anonymized_s3_log.sort_values(by="timestamp") + expected_ordered_and_anonymized_s3_log = expected_ordered_and_anonymized_s3_log.sort_values(by="timestamp") + pandas.testing.assert_frame_equal( left=test_ordered_and_anonymized_s3_log, right=expected_ordered_and_anonymized_s3_log,