From 4049188d585c5acb849afff8e835926d4b44fd7a Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Wed, 14 Aug 2024 13:36:10 -0400
Subject: [PATCH] Add more types (#37)

* add more types

* update test

---------

Co-authored-by: CodyCBakerPhD <codycbakerphd@gmail.com>
---
 src/dandi_s3_log_parser/_s3_log_line_parser.py     | 14 ++++++--------
 .../blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv |  2 +-
 tests/test_order_and_anonymize.py                  |  4 ++++
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py
index 19bc288..5590ff3 100644
--- a/src/dandi_s3_log_parser/_s3_log_line_parser.py
+++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py
@@ -3,15 +3,12 @@
 
 The strategy is to...
 
-1) Parse the raw line into a list of strings using a regex pattern.
+1) Parse the raw line into a list of strings using a combination of regex patterns and custom string manipulation.
 2) Construct a FullLogLine object from the parsed line. A collections.namedtuple object is used for performance.
-3) Reduce and map the information from the FullLogLine into a ReducedLogLine object.
-   This uses a lot less memory than the full version.
+3) Reduce and map the information from the FullLogLine into a collections.defaultdict object.
    Some of the mapping operations at this step include...
-      - Identifying the DANDI asset ID from the full blob.
-      - Parsing the timestamp in memory as a datetime.datetime object.
-      - Filtering out log lines from excluded IPs (such as Drogon or GitHub actions).
-      - Converting the full remote IP to a country and region, so it can be saved without violating privacy.
+      - Handling the timestamp in memory as a datetime.datetime object.
+      - Filtering out log lines from excluded IPs.
 """
 
 import collections
@@ -28,11 +25,12 @@
 # REST.PUT.OBJECT
 # REST.HEAD.OBJECT
 # REST.POST.OBJECT
+# REST.COPY.PART and REST.COPY.OBJECT_GET
 # REST.DELETE.OBJECT
 # REST.OPTIONS.PREFLIGHT
 # BATCH.DELETE.OBJECT
 # Longer names are truncated for lower data overhead via direct slicing based on known lengths and separator locations
-_KNOWN_REQUEST_TYPES = ["GET", "PUT", "HEAD", "POST", "DELE", "OPTI", ".DEL"]
+_KNOWN_REQUEST_TYPES = ["GET", "PUT", "HEAD", "POST", "COPY", "DELE", "OPTI", ".DEL"]
 
 _IS_REQUEST_TYPE_KNOWN = collections.defaultdict(bool)
 for request_type in _KNOWN_REQUEST_TYPES:
diff --git a/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
index a10cb8a..2a908ec 100644
--- a/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
+++ b/tests/examples/order_and_anonymize_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
@@ -1,5 +1,5 @@
 	timestamp	bytes_sent	region
 0	2020-02-24 05:06:35	124	MX/Nuevo León
-1	2021-05-21 05:06:35	1234	EC/Guayas
+1	2021-05-21 05:06:35	1234	EC/Santa Elena
 2	2022-07-01 05:06:35	512	unknown
 3	2022-11-04 05:06:35	141424	FI/Uusimaa
diff --git a/tests/test_order_and_anonymize.py b/tests/test_order_and_anonymize.py
index 8dcb190..45bd0df 100644
--- a/tests/test_order_and_anonymize.py
+++ b/tests/test_order_and_anonymize.py
@@ -33,6 +33,10 @@ def test_order_and_anonymize(tmpdir: py.path.local) -> None:
             filepath_or_buffer=expected_ordered_and_anonymized_s3_log_file_path,
             index_col=0,
         )
+
+        test_ordered_and_anonymized_s3_log = test_ordered_and_anonymized_s3_log.sort_values(by="timestamp")
+        expected_ordered_and_anonymized_s3_log = expected_ordered_and_anonymized_s3_log.sort_values(by="timestamp")
+
         pandas.testing.assert_frame_equal(
             left=test_ordered_and_anonymized_s3_log,
             right=expected_ordered_and_anonymized_s3_log,