From f183d0f8f92045ec95759707a686d54cf34de40f Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Sun, 11 Aug 2024 16:06:36 -0400
Subject: [PATCH] split tests; add first bad line; debug

---
 .../_s3_log_file_parser.py                    |   4 +-
 .../_s3_log_line_parser.py                    |  85 +++++++++---
 .../example_dandi_s3_log.log                  |   3 +
 ...s_11ec8933-1456-4942-922b-94e5878bb991.tsv |   2 +
 ...s_a7b032b8-1e31-429f-975f-52a28cec6629.tsv |   2 +
 tests/test_dandi_s3_log_parser.py             | 127 ------------------
 tests/test_parse_all_dandi_raw_s3_logs.py     |  46 +++++++
 ...st_parse_all_dandi_raw_s3_logs_parallel.py |  47 +++++++
 tests/test_parse_dandi_raw_s3_log.py          |  49 +++++++
 .../test_parse_dandi_raw_s3_log_bad_lines.py  |  56 ++++++++
 10 files changed, 271 insertions(+), 150 deletions(-)
 create mode 100644 tests/examples/ordered_example_2/example_dandi_s3_log.log
 create mode 100644 tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
 create mode 100644 tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv
 delete mode 100644 tests/test_dandi_s3_log_parser.py
 create mode 100644 tests/test_parse_all_dandi_raw_s3_logs.py
 create mode 100644 tests/test_parse_all_dandi_raw_s3_logs_parallel.py
 create mode 100644 tests/test_parse_dandi_raw_s3_log.py
 create mode 100644 tests/test_parse_dandi_raw_s3_log_bad_lines.py

diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py
index 3c05cbf..ce39a2e 100644
--- a/src/dandi_s3_log_parser/_s3_log_file_parser.py
+++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py
@@ -13,7 +13,7 @@
     _load_ip_address_to_region_cache,
     _save_ip_address_to_region_cache,
 )
-from ._s3_log_line_parser import ReducedLogLine, _append_reduced_log_line
+from ._s3_log_line_parser import _ReducedLogLine, _append_reduced_log_line
 from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
 from ._buffered_text_reader import BufferedTextReader
 from ._order_parsed_logs import order_parsed_logs
@@ -155,7 +155,7 @@ def _get_reduced_log_lines(
     excluded_ips: collections.defaultdict[str, bool],
     tqdm_kwargs: dict | None = None,
     maximum_buffer_size_in_bytes: int = 4 * 10**9,
-) -> list[ReducedLogLine]:
+) -> list[_ReducedLogLine]:
     """
     Reduce the full S3 log file to minimal content and return a list of in-memory collections.namedtuple objects.
 
diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py
index 08b4edb..d938fda 100644
--- a/src/dandi_s3_log_parser/_s3_log_line_parser.py
+++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py
@@ -23,7 +23,7 @@
 from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
 from ._ip_utils import _get_region_from_ip_address
 
-FULL_PATTERN_TO_FIELD_MAPPING = [
+_FULL_PATTERN_TO_FIELD_MAPPING = [
     "bucket_owner",
     "bucket",
     "timestamp",
@@ -50,25 +50,70 @@
     "endpoint",
     "tls_version",
     "access_point_arn",
-    "extra",  # TODO: Never figured out what this field is...
 ]
-REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "region"]
+_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "region"]
 
-FullLogLine = collections.namedtuple("FullLogLine", FULL_PATTERN_TO_FIELD_MAPPING)
-ReducedLogLine = collections.namedtuple("ReducedLogLine", REDUCED_PATTERN_TO_FIELD_MAPPING)
+_FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING)
+_ReducedLogLine = collections.namedtuple("ReducedLogLine", _REDUCED_PATTERN_TO_FIELD_MAPPING)
 
+_S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)')
 
-# Original
-# S3_LOG_REGEX = re.compile(r'(?:"([^"]+)")|(?:\[([^\]]+)\])|([^ ]+)')
 
+def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]:
+    indices = list()
+    start = 0
+    while True:
+        next_index = string.find(substring, start)
+        if next_index == -1:  # .find(...) was unable to locate the substring
+            break
+        indices.append(next_index)
+        start = next_index + 1
 
-# AI corrected...
-S3_LOG_REGEX = re.compile(r'"([^"]+)"|\[([^]]+)]|([^ ]+)')
+    return indices
+
+
+def _attempt_to_remove_bad_quotes(*, raw_line: str, bad_parsed_line: str) -> str:
+    """
+    Attempt to remove bad quotes from a raw line of an S3 log file.
+
+    These quotes are not properly escaped and are causing issues with the regex pattern.
+    Various attempts to fix the regex failed, so this is the most reliable correction I could find.
+    """
+    starting_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring=' "')
+    ending_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring='" ')
+
+    # If even further unexpected structure, just return the bad parsed line so that the error reporter can catch it
+    if len(starting_quotes_indices) == 0:
+        return bad_parsed_line
+    if len(starting_quotes_indices) != len(ending_quotes_indices):
+        return bad_parsed_line
+
+    cleaned_raw_line = raw_line[0 : starting_quotes_indices[0]]
+    for counter in range(1, len(starting_quotes_indices) - 1):
+        next_block = raw_line[ending_quotes_indices[counter - 1] + 2 : starting_quotes_indices[counter]]
+        cleaned_raw_line += " - " + next_block
+    cleaned_raw_line += " - " + raw_line[ending_quotes_indices[-1] + 2 :]
+
+    return cleaned_raw_line
 
 
 def _parse_s3_log_line(*, raw_line: str) -> list[str]:
-    """The current method of parsing lines of an S3 log file."""
-    parsed_log_line = [a or b or c for a, b, c in S3_LOG_REGEX.findall(raw_line)]
+    """
+    The current method of parsing lines of an S3 log file.
+
+    Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing
+    as a pre-step. No self-contained single regex was found that could account for this uncorrected strings.
+    """
+    parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_line)]
+
+    number_of_parsed_items = len(parsed_log_line)
+
+    # Everything worked as expected
+    if number_of_parsed_items <= 26:
+        return parsed_log_line
+
+    potentially_cleaned_raw_line = _attempt_to_remove_bad_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line)
+    parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)]
 
     return parsed_log_line
 
@@ -79,7 +124,7 @@ def _get_full_log_line(
     log_file_path: pathlib.Path,
     index: int,
     raw_line: str,
-) -> FullLogLine | None:
+) -> _FullLogLine | None:
     """Construct a FullLogLine from a single parsed log line, or dump to error collection file and return None."""
     full_log_line = None
 
@@ -88,15 +133,13 @@ def _get_full_log_line(
         # ARN not detected
         case 24:
             parsed_log_line.append("-")
-            parsed_log_line.append("-")
-            full_log_line = FullLogLine(*parsed_log_line)
-        # Expected form most of the time
+            full_log_line = _FullLogLine(*parsed_log_line)
+        # Expected length for good lines
         case 25:
-            parsed_log_line.append("-")
-            full_log_line = FullLogLine(*parsed_log_line)
-        # Happens for certain types of HEAD requests
+            full_log_line = _FullLogLine(*parsed_log_line)
+        # Happens for certain types of HEAD requests; not sure what the extra element is
         case 26:
-            full_log_line = FullLogLine(*parsed_log_line)
+            full_log_line = _FullLogLine(*parsed_log_line[:25])
 
     # Deviant log entry; usually some very ill-formed content in the URI
     # Dump information to a log file in the base folder for easy sharing
@@ -117,7 +160,7 @@ def _get_full_log_line(
 def _append_reduced_log_line(
     *,
     raw_line: str,
-    reduced_log_lines: list[ReducedLogLine],
+    reduced_log_lines: list[_ReducedLogLine],
     bucket: str,
     request_type: str,
     excluded_ips: collections.defaultdict[str, bool],
@@ -181,7 +224,7 @@ def _append_reduced_log_line(
     parsed_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S")
     parsed_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0
     region = _get_region_from_ip_address(ip_hash_to_region=ip_hash_to_region, ip_address=full_log_line.remote_ip)
-    reduced_log_line = ReducedLogLine(
+    reduced_log_line = _ReducedLogLine(
         asset_id=full_log_line.asset_id,
         timestamp=parsed_timestamp,
         bytes_sent=parsed_bytes_sent,
diff --git a/tests/examples/ordered_example_2/example_dandi_s3_log.log b/tests/examples/ordered_example_2/example_dandi_s3_log.log
new file mode 100644
index 0000000..81d1749
--- /dev/null
+++ b/tests/examples/ordered_example_2/example_dandi_s3_log.log
@@ -0,0 +1,3 @@
+8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 -
+8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 -
+8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1<?php$password%20=%20"xinba";$ch%20=%20explode(".","hello.ass.world.er.t");array_intersect_ukey(array($_REQUEST[$password]%20=>%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1<?php$password%20=%20"xinba";$ch%20=%20explode(".","hello.ass.world.er.t");array_intersect_ukey(array($_REQUEST[$password]%20=>%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -
diff --git a/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
new file mode 100644
index 0000000..76f3a91
--- /dev/null
+++ b/tests/examples/ordered_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv
@@ -0,0 +1,2 @@
+	timestamp	bytes_sent	region
+0	2022-05-04 05:06:35	512	unknown
diff --git a/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv
new file mode 100644
index 0000000..6980324
--- /dev/null
+++ b/tests/examples/ordered_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv
@@ -0,0 +1,2 @@
+	timestamp	bytes_sent	region
+0	2021-12-31 23:06:42	1443	unknown
diff --git a/tests/test_dandi_s3_log_parser.py b/tests/test_dandi_s3_log_parser.py
deleted file mode 100644
index 60a7ce3..0000000
--- a/tests/test_dandi_s3_log_parser.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import pathlib
-
-import pandas
-import py
-
-import dandi_s3_log_parser
-
-
-def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local):
-    """
-    Most basic test of functionality.
-
-    If there are failures in the parsing of any lines found in application,
-    please raise an issue and contribute them to the example log collection.
-    """
-    tmpdir = pathlib.Path(tmpdir)
-
-    file_parent = pathlib.Path(__file__).parent
-    examples_folder_path = file_parent / "examples" / "ordered_example_0"
-    example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log"
-    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
-
-    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_0"
-    dandi_s3_log_parser.parse_dandi_raw_s3_log(
-        raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path
-    )
-    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
-
-    number_of_output_files = len(test_output_file_paths)
-    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
-
-    # Increment this over time as more examples are added
-    expected_number_of_output_files = 2
-    assert (
-        number_of_output_files == expected_number_of_output_files
-    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
-
-    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
-    for test_parsed_s3_log_file_path in test_output_file_paths:
-        assert (
-            test_parsed_s3_log_file_path.stem in expected_asset_ids
-        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
-
-        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
-        expected_parsed_s3_log_file_path = (
-            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
-        )
-        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
-        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
-
-
-def test_parse_all_dandi_raw_s3_logs_example_0(tmpdir: py.path.local):
-    tmpdir = pathlib.Path(tmpdir)
-
-    file_parent = pathlib.Path(__file__).parent
-    examples_folder_path = file_parent / "examples" / "ordered_example_1"
-    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
-
-    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1"
-    dandi_s3_log_parser.parse_all_dandi_raw_s3_logs(
-        base_raw_s3_log_folder_path=examples_folder_path,
-        parsed_s3_log_folder_path=test_parsed_s3_log_folder_path,
-    )
-    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
-
-    number_of_output_files = len(test_output_file_paths)
-    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
-
-    # Increment this over time as more examples are added
-    expected_number_of_output_files = 2
-    assert (
-        number_of_output_files == expected_number_of_output_files
-    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
-
-    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
-    for test_parsed_s3_log_file_path in test_output_file_paths:
-        assert (
-            test_parsed_s3_log_file_path.stem in expected_asset_ids
-        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
-
-        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
-        expected_parsed_s3_log_file_path = (
-            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
-        )
-        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
-        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
-
-
-def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local):
-    tmpdir = pathlib.Path(tmpdir)
-
-    file_parent = pathlib.Path(__file__).parent
-    examples_folder_path = file_parent / "examples" / "ordered_example_1"
-    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
-
-    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1"
-    dandi_s3_log_parser.parse_all_dandi_raw_s3_logs(
-        base_raw_s3_log_folder_path=examples_folder_path,
-        parsed_s3_log_folder_path=test_parsed_s3_log_folder_path,
-        maximum_number_of_workers=2,
-    )
-    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
-
-    number_of_output_files = len(test_output_file_paths)
-    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
-
-    # Increment this over time as more examples are added
-    expected_number_of_output_files = 2
-    assert (
-        number_of_output_files == expected_number_of_output_files
-    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
-
-    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
-    for test_parsed_s3_log_file_path in test_output_file_paths:
-        assert (
-            test_parsed_s3_log_file_path.stem in expected_asset_ids
-        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
-
-        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
-        expected_parsed_s3_log_file_path = (
-            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
-        )
-        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
-        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
-
-
-# TODO: add tests for API and CLI usage of finding random example line from testing submodule
diff --git a/tests/test_parse_all_dandi_raw_s3_logs.py b/tests/test_parse_all_dandi_raw_s3_logs.py
new file mode 100644
index 0000000..afdc76a
--- /dev/null
+++ b/tests/test_parse_all_dandi_raw_s3_logs.py
@@ -0,0 +1,46 @@
+import pathlib
+
+import pandas
+import py
+
+import dandi_s3_log_parser
+
+
+def test_parse_all_dandi_raw_s3_logs_example_0(tmpdir: py.path.local):
+    tmpdir = pathlib.Path(tmpdir)
+
+    file_parent = pathlib.Path(__file__).parent
+    examples_folder_path = file_parent / "examples" / "ordered_example_1"
+    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
+
+    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1"
+    dandi_s3_log_parser.parse_all_dandi_raw_s3_logs(
+        base_raw_s3_log_folder_path=examples_folder_path,
+        parsed_s3_log_folder_path=test_parsed_s3_log_folder_path,
+    )
+    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
+
+    number_of_output_files = len(test_output_file_paths)
+    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
+
+    # Increment this over time as more examples are added
+    expected_number_of_output_files = 2
+    assert (
+        number_of_output_files == expected_number_of_output_files
+    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
+
+    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
+    for test_parsed_s3_log_file_path in test_output_file_paths:
+        assert (
+            test_parsed_s3_log_file_path.stem in expected_asset_ids
+        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
+
+        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
+        expected_parsed_s3_log_file_path = (
+            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
+        )
+        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
+        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
+
+
+# TODO: add CLI
diff --git a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
new file mode 100644
index 0000000..a975df9
--- /dev/null
+++ b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
@@ -0,0 +1,47 @@
+import pathlib
+
+import pandas
+import py
+
+import dandi_s3_log_parser
+
+
+def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local):
+    tmpdir = pathlib.Path(tmpdir)
+
+    file_parent = pathlib.Path(__file__).parent
+    examples_folder_path = file_parent / "examples" / "ordered_example_1"
+    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
+
+    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_1"
+    dandi_s3_log_parser.parse_all_dandi_raw_s3_logs(
+        base_raw_s3_log_folder_path=examples_folder_path,
+        parsed_s3_log_folder_path=test_parsed_s3_log_folder_path,
+        maximum_number_of_workers=2,
+    )
+    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
+
+    number_of_output_files = len(test_output_file_paths)
+    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
+
+    # Increment this over time as more examples are added
+    expected_number_of_output_files = 2
+    assert (
+        number_of_output_files == expected_number_of_output_files
+    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
+
+    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
+    for test_parsed_s3_log_file_path in test_output_file_paths:
+        assert (
+            test_parsed_s3_log_file_path.stem in expected_asset_ids
+        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
+
+        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
+        expected_parsed_s3_log_file_path = (
+            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
+        )
+        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
+        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
+
+
+# TODO: add CLI
diff --git a/tests/test_parse_dandi_raw_s3_log.py b/tests/test_parse_dandi_raw_s3_log.py
new file mode 100644
index 0000000..329b180
--- /dev/null
+++ b/tests/test_parse_dandi_raw_s3_log.py
@@ -0,0 +1,49 @@
+import pathlib
+
+import pandas
+import py
+
+import dandi_s3_log_parser
+
+
+def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local):
+    """
+    Most basic test of functionality.
+
+    If there are failures in the parsing of any lines found in application,
+    please raise an issue and contribute them to the example log collection.
+    """
+    tmpdir = pathlib.Path(tmpdir)
+
+    file_parent = pathlib.Path(__file__).parent
+    examples_folder_path = file_parent / "examples" / "ordered_example_0"
+    example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log"
+    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
+
+    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_0"
+    dandi_s3_log_parser.parse_dandi_raw_s3_log(
+        raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path
+    )
+    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
+
+    number_of_output_files = len(test_output_file_paths)
+    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
+
+    # Increment this over time as more examples are added
+    expected_number_of_output_files = 2
+    assert (
+        number_of_output_files == expected_number_of_output_files
+    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
+
+    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
+    for test_parsed_s3_log_file_path in test_output_file_paths:
+        assert (
+            test_parsed_s3_log_file_path.stem in expected_asset_ids
+        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
+
+        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
+        expected_parsed_s3_log_file_path = (
+            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
+        )
+        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
+        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
diff --git a/tests/test_parse_dandi_raw_s3_log_bad_lines.py b/tests/test_parse_dandi_raw_s3_log_bad_lines.py
new file mode 100644
index 0000000..9ae2718
--- /dev/null
+++ b/tests/test_parse_dandi_raw_s3_log_bad_lines.py
@@ -0,0 +1,56 @@
+import pathlib
+
+import pandas
+import py
+
+import dandi_s3_log_parser
+
+
+def test_parse_dandi_raw_s3_log_example_2(tmpdir: py.path.local):
+    """
+    'ordered_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time.
+    """
+    tmpdir = pathlib.Path(tmpdir)
+
+    # Count initial error folder contents
+    error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors"
+    error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list()
+    initial_number_of_error_folder_contents = len(error_folder_contents)
+
+    file_parent = pathlib.Path(__file__).parent
+    examples_folder_path = file_parent / "examples" / "ordered_example_2"
+    example_raw_s3_log_file_path = examples_folder_path / "example_dandi_s3_log.log"
+    expected_parsed_s3_log_folder_path = examples_folder_path / "expected_output"
+
+    test_parsed_s3_log_folder_path = tmpdir / "parsed_example_2"
+    dandi_s3_log_parser.parse_dandi_raw_s3_log(
+        raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path
+    )
+    test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir())
+
+    number_of_output_files = len(test_output_file_paths)
+    assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!"
+
+    # Increment this over time as more examples are added
+    expected_number_of_output_files = 2
+    assert (
+        number_of_output_files == expected_number_of_output_files
+    ), f"The number of asset files ({number_of_output_files}) does not match expectation!"
+
+    expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()]
+    for test_parsed_s3_log_file_path in test_output_file_paths:
+        assert (
+            test_parsed_s3_log_file_path.stem in expected_asset_ids
+        ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!"
+
+        test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path, index_col=0)
+        expected_parsed_s3_log_file_path = (
+            expected_parsed_s3_log_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv"
+        )
+        expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
+        pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log)
+
+    post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list()
+    assert (
+        len(post_test_error_folder_contents) == initial_number_of_error_folder_contents
+    ), "Errors occurred during line parsing!"