From baa8055e9fc57ddc85a4d565a889805fd6190ae3 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:46:03 -0400 Subject: [PATCH 01/12] added another bad line; adjusted name of helper to clarify (#25) Co-authored-by: CodyCBakerPhD --- src/dandi_s3_log_parser/_s3_log_line_parser.py | 4 ++-- tests/examples/ordered_example_2/example_dandi_s3_log.log | 1 + .../blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index d938fda..2df7abb 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -72,7 +72,7 @@ def _find_all_possible_substring_indices(*, string: str, substring: str) -> list return indices -def _attempt_to_remove_bad_quotes(*, raw_line: str, bad_parsed_line: str) -> str: +def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: """ Attempt to remove bad quotes from a raw line of an S3 log file. @@ -112,7 +112,7 @@ def _parse_s3_log_line(*, raw_line: str) -> list[str]: if number_of_parsed_items <= 26: return parsed_log_line - potentially_cleaned_raw_line = _attempt_to_remove_bad_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) + potentially_cleaned_raw_line = _attempt_to_remove_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)] return parsed_log_line diff --git a/tests/examples/ordered_example_2/example_dandi_s3_log.log b/tests/examples/ordered_example_2/example_dandi_s3_log.log index 81d1749..9205ad3 100644 --- a/tests/examples/ordered_example_2/example_dandi_s3_log.log +++ b/tests/examples/ordered_example_2/example_dandi_s3_log.log @@ -1,3 +1,4 @@ 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv new file mode 100644 index 0000000..0f387f7 --- /dev/null +++ b/tests/examples/ordered_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2023-06-26 03:05:53 6616308 unknown From 0f598c60cb0c529fd6d22c7d66f267f5845722a2 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 17:21:54 -0400 Subject: [PATCH 02/12] break up long lines --- README.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d572382..d3b4b95 100644 --- a/README.md +++ b/README.md @@ -27,19 +27,32 @@ Developed for the [DANDI Archive](https://dandiarchive.org/). To iteratively parse all historical logs all at once (parallelization with 10-15 total GB recommended): ```bash -parse_all_dandi_raw_s3_logs --base_raw_s3_log_folder_path < base log folder > --parsed_s3_log_folder_path < output folder > --excluded_ips < comma-separated list of known IPs to exclude > --maximum_number_of_workers < number of CPUs to use > --maximum_buffer_size_in_bytes < approximate amount of RAM to use > +parse_all_dandi_raw_s3_logs \ + --base_raw_s3_log_folder_path < base log folder > \ + --parsed_s3_log_folder_path < output folder > \ + --excluded_ips < comma-separated list of known IPs to exclude > \ + --maximum_number_of_workers < number of CPUs to use > \ + --maximum_buffer_size_in_bytes < approximate amount of RAM to use > ``` For example, on Drogon: ```bash -parse_all_dandi_raw_s3_logs --base_raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs --parsed_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_7_13_2024/GET_per_asset_id --excluded_ips < Drogon's IP > --maximum_number_of_workers 30 --maximum_buffer_size_in_bytes 15000000000 +parse_all_dandi_raw_s3_logs \ + --base_raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs \ + --parsed_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_7_13_2024/GET_per_asset_id \ + --excluded_ips < Drogon's IP > \ + --maximum_number_of_workers 30 \ + --maximum_buffer_size_in_bytes 15000000000 ``` To parse only a single log file at a time, such as in a CRON job: ```bash -parse_dandi_raw_s3_log --raw_s3_log_file_path < s3 log file path > --parsed_s3_log_folder_path < output folder > --excluded_ips < comma-separated list of known IPs to exclude > +parse_dandi_raw_s3_log \ + --raw_s3_log_file_path < s3 log file path > \ + --parsed_s3_log_folder_path < output folder > \ + --excluded_ips < comma-separated list of known IPs to exclude > ``` From 11550c59107b840831539733200afe7233addfc7 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 19:48:02 -0400 Subject: [PATCH 03/12] reduce drogon CPU --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d3b4b95..8f10cb4 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ parse_all_dandi_raw_s3_logs \ --base_raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs \ --parsed_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_7_13_2024/GET_per_asset_id \ --excluded_ips < Drogon's IP > \ - --maximum_number_of_workers 30 \ + --maximum_number_of_workers 3 \ --maximum_buffer_size_in_bytes 15000000000 ``` From b5549e8876ba38980de84433dbd57b3206ce35eb Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:38:01 -0400 Subject: [PATCH 04/12] fix text coloration --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f10cb4..1326622 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ For example, on Drogon: parse_all_dandi_raw_s3_logs \ --base_raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs \ --parsed_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_7_13_2024/GET_per_asset_id \ - --excluded_ips < Drogon's IP > \ + --excluded_ips < Drogons IP > \ --maximum_number_of_workers 3 \ --maximum_buffer_size_in_bytes 15000000000 ``` From 52fcdd9cd3cbf2f1de6277fe3b35624443dd9038 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:40:07 -0400 Subject: [PATCH 05/12] add PyPI release workflow --- .../pypi_publish_on_github_release.yml | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/pypi_publish_on_github_release.yml diff --git a/.github/workflows/pypi_publish_on_github_release.yml b/.github/workflows/pypi_publish_on_github_release.yml new file mode 100644 index 0000000..561381f --- /dev/null +++ b/.github/workflows/pypi_publish_on_github_release.yml @@ -0,0 +1,32 @@ +name: Upload Package to PyPI + +on: + release: + types: [published] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build + python -m pip install --upgrade twine + + - name: Build package + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + verbose: true + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} From 292fde0b4a0d2a8ed93f72f2084590fbe319df30 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:40:34 -0400 Subject: [PATCH 06/12] Rename pypi_publish_on_github_release.yml to publish_to_pypi_on_github_release.yml --- ...n_github_release.yml => publish_to_pypi_on_github_release.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{pypi_publish_on_github_release.yml => publish_to_pypi_on_github_release.yml} (100%) diff --git a/.github/workflows/pypi_publish_on_github_release.yml b/.github/workflows/publish_to_pypi_on_github_release.yml similarity index 100% rename from .github/workflows/pypi_publish_on_github_release.yml rename to .github/workflows/publish_to_pypi_on_github_release.yml From 05236f55def1e8ec8872635a237cf51af41c85aa Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:50:48 -0400 Subject: [PATCH 07/12] add daily trigger --- .github/workflows/testing_dev.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_dev.yml index cefd94a..c71cad3 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_dev.yml @@ -1,5 +1,7 @@ name: Dev tests on: + schedule: + - cron: "0 6 * * *" # Daily at 2am EST workflow_call: secrets: IPINFO_HASH_SALT: From f5e0969ad3bc78d245d93b21c3256ff76d5b248f Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:52:26 -0400 Subject: [PATCH 08/12] deployment for dailies instead --- .github/workflows/deploy_daily_tests.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/workflows/deploy_daily_tests.yaml diff --git a/.github/workflows/deploy_daily_tests.yaml b/.github/workflows/deploy_daily_tests.yaml new file mode 100644 index 0000000..326aa29 --- /dev/null +++ b/.github/workflows/deploy_daily_tests.yaml @@ -0,0 +1,18 @@ +name: Deploy daily tests + +on: + schedule: + - cron: "0 6 * * *" # Daily at 2am EST + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + DailyTests: + uses: ./.github/workflows/testing_dev.yml + secrets: + IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }} + IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} + CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} From ce853e549117118ac771e4e9bca11b7430192dee Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:52:39 -0400 Subject: [PATCH 09/12] remove trigger --- .github/workflows/testing_dev.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_dev.yml index c71cad3..cefd94a 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_dev.yml @@ -1,7 +1,5 @@ name: Dev tests on: - schedule: - - cron: "0 6 * * *" # Daily at 2am EST workflow_call: secrets: IPINFO_HASH_SALT: From 3563986ebae402576c7dee5cf9cfbe2c685a888a Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:52:58 -0400 Subject: [PATCH 10/12] Rename deploy_daily_tests.yaml to deploy_daily_tests.yml --- .../workflows/{deploy_daily_tests.yaml => deploy_daily_tests.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{deploy_daily_tests.yaml => deploy_daily_tests.yml} (100%) diff --git a/.github/workflows/deploy_daily_tests.yaml b/.github/workflows/deploy_daily_tests.yml similarity index 100% rename from .github/workflows/deploy_daily_tests.yaml rename to .github/workflows/deploy_daily_tests.yml From 73da2c5f4537afbb39e7e0d5b4c060168dc92714 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:53:45 -0400 Subject: [PATCH 11/12] add manual trigger too --- .github/workflows/deploy_daily_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy_daily_tests.yml b/.github/workflows/deploy_daily_tests.yml index 326aa29..4c5239a 100644 --- a/.github/workflows/deploy_daily_tests.yml +++ b/.github/workflows/deploy_daily_tests.yml @@ -3,6 +3,7 @@ name: Deploy daily tests on: schedule: - cron: "0 6 * * *" # Daily at 2am EST + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} From acc51cc4a7432c244caf821d5e5eca53f5ecba6c Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:54:03 -0400 Subject: [PATCH 12/12] remove from main workflow --- .github/workflows/testing_dev.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_dev.yml index cefd94a..11f5e0f 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_dev.yml @@ -8,7 +8,6 @@ on: required: true CODECOV_CREDENTIALS: required: true - workflow_dispatch: env: IPINFO_HASH_SALT: ${{ secrets.IPINFO_HASH_SALT }}