From 4caa118d912470648a28dd2bfdecaf8f6faf33d1 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 15:57:52 +0100 Subject: [PATCH 01/12] initial repo structure --- s3_upload/__init__.py | 0 s3_upload/s3_upload.py | 0 s3_upload/utils/__init__.py | 0 s3_upload/utils/utils.py | 0 tests/__init__.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 s3_upload/__init__.py create mode 100644 s3_upload/s3_upload.py create mode 100644 s3_upload/utils/__init__.py create mode 100644 s3_upload/utils/utils.py create mode 100644 tests/__init__.py diff --git a/s3_upload/__init__.py b/s3_upload/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/s3_upload/s3_upload.py b/s3_upload/s3_upload.py new file mode 100644 index 0000000..e69de29 diff --git a/s3_upload/utils/__init__.py b/s3_upload/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/s3_upload/utils/utils.py b/s3_upload/utils/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 6cf28be4ecbeb464f8be782cacd9db94a51060cb Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 16:11:47 +0100 Subject: [PATCH 02/12] more boilerplate --- s3_upload/s3_upload.py | 26 ++++++++++++++++++++++++++ s3_upload/utils/utils.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/s3_upload/s3_upload.py b/s3_upload/s3_upload.py index e69de29..23c7c2f 100644 --- a/s3_upload/s3_upload.py +++ b/s3_upload/s3_upload.py @@ -0,0 +1,26 @@ +import argparse + + +def parse_args() -> argparse.Namespace: + """ + Parse cmd line arguments + + Returns + ------- + argparse.Namespace + parsed arguments + """ + parser = argparse.ArgumentParser() + + # TODO - add the args + # need to decide on what running modes to have and user config + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + +if __name__ == "__main__": + main() diff --git a/s3_upload/utils/utils.py b/s3_upload/utils/utils.py index e69de29..d3f4c6d 100644 --- a/s3_upload/utils/utils.py +++ b/s3_upload/utils/utils.py @@ -0,0 +1,32 @@ +def check_termination_files_exists(dir) -> bool: + """ + _summary_ + + Parameters + ---------- + dir : _type_ + _description_ + + Returns + ------- + bool + _description_ + """ + pass + + +def check_is_sequencing_dir(dir) -> bool: + """ + _summary_ + + Parameters + ---------- + dir : _type_ + _description_ + + Returns + ------- + bool + _description_ + """ + pass From 3e16bf977ad3073d96de059cd7ac488420ccaba0 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 16:22:58 +0100 Subject: [PATCH 03/12] more boilerplate --- s3_upload/utils/upload.py | 63 +++++++++++++++++++++++++++++++++++++++ s3_upload/utils/utils.py | 22 ++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 s3_upload/utils/upload.py diff --git a/s3_upload/utils/upload.py b/s3_upload/utils/upload.py new file mode 100644 index 0000000..122c243 --- /dev/null +++ b/s3_upload/utils/upload.py @@ -0,0 +1,63 @@ +"""Functions for handling uploading into S3""" + +from concurrent.futures import ( + ProcessPoolExecutor, + ThreadPoolExecutor, + wait, + as_completed, +) + +import boto3 + + +def upload_single_file(local_file): + """ + Uploads single file into S3 storage bucket + + Parameters + ---------- + local_file : _type_ + _description_ + """ + pass + + +def single_core_threaded_upload(files, threads) -> list: + """ + Uploads the given set of `files` to S3 on a single CPU core using + maximum of n threads + + Parameters + ---------- + files : _type_ + _description_ + threads : _type_ + _description_ + + Returns + ------- + list + _description_ + """ + pass + + +def call_by_core(files, cores, threads) -> list: + """ + Call the single_core_threaded_upload on `files` split across n + logical CPU cores + + Parameters + ---------- + files : _type_ + _description_ + cores : _type_ + _description_ + threads : _type_ + _description_ + + Returns + ------- + list + _description_ + """ diff --git a/s3_upload/utils/utils.py b/s3_upload/utils/utils.py index d3f4c6d..c51e371 100644 --- a/s3_upload/utils/utils.py +++ b/s3_upload/utils/utils.py @@ -1,3 +1,6 @@ +"""General utility functions""" + + def check_termination_files_exists(dir) -> bool: """ _summary_ @@ -30,3 +33,22 @@ def check_is_sequencing_dir(dir) -> bool: _description_ """ pass + + +def get_sequencing_file_list(dir, exclude_patterns) -> list: + """ + Recursively get list of files and their paths from the given directory + + Parameters + ---------- + dir : _type_ + _description_ + exclude_patterns : _type_ + _description_ + + Returns + ------- + list + _description_ + """ + pass From 7737a979ae070bac50135b626ac562d81c303eb0 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 16:23:04 +0100 Subject: [PATCH 04/12] add requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8da1b7f --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +boto3==1.35.27 From 97ca14dfeb954d754718cac3ca8887e6d4408585 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 16:47:59 +0100 Subject: [PATCH 05/12] more boilerplate --- s3_test.py | 74 +++++++++++++++++++++++++++++++++++++++++++ tests/__init__.py | 8 +++++ tests/test_data/.keep | 0 tests/test_upload.py | 0 tests/test_utils.py | 0 5 files changed, 82 insertions(+) create mode 100644 s3_test.py create mode 100644 tests/test_data/.keep create mode 100644 tests/test_upload.py create mode 100644 tests/test_utils.py diff --git a/s3_test.py b/s3_test.py new file mode 100644 index 0000000..fdb9736 --- /dev/null +++ b/s3_test.py @@ -0,0 +1,74 @@ +from concurrent.futures import ( + ProcessPoolExecutor, + ThreadPoolExecutor, + wait, + as_completed, +) +from glob import glob +import os +import pathlib +import sys + +import boto3 + + +def single_core_threaded_upload(files, threads): + """Upload files with single core but multiple threads""" + with ThreadPoolExecutor(max_workers=threads) as executor: + concurrent_jobs = { + executor.submit(upload, item): item for item in files + } + + for future in as_completed(concurrent_jobs): + # access returned output as each is returned in any order + try: + future.result() + except Exception as exc: + # catch any other errors that might get raised during querying + print( + f"\nError getting data for {concurrent_jobs[future]}: {exc}" + ) + raise exc + + +def multiple_core_threaded_upload(files, cores, threads): + """Split uploading of given files across n CPU cores""" + # MAX_CORES = 4 + + files = [files[i : i + cores] for i in range(0, len(files), cores)] + + with ProcessPoolExecutor(max_workers=cores) as exe: + futures = [ + exe.submit( + single_core_threaded_upload, threads=threads, files=sub_files + ) + for sub_files in files + ] + + wait(futures) + + +def upload(local_file): + """Upload single file to bucket""" + s3_client = boto3.client("s3") + + upload_file = local_file.lstrip(".").lstrip("/").replace("/genetics", "") + # print(f"Uploading {local_file} to {upload_file}") + + s3_client.upload_file(local_file, "jethro-s3-test-v2", upload_file) + + +if __name__ == "__main__": + files = [ + x + for x in glob(f"{sys.argv[1]}/**/*", recursive=True) + if pathlib.Path(x).is_file() + ] + + smol_files = [x for x in files if os.path.getsize(x) < 8388608] + big_files = [x for x in files if os.path.getsize(x) >= 8388608] + + # single_core_threaded_upload(files) + + multiple_core_threaded_upload(files=files, cores=4, threads=8) + # single_core_threaded_upload(big_files, threads=8) diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..c4ee2f4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,8 @@ +import sys +import os + +sys.path.append( + os.path.abspath(os.path.join(os.path.realpath(__file__), "../../")) +) + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "test_data") diff --git a/tests/test_data/.keep b/tests/test_data/.keep new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_upload.py b/tests/test_upload.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..e69de29 From 4ee150e31d42deb80ad0d406ad257505aa149bf4 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Mon, 30 Sep 2024 17:09:00 +0100 Subject: [PATCH 06/12] more boilerplate, function structure --- s3_upload/utils/upload.py | 28 ++++++++++++++++++---------- s3_upload/utils/utils.py | 17 +++++++++++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/s3_upload/utils/upload.py b/s3_upload/utils/upload.py index 122c243..0d22b8e 100644 --- a/s3_upload/utils/upload.py +++ b/s3_upload/utils/upload.py @@ -10,6 +10,13 @@ import boto3 +def authenticate(): + """ + Authenticate with AWS S3 with given credentials + """ + pass + + def upload_single_file(local_file): """ Uploads single file into S3 storage bucket @@ -29,10 +36,10 @@ def single_core_threaded_upload(files, threads) -> list: Parameters ---------- - files : _type_ - _description_ - threads : _type_ - _description_ + files : list + list of local files to upload + threads : int + maximum number of threaded process to open per core Returns ------- @@ -49,15 +56,16 @@ def call_by_core(files, cores, threads) -> list: Parameters ---------- - files : _type_ - _description_ - cores : _type_ - _description_ - threads : _type_ - _description_ + files : list + list of local files to upload + cores : int + maximum number of logical CPU cores to split uploading across + threads : int + maximum number of threaded process to open per core Returns ------- list _description_ """ + pass diff --git a/s3_upload/utils/utils.py b/s3_upload/utils/utils.py index c51e371..a0ab561 100644 --- a/s3_upload/utils/utils.py +++ b/s3_upload/utils/utils.py @@ -52,3 +52,20 @@ def get_sequencing_file_list(dir, exclude_patterns) -> list: _description_ """ pass + + +def check_upload_state(dir) -> str: + """ + Checking upload state of run (i.e. complete, partial, not started) + + Parameters + ---------- + dir : _type_ + _description_ + + Returns + ------- + str + _description_ + """ + pass From d328fe9b24db31a1bf758cd5db0e8c0d50308182 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:37:41 +0100 Subject: [PATCH 07/12] update comments --- s3_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/s3_test.py b/s3_test.py index fdb9736..0bb1c3d 100644 --- a/s3_test.py +++ b/s3_test.py @@ -33,8 +33,10 @@ def single_core_threaded_upload(files, threads): def multiple_core_threaded_upload(files, cores, threads): """Split uploading of given files across n CPU cores""" - # MAX_CORES = 4 + # split our list of files equally across cores + # TODO - think about splitting files by size between cores + # so we have a mix of large and small files split across cores files = [files[i : i + cores] for i in range(0, len(files), cores)] with ProcessPoolExecutor(max_workers=cores) as exe: @@ -53,7 +55,6 @@ def upload(local_file): s3_client = boto3.client("s3") upload_file = local_file.lstrip(".").lstrip("/").replace("/genetics", "") - # print(f"Uploading {local_file} to {upload_file}") s3_client.upload_file(local_file, "jethro-s3-test-v2", upload_file) From 885fe2cde5ae5254c63a75dac4434cc1113dd90e Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:38:03 +0100 Subject: [PATCH 08/12] remove test script --- s3_test.py | 75 ------------------------------------------------------ 1 file changed, 75 deletions(-) delete mode 100644 s3_test.py diff --git a/s3_test.py b/s3_test.py deleted file mode 100644 index 0bb1c3d..0000000 --- a/s3_test.py +++ /dev/null @@ -1,75 +0,0 @@ -from concurrent.futures import ( - ProcessPoolExecutor, - ThreadPoolExecutor, - wait, - as_completed, -) -from glob import glob -import os -import pathlib -import sys - -import boto3 - - -def single_core_threaded_upload(files, threads): - """Upload files with single core but multiple threads""" - with ThreadPoolExecutor(max_workers=threads) as executor: - concurrent_jobs = { - executor.submit(upload, item): item for item in files - } - - for future in as_completed(concurrent_jobs): - # access returned output as each is returned in any order - try: - future.result() - except Exception as exc: - # catch any other errors that might get raised during querying - print( - f"\nError getting data for {concurrent_jobs[future]}: {exc}" - ) - raise exc - - -def multiple_core_threaded_upload(files, cores, threads): - """Split uploading of given files across n CPU cores""" - - # split our list of files equally across cores - # TODO - think about splitting files by size between cores - # so we have a mix of large and small files split across cores - files = [files[i : i + cores] for i in range(0, len(files), cores)] - - with ProcessPoolExecutor(max_workers=cores) as exe: - futures = [ - exe.submit( - single_core_threaded_upload, threads=threads, files=sub_files - ) - for sub_files in files - ] - - wait(futures) - - -def upload(local_file): - """Upload single file to bucket""" - s3_client = boto3.client("s3") - - upload_file = local_file.lstrip(".").lstrip("/").replace("/genetics", "") - - s3_client.upload_file(local_file, "jethro-s3-test-v2", upload_file) - - -if __name__ == "__main__": - files = [ - x - for x in glob(f"{sys.argv[1]}/**/*", recursive=True) - if pathlib.Path(x).is_file() - ] - - smol_files = [x for x in files if os.path.getsize(x) < 8388608] - big_files = [x for x in files if os.path.getsize(x) >= 8388608] - - # single_core_threaded_upload(files) - - multiple_core_threaded_upload(files=files, cores=4, threads=8) - # single_core_threaded_upload(big_files, threads=8) From 8e61b54788638c00f28dec6527a8a4e1a9013444 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:41:18 +0100 Subject: [PATCH 09/12] add github actions workflow --- .github/workflows/pytest.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/pytest.yml diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..e452367 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,21 @@ +name: pytest +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pipenv codecov + pip install -r requirements.txt + pipenv install --dev + - name: Test with pytest + run: | + pytest -v --cov --count 10 --random-order From 28b75b022f6cbb5e13e96e2a6b18e723a873acfa Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:41:33 +0100 Subject: [PATCH 10/12] update requirements.txt with pytest --- requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/requirements.txt b/requirements.txt index 8da1b7f..22b6316 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,9 @@ boto3==1.35.27 +pytest==7.0.1 +pytest-cov==4.0.0 +pytest-html==4.1.0 +pytest-metadata==3.0.0 +pytest-mock==3.11.1 +pytest-random-order==1.1.1 +pytest-repeat==0.9.3 +pytest-subtests==0.11.0 From c2eb79d46f66624a33f53780dfb2533f118de004 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:44:46 +0100 Subject: [PATCH 11/12] add .coveragerc --- .coveragerc | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..6d4ee03 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +omit = tests/* + From 9b53ddc6852eeb3c73ae9fe247d86e8091d74188 Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Tue, 1 Oct 2024 14:45:00 +0100 Subject: [PATCH 12/12] update test command --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e452367..2cae752 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -18,4 +18,4 @@ jobs: pipenv install --dev - name: Test with pytest run: | - pytest -v --cov --count 10 --random-order + pytest -v --cov --count 10 --random-order .