From 77d9c8c03b3d43da956e714fdc761d8fe8468be9 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 4 Apr 2024 16:43:00 +0100
Subject: [PATCH 01/45] fix for #195 - correctly check all archived files

---
 .../dias_batch/tests/test_dx_requests.py      | 87 ++++++++++++++++---
 .../dnanexus/dias_batch/utils/dx_requests.py  | 45 +++++++---
 2 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index 5eeef5a..eb0fa4f 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -662,7 +662,7 @@ def test_trailing_blank_line_removed(self, mock_file):
         assert contents == ['line1', 'line2', 'line3']
 
 
-class TestDXManageCheckArchivalState():
+class TestDXManageCheckArchivalState(unittest.TestCase):
     """
     Tests for DXManage.check_archival_state()
 
@@ -713,19 +713,26 @@ class TestDXManageCheckArchivalState():
         }
     ]
 
-    def test_all_live(self, capsys):
+
+    @pytest.fixture(autouse=True)
+    def capsys(self, capsys):
+        """Capture stdout to provide it to tests"""
+        self.capsys = capsys
+
+
+    def test_all_live(self):
         """
         Test no error is raised when all provided files are live
         """
         DXManage().check_archival_state(
-            files=self.files,
+            sample_files=self.files,
             unarchive=False
         )
 
         # since we don't explicitly return anything when there are no
         # archived files, check stdout for expected string printed
         # to ensure the function passed through all checks to the end
-        stdout = capsys.readouterr().out
+        stdout = self.capsys.readouterr().out
 
         assert 'No required files in archived state' in stdout, (
             'Expected print for all live files not in captured stdout'
@@ -742,12 +749,12 @@ def test_error_raised_for_archived_files(self):
             match='Files required for analysis archived'
         ):
             DXManage().check_archival_state(
-            files=self.files_w_archive,
+            sample_files=self.files_w_archive,
             unarchive=False
         )
 
 
-    def test_archived_files_filtered_out_when_not_in_sample_list(self, capsys):
+    def test_archived_files_filtered_out_when_not_in_sample_list(self):
         """
         Test when a list of sample names is provided that any files for other
         samples are filtered out, we will test this by adding an archived file
@@ -755,14 +762,14 @@ def test_archived_files_filtered_out_when_not_in_sample_list(self, capsys):
         """
         # provide list of sample names to filter by
         DXManage().check_archival_state(
-            files=self.files_w_archive,
+            sample_files=self.files_w_archive,
             unarchive=False,
             samples=['sample1', 'sample2', 'sample3', 'sample4']
         )
 
         # since we don't explicitly return anything for all being live check
         # stdout for expected string printed to ensure we got where we expect
-        stdout = capsys.readouterr().out
+        stdout = self.capsys.readouterr().out
 
         assert 'No required files in archived state' in stdout, (
             'Expected print for all live files not in captured stdout'
@@ -781,12 +788,70 @@ def test_archived_files_kept_when_in_sample_list(self):
             # provide list of sample names to filter by, sample5 has
             # archived file and unarchive=False => should raise error
             DXManage().check_archival_state(
-                files=self.files_w_archive,
+                sample_files=self.files_w_archive,
                 unarchive=False,
                 samples=['sample5']
             )
 
 
+    def test_archived_non_sample_file_kept_when_sample_list_given(self):
+        """
+        Can pass a list of sample files plus non-sample file(s) (i.e.
+        intervals bed file from CNV calling), when the samples param is
+        also specified (i.e. list of sample names), ensure we don't wrongly
+        remove the non sample files
+        """
+        non_sample_archived_file = [
+                {
+                'id': 'file-zzz',
+                'describe': {
+                    'name': 'some_other_run_level_file.bed',
+                    'archivalState': 'archived'
+                }
+            }
+        ]
+
+        # test archived non sample file correctly raises error when
+        # provided with sample files and samples list
+        expected_error = "Files required for analysis archived"
+        with self.subTest():
+            with pytest.raises(RuntimeError, match=expected_error):
+                DXManage().check_archival_state(
+                    sample_files=self.files_w_archive,
+                    non_sample_files=non_sample_archived_file,
+                    unarchive=False,
+                    samples=['sample5']
+                )
+
+            # actually test the bed file is flagged
+            archived_bed_stdout = (
+                "some_other_run_level_file.bed (file-zzz) - archived"
+            )
+
+            assert  archived_bed_stdout in self.capsys.readouterr().out, (
+                'Archived bed not correctly identified as archived'
+            )
+
+        # test archived non sample file correctly raises error when
+        # NOT provided with other files
+        expected_error = "Files required for analysis archived"
+        with self.subTest():
+            with pytest.raises(RuntimeError, match=expected_error):
+                DXManage().check_archival_state(
+                    non_sample_files=non_sample_archived_file,
+                    unarchive=False
+                )
+
+            # actually test the bed file is flagged
+            archived_bed_stdout = (
+                "some_other_run_level_file.bed (file-zzz) - archived"
+            )
+
+            assert  archived_bed_stdout in self.capsys.readouterr().out, (
+                'Archived bed not correctly identified as archived'
+            )
+
+
     def test_error_raised_when_non_live_files_can_not_be_unarchived(self):
         """
         When non-live files are found their archivalState is checked and
@@ -821,7 +886,7 @@ def test_error_raised_when_non_live_files_can_not_be_unarchived(self):
             match='non-live files not in a state that can be unarchived'
         ):
             DXManage().check_archival_state(
-                files=files,
+                sample_files=files,
                 unarchive=True
             )
 
@@ -833,7 +898,7 @@ def test_unarchive_files_called_when_specified(self, mock_unarchive):
         we call the function to start unarchiving
         """
         DXManage().check_archival_state(
-            files=self.files_w_archive,
+            sample_files=self.files_w_archive,
             unarchive=True
         )
 
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 0bfd37b..79649d5 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -357,7 +357,13 @@ def read_dxfile(self, file) -> List[str]:
             project=project, dxid=file_id).read().rstrip('\n').split('\n')
 
 
-    def check_archival_state(self, files, unarchive, samples=None) -> None:
+    def check_archival_state(
+            self,
+            sample_files=[],
+            non_sample_files=[],
+            unarchive=False,
+            samples=None
+        ) -> None:
         """
         Check archival state of n files, to be used before attempting
         to launch jobs to ensure nothing fails due to archived files.
@@ -374,8 +380,12 @@ def check_archival_state(self, files, unarchive, samples=None) -> None:
 
         Parameters
         ---------
-        files : list
-            list of DXFile objects to check state of
+        sample_files : list
+            list of DXFile objects to check state of that belong to
+            individual samples (will be filtered by samples parameter)
+        non_sample_files : list
+            list of DXFile objects to check state of that will not be
+            filtered by the samples parameter
         unarchive : bool
             if to automatically unarchive files
         samples : list
@@ -389,12 +399,17 @@ def check_archival_state(self, files, unarchive, samples=None) -> None:
         RuntimeError
             Raised when required files are archived and -iunarchive=False
         """
-        print(f"\n \nChecking archival state of {len(files)} files...")
+        # non_sample_files = [] if not non_sample_files else non_sample_files
+        print(
+            f"\n \nChecking archival state of "
+            f"{len(sample_files) + len(non_sample_files)} files..."
+        )
 
-        # find files not in a live state, and filter these down by samples
-        # given that we're going to launch jobs for
+        # find sample files not in a live state, and filter these down
+        # by samples given that we're going to launch jobs for
         not_live = [
-            x for x in files if x['describe']['archivalState'] != 'live'
+            x for x in sample_files
+            if x['describe']['archivalState'] != 'live'
         ]
 
         if samples and not_live:
@@ -412,6 +427,12 @@ def check_archival_state(self, files, unarchive, samples=None) -> None:
 
             not_live = not_live_filtered
 
+        # add in any non sample files that are not in live state
+        not_live.extend([
+            x for x in non_sample_files
+            if x['describe']['archivalState'] != 'live'
+        ])
+
         if not not_live:
             # nothing archived that we need :dancing_penguin:
             print("No required files in archived state")
@@ -435,7 +456,7 @@ def check_archival_state(self, files, unarchive, samples=None) -> None:
         ])
 
         print(
-            f"\n \nWARNING: {len(not_live)} sample files to use for analysis "
+            f"\n \nWARNING: {len(not_live)} files to use for analysis "
             f"are not in a live state:\n\t{not_live_printable}\n \n"
         )
 
@@ -705,7 +726,10 @@ def cnv_calling(
             )
 
         # check to ensure all bams are unarchived
-        DXManage().check_archival_state(files, unarchive=unarchive)
+        DXManage().check_archival_state(
+            sample_files=files,
+            unarchive=unarchive
+        )
 
         files = [{"$dnanexus_link": file} for file in files]
         cnv_config['inputs']['bambais'] = files
@@ -1057,7 +1081,8 @@ def reports_workflow(
 
         # check to ensure all vcfs (and mosdepth files for SNVs) are unarchived
         DXManage().check_archival_state(
-            files=vcf_files + mosdepth_files + excluded_intervals_bed_file,
+            sample_files=vcf_files + mosdepth_files,
+            non_sample_files=excluded_intervals_bed_file,
             samples=manifest.keys(),
             unarchive=unarchive
         )

From 65a9d389f86904ae18fd11fe7015ae7c840e3d81 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Wed, 17 Apr 2024 13:05:15 +0100
Subject: [PATCH 02/45] fix #197

---
 resources/home/dnanexus/dias_batch/tests/test_utils.py | 8 +++++++-
 resources/home/dnanexus/dias_batch/utils/utils.py      | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_utils.py b/resources/home/dnanexus/dias_batch/tests/test_utils.py
index 00bff6d..1fcc14d 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_utils.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_utils.py
@@ -1062,9 +1062,15 @@ def test_error_raised_when_sample_has_no_tests(self):
         """
         # drop test codes for a manifest sample
         manifest_copy = deepcopy(self.manifest)
+        manifest_copy['324338111-43206R00111']['tests'] = []
         manifest_copy['424487111-53214R00111']['tests'] = [[]]
 
-        with pytest.raises(RuntimeError, match=r"No tests booked for sample"):
+        expected_error = re.escape(
+            "'324338111-43206R00111': ['No tests booked for sample'], "
+            "'424487111-53214R00111': ['No tests booked for sample']"
+        )
+
+        with pytest.raises(RuntimeError, match=expected_error):
             utils.check_manifest_valid_test_codes(
                 manifest=manifest_copy, genepanels=self.genepanels
             )
diff --git a/resources/home/dnanexus/dias_batch/utils/utils.py b/resources/home/dnanexus/dias_batch/utils/utils.py
index 7dcbaae..093e1b6 100644
--- a/resources/home/dnanexus/dias_batch/utils/utils.py
+++ b/resources/home/dnanexus/dias_batch/utils/utils.py
@@ -732,7 +732,7 @@ def check_manifest_valid_test_codes(manifest, genepanels) -> dict:
     for sample, test_codes in manifest.items():
         sample_invalid_test = []
 
-        if test_codes['tests'] == [[]]:
+        if [x for x in test_codes['tests'] if x] == []:
             # sample has no booked tests => chuck it in the error bucket
             invalid[sample].append('No tests booked for sample')
             continue

From e2f920f55f46d80911c7758b856fe19ea745f874 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Wed, 24 Apr 2024 17:03:51 +0100
Subject: [PATCH 03/45] fix_#198

---
 .../dnanexus/dias_batch/tests/test_utils.py   | 49 +++++++++++++++++++
 .../home/dnanexus/dias_batch/utils/utils.py   | 12 +++++
 2 files changed, 61 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_utils.py b/resources/home/dnanexus/dias_batch/tests/test_utils.py
index 00bff6d..77c28b6 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_utils.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_utils.py
@@ -806,6 +806,55 @@ def test_epic_missing_sample_id_caught(self):
             utils.parse_manifest(data)
 
 
+    def test_epic_invalid_sample_id_skipped_when_subset_specified(self):
+        """
+        Where both ReanalysisID and SampleID are not valid, this would
+        normally raise a RuntimeError (as tested in test_epic_missing_
+        sample_id_caught()). If the subset param is specified these
+        should be skipped and only checked if any of the samples
+        specified to subset do not exist in the resultant manifest
+        """
+        data = deepcopy(self.epic_data)
+
+        # remove the specimen ID for the first sample to make it invalid
+        # row 2 => first row of sample data w/ normal specimen - instrument ID
+        data[2] = ';'.join([
+            '' if idx == 2 else x for idx, x in enumerate(data[2].split(';'))
+        ])
+
+        utils.parse_manifest(
+            data, subset='224289111-33202R00111,324338111-43206R00111'
+        )
+
+
+    def test_epic_invalid_sample_id_skipped_and_subset_checked(self):
+        """
+        As testing above in test_epic_invalid_sample_id_skipped_when_
+        subset_specified() - but now we want to test where the invalid
+        sample in the manifest that is skipped is specified in the
+        subset and we catch this and raise a RuntimeError
+        """
+        data = deepcopy(self.epic_data)
+
+        # remove the specimen ID for the first sample to make it invalid
+        # row 2 => first row of sample data w/ normal specimen - instrument ID
+        data[2] = ';'.join([
+            '' if idx == 2 else x for idx, x in enumerate(data[2].split(';'))
+        ])
+
+        expected_error = re.escape(
+            "Sample names provided to -isubset not in manifest: "
+            "['123245111-23146R00111']"
+        )
+
+        with pytest.raises(RuntimeError, match=expected_error):
+            # 123245111-23146R00111 provided to subset is missing the
+            # specimenID as removed above - make sure we catch this
+            utils.parse_manifest(
+                data, subset='224289111-33202R00111,123245111-23146R00111'
+            )
+
+
     def test_invalid_manifest(self):
         """
         Manifest file passed is checked if every row contains '\t' =>
diff --git a/resources/home/dnanexus/dias_batch/utils/utils.py b/resources/home/dnanexus/dias_batch/utils/utils.py
index 7dcbaae..59f188c 100644
--- a/resources/home/dnanexus/dias_batch/utils/utils.py
+++ b/resources/home/dnanexus/dias_batch/utils/utils.py
@@ -555,6 +555,18 @@ def parse_manifest(contents, split_tests=False, subset=None) -> Tuple[pd.DataFra
             elif re.match(r"[\d\w]+-[\d\w]+", row.SampleID):
                 data[row.SampleID]['tests'].append(test_codes)
                 manifest_source[row.SampleID] = {'manifest_source': 'Epic'}
+            elif subset:
+                # sampleID and reanalysisID don't seem valid, continue
+                # anyway if we're subsetting and assume that the user
+                # knows what they're doing, if the samples specified to
+                # --subset don't exist in the manifest this will still
+                # raise a RuntimeError below
+                print(
+                    f"Row {idx + 1} of manifest does not seem to contain all "
+                    "required identifiers, --subset specified so will skip "
+                    f"this row:\n\t{row.tolist()}"
+                )
+                continue
             else:
                 # something funky with this sample naming
                 raise RuntimeError(

From 609654d5d7174acb19ade8851aab3da205a5720f Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 12:39:02 +0100
Subject: [PATCH 04/45] fix #199 - add more detail to summary report on samples
 not processed

---
 .../home/dnanexus/dias_batch/dias_batch.py    |  4 ++
 .../dnanexus/dias_batch/tests/test_utils.py   | 50 +++++++++++++++++--
 .../home/dnanexus/dias_batch/utils/utils.py   | 17 ++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index 28b8186..ebafa4c 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -341,6 +341,9 @@ def main(
         print("Parsed manifest(s):")
         print('⠀⠀', '\n⠀⠀⠀'.join({f"{k}: {v}" for k, v in manifest.items()}))
 
+        # record what we had provided before excluding anything
+        provided_manifest_samples = manifest.keys()
+
         # filter manifest tests against genepanels to ensure what has been
         # requested are test codes or HGNC IDs we recognise
         manifest = check_manifest_valid_test_codes(
@@ -514,6 +517,7 @@ def main(
         app=app_details,
         assay_config=assay_config,
         manifest=manifest,
+        provided_manifest_samples=provided_manifest_samples,
         launched_jobs=launched_jobs,
         excluded=exclude_samples,
         cnv_call_excluded=cnv_call_excluded_files,
diff --git a/resources/home/dnanexus/dias_batch/tests/test_utils.py b/resources/home/dnanexus/dias_batch/tests/test_utils.py
index 00bff6d..0cf0dce 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_utils.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_utils.py
@@ -119,7 +119,7 @@ def test_prev_samples_but_no_suffix_in_name(self):
         )
 
 
-class TestWriteSummaryReport():
+class TestWriteSummaryReport(unittest.TestCase):
     """
     Tests for utils.write_summary_report()
 
@@ -180,6 +180,10 @@ class TestWriteSummaryReport():
         'X111114': {'tests': [['R134.1']]}
     }
 
+    provided_manifest_samples = [
+        'X111111', 'X111112', 'X111113', 'X111114', 'X111115', 'X111116'
+    ]
+
     excluded_samples = ['X111115', 'X111116']
 
     # example list of files excluded from CNV calling
@@ -220,6 +224,7 @@ class TestWriteSummaryReport():
         app=app_details,
         assay_config=assay_config,
         manifest=manifest,
+        provided_manifest_samples=provided_manifest_samples,
         launched_jobs=launched_jobs,
         excluded=excluded_samples,
         cnv_call_excluded=cnv_call_excluded_files,
@@ -244,7 +249,8 @@ def test_inputs_written_correctly(self):
         # job inputs written between lines 'Job inputs:' and
         # 'Total number of samples in manifest: 4'
         start = self.summary_contents.index('Job inputs:')
-        end = self.summary_contents.index('Total number of samples in manifest: 4')
+        end = self.summary_contents.index(
+            'Total number of samples in provided manifest(s): 6')
 
         written_inputs = self.summary_contents[start + 1: end]
         written_inputs = sorted([
@@ -267,10 +273,10 @@ def test_total_no_samples_written(self):
         """
         samples = [
             x for x in self.summary_contents
-            if x.startswith('Total number of samples in manifest')
+            if x.startswith('Total number of samples in provided manifest(s):')
         ]
 
-        assert int(samples[0][-1]) == 4, (
+        assert int(samples[0][-1]) == 6, (
             'Total no. samples wrongly parsed from manifest'
         )
 
@@ -379,6 +385,42 @@ def test_report_summary(self):
         )
 
 
+    def test_manifest_sample_lines_correctly_written(self):
+        """
+        Test that the lines we write to the summary for the samples
+        originally in the provided manifest, the samples we ran jobs for
+        and the samples removed (i.e. where they had Research Use test
+        codes) are correctly written
+        """
+        expected_text = (
+            "\nTotal number of samples in provided manifest(s): 6"
+            "\nTotal number of samples processed from manifest(s): 4"
+            "\nSamples from manifest(s) not processed (2): X111115, X111116"
+        )
+
+        assert expected_text in '\n'.join(self.summary_contents)
+
+# '\nTotal number of samples in provided manifest(s): 6\nTotal number of samples processed from manifest(s): 4\nSamples from manifest(s) not processed (2): X111115, X111116'
+# '\nTotal number of samples in provided manifest(s): 6\nTotal number of samples processed from manifest(s): 4\nSamples from manifest(s) not processed (2): X111116, X111115\nSamples specified to exclude from CNV calling and CNV reports (2): X111115, X111116
+
+    #         file_handle.write(
+    #             "\nTotal number of samples in provided manifest(s): "
+    #             f"{len(summary.get('provided_manifest_samples'))}\n"
+    #         )
+    #         file_handle.write(
+    #             f"\nTotal number of samples processed from manifest: "
+    #             f"{len(manifest.keys())}"
+    #         )
+    #         file_handle.write(
+    #             f"Samples excluded from manifest and not processed "
+    #             f"({len(summary.get('provided_manifest_samples'))}): "
+    #             f"{', '.join(set(manifest.keys()) - set(summary.get('provided_manifest_samples')))}"
+    #         )
+
+    # provided_manifest_samples = [
+    #     'X111111', 'X111112', 'X111113', 'X111114', 'X111115', 'X111116'
+    # ]
+
 class TestMakePath():
     """
     Tests for utils.make_path()
diff --git a/resources/home/dnanexus/dias_batch/utils/utils.py b/resources/home/dnanexus/dias_batch/utils/utils.py
index 7dcbaae..9b5f777 100644
--- a/resources/home/dnanexus/dias_batch/utils/utils.py
+++ b/resources/home/dnanexus/dias_batch/utils/utils.py
@@ -134,7 +134,22 @@ def write_summary_report(output, job, app, manifest=None, **summary) -> None:
                 f"\nManifest(s) parsed: {job['runInput']['manifest_files']}\n"
             )
             file_handle.write(
-                f"\nTotal number of samples in manifest: {len(manifest.keys())}\n"
+                "\nTotal number of samples in provided manifest(s): "
+                f"{len(summary.get('provided_manifest_samples'))}"
+            )
+            file_handle.write(
+                f"\nTotal number of samples processed from manifest(s): "
+                f"{len(manifest.keys())}"
+            )
+
+            not_processed = sorted(
+                set(summary.get('provided_manifest_samples')) -
+                set(manifest.keys())
+            )
+            file_handle.write(
+                f"\nSamples from manifest(s) not processed "
+                f"({len(not_processed)}): "
+                f"{', '.join(not_processed) if not_processed else 'None'}"
             )
 
         if summary.get('excluded'):

From 1e342d1b31ee9b8bd7df036e2d6563c5554e1a02 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 12:41:00 +0100
Subject: [PATCH 05/45] bump app version

---
 dxapp.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dxapp.json b/dxapp.json
index 11b2c1d..043be05 100644
--- a/dxapp.json
+++ b/dxapp.json
@@ -1,7 +1,7 @@
 {
     "name": "eggd_dias_batch",
     "title": "eggd_dias_batch",
-    "version": "3.1.0",
+    "version": "3.2.0",
     "summary": "Launches downstream analyses for Dias",
     "dxapi": "1.0.0",
     "inputSpec": [

From 165054062cf493869f44431728980d006c654990 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 13:14:18 +0100
Subject: [PATCH 06/45] minor text formatting

---
 resources/home/dnanexus/dias_batch/utils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/utils.py b/resources/home/dnanexus/dias_batch/utils/utils.py
index 9b5f777..4d471a1 100644
--- a/resources/home/dnanexus/dias_batch/utils/utils.py
+++ b/resources/home/dnanexus/dias_batch/utils/utils.py
@@ -149,7 +149,7 @@ def write_summary_report(output, job, app, manifest=None, **summary) -> None:
             file_handle.write(
                 f"\nSamples from manifest(s) not processed "
                 f"({len(not_processed)}): "
-                f"{', '.join(not_processed) if not_processed else 'None'}"
+                f"{', '.join(not_processed) if not_processed else 'None'}\n"
             )
 
         if summary.get('excluded'):

From 8ec29fe4759c057e11fd6a96bad8945f4e1981dc Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 14:06:17 +0100
Subject: [PATCH 07/45] remove commented code

---
 .../dnanexus/dias_batch/tests/test_utils.py   | 22 +------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_utils.py b/resources/home/dnanexus/dias_batch/tests/test_utils.py
index 0cf0dce..17da271 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_utils.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_utils.py
@@ -399,27 +399,7 @@ def test_manifest_sample_lines_correctly_written(self):
         )
 
         assert expected_text in '\n'.join(self.summary_contents)
-
-# '\nTotal number of samples in provided manifest(s): 6\nTotal number of samples processed from manifest(s): 4\nSamples from manifest(s) not processed (2): X111115, X111116'
-# '\nTotal number of samples in provided manifest(s): 6\nTotal number of samples processed from manifest(s): 4\nSamples from manifest(s) not processed (2): X111116, X111115\nSamples specified to exclude from CNV calling and CNV reports (2): X111115, X111116
-
-    #         file_handle.write(
-    #             "\nTotal number of samples in provided manifest(s): "
-    #             f"{len(summary.get('provided_manifest_samples'))}\n"
-    #         )
-    #         file_handle.write(
-    #             f"\nTotal number of samples processed from manifest: "
-    #             f"{len(manifest.keys())}"
-    #         )
-    #         file_handle.write(
-    #             f"Samples excluded from manifest and not processed "
-    #             f"({len(summary.get('provided_manifest_samples'))}): "
-    #             f"{', '.join(set(manifest.keys()) - set(summary.get('provided_manifest_samples')))}"
-    #         )
-
-    # provided_manifest_samples = [
-    #     'X111111', 'X111112', 'X111113', 'X111114', 'X111115', 'X111116'
-    # ]
+         f"({len(summary.get('provided_manifest_samples'))}): "
 
 class TestMakePath():
     """

From 72025201adfb5ac4192dfe76a020cbe45b7378fb Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 14:06:58 +0100
Subject: [PATCH 08/45] improve unit test error

---
 resources/home/dnanexus/dias_batch/tests/test_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_utils.py b/resources/home/dnanexus/dias_batch/tests/test_utils.py
index 17da271..9ae7cb9 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_utils.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_utils.py
@@ -398,8 +398,10 @@ def test_manifest_sample_lines_correctly_written(self):
             "\nSamples from manifest(s) not processed (2): X111115, X111116"
         )
 
-        assert expected_text in '\n'.join(self.summary_contents)
-         f"({len(summary.get('provided_manifest_samples'))}): "
+        assert expected_text in '\n'.join(self.summary_contents), (
+            'Manifest details incorrect in summary text'
+        )
+
 
 class TestMakePath():
     """

From 9c60a9e7a50e21c3135c1360efe8a69e00131c86 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 26 Apr 2024 14:53:55 +0100
Subject: [PATCH 09/45] fix pep8 issues

---
 .../dnanexus/dias_batch/tests/test_dx_requests.py    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index eb0fa4f..89cbfd1 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -749,9 +749,9 @@ def test_error_raised_for_archived_files(self):
             match='Files required for analysis archived'
         ):
             DXManage().check_archival_state(
-            sample_files=self.files_w_archive,
-            unarchive=False
-        )
+                sample_files=self.files_w_archive,
+                unarchive=False
+            )
 
 
     def test_archived_files_filtered_out_when_not_in_sample_list(self):
@@ -802,7 +802,7 @@ def test_archived_non_sample_file_kept_when_sample_list_given(self):
         remove the non sample files
         """
         non_sample_archived_file = [
-                {
+            {
                 'id': 'file-zzz',
                 'describe': {
                     'name': 'some_other_run_level_file.bed',
@@ -828,7 +828,7 @@ def test_archived_non_sample_file_kept_when_sample_list_given(self):
                 "some_other_run_level_file.bed (file-zzz) - archived"
             )
 
-            assert  archived_bed_stdout in self.capsys.readouterr().out, (
+            assert archived_bed_stdout in self.capsys.readouterr().out, (
                 'Archived bed not correctly identified as archived'
             )
 
@@ -847,7 +847,7 @@ def test_archived_non_sample_file_kept_when_sample_list_given(self):
                 "some_other_run_level_file.bed (file-zzz) - archived"
             )
 
-            assert  archived_bed_stdout in self.capsys.readouterr().out, (
+            assert archived_bed_stdout in self.capsys.readouterr().out, (
                 'Archived bed not correctly identified as archived'
             )
 

From aeaf033c3549c4ad1f5fba6884c636133bd80cf3 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 10:18:54 +0100
Subject: [PATCH 10/45] fix #207 - add new function
 CheckInputs.strip_string_inputs

---
 .../home/dnanexus/dias_batch/dias_batch.py    | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index ebafa4c..b9c3329 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -71,6 +71,7 @@ def __init__(self, **inputs) -> None:
         self.check_exclude_str_and_file()
         self.check_exclude_samples_file_id()
         self.check_qc_file()
+        self.strip_string_inputs()
 
         if self.errors:
             errors = '; '.join(x for x in self.errors)
@@ -251,6 +252,24 @@ def check_exclude_samples_file_id(self):
                     f"{self.inputs.get('exclude_samples')}"
                 )
 
+    def strip_string_inputs(self):
+        """
+        Strip string type inputs to ensure no leading or trailing
+        whitespace are retained
+        """
+        string_inputs = [
+            'assay',
+            'assay_config_dir',
+            'exclude_samples',
+            'manifest_subset',
+            'single_output_dir',
+            'cnv_call_job_id'
+        ]
+
+        for string in string_inputs:
+            if self.inputs.get(string) and isinstance(self.inputs.get(string), str):
+                self.inputs[string] = self.inputs[string].strip()
+
 
 @dxpy.entry_point('main')
 def main(

From 870d66a52514538d75c0ef88144955abf227c93c Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 10:19:22 +0100
Subject: [PATCH 11/45] add test for CheckInputs.strip_string_inputs

---
 .../dias_batch/tests/test_dias_batch.py       | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py b/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
index a85a464..1c292ad 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
@@ -258,6 +258,37 @@ def test_qc_status_file_is_valid(self, mock_file, mocker):
             "Error not raised when non .xlsx file provided to check_qc_file()"
         )
 
+    def test_string_inputs_with_whitespace_stripped(self, mocker):
+        """
+        Test that string inputs are correctly stripped
+        """
+        mocker.patch.object(CheckInputs, "__init__", return_value=None)
+        check = CheckInputs()
+
+        check.inputs = {
+            'assay': 'CEN ',
+            'assay_config_dir': ' some_dir',
+            'exclude_samples': ' sample1 ',
+            'manifest_subset': ' ',
+            'single_output_dir': '/output/foo/bar',
+            'cnv_call_job_id': 'job-xxx '
+        }
+
+        check.strip_string_inputs()
+
+        expected_inputs = {
+            'assay': 'CEN',
+            'assay_config_dir': 'some_dir',
+            'exclude_samples': 'sample1',
+            'manifest_subset': '',
+            'single_output_dir': '/output/foo/bar',
+            'cnv_call_job_id': 'job-xxx'
+        }
+
+        assert check.inputs == expected_inputs, (
+            'String inputs not correctly stripped'
+        )
+
 
 class TestMain():
     """

From c090e779a6c1ee057568845e1c97d20f37faa4c6 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 17:19:00 +0100
Subject: [PATCH 12/45] add fix for #196

---
 .../home/dnanexus/dias_batch/dias_batch.py    |  50 ++++++++
 .../dnanexus/dias_batch/utils/defaults.py     |  32 +++++
 .../dnanexus/dias_batch/utils/dx_requests.py  | 116 +++++++++++++++++-
 3 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 resources/home/dnanexus/dias_batch/utils/defaults.py

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index b9c3329..7d7130d 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -382,6 +382,56 @@ def main(
             for sample in manifest
         }
 
+    # check up front if any files for any of the selected running modes
+    # are in an archived state which would cause jobs to fail to launch
+    # mode_file_patterns = assay_config.get('mode_file_patterns')
+
+
+        # mode_file_patterns = {
+        #     'cnv_reports': {
+        #         'sample': [
+        #             '_segments.vcf$'
+        #         ],
+        #         'run': [
+        #             '_excluded_intervals.bed$'
+        #         ]
+        #     },
+        #     'snv_reports': {
+        #         'sample': [
+        #             '_markdup_recalibrated_Haplotyper.vcf.gz$',
+        #             'per-base.bed.gz$',
+        #             'reference_build.txt$'
+        #         ],
+        #         'run': []
+        #     },
+        #     'mosaic_reports': {
+        #         'sample': [
+        #             '_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+        #             'per-base.bed.gz$',
+        #             'reference_build.txt$'
+        #         ],
+        #         'run': []
+        #     },
+        #     'artemis': {
+        #         'bam$',
+        #         'bam.bai$',
+        #         '_copy_ratios.gcnv.bed$',
+        #         '_copy_ratios.gcnv.bed.tbi$'
+        #     }
+        # }
+
+    DXManage().check_all_files_archival_state(
+        patterns=assay_config.get('mode_file_patterns'),
+        samples=manifest.keys(),
+        unarchive=unarchive,
+        modes={
+            'cnv_reports': cnv_reports,
+            'snv_reports': snv_reports,
+            'mosaic_reports': mosaic_reports,
+            'artemis': artemis
+        }
+    )
+
     launched_jobs = {}
     cnv_report_errors = snv_report_errors = mosaic_report_errors = \
         cnv_call_excluded_files = cnv_report_summary = snv_report_summary = \
diff --git a/resources/home/dnanexus/dias_batch/utils/defaults.py b/resources/home/dnanexus/dias_batch/utils/defaults.py
new file mode 100644
index 0000000..04e72bd
--- /dev/null
+++ b/resources/home/dnanexus/dias_batch/utils/defaults.py
@@ -0,0 +1,32 @@
+default_mode_file_patterns = {
+    'cnv_reports': {
+        'sample': [
+            '_segments.vcf$'
+        ],
+        'run': [
+            '_excluded_intervals.bed$'
+        ]
+    },
+    'snv_reports': {
+        'sample': [
+            '_markdup_recalibrated_Haplotyper.vcf.gz$',
+            'per-base.bed.gz$',
+            'reference_build.txt$'
+        ],
+        'run': []
+    },
+    'mosaic_reports': {
+        'sample': [
+            '_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+            'per-base.bed.gz$',
+            'reference_build.txt$'
+        ],
+        'run': []
+    },
+    'artemis': {
+        'bam$',
+        'bam.bai$',
+        '_copy_ratios.gcnv.bed$',
+        '_copy_ratios.gcnv.bed.tbi$'
+    }
+}
\ No newline at end of file
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 79649d5..77aea39 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -17,6 +17,8 @@
 from packaging.version import Version
 import pandas as pd
 
+from .defaults import default_mode_file_patterns
+
 from .utils import (
     add_dynamic_inputs,
     check_exclude_samples,
@@ -227,7 +229,7 @@ def get_file_project_context(self, file) -> dxpy.DXObject:
 
 
     def find_files(
-        self, path, subdir='', limit=None, pattern=None) -> List[dxpy.DXObject]:
+        self, path='/', subdir='', limit=None, pattern=None) -> List[dxpy.DXObject]:
         """
         Search given path in DNAnexus, optionally filter down by a sub
         directory and / or with a file name regex pattern. Default
@@ -357,6 +359,98 @@ def read_dxfile(self, file) -> List[str]:
             project=project, dxid=file_id).read().rstrip('\n').split('\n')
 
 
+    def check_all_files_archival_state(
+        self,
+        patterns,
+        samples,
+        modes,
+        unarchive
+        ) -> list:
+        """
+        Checks for all specified file patterns and samples for each
+        running mode to ensure they are unarchived before attempting
+        to launch any jobs
+
+        Parameters
+        ----------
+        patterns : dict
+            mapping of running mode to file patterns to check for
+        samples : list
+            list of samples to filter returned files by
+        modes: dict
+            mapping of running modes to booleans if they are being run
+        unarchive : bool
+            if to automatically unarchive files, will be passed through
+            to self.check_archival_state
+
+        Returns
+        -------
+        list
+            _description_
+        """
+        print("\nChecking archival states for all selected running modes")
+
+        sample_files_to_check = []
+        run_files_to_check = []
+
+        if not patterns:
+            # file patterns to check per running mode not defined in config,
+            # use current patterns correct as of 12/07/2024 as default
+            # TODO - remove this once it is added to both CEN and TWE configs
+            print(
+                "No mode file patterns defined in assay config, using "
+                "default values from utils.defaults"
+            )
+            patterns = default_mode_file_patterns
+
+        for mode, _ in modes.items():
+            if not mode:
+                continue
+
+            mode_sample_patterns = patterns.get(mode, {}).get('sample')
+            mode_run_patterns = patterns.get(mode, {}).get('run')
+
+            if mode_sample_patterns:
+                # generate regex pattern per sample for each file pattern,
+                # then join it as one big chongus pattern for a single query
+                # because its not our API server load to worry about
+                sample_patterns = '|'.join([
+                    [f"{x}.*{y}" for x in samples for y in mode_sample_patterns]
+                ])
+                print(
+                    f"Searching per sample files for {mode} with "
+                    f"{len(mode_sample_patterns)} patterns for {len(samples)} "
+                    "samples"
+                )
+                print(sample_patterns)
+
+                sample_files_to_check.extend(self.find_files(
+                    pattern=sample_patterns
+                ))
+
+            if mode_run_patterns:
+                print(
+                    f"Searching per run files for {mode} with "
+                    f"{len(mode_run_patterns)} patterns"
+                )
+                run_files_to_check.extend(self.find_files(
+                    pattern='|'.join(mode_run_patterns)
+                ))
+
+        print(
+            f"Found {len(sample_files_to_check)} sample files and "
+            f"{len(run_files_to_check)} run level files to check"
+        )
+
+        self.check_archival_state(
+            sample_files=sample_files_to_check,
+            non_sample_files=run_files_to_check,
+            unarchive=unarchive
+        )
+
+        exit()
+
+
     def check_archival_state(
             self,
             sample_files=[],
@@ -614,6 +708,26 @@ def format_output_folders(self, workflow, single_output, time_stamp, name) -> di
         return stage_folders
 
 
+    def remove_job_tags(self) -> None:
+        """
+        Checks for presence of job tags relating to unarchiving on
+        launching, any of these present suggests the job has been cloned
+        and therefore not relating to the current job
+        """
+        job = dxpy.DXJob(dxid=os.environ.get('DX_JOB_ID'))
+        current_tags = job.describe(fields={'tags': True}).get('tags')
+
+        unarchive_tags = [
+            tag for tag in current_tags if re.match(
+                r'Unarchiving of [\d]+ requested, no jobs launched', tag
+            )
+        ]
+
+        if unarchive_tags:
+            print("Removing old archive tag(s)  from job")
+            job.remove_tags(unarchive_tags)
+
+
 class DXExecute():
     """
     Methods for handling execution of apps / workflows

From d308400700978404bb255a90d3b1bf39c94dbc7c Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 20:55:47 +0100
Subject: [PATCH 13/45] formatting

---
 .../home/dnanexus/dias_batch/dias_batch.py    | 36 -------------------
 .../dnanexus/dias_batch/utils/dx_requests.py  | 31 ++++++++++------
 2 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index 7d7130d..5ef7db0 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -384,42 +384,6 @@ def main(
 
     # check up front if any files for any of the selected running modes
     # are in an archived state which would cause jobs to fail to launch
-    # mode_file_patterns = assay_config.get('mode_file_patterns')
-
-
-        # mode_file_patterns = {
-        #     'cnv_reports': {
-        #         'sample': [
-        #             '_segments.vcf$'
-        #         ],
-        #         'run': [
-        #             '_excluded_intervals.bed$'
-        #         ]
-        #     },
-        #     'snv_reports': {
-        #         'sample': [
-        #             '_markdup_recalibrated_Haplotyper.vcf.gz$',
-        #             'per-base.bed.gz$',
-        #             'reference_build.txt$'
-        #         ],
-        #         'run': []
-        #     },
-        #     'mosaic_reports': {
-        #         'sample': [
-        #             '_markdup_recalibrated_tnhaplotyper2.vcf.gz',
-        #             'per-base.bed.gz$',
-        #             'reference_build.txt$'
-        #         ],
-        #         'run': []
-        #     },
-        #     'artemis': {
-        #         'bam$',
-        #         'bam.bai$',
-        #         '_copy_ratios.gcnv.bed$',
-        #         '_copy_ratios.gcnv.bed.tbi$'
-        #     }
-        # }
-
     DXManage().check_all_files_archival_state(
         patterns=assay_config.get('mode_file_patterns'),
         samples=manifest.keys(),
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 77aea39..de3eeff 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -390,9 +390,6 @@ def check_all_files_archival_state(
         """
         print("\nChecking archival states for all selected running modes")
 
-        sample_files_to_check = []
-        run_files_to_check = []
-
         if not patterns:
             # file patterns to check per running mode not defined in config,
             # use current patterns correct as of 12/07/2024 as default
@@ -401,21 +398,34 @@ def check_all_files_archival_state(
                 "No mode file patterns defined in assay config, using "
                 "default values from utils.defaults"
             )
-            patterns = default_mode_file_patterns
+            patterns = dict(default_mode_file_patterns)
+
+        for x, y in patterns.items():
+            print(x, y)
+
+        print('all samples: ', samples)
+
+        sample_files_to_check = []
+        run_files_to_check = []
 
         for mode, _ in modes.items():
+            print(mode)
             if not mode:
                 continue
 
+
             mode_sample_patterns = patterns.get(mode, {}).get('sample')
             mode_run_patterns = patterns.get(mode, {}).get('run')
 
+            print(mode_sample_patterns)
+            print(mode_run_patterns)
+
             if mode_sample_patterns:
                 # generate regex pattern per sample for each file pattern,
                 # then join it as one big chongus pattern for a single query
                 # because its not our API server load to worry about
                 sample_patterns = '|'.join([
-                    [f"{x}.*{y}" for x in samples for y in mode_sample_patterns]
+                    f"{x}.*{y}" for x in samples for y in mode_sample_patterns
                 ])
                 print(
                     f"Searching per sample files for {mode} with "
@@ -442,11 +452,12 @@ def check_all_files_archival_state(
             f"{len(run_files_to_check)} run level files to check"
         )
 
-        self.check_archival_state(
-            sample_files=sample_files_to_check,
-            non_sample_files=run_files_to_check,
-            unarchive=unarchive
-        )
+        if sample_files_to_check or run_files_to_check:
+            self.check_archival_state(
+                sample_files=sample_files_to_check,
+                non_sample_files=run_files_to_check,
+                unarchive=unarchive
+            )
 
         exit()
 

From 45ae7cc7462c9a26e603ee84c0959fc0b266c05d Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 21:15:22 +0100
Subject: [PATCH 14/45] fix default pattern structure

---
 resources/home/dnanexus/dias_batch/utils/defaults.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/defaults.py b/resources/home/dnanexus/dias_batch/utils/defaults.py
index 04e72bd..b278ea8 100644
--- a/resources/home/dnanexus/dias_batch/utils/defaults.py
+++ b/resources/home/dnanexus/dias_batch/utils/defaults.py
@@ -24,9 +24,11 @@
         'run': []
     },
     'artemis': {
-        'bam$',
-        'bam.bai$',
-        '_copy_ratios.gcnv.bed$',
-        '_copy_ratios.gcnv.bed.tbi$'
+        'sample':[
+            'bam$',
+            'bam.bai$',
+            '_copy_ratios.gcnv.bed$',
+            '_copy_ratios.gcnv.bed.tbi$'
+        ]
     }
 }
\ No newline at end of file

From dc9f9f3285ae931943c9a330294951dd05c50682 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 12 Jul 2024 21:15:53 +0100
Subject: [PATCH 15/45] ensure path for DXManage.find_files is non empty

---
 resources/home/dnanexus/dias_batch/utils/dx_requests.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index de3eeff..f96f82b 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -251,7 +251,9 @@ def find_files(
         list
             list of files found
         """
-        path = path.rstrip('/')
+        if path != '/':
+            path = path.rstrip('/')
+
         if subdir:
             subdir = subdir.strip('/')
 

From 43bb4e5b949675c05888c167c7432b335cd5c4c3 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 09:58:45 +0100
Subject: [PATCH 16/45] fix passing project context to unarchive check

---
 resources/home/dnanexus/dias_batch/dias_batch.py       |  1 +
 .../home/dnanexus/dias_batch/utils/dx_requests.py      | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index 5ef7db0..d8f76c5 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -387,6 +387,7 @@ def main(
     DXManage().check_all_files_archival_state(
         patterns=assay_config.get('mode_file_patterns'),
         samples=manifest.keys(),
+        path=single_output_dir,
         unarchive=unarchive,
         modes={
             'cnv_reports': cnv_reports,
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index f96f82b..009bd31 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -229,7 +229,7 @@ def get_file_project_context(self, file) -> dxpy.DXObject:
 
 
     def find_files(
-        self, path='/', subdir='', limit=None, pattern=None) -> List[dxpy.DXObject]:
+        self, path, subdir='', limit=None, pattern=None) -> List[dxpy.DXObject]:
         """
         Search given path in DNAnexus, optionally filter down by a sub
         directory and / or with a file name regex pattern. Default
@@ -251,8 +251,7 @@ def find_files(
         list
             list of files found
         """
-        if path != '/':
-            path = path.rstrip('/')
+        path = path.rstrip('/')
 
         if subdir:
             subdir = subdir.strip('/')
@@ -365,6 +364,7 @@ def check_all_files_archival_state(
         self,
         patterns,
         samples,
+        path,
         modes,
         unarchive
         ) -> list:
@@ -379,6 +379,8 @@ def check_all_files_archival_state(
             mapping of running mode to file patterns to check for
         samples : list
             list of samples to filter returned files by
+        path : str
+            path to search for files
         modes: dict
             mapping of running modes to booleans if they are being run
         unarchive : bool
@@ -437,6 +439,7 @@ def check_all_files_archival_state(
                 print(sample_patterns)
 
                 sample_files_to_check.extend(self.find_files(
+                    path=path,
                     pattern=sample_patterns
                 ))
 
@@ -446,6 +449,7 @@ def check_all_files_archival_state(
                     f"{len(mode_run_patterns)} patterns"
                 )
                 run_files_to_check.extend(self.find_files(
+                    path=path,
                     pattern='|'.join(mode_run_patterns)
                 ))
 

From f591fec02f1c8c52f5f9451b767223d24fac06bf Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 09:59:40 +0100
Subject: [PATCH 17/45] improve logging

---
 .../dnanexus/dias_batch/utils/dx_requests.py   | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index f96f82b..39b7c51 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -402,26 +402,19 @@ def check_all_files_archival_state(
             )
             patterns = dict(default_mode_file_patterns)
 
-        for x, y in patterns.items():
-            print(x, y)
-
-        print('all samples: ', samples)
+        print("Currently defined patterns:")
+        prettier_print(patterns)
 
         sample_files_to_check = []
         run_files_to_check = []
 
-        for mode, _ in modes.items():
-            print(mode)
-            if not mode:
+        for mode, selected in modes.items():
+            if not selected:
                 continue
 
-
             mode_sample_patterns = patterns.get(mode, {}).get('sample')
             mode_run_patterns = patterns.get(mode, {}).get('run')
 
-            print(mode_sample_patterns)
-            print(mode_run_patterns)
-
             if mode_sample_patterns:
                 # generate regex pattern per sample for each file pattern,
                 # then join it as one big chongus pattern for a single query
@@ -434,7 +427,6 @@ def check_all_files_archival_state(
                     f"{len(mode_sample_patterns)} patterns for {len(samples)} "
                     "samples"
                 )
-                print(sample_patterns)
 
                 sample_files_to_check.extend(self.find_files(
                     pattern=sample_patterns
@@ -451,7 +443,7 @@ def check_all_files_archival_state(
 
         print(
             f"Found {len(sample_files_to_check)} sample files and "
-            f"{len(run_files_to_check)} run level files to check"
+            f"{len(run_files_to_check)} run level files to check status of"
         )
 
         if sample_files_to_check or run_files_to_check:

From da07162df78ccca5a4a034875da15227a79ef407 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 10:27:59 +0100
Subject: [PATCH 18/45] remove early exit

---
 resources/home/dnanexus/dias_batch/utils/dx_requests.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 70e8202..de608a5 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -457,8 +457,6 @@ def check_all_files_archival_state(
                 unarchive=unarchive
             )
 
-        exit()
-
 
     def check_archival_state(
             self,

From a4699c4d79e222c3ce2e7e3789ed644eb65bd1c3 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 11:24:48 +0100
Subject: [PATCH 19/45] remove unused function

---
 .../dnanexus/dias_batch/utils/dx_requests.py  | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index de608a5..3a00eae 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -715,26 +715,6 @@ def format_output_folders(self, workflow, single_output, time_stamp, name) -> di
         return stage_folders
 
 
-    def remove_job_tags(self) -> None:
-        """
-        Checks for presence of job tags relating to unarchiving on
-        launching, any of these present suggests the job has been cloned
-        and therefore not relating to the current job
-        """
-        job = dxpy.DXJob(dxid=os.environ.get('DX_JOB_ID'))
-        current_tags = job.describe(fields={'tags': True}).get('tags')
-
-        unarchive_tags = [
-            tag for tag in current_tags if re.match(
-                r'Unarchiving of [\d]+ requested, no jobs launched', tag
-            )
-        ]
-
-        if unarchive_tags:
-            print("Removing old archive tag(s)  from job")
-            job.remove_tags(unarchive_tags)
-
-
 class DXExecute():
     """
     Methods for handling execution of apps / workflows

From 3c1e23b3a4d592eea0b766993a26cba4437f7e92 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:14:53 +0100
Subject: [PATCH 20/45] add unit tests for
 dx_requests.check_all_files_archival_state

---
 .../dias_batch/tests/test_dx_requests.py      | 271 ++++++++++++++++++
 1 file changed, 271 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index 89cbfd1..2d68f71 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -662,6 +662,277 @@ def test_trailing_blank_line_removed(self, mock_file):
         assert contents == ['line1', 'line2', 'line3']
 
 
+@patch('utils.dx_requests.DXManage.find_files')
+@patch('utils.dx_requests.DXManage.check_archival_state')
+class TestCheckAllFilesArchivalState(unittest.TestCase):
+    """
+    Tests for dx_requests.check_all_files_archival_state
+
+    Function takes patterns per running mode of required file types to
+    check archival state of, and queries the given path for all files
+    and then checks the archival state.
+    """
+    @pytest.fixture(autouse=True)
+    def capsys(self, capsys):
+        """Capture stdout to provide it to tests"""
+        self.capsys = capsys
+
+
+    def test_default_patterns_used_if_no_default_provided(
+        self, mock_archive, mock_find
+    ):
+        """
+        Test if no pattern is provided from the config that the default
+        patterns from utils.defaults is used.
+
+        We will use calls to dx_requests.find_files as a proxy for the
+        config patterns being used, as we expected 5 calls for the 4
+        running modes to be made (4x per sample and 1x per run patterns).
+        """
+        DXManage().check_all_files_archival_state(
+            patterns=None,
+            samples=['sample_1', 'sample_2'],
+            path='project-xxx:/',
+            unarchive=False,
+            modes={
+                'cnv_reports': True,
+                'snv_reports': True,
+                'mosaic_reports': True,
+                'artemis': True
+            }
+        )
+
+        assert mock_find.call_count == 5, (
+            'incorrect number of calls to dx_requests.find_files'
+        )
+
+
+    def test_running_mode_check_skipped_if_not_selected(
+        self, mock_archive, mock_find
+    ):
+        """
+        Test if when a running mode has not been selected (i.e. not
+        running CNV reports) that the checks for these file types are
+        not run.
+        """
+        DXManage().check_all_files_archival_state(
+            patterns=None,
+            samples=['sample_1', 'sample_2'],
+            path='project-xxx:/',
+            unarchive=False,
+            modes={
+                'cnv_reports': False,
+                'snv_reports': True,
+                'mosaic_reports': True,
+                'artemis': True
+            }
+        )
+
+        with self.subTest('expected message not in stdout'):
+            expected_stdout = (
+                "Running mode cnv_reports not selected, skipping file check"
+            )
+
+            assert expected_stdout in self.capsys.readouterr().out
+
+        with self.subTest('incorrect calls made to dx_requests.find_files'):
+            assert mock_find.call_count == 3
+
+
+    def test_correct_patterns_provided_for_each_mode(
+        self, mock_archive, mock_find
+    ):
+        """
+        Test that for each running mode the correct patterns are provided
+        when doing the searching
+        """
+        DXManage().check_all_files_archival_state(
+            patterns=None,
+            samples=['sample_1', 'sample_2'],
+            path='project-xxx:/',
+            unarchive=False,
+            modes={
+                'cnv_reports': True,
+                'snv_reports': True,
+                'mosaic_reports': True,
+                'artemis': True
+            }
+        )
+
+        # define what patterns we expect the function to provide to
+        # each call to dx_requests.find_files for each running mode
+        expected_called_patterns = {
+            'cnv_reports_sample': (
+                    'sample_1.*_segments.vcf$|sample_2.*_segments.vcf$'
+            ),
+            'cnv_reports_run': '_excluded_intervals.bed$',
+            'snv_reports': (
+                'sample_1.*_markdup_recalibrated_Haplotyper.vcf.gz$|'
+                'sample_1.*per-base.bed.gz$|sample_1.*reference_build.txt$|'
+                'sample_2.*_markdup_recalibrated_Haplotyper.vcf.gz$|'
+                'sample_2.*per-base.bed.gz$|sample_2.*reference_build.txt$'
+                ),
+            'mosaic_reports': (
+                'sample_1.*_markdup_recalibrated_tnhaplotyper2.vcf.gz|'
+                'sample_1.*per-base.bed.gz$|sample_1.*reference_build.txt$|'
+                'sample_2.*_markdup_recalibrated_tnhaplotyper2.vcf.gz|'
+                'sample_2.*per-base.bed.gz$|sample_2.*reference_build.txt$'
+            ),
+            'artemis': (
+                'sample_1.*bam$|sample_1.*bam.bai$|'
+                'sample_1.*_copy_ratios.gcnv.bed$|'
+                'sample_1.*_copy_ratios.gcnv.bed.tbi$|'
+                'sample_2.*bam$|sample_2.*bam.bai$|'
+                'sample_2.*_copy_ratios.gcnv.bed$|'
+                'sample_2.*_copy_ratios.gcnv.bed.tbi$'
+            )
+        }
+
+        called_patterns = [x[1]['pattern'] for x in mock_find.call_args_list]
+
+        assert sorted(expected_called_patterns.values()) == sorted(called_patterns), (
+            'incorrect patterns provided to dx_requests.find_files'
+        )
+
+
+    def test_call_to_check_archival_state_correct(
+        self, mock_archive, mock_find
+    ):
+        """
+        Test that when we've found files for samples for each running mode,
+        that these are all correctly passed to
+        dx_requests.check_archival_state to actually check if they're archived
+        """
+        # define what files each call to dx_requests.find_files should
+        # return, will be called twice for CNV reports (per sample then
+        # per run), then once for each other mode
+        mock_find.side_effect = [
+            ['sample1_segments.vcf', 'sample2_segments.vcf'],
+            ['myRun_excluded_intervals.bed'],
+            [
+                'sample1_markdup_recalibrated_Haplotyper.vcf.gz',
+                'sample2_markdup_recalibrated_Haplotyper.vcf.gz',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt'
+            ],
+            [
+                'sample1_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+                'sample2_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt'
+            ],
+            [
+                'sample1_bam$',
+                'sample1_bam.bai$',
+                'sample1_copy_ratios.gcnv.bed$',
+                'sample1_copy_ratios.gcnv.bed.tbi$',
+                'sample2_bam$',
+                'sample2_bam.bai$',
+                'sample2_copy_ratios.gcnv.bed$',
+                'sample2_copy_ratios.gcnv.bed.tbi$'
+            ]
+        ]
+
+        DXManage().check_all_files_archival_state(
+            patterns=None,
+            samples=['sample_1', 'sample_2'],
+            path='project-xxx:/',
+            unarchive=False,
+            modes={
+                'cnv_reports': True,
+                'snv_reports': True,
+                'mosaic_reports': True,
+                'artemis': True
+            }
+        )
+
+        # we expect to pass a single level list of all above files to
+        # dx_requests.check_archival_state
+        expected_sample_files = [
+                'sample1_segments.vcf', 'sample2_segments.vcf',
+                'sample1_markdup_recalibrated_Haplotyper.vcf.gz',
+                'sample2_markdup_recalibrated_Haplotyper.vcf.gz',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+                'sample2_markdup_recalibrated_tnhaplotyper2.vcf.gz',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_per-base.bed.gz',
+                'sample2_reference_build.txt',
+                'sample1_bam$',
+                'sample1_bam.bai$',
+                'sample1_copy_ratios.gcnv.bed$',
+                'sample1_copy_ratios.gcnv.bed.tbi$',
+                'sample2_bam$',
+                'sample2_bam.bai$',
+                'sample2_copy_ratios.gcnv.bed$',
+                'sample2_copy_ratios.gcnv.bed.tbi$'
+        ]
+
+        expected_run_files = ['myRun_excluded_intervals.bed']
+
+        with self.subTest('wrong sample files passed to check archival state'):
+            assert sorted(mock_archive.call_args[1]['sample_files']) == \
+                sorted(expected_sample_files)
+
+        with self.subTest('wrong run files passed to check archival state'):
+            assert mock_archive.call_args[1]['non_sample_files'] == \
+                expected_run_files
+
+
+    def test_unarchive_passed_to_check_archival_state(
+        self, mock_archive, mock_find
+    ):
+        """
+        Test that the unarchive param passed to check_all_files_archival_
+        state is passed through to dx_requests.check_archival_state
+        """
+        # set some return value so check_archival_state will get called
+        mock_find.return_value = ['foo']
+
+        with self.subTest('check unarchive False'):
+            DXManage().check_all_files_archival_state(
+                patterns=None,
+                samples=['sample_1', 'sample_2'],
+                path='project-xxx:/',
+                unarchive=False,
+                modes={
+                    'cnv_reports': True,
+                    'snv_reports': True,
+                    'mosaic_reports': True,
+                    'artemis': True
+                }
+            )
+
+            # should be passed through as False
+            assert mock_archive.call_args[1]['unarchive'] == False
+
+        with self.subTest('check unarchive False'):
+            DXManage().check_all_files_archival_state(
+                patterns=None,
+                samples=['sample_1', 'sample_2'],
+                path='project-xxx:/',
+                unarchive=True,
+                modes={
+                    'cnv_reports': True,
+                    'snv_reports': True,
+                    'mosaic_reports': True,
+                    'artemis': True
+                }
+            )
+
+            # should be passed through as True
+            assert mock_archive.call_args[1]['unarchive'] == True
+
+
+
 class TestDXManageCheckArchivalState(unittest.TestCase):
     """
     Tests for DXManage.check_archival_state()

From 7af927f5965ba0bd674afaa231950324711a62d5 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:14:58 +0100
Subject: [PATCH 21/45] fix return type

---
 .../home/dnanexus/dias_batch/utils/dx_requests.py     | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 3a00eae..13b2f59 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -367,7 +367,7 @@ def check_all_files_archival_state(
         path,
         modes,
         unarchive
-        ) -> list:
+        ):
         """
         Checks for all specified file patterns and samples for each
         running mode to ensure they are unarchived before attempting
@@ -386,13 +386,9 @@ def check_all_files_archival_state(
         unarchive : bool
             if to automatically unarchive files, will be passed through
             to self.check_archival_state
-
-        Returns
-        -------
-        list
-            _description_
         """
-        print("\nChecking archival states for all selected running modes")
+        print("\nChecking archival states for selected running modes:")
+        prettier_print(modes)
 
         if not patterns:
             # file patterns to check per running mode not defined in config,
@@ -412,6 +408,7 @@ def check_all_files_archival_state(
 
         for mode, selected in modes.items():
             if not selected:
+                print(f'Running mode {mode} not selected, skipping file check')
                 continue
 
             mode_sample_patterns = patterns.get(mode, {}).get('sample')

From 07a8a444470cfd033f9eab1fc5ca0f4fece9765d Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:24:58 +0100
Subject: [PATCH 22/45] add details to readme on archivalState checking

---
 readme.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/readme.md b/readme.md
index 29e7ed3..752db9a 100644
--- a/readme.md
+++ b/readme.md
@@ -59,6 +59,8 @@ DNAnexus app for launching CNV calling, one or more of SNV, CNV and mosaic repor
 
 The app takes as a minimum input a path to Dias single output, an assay config, and at least one of the above listed running modes. The default behaviour is to pass an assay string specified to run for (with `-iassay`), which will search DNAnexus for the highest version config file in `-iassay_config_dir`  (default: `001_Reference:/dynamic_files/dias_batch_configs/`)  and use this for analysis. Alternatively, an assay config file may be specified to use instead with `-iassay_config_file`. If running a reports workflow a manifest file must also be specified.
 
+Before any jobs are launched, a check of the archival state of all required files is first made. This will use the file pattern mappings either defined in `utils.defaults` or from the assay config file (if specified) to search for the per sample and per run files required, any will raise an error on any archived files if `unarchive=True` is not set.
+
 The general behaviour of each mode is as follows:
 
 ### CNV calling
@@ -221,6 +223,7 @@ The top level section should be structured as follows:
 - `{cnv_call_app|_report_workflow}_id` (`str`) : the IDs of CNV calling and reports workflows to use
 - `reference_files` (`dict`) : mapping of reference file name : DNAnexus file ID, reference file name _must_ be given as shown above, and DNAnexus file ID should be provided as `project-xxx:file-xxx`
 - `name_patterns` (`dict`) : mapping of the manifest source and a regex pattern to use for filtering sample names and files etc.
+- `mode_file_patterns` (`dict` | optional): mapping for each running mode to sample and run file patterns for which to search and check the archival state of before launching any jobs. Defaults are defined in `utils.defaults`, and a mapping of the same structure may be added to the assay config file to override the defaults.
 
 The definitions of inputs for CNV calling and each reports workflow should be defined under the key `modes`, containing a mapping of all inputs and other inputs for controlling running of analyses.
 

From 4283533021230913fbfe8e769a99fa7e95ee0433 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:27:15 +0100
Subject: [PATCH 23/45] add .coveragerc

---
 .coveragerc | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..c712d25
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = tests/*

From 477f9308d67a5933c5dbda071c1575509909f89d Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:43:25 +0100
Subject: [PATCH 24/45] update coveragerc path

---
 .coveragerc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.coveragerc b/.coveragerc
index c712d25..eb1e682 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,2 +1,2 @@
 [run]
-omit = tests/*
+omit = resources/home/dnanexus/dias_batch/tests/*

From 00ff13ac65feb63bb4e2980a6d8f9f03d190a10a Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Mon, 15 Jul 2024 14:58:05 +0100
Subject: [PATCH 25/45] fix #206 - add more detail to readme on dynamic inputs

---
 readme.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/readme.md b/readme.md
index 752db9a..ff76284 100644
--- a/readme.md
+++ b/readme.md
@@ -314,6 +314,8 @@ The definitions of inputs for CNV calling and each reports workflow should be de
     - `INPUT-test_codes` : `'&&'` separated string of test codes
     - `INPUT-sample_name` : string of sample name from manifest
 
+    These are added to the config via [`utils.add_dynamic_inputs`](https://github.com/eastgenomics/eggd_dias_batch/blob/b63a04e2d421a246017e984efcc2a9eef85fbeaf/resources/home/dnanexus/dias_batch/utils/utils.py#L1073) from kwargs generated at run time specified [here](https://github.com/eastgenomics/eggd_dias_batch/blob/b63a04e2d421a246017e984efcc2a9eef85fbeaf/resources/home/dnanexus/dias_batch/utils/dx_requests.py#L1170).
+
 ---
 
 ## What does this app output

From 365fc0114db3e390036cc85526c6a54d61cf9ee1 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 15:15:26 +0100
Subject: [PATCH 26/45] add multiqc html pattern to defaults.py

---
 resources/home/dnanexus/dias_batch/utils/defaults.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/utils/defaults.py b/resources/home/dnanexus/dias_batch/utils/defaults.py
index b278ea8..28aa46d 100644
--- a/resources/home/dnanexus/dias_batch/utils/defaults.py
+++ b/resources/home/dnanexus/dias_batch/utils/defaults.py
@@ -29,6 +29,9 @@
             'bam.bai$',
             '_copy_ratios.gcnv.bed$',
             '_copy_ratios.gcnv.bed.tbi$'
+        ],
+        'run': [
+            '-multiqc.html'
         ]
     }
 }
\ No newline at end of file

From 33a36a1af515499e979a9ead24c3222f3b622d48 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 15:26:24 +0100
Subject: [PATCH 27/45] bump packaging version to fix test setup error

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 479788c..119b724 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 dxpy==0.318.1
-packaging==20.3
+packaging==24.1
 pandas==1.4.1
 pytest==7.0.1
 pytest-cov==4.0.0

From 5409391cb0002e17a9c0f84caa0d6292fa186712 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 15:41:33 +0100
Subject: [PATCH 28/45] fix failing tests for new multiqc.html pattern

---
 .../dias_batch/tests/test_dx_requests.py      | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index 2d68f71..821274b 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -686,8 +686,8 @@ def test_default_patterns_used_if_no_default_provided(
         patterns from utils.defaults is used.
 
         We will use calls to dx_requests.find_files as a proxy for the
-        config patterns being used, as we expected 5 calls for the 4
-        running modes to be made (4x per sample and 1x per run patterns).
+        config patterns being used, as we expected 6 calls for the 4
+        running modes to be made (4x per sample and 2x per run patterns).
         """
         DXManage().check_all_files_archival_state(
             patterns=None,
@@ -702,7 +702,7 @@ def test_default_patterns_used_if_no_default_provided(
             }
         )
 
-        assert mock_find.call_count == 5, (
+        assert mock_find.call_count == 6, (
             'incorrect number of calls to dx_requests.find_files'
         )
 
@@ -736,7 +736,7 @@ def test_running_mode_check_skipped_if_not_selected(
             assert expected_stdout in self.capsys.readouterr().out
 
         with self.subTest('incorrect calls made to dx_requests.find_files'):
-            assert mock_find.call_count == 3
+            assert mock_find.call_count == 4
 
 
     def test_correct_patterns_provided_for_each_mode(
@@ -778,14 +778,15 @@ def test_correct_patterns_provided_for_each_mode(
                 'sample_2.*_markdup_recalibrated_tnhaplotyper2.vcf.gz|'
                 'sample_2.*per-base.bed.gz$|sample_2.*reference_build.txt$'
             ),
-            'artemis': (
+            'artemis_sample': (
                 'sample_1.*bam$|sample_1.*bam.bai$|'
                 'sample_1.*_copy_ratios.gcnv.bed$|'
                 'sample_1.*_copy_ratios.gcnv.bed.tbi$|'
                 'sample_2.*bam$|sample_2.*bam.bai$|'
                 'sample_2.*_copy_ratios.gcnv.bed$|'
                 'sample_2.*_copy_ratios.gcnv.bed.tbi$'
-            )
+            ),
+            'artemis_run': '-multiqc.html'
         }
 
         called_patterns = [x[1]['pattern'] for x in mock_find.call_args_list]
@@ -834,7 +835,8 @@ def test_call_to_check_archival_state_correct(
                 'sample2_bam.bai$',
                 'sample2_copy_ratios.gcnv.bed$',
                 'sample2_copy_ratios.gcnv.bed.tbi$'
-            ]
+            ],
+            ['002_myRun-multiqc.html']
         ]
 
         DXManage().check_all_files_archival_state(
@@ -876,14 +878,17 @@ def test_call_to_check_archival_state_correct(
                 'sample2_copy_ratios.gcnv.bed.tbi$'
         ]
 
-        expected_run_files = ['myRun_excluded_intervals.bed']
+        expected_run_files = [
+            '002_myRun-multiqc.html',
+            'myRun_excluded_intervals.bed'
+        ]
 
         with self.subTest('wrong sample files passed to check archival state'):
             assert sorted(mock_archive.call_args[1]['sample_files']) == \
                 sorted(expected_sample_files)
 
         with self.subTest('wrong run files passed to check archival state'):
-            assert mock_archive.call_args[1]['non_sample_files'] == \
+            assert sorted(mock_archive.call_args[1]['non_sample_files']) == \
                 expected_run_files
 
 
@@ -932,7 +937,6 @@ def test_unarchive_passed_to_check_archival_state(
             assert mock_archive.call_args[1]['unarchive'] == True
 
 
-
 class TestDXManageCheckArchivalState(unittest.TestCase):
     """
     Tests for DXManage.check_archival_state()

From d2f54efd3b343bad63a730f2d038c0bf5eac8600 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:16:44 +0100
Subject: [PATCH 29/45] add new input unarchive_only

---
 .../home/dnanexus/dias_batch/dias_batch.py    |  4 +++-
 .../dnanexus/dias_batch/utils/dx_requests.py  | 21 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index d8f76c5..a6030eb 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -293,7 +293,8 @@ def main(
     multiqc_report=None,
     testing=False,
     sample_limit=None,
-    unarchive=None
+    unarchive=None,
+    unarchive_only=None
 ):
     dxpy.set_workspace_id(os.environ.get('DX_PROJECT_CONTEXT_ID'))
 
@@ -389,6 +390,7 @@ def main(
         samples=manifest.keys(),
         path=single_output_dir,
         unarchive=unarchive,
+        unarchive_only=unarchive_only,
         modes={
             'cnv_reports': cnv_reports,
             'snv_reports': snv_reports,
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 13b2f59..90a20ba 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -366,7 +366,8 @@ def check_all_files_archival_state(
         samples,
         path,
         modes,
-        unarchive
+        unarchive,
+        unarchive_only=False
         ):
         """
         Checks for all specified file patterns and samples for each
@@ -386,6 +387,9 @@ def check_all_files_archival_state(
         unarchive : bool
             if to automatically unarchive files, will be passed through
             to self.check_archival_state
+        unarchive_only : bool
+            if to only check file archival status and exit without
+            returning to launch any jobs
         """
         print("\nChecking archival states for selected running modes:")
         prettier_print(modes)
@@ -454,6 +458,21 @@ def check_all_files_archival_state(
                 unarchive=unarchive
             )
 
+        if unarchive_only:
+            # unarchive only set and no files in an archived state otherwise
+            # dx_requests.DXManage.check_archival_state will have either
+            # raised a RuntimeError on archived files or an exit with
+            # zero exit code on no archived files found => just exit here
+            # and add a helpful tag to the job
+            dxpy.DXJob(dxid=os.environ.get('DX_JOB_ID')).add_tags(
+                ["unarchive_only set - no jobs launched"]
+            )
+            print(
+                "-iunarchive_only set and no files in archived state "
+                "- exiting now"
+            )
+            exit(0)
+
 
     def check_archival_state(
             self,

From a0cfb420c9f8884c40a140518e01c6db06c3448b Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:16:55 +0100
Subject: [PATCH 30/45] add unit tests for new input

---
 .../dias_batch/tests/test_dx_requests.py      | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index 2d68f71..f56873a 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -931,6 +931,39 @@ def test_unarchive_passed_to_check_archival_state(
             # should be passed through as True
             assert mock_archive.call_args[1]['unarchive'] == True
 
+    @patch('utils.dx_requests.exit')
+    @patch('utils.dx_requests.dxpy.DXJob')
+    def test_unarchive_only_correctly_tags_job_and_exits(
+        self, mock_job, mock_exit, mock_archive, mock_find
+    ):
+        """
+        Test that when we specify unarchive_only and there are no files
+        to unarchive (which would exit from dx_requests.DXManage.check_
+        file_archival_status) that we exit here with a zero exit code
+        """
+        DXManage().check_all_files_archival_state(
+            patterns=None,
+            samples=['sample_1', 'sample_2'],
+            path='project-xxx:/',
+            unarchive=False,
+            unarchive_only=True,
+            modes={
+                'cnv_reports': True,
+                'snv_reports': True,
+                'mosaic_reports': True,
+                'artemis': True
+            }
+        )
+
+        with self.subTest('stdout not correct'):
+            expected_stdout = (
+                '-iunarchive_only set and no files in archived state '
+                '- exiting now'
+            )
+            assert expected_stdout in self.capsys.readouterr().out
+
+        with self.subTest('DXJob.add_tags not called'):
+            assert mock_job.return_value.add_tags.call_count == 1
 
 
 class TestDXManageCheckArchivalState(unittest.TestCase):
@@ -1178,6 +1211,7 @@ def test_unarchive_files_called_when_specified(self, mock_unarchive):
         )
 
 
+
 class TestDXManageUnarchiveFiles(unittest.TestCase):
     """
     Tests for DXManage.unarchive_files()

From c09182be61091c625682a95b7fb8ac5d9c9dc1fa Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:18:29 +0100
Subject: [PATCH 31/45] update readme

---
 readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/readme.md b/readme.md
index ff76284..5171ffe 100644
--- a/readme.md
+++ b/readme.md
@@ -38,6 +38,7 @@ DNAnexus app for launching CNV calling, one or more of SNV, CNV and mosaic repor
 - `-iexclude_controls` (`bool`): controls if to automatically exclude control samples from CNV calling based on the regex pattern `'^\w+-\w+Q\w+-'` (default: `true`)
 - `-isplit_tests` (`bool`): controls if to split multiple panels / genes in a manifest to individual reports instead of being combined into one
 - `-iunarchive` (`bool`):  controls whether to automatically unarchive any required files that are archived. Default is to fail the app with a list of files required to unarchive. If set to true, all required files will start to be unarchived and the job will exit with a zero exit code and the job tagged to state no jobs were launched
+- `-iunarchive_only` (`bool`): controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code.
 
 
 #### Running modes

From 0d9d2c50af3fd9a7be87f4c6c50f85859ec76434 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:19:47 +0100
Subject: [PATCH 32/45] add unarchive_only to dxapp.json

---
 dxapp.json | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dxapp.json b/dxapp.json
index 043be05..fbb05d2 100644
--- a/dxapp.json
+++ b/dxapp.json
@@ -166,6 +166,14 @@
             "optional": true,
             "default": false,
             "help": "controls whether to automatically unarchive any required files that are archived. Default is to fail the app with a list of files required to unarchive. If set to true, all required files will start to be unarchived and the job will exit with a zero exit code and the job tagged to state no jobs were launched"
+          },
+          {
+            "name": "unarchive_only",
+            "label": "unarchive_only",
+            "class": "boolean",
+            "optional": true,
+            "default": false,
+            "help": "controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code"
           }
     ],
     "outputSpec": [

From 4b3217fcd2de46051d293da43cbee35a5e4f0f68 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:21:36 +0100
Subject: [PATCH 33/45] bump packaging -> 24.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 479788c..119b724 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 dxpy==0.318.1
-packaging==20.3
+packaging==24.1
 pandas==1.4.1
 pytest==7.0.1
 pytest-cov==4.0.0

From b3a9694cccfa0605059dadd4066d3f6b2993a282 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:22:44 +0100
Subject: [PATCH 34/45] fix typo

---
 resources/home/dnanexus/dias_batch/utils/dx_requests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 90a20ba..778ef73 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -459,7 +459,7 @@ def check_all_files_archival_state(
             )
 
         if unarchive_only:
-            # unarchive only set and no files in an archived state otherwise
+            # unarchive_only set and no files in an archived state otherwise
             # dx_requests.DXManage.check_archival_state will have either
             # raised a RuntimeError on archived files or an exit with
             # zero exit code on no archived files found => just exit here

From 5351aa355db34e503d6e068fb9bca40d32b7a3d9 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 18 Jul 2024 16:23:00 +0100
Subject: [PATCH 35/45] update readme

---
 readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 5171ffe..5533d1f 100644
--- a/readme.md
+++ b/readme.md
@@ -38,7 +38,7 @@ DNAnexus app for launching CNV calling, one or more of SNV, CNV and mosaic repor
 - `-iexclude_controls` (`bool`): controls if to automatically exclude control samples from CNV calling based on the regex pattern `'^\w+-\w+Q\w+-'` (default: `true`)
 - `-isplit_tests` (`bool`): controls if to split multiple panels / genes in a manifest to individual reports instead of being combined into one
 - `-iunarchive` (`bool`):  controls whether to automatically unarchive any required files that are archived. Default is to fail the app with a list of files required to unarchive. If set to true, all required files will start to be unarchived and the job will exit with a zero exit code and the job tagged to state no jobs were launched
-- `-iunarchive_only` (`bool`): controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code.
+- `-iunarchive_only` (`bool`): controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code
 
 
 #### Running modes

From a54543d88d2553e7b8f68c3596c087e2a51b470e Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Tue, 23 Jul 2024 10:06:15 +0100
Subject: [PATCH 36/45] fix #214 - replace colon in single gene name with
 underscore

---
 .../dias_batch/tests/test_dx_requests.py      | 54 +++++++++++++++++++
 .../dnanexus/dias_batch/utils/dx_requests.py  |  5 +-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index f7e0e8b..a9995f8 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -2362,6 +2362,60 @@ def test_name_suffix_integer_incremented(self):
         )
 
 
+    def test_single_gene_reports_name_have_no_colon(self):
+        """
+        Test that when a single gene report is made that the filename
+        does not contain a colon and is replaced with an underscore as
+        this breaks file downloads
+        """
+        self.mock_find.side_effect = [
+            [],
+            [{
+                'describe': {
+                    'name': 'sample.vcf'
+                }
+            }],
+            [
+                {
+                    'project': 'project-xxx',
+                    'id': 'file-xxx',
+                    'describe': {
+                        'name': 'X1234.per-base.bed.gz'
+                    }
+                },
+                {
+                    'project': 'project-xxx',
+                    'id': 'file-xxx',
+                    'describe': {
+                        'name': 'X5678.per-base.bed.gz'
+                    }
+                }
+            ]
+        ]
+
+        # minimal manifest with parsed in indications and panels, make
+        # first sample have single gene test
+        filled_manifest = deepcopy(self.mock_filter_manifest.return_value)
+        filled_manifest[0]["X1234"]["tests"] = [["_HGNC:1234"]]
+
+        self.mock_filter_manifest.return_value = filled_manifest
+        self.mock_index.return_value = 1
+
+        _, _, summary = DXExecute().reports_workflow(
+            mode='SNV',
+            workflow_id='workflow-GXzvJq84XZB1fJk9fBfG88XJ',
+            single_output_dir='/path_to_single/',
+            manifest=filled_manifest[0],
+            config=self.assay_config['modes']['snv_reports'],
+            start='230925_0943',
+            name_patterns=self.assay_config['name_patterns']
+        )
+
+        assert summary['SNV']['X1234'] == 'X1234_HGNC_1234_SNV_1', (
+            'naming of single gene test incorrect'
+        )
+
+
     def test_sample_limit_works(self):
         """
         Test when sample limit is set that it works as expected
diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 778ef73..37ba044 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -1271,8 +1271,9 @@ def reports_workflow(
                 # set prefix for naming output report with integer suffix
                 name = (
                     f"{vcf['describe']['name'].split('_')[0]}_"
-                    f"{'_'.join(test_list)}_{mode}".replace('__', '_')
-                )
+                    f"{'_'.join(test_list)}_{mode}"
+                ).replace(':', '_').replace('__', '_')
+
                 suffix = check_report_index(name=name, reports=xlsx_reports)
 
                 if sample_name_to_suffix.get(name):

From 990f389db70c86c21776877962e4b0409d9dfe1c Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 11:30:24 +0100
Subject: [PATCH 37/45] update default unarchive patterns for copy ratio bed
 files in defaults.py

---
 resources/home/dnanexus/dias_batch/utils/defaults.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/defaults.py b/resources/home/dnanexus/dias_batch/utils/defaults.py
index 28aa46d..058e5fc 100644
--- a/resources/home/dnanexus/dias_batch/utils/defaults.py
+++ b/resources/home/dnanexus/dias_batch/utils/defaults.py
@@ -27,8 +27,8 @@
         'sample':[
             'bam$',
             'bam.bai$',
-            '_copy_ratios.gcnv.bed$',
-            '_copy_ratios.gcnv.bed.tbi$'
+            '_copy_ratios.gcnv.bed.gz$',
+            '_copy_ratios.gcnv.bed.gz.tbi$'
         ],
         'run': [
             '-multiqc.html'

From 482d110a41d811336fdd928c1f1b479293bc4039 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 11:33:16 +0100
Subject: [PATCH 38/45] fix unit tests for defaults

---
 .../home/dnanexus/dias_batch/tests/test_dx_requests.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
index a9995f8..40e17bc 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dx_requests.py
@@ -780,11 +780,11 @@ def test_correct_patterns_provided_for_each_mode(
             ),
             'artemis_sample': (
                 'sample_1.*bam$|sample_1.*bam.bai$|'
-                'sample_1.*_copy_ratios.gcnv.bed$|'
-                'sample_1.*_copy_ratios.gcnv.bed.tbi$|'
+                'sample_1.*_copy_ratios.gcnv.bed.gz$|'
+                'sample_1.*_copy_ratios.gcnv.bed.gz.tbi$|'
                 'sample_2.*bam$|sample_2.*bam.bai$|'
-                'sample_2.*_copy_ratios.gcnv.bed$|'
-                'sample_2.*_copy_ratios.gcnv.bed.tbi$'
+                'sample_2.*_copy_ratios.gcnv.bed.gz$|'
+                'sample_2.*_copy_ratios.gcnv.bed.gz.tbi$'
             ),
             'artemis_run': '-multiqc.html'
         }

From fe054a795c0cfd2077443d7e90f8250524b5a03e Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 13:47:04 +0100
Subject: [PATCH 39/45] improve logging of unarchive warning on non-live files

---
 resources/home/dnanexus/dias_batch/utils/dx_requests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/utils/dx_requests.py b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
index 37ba044..696bf5d 100644
--- a/resources/home/dnanexus/dias_batch/utils/dx_requests.py
+++ b/resources/home/dnanexus/dias_batch/utils/dx_requests.py
@@ -516,7 +516,6 @@ def check_archival_state(
         RuntimeError
             Raised when required files are archived and -iunarchive=False
         """
-        # non_sample_files = [] if not non_sample_files else non_sample_files
         print(
             f"\n \nChecking archival state of "
             f"{len(sample_files) + len(non_sample_files)} files..."
@@ -573,8 +572,9 @@ def check_archival_state(
         ])
 
         print(
-            f"\n \nWARNING: {len(not_live)} files to use for analysis "
-            f"are not in a live state:\n\t{not_live_printable}\n \n"
+            f"\n \nWARNING: {len(not_live)}/"
+            f"{len(sample_files) + len(non_sample_files)} files to use for "
+            f"analysis are not in a live state:\n\t{not_live_printable}\n \n"
         )
 
         print(f"{len(unarchiving)} files are currently in state 'unarchiving'")

From 9609ea572aef52e709d6b748aea206d33310f973 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 13:47:38 +0100
Subject: [PATCH 40/45] remove unused import

---
 resources/home/dnanexus/dias_batch/dias_batch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index a6030eb..d568b3c 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -19,7 +19,6 @@
         make_path,
         parse_manifest,
         parse_genepanels,
-        prettier_print,
         time_stamp,
         write_summary_report
     )
@@ -32,7 +31,6 @@
         make_path,
         parse_manifest,
         parse_genepanels,
-        prettier_print,
         time_stamp,
         write_summary_report
     )

From b4d1dca6a0a9108566f7635fb427d0898ba8f6d5 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 13:54:57 +0100
Subject: [PATCH 41/45] always set unarchive=True if unarchive_only=True

---
 resources/home/dnanexus/dias_batch/dias_batch.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index d568b3c..3155811 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -62,6 +62,7 @@ def __init__(self, **inputs) -> None:
         self.check_assay()
         self.check_assay_config_dir()
         self.check_mode_set()
+        self.check_unarchive_set()
         self.check_single_output_dir()
         self.check_cnv_call_and_cnv_call_job_id_mutually_exclusive()
         self.check_cnv_calling_for_cnv_reports()
@@ -167,6 +168,18 @@ def check_mode_set(self):
                 'Reports argument specified with no manifest file'
             )
 
+    def check_unarchive_set(self):
+        """
+        Checks that if unarchive_only specified that unarchive will
+        default to also being specified
+        """
+        if self.inputs.get('unarchive_only') and not self.inputs.get('unarchive'):
+            print(
+                "-iunarchive_only specified but -unarchive not specified, "
+                "setting unarchive to True"
+            )
+            self.inputs['unarchive'] = True
+
     def check_cnv_call_and_cnv_call_job_id_mutually_exclusive(self):
         """
         Check that both cnv_call and cnv_call_job_id have not been
@@ -301,6 +314,9 @@ def main(
     # assign single out dir in case of missing / output prefix to path
     single_output_dir = check.inputs['single_output_dir']
 
+    # ensure unarchive is set from CheckInputs.check_unarchive_set
+    unarchive = check.inputs['unarchive']
+
     # time of running for naming output folders
     start_time = time_stamp()
 

From d6fa9d687e0784d72805718ec7517aaeae03d5ae Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 14:03:59 +0100
Subject: [PATCH 42/45] add unittest for added behaviour to set unarchive True
 on unarchive_only being set

---
 .../dias_batch/tests/test_dias_batch.py       | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py b/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
index 1c292ad..cb66c67 100644
--- a/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/tests/test_dias_batch.py
@@ -4,6 +4,7 @@
 import os
 import sys
 from unittest.mock import patch
+import unittest
 
 
 sys.path.append(os.path.abspath(
@@ -129,6 +130,39 @@ def test_check_no_mode_set(self, mocker):
             'Error not raised for no running mode set'
         )
 
+    def test_check_unarchive_behaviour_as_expected(self, mocker):
+        """
+        Check behaviour for if unarchive_only is set that unarchive also
+        defaults to being set to True
+        """
+        mocker.patch.object(CheckInputs, "__init__", return_value=None)
+        mocker.return_value = None
+
+        with unittest.TestCase().subTest('unarchive_only set to True'):
+            # Test when unarchive_only set to True we force unarchive
+            # to True also
+            check = CheckInputs()
+            check.inputs = {
+                'unarchive_only': True,
+                'unarchive': False
+            }
+            check.check_unarchive_set()
+
+            assert check.inputs['unarchive'] == True
+
+        with unittest.TestCase().subTest('unarchive_only set to False'):
+            # Test when unarchive_only set to False we do not force
+            # unarchive to True also
+            check = CheckInputs()
+            check.inputs = {
+                'unarchive_only': False,
+                'unarchive': False
+            }
+            check.check_unarchive_set()
+
+            assert check.inputs['unarchive'] == False
+
+
     def test_error_raised_for_no_manifest_with_reports_mode(self, mocker):
         """
         Test error is raised when a reports mode is set and no manifest given

From bcf5a61742b3eb300d04a0e9a95baff28b92f2b6 Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 14:05:01 +0100
Subject: [PATCH 43/45] update readme details on unarchive_only

---
 readme.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 5533d1f..c2c6f31 100644
--- a/readme.md
+++ b/readme.md
@@ -38,7 +38,8 @@ DNAnexus app for launching CNV calling, one or more of SNV, CNV and mosaic repor
 - `-iexclude_controls` (`bool`): controls if to automatically exclude control samples from CNV calling based on the regex pattern `'^\w+-\w+Q\w+-'` (default: `true`)
 - `-isplit_tests` (`bool`): controls if to split multiple panels / genes in a manifest to individual reports instead of being combined into one
 - `-iunarchive` (`bool`):  controls whether to automatically unarchive any required files that are archived. Default is to fail the app with a list of files required to unarchive. If set to true, all required files will start to be unarchived and the job will exit with a zero exit code and the job tagged to state no jobs were launched
-- `-iunarchive_only` (`bool`): controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code
+- `-iunarchive_only` (`bool`): controls if to only run the app to check for archived files and unarchive (i.e no launching of jobs), if all files are found in an unarchived state the app will exit with a zero exit code.
+  - n.b. in this mode, `unarchive` defaults to True and unarchiving will always be run
 
 
 #### Running modes

From 0f4172a668f2f0e34fc77b7af90f7ef97028a28a Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Thu, 25 Jul 2024 14:57:19 +0100
Subject: [PATCH 44/45] fix typo in print

---
 resources/home/dnanexus/dias_batch/dias_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index 3155811..cd09637 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -175,7 +175,7 @@ def check_unarchive_set(self):
         """
         if self.inputs.get('unarchive_only') and not self.inputs.get('unarchive'):
             print(
-                "-iunarchive_only specified but -unarchive not specified, "
+                "-iunarchive_only specified but -iunarchive not specified, "
                 "setting unarchive to True"
             )
             self.inputs['unarchive'] = True

From 3c92f6da3be5789038beca9471ec8698436e7caf Mon Sep 17 00:00:00 2001
From: Jethro Rainford <45037268+jethror1@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:25:32 +0100
Subject: [PATCH 45/45] Update dias_batch.py

ensure `CheckInputs.strip_string_inputs()` is called before other methods
---
 resources/home/dnanexus/dias_batch/dias_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/home/dnanexus/dias_batch/dias_batch.py b/resources/home/dnanexus/dias_batch/dias_batch.py
index cd09637..445ada7 100644
--- a/resources/home/dnanexus/dias_batch/dias_batch.py
+++ b/resources/home/dnanexus/dias_batch/dias_batch.py
@@ -59,6 +59,7 @@ def __init__(self, **inputs) -> None:
 
         self.inputs = inputs
         self.errors = []
+        self.strip_string_inputs()
         self.check_assay()
         self.check_assay_config_dir()
         self.check_mode_set()
@@ -70,7 +71,6 @@ def __init__(self, **inputs) -> None:
         self.check_exclude_str_and_file()
         self.check_exclude_samples_file_id()
         self.check_qc_file()
-        self.strip_string_inputs()
 
         if self.errors:
             errors = '; '.join(x for x in self.errors)