From 10276faa4e935536cb186424e9321b4b8e71b921 Mon Sep 17 00:00:00 2001
From: Amine Ghozlane <amine.ghozlane@pasteur.fr>
Date: Fri, 4 Oct 2024 16:17:23 +0200
Subject: [PATCH] Update pysam, perform multiple alignment, activate
 multiprocessing

---
 meteor/phylogeny.py         | 142 ++++++++------
 meteor/strain.py            |   2 +-
 meteor/tests/test_strain.py |   2 +-
 meteor/variantcalling.py    | 368 ++++++++++++++++++++++--------------
 poetry.lock                 |  56 +++---
 5 files changed, 344 insertions(+), 226 deletions(-)

diff --git a/meteor/phylogeny.py b/meteor/phylogeny.py
index 22ce0b5..7cb75c6 100644
--- a/meteor/phylogeny.py
+++ b/meteor/phylogeny.py
@@ -28,12 +28,11 @@
 from collections import OrderedDict
 from datetime import datetime
 from typing import Iterable, Tuple
-from cogent3 import load_unaligned_seqs  # , load_aligned_seqs
-
-# from cogent3.evolve.distance import EstimateDistances
-# from cogent3.evolve.models import GTR
-# from cogent3.cluster.UPGMA import upgma
-from cogent3.align.progressive import tree_align
+from cogent3 import load_unaligned_seqs, make_aligned_seqs
+from cogent3.evolve.distance import EstimateDistances
+from cogent3.evolve.models import GTR
+from cogent3.cluster.UPGMA import upgma
+from concurrent.futures import ProcessPoolExecutor, as_completed
 
 
 @dataclass
@@ -88,9 +87,7 @@ def clean_sites(
             resultdict[gene_id] = output_seq
         print(flush=True, file=output)
         return resultdict, info_sites
-        # return info_sites
 
-    # def set_tree_config(self, raxml_ng_version: str) -> dict:  # pragma: no cover
     def set_tree_config(self):
         """Define the census configuration
 
@@ -102,8 +99,6 @@ def set_tree_config(self):
             "meteor_version": self.meteor.version,
             "phylogeny": {
                 "phylogeny_tool": "cogent3",
-                # "phylogeny_tool": "raxml-ng",
-                # "phylogeny_version": raxml_ng_version,
                 "phylogeny_date": datetime.now().strftime("%Y-%m-%d"),
                 "tree_files": ",".join([tree.name for tree in self.tree_files]),
             },
@@ -116,61 +111,98 @@ def remove_edge_labels(self, newick: str) -> str:
         # Replace matched patterns with ":" (effectively removing the edge label)
         return re.sub(pattern, ":", newick)
 
+    def process_msp_file(
+        self, msp_file: Path, idx: int, msp_count: int, tree_dir, tmp_dir
+    ) -> Tuple[Path, bool]:
+        """Process a single MSP file and generate its phylogeny tree."""
+        logging.info(
+            "%d/%d %s: Start analysis",
+            idx,
+            msp_count,
+            msp_file.name.replace(".fasta", ""),
+        )
+        tree_file = tree_dir / f"{msp_file.stem}.tree"
+
+        with NamedTemporaryFile(mode="wt", dir=tmp_dir, suffix=".fasta") as temp_clean:
+            # Clean sites
+            logging.info("Clean sites for %s", msp_file.name)
+            _, info_sites = self.clean_sites(msp_file, temp_clean)
+
+            if info_sites < self.min_info_sites:
+                logging.info(
+                    "Only %d informative sites (< %d threshold) left after cleaning, skipping %s.",
+                    info_sites,
+                    self.min_info_sites,
+                    msp_file.name.replace(".fasta", ""),
+                )
+                return tree_file, False  # Return False to indicate skipping
+
+            # Perform alignments and UPGMA
+            logging.info("Running UPGMA and Distance Estimation")
+            aligned_seqs = make_aligned_seqs(
+                load_unaligned_seqs(temp_clean.name, moltype="dna"),
+                moltype="dna",
+                array_align=True,
+            )
+            d = EstimateDistances(aligned_seqs, submodel=GTR())
+            d.run(show_progress=False)
+
+            # Create UPGMA Tree
+            mycluster = upgma(d.get_pairwise_distances())
+            mycluster = mycluster.unrooted_deepcopy()
+
+            with tree_file.open("w") as f:
+                f.write(
+                    self.remove_edge_labels(mycluster.get_newick(with_distances=True))
+                )
+
+            return tree_file, tree_file.exists()
+
     def execute(self) -> None:
         logging.info("Launch phylogeny analysis")
-        # Start phylogenies
         start = perf_counter()
+
         self.tree_files: list[Path] = []
         msp_count = len(self.msp_file_list)
-        for idx, msp_file in enumerate(self.msp_file_list, start=1):
-            logging.info(
-                "%d/%d %s: Start analysis",
-                idx,
-                msp_count,
-                msp_file.name.replace(".fasta", ""),
-            )
-            with NamedTemporaryFile(
-                mode="wt", dir=self.meteor.tmp_dir, suffix=".fasta"
-            ) as temp_clean:
-                tree_file = self.meteor.tree_dir / f"{msp_file.name}".replace(
-                    ".fasta", ""
-                )
-                # Clean sites
-                logging.info("Clean sites")
-                _, info_sites = self.clean_sites(msp_file, temp_clean)
-                if info_sites < self.min_info_sites:
-                    logging.info(
-                        "Only %d informative sites (< %d threshold) left after cleaning, skip.",
-                        info_sites,
-                        self.min_info_sites,
-                    )
-                else:
-                    seqs = load_unaligned_seqs(temp_clean.name, moltype="dna")
-                    # params = {"kappa": 4.0}
-                    _, tree = tree_align(
-                        "GTR",
-                        seqs,
-                        # param_vals=params,
-                        show_progress=False,
-                    )
-                    # print(aln)
-                    with tree_file.with_suffix(".tree").open("w") as f:
-                        f.write(
-                            self.remove_edge_labels(
-                                tree.get_newick(with_distances=True)
-                            )
+        # Using ProcessPoolExecutor to parallelize the MSP file processing
+        with ProcessPoolExecutor(max_workers=self.meteor.threads) as executor:
+            futures = {
+                executor.submit(
+                    self.process_msp_file,
+                    msp_file,
+                    idx,
+                    msp_count,
+                    self.meteor.tree_dir,
+                    self.meteor.tmp_dir,
+                ): msp_file
+                for idx, msp_file in enumerate(self.msp_file_list, start=1)
+            }
+
+            for future in as_completed(futures):
+                msp_file = futures[future]
+                try:
+                    tree_file, success = future.result()
+                    if success:
+                        self.tree_files.append(tree_file)
+                        logging.info(
+                            "Completed MSP tree for MSP %s",
+                            msp_file.name.replace(".fasta", ""),
+                        )
+                    else:
+                        logging.info(
+                            "Skipped MSP %s due to insufficient informative sites",
+                            msp_file.name.replace(".fasta", ""),
                         )
-                if tree_file.with_suffix(".tree").exists():
-                    self.tree_files.append(tree_file.with_suffix(".tree"))
-                    logging.info(
-                        "Completed MSP tree for MSP %s",
-                        msp_file.name.replace(".fasta", ""),
+                except Exception as exc:
+                    logging.error(
+                        "MSP %s generated an exception: %s", msp_file.name, exc
                     )
-                else:
-                    logging.info("No tree file generated")
+
         logging.info("Completed phylogeny in %f seconds", perf_counter() - start)
         logging.info(
             "Trees were generated for %d/%d MSPs", len(self.tree_files), msp_count
         )
+
+        # Save configuration after all trees are processed
         config = self.set_tree_config()
         self.save_config(config, self.meteor.tree_dir / "census_stage_4.json")
diff --git a/meteor/strain.py b/meteor/strain.py
index b9dc9ee..4f32da0 100644
--- a/meteor/strain.py
+++ b/meteor/strain.py
@@ -247,6 +247,7 @@ def execute(self) -> None:
             )
             sys.exit(1)
         try:
+            start = perf_counter()
             census_json = self.get_census_stage(self.meteor.mapped_sample_dir, 1)
             sample_info = census_json["sample_info"]
             stage3_dir = self.meteor.strain_dir / sample_info["sample_name"]
@@ -315,7 +316,6 @@ def execute(self) -> None:
                 / self.json_data["reference"]["reference_file"]["database_dir"]
                 / self.json_data["reference"]["annotation"]["bed"]["filename"]
             )
-            start = perf_counter()
             # count_file,
             self.get_msp_variant(
                 consensus_file, msp_file, cram_file, bed_file, reference_file
diff --git a/meteor/tests/test_strain.py b/meteor/tests/test_strain.py
index 3156565..ee672d5 100644
--- a/meteor/tests/test_strain.py
+++ b/meteor/tests/test_strain.py
@@ -98,4 +98,4 @@ def test_execute(strain_builder, tmp_path: Path) -> None:
     BS = tmp_path / "strain" / "test" / "BS.fasta.xz"
     assert BS.exists()
     with BS.open("rb") as out:
-        assert md5(out.read()).hexdigest() == "c4a414c7677da877a6b0a569f8950cda"
+        assert md5(out.read()).hexdigest() == "665997d7dc24653bc001c2789fecb8fb"
diff --git a/meteor/variantcalling.py b/meteor/variantcalling.py
index fdde991..6c85fce 100644
--- a/meteor/variantcalling.py
+++ b/meteor/variantcalling.py
@@ -16,6 +16,7 @@
 import sys
 import lzma
 import bgzip
+import pickle
 from subprocess import CalledProcessError, run, Popen, PIPE
 from dataclasses import dataclass
 from pathlib import Path
@@ -25,76 +26,87 @@
 from tempfile import NamedTemporaryFile
 from packaging.version import parse
 from pysam import AlignmentFile, FastaFile, VariantFile, faidx, tabix_index
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from collections import defaultdict
 import pandas as pd
 from typing import ClassVar
 import numpy as np
-import os
-import pickle
+import pysam
 
 
-# Helper function for multiprocessing
-def process_msp_file(
-    meteor_tmp_dir, meteor_tree_dir, min_info_sites, idx, msp_file, msp_count, max_gap
+def run_freebayes_chunk(
+    temp_ref_file_path: str,
+    bed_chunk_file: Path,
+    cram_file: Path,
+    vcf_chunk_file: Path,
+    min_snp_depth: int,
+    min_frequency: float,
+    ploidy: int,
 ):
-    """
-    Process single MSP file and return the path to the generated tree file or None if unsuccessful.
-    This runs in a separate process.
-    """
-    pattern = r"\b(edge\.\d+):\b"
-
-    logging.info(
-        "%d/%d %s: Start analysis",
-        idx,
-        msp_count,
-        msp_file.name.replace(".fasta", ""),
-    )
-
-    with NamedTemporaryFile(
-        mode="wt", dir=meteor_tmp_dir, suffix=".fasta"
-    ) as temp_clean:
-        tree_file = Path(meteor_tree_dir) / f"{msp_file.name}".replace(".fasta", "")
-
-        # Clean sites (use a dummy clean sites function for illustration)
-        logging.info("Clean sites")
-        # Replace the dummy version below with the actual logic to clean sites:
-        info_sites = 20  # Fake number of informative sites for demo
-
-        if info_sites < min_info_sites:
-            logging.info(
-                "Only %d informative sites (<%d threshold) left after cleaning, skip.",
-                info_sites,
-                min_info_sites,
-            )
-            return None  # Skip this file, return None
-
-        # Perform sequence alignment and tree generation
-        seqs = load_unaligned_seqs(msp_file, moltype="dna")
-        params = {"kappa": 4.0}
-        aln, tree = tree_align(
-            "HKY85",
-            seqs,
-            param_vals=params,
-            show_progress=False,
-        )
-        print(aln)
-        with tree_file.with_suffix(".tree").open("w") as f:
-            f.write(re.sub(pattern, ":", tree.get_newick(with_distances=True)))
+    """Function to run freebayes on a chunk of the BED file (i.e., a portion of the genome)."""
+    try:
+        with Popen(
+            [
+                "freebayes",
+                "--pooled-continuous",
+                "--min-alternate-count",
+                str(1),
+                "--min-coverage",
+                str(min_snp_depth),
+                "--min-alternate-fraction",
+                str(min_frequency),
+                "--min-mapping-quality",
+                str(0),
+                "--use-duplicate-reads",
+                "-t",
+                str(
+                    bed_chunk_file.resolve()
+                ),  # BED region chunk for parallel execution
+                "-p",
+                str(ploidy),
+                "-f",
+                temp_ref_file_path,  # Path to temporary reference FASTA file
+                "-b",
+                str(cram_file.resolve()),  # BAM/CRAM alignment file
+            ],
+            stdout=PIPE,
+            stderr=PIPE,
+        ) as freebayes_process:
+            # Capture output from freebayes
+            freebayes_output, freebayes_error = freebayes_process.communicate()
+            if freebayes_error:
+                logging.error(
+                    "Error processing chunk %s: %s", bed_chunk_file, freebayes_error
+                )
+                return None
+            elif freebayes_process.returncode == 0:
+                # Compress output using bgzip
+                with vcf_chunk_file.open("wb") as raw:
+                    with bgzip.BGZipWriter(raw) as fh:
+                        fh.write(freebayes_output)
+                tabix_index(str(vcf_chunk_file.resolve()), preset="vcf", force=True)
+            else:
+                logging.error(
+                    "Freebayes process failed for chunk %s, return code: %d",
+                    bed_chunk_file,
+                    freebayes_process.returncode,
+                )
+                return None
+        return vcf_chunk_file
 
-    if tree_file.with_suffix(".tree").exists():
-        logging.info(
-            "Completed MSP tree for MSP %s",
-            msp_file.name.replace(".fasta", ""),
+    except CalledProcessError as e:
+        logging.error(
+            "Freebayes failed for chunk %s with return code %d",
+            bed_chunk_file,
+            e.returncode,
         )
-        return tree_file.with_suffix(".tree")
-    else:
-        logging.info("No tree file generated")
-        return None
+        logging.error("Error output: %s", e.output)
+        raise e
 
 
 @dataclass
 class VariantCalling(Session):
-    """Run bcftools"""
+    """Run freebayes"""
 
     # from https://www.bioinformatics.org/sms/iupac.html
     IUPAC: ClassVar[dict] = {
@@ -160,6 +172,58 @@ def set_variantcalling_config(
         }
         return config
 
+    def create_bed_chunks(
+        self, merged_df: pd.DataFrame, num_chunks: int, tmp_dir: Path
+    ) -> list[Path]:
+        """
+        Divide the merged_df DataFrame into `num_chunks` BED chunks,
+        with each chunk containing multiple `msp_name` groups.
+        Each chunk will be written to a separate temporary BED file.
+        """
+        bed_chunks = []
+
+        # Get unique msp_name groups
+        msp_names = merged_df["msp_name"].unique()
+        total_msp_names = len(msp_names)
+
+        # Calculate how many MSPs each chunk should include
+        base_chunk_size = total_msp_names // num_chunks
+
+        # Number of chunks that will get an extra MSP name (due to remainder)
+        remainder = total_msp_names % num_chunks
+
+        start_idx = 0  # Starting index for slicing the msp_names
+
+        # Split the msp_names into balanced chunks
+        for i in range(num_chunks):
+            # Determine the chunk size: add 1 to the base size if within the remainder limit
+            chunk_size = base_chunk_size + 1 if i < remainder else base_chunk_size
+
+            # Determine the end index for this chunk
+            end_idx = start_idx + chunk_size
+
+            # Select the subset of MSPs for this chunk
+            current_msp_names = msp_names[start_idx:end_idx]
+
+            # Subset the DataFrame for only these selected MSPs
+            chunk_df = merged_df[merged_df["msp_name"].isin(current_msp_names)]
+
+            # Create a temporary BED file for this chunk
+            temp_bed_file = NamedTemporaryFile(suffix=".bed", dir=tmp_dir, delete=False)
+
+            # Write the chunk DataFrame subset to the temporary BED file
+            chunk_df[["gene_id", "startpos", "gene_length"]].to_csv(
+                temp_bed_file.name, sep="\t", index=False, header=False
+            )
+
+            # Add the path to the temporary file to the list of bed_chunks
+            bed_chunks.append(Path(temp_bed_file.name))
+
+            # Update the starting index for the next chunk
+            start_idx = end_idx
+
+        return bed_chunks  # Return the list of file paths.
+
     def group_consecutive_positions(
         self, position_count_dict: dict, gene_name: str, gene_length: int
     ):
@@ -237,6 +301,29 @@ def count_reads_in_gene(
 
         return reads_dict
 
+    def merge_vcf_files(self, vcf_file_list, output_vcf):
+        """Merge variant records (handling the same positions in multiple VCFs)."""
+        variant_dict = defaultdict(list)
+
+        # Collect all records from all files
+        for i, vcf_file in enumerate(vcf_file_list):
+            with VariantFile(vcf_file, threads=self.meteor.threads) as vcf_in:
+                #  Get the header from the first input VCF
+                if i == 0:
+                    vcf_header = vcf_in.header
+                for rec in vcf_in:
+                    # Use (chrom, pos) tuple as key to merge records based on positions
+                    variant_dict[(rec.chrom, rec.pos)].append(rec)
+        # Write the merged VCF output
+        with VariantFile(
+            str(output_vcf.resolve()),
+            "w",
+            header=vcf_header,
+            threads=self.meteor.threads,
+        ) as vcf_out:
+            for _, rec_list in variant_dict.items():
+                vcf_out.write(rec_list[0])
+
     # @memory_profiler.profile
     def filter_low_cov_sites(
         self,
@@ -326,7 +413,7 @@ def create_consensus(
             )
         )
         # low_cov_sites_dict = low_cov_sites.groupby(low_cov_sites.index).apply(lambda x: x.to_dict(orient='records')).to_dict()
-        with VariantFile(vcf_file) as vcf:
+        with VariantFile(str(vcf_file.resolve()), threads=self.meteor.threads) as vcf:
             with FastaFile(filename=str(reference_file.resolve())) as Fasta:
                 with lzma.open(consensus_file, "wt", preset=0) as consensus_f:
                     # Iterate over all reference sequences in the fasta file
@@ -446,11 +533,6 @@ def execute(self) -> None:
             / self.census["reference"]["reference_file"]["fasta_dir"]
             / self.census["reference"]["reference_file"]["fasta_filename"]
         )
-        bed_file = (
-            self.meteor.ref_dir
-            / self.census["reference"]["reference_file"]["database_dir"]
-            / self.census["reference"]["annotation"]["bed"]["filename"]
-        ).resolve()
         msp_file = (
             self.meteor.ref_dir
             / self.census["reference"]["reference_file"]["database_dir"]
@@ -464,29 +546,6 @@ def execute(self) -> None:
         )
         msp_content = self.load_data(msp_file)
         gene_details = self.load_data(annotation_file)
-        if self.census["reference"]["reference_info"]["database_type"] == "complete":
-            msp_content = msp_content.loc[msp_content["gene_category"] == "core"]
-            msp_content = (
-                msp_content.groupby("msp_name")
-                .head(self.core_size)
-                .reset_index(drop=True)
-            )
-            # print(msp_content)
-            # print(gene_details)
-            merged_df = pd.merge(msp_content, gene_details, on="gene_id")
-            # Add a constant column with value 0
-            merged_df["startpos"] = 0
-            # Extract the required columns
-            result_df = merged_df[["gene_id", "startpos", "gene_length"]]
-            with NamedTemporaryFile(
-                suffix=".bed", dir=self.meteor.tmp_dir, delete=False
-            ) as temp_bed_file:
-                result_df.to_csv(
-                    temp_bed_file.name, sep="\t", index=False, header=False
-                )
-            # print("Why is empty ?")
-            # print(temp_bed_file.name)
-            bed_file = temp_bed_file.name
         freebayes_exec = run(
             ["freebayes", "--version"], check=False, capture_output=True
         )
@@ -525,62 +584,83 @@ def execute(self) -> None:
                 temp_ref_file_path = temp_ref_file.name
             # index on the fly
             faidx(temp_ref_file.name)
-        try:
-            if not vcf_file.exists():
-                with Popen(
-                    [
-                        "freebayes",
-                        # "-i",  # no indel
-                        # "-X",
-                        # "-u",  # no complex observation that may include ins
-                        "--pooled-continuous",
-                        "--min-alternate-count",
-                        str(1),
-                        "--min-coverage",
-                        str(self.min_snp_depth),
-                        "--min-alternate-fraction",
-                        str(self.min_frequency),
-                        "--min-mapping-quality",
-                        str(0),
-                        "--use-duplicate-reads",
-                        "-t",
-                        str(bed_file),
-                        "-p",
-                        str(self.ploidy),
-                        "-f",
-                        temp_ref_file_path,
-                        "-b",
-                        str(cram_file.resolve()),
-                    ],
-                    stdin=PIPE,
-                    stdout=PIPE,
-                ) as freebayes_process:
-                    # capture output of bcftools_process
-                    freebayes_output = freebayes_process.communicate(
-                        input=decompressed_reference
-                    )[0]
-                    # print(freebayes_output)
-                    # compress output using bgzip
-                    with vcf_file.open("wb") as raw:
-                        with bgzip.BGZipWriter(raw) as fh:
-                            fh.write(freebayes_output)
-        except CalledProcessError as e:
-            logging.error("Freebayes failed with return code %d", e.returncode)
-            logging.error("Output: %s", e.output)
-            sys.exit()
-        finally:
-            if temp_ref_file_path is not None:
-                temp_ref_file_path = Path(temp_ref_file_path)
-                # Ensure the temporary file is removed after use
-                if temp_ref_file_path.exists():
-                    temp_ref_file_path.unlink(missing_ok=True)
-                if Path(f"{temp_ref_file_path}.fai").exists():
-                    Path(f"{temp_ref_file_path}.fai").unlink(missing_ok=True)
+            # Prepare the gene data by merging content and creating the necessary fields for the BED format
+            msp_content = msp_content[msp_content["gene_category"] == "core"]
+            msp_content = (
+                msp_content.groupby("msp_name")
+                .head(self.core_size)
+                .reset_index(drop=True)
+            )  # Limit to core_size per `msp_name`
+
+            # Merge with gene details
+            merged_df = pd.merge(msp_content, gene_details, on="gene_id")
+
+            # Add BED columns (we assume `startpos` is 0 and `gene_length` is the length of the gene)
+            merged_df["startpos"] = 0
+            merged_df["gene_length"] = merged_df["gene_length"].astype(
+                int
+            )  # Ensure these are integers
+            result_df = merged_df[["gene_id", "startpos", "gene_length"]]
+            with NamedTemporaryFile(
+                suffix=".bed", dir=self.meteor.tmp_dir, delete=False
+            ) as temp_bed_file:
+                result_df.to_csv(
+                    temp_bed_file.name, sep="\t", index=False, header=False
+                )
+            bed_file = temp_bed_file.name
+            # Create bed_chunk files. Each file stores multiple `msp_name` regions
+            bed_chunks = self.create_bed_chunks(
+                merged_df, self.meteor.threads, self.meteor.tmp_dir
+            )
+            # List to store the VCF chunk files
+            vcf_chunk_files = [
+                NamedTemporaryFile(
+                    suffix=".vcf.gz", dir=self.meteor.tmp_dir, delete=False
+                ).name
+                for _ in bed_chunks
+            ]
+            # Use ProcessPoolExecutor to run freebayes in parallel on each BED chunk
+            with ProcessPoolExecutor(max_workers=self.meteor.threads) as executor:
+                futures = {
+                    executor.submit(
+                        run_freebayes_chunk,
+                        temp_ref_file_path,  # Pass the path to the reference file
+                        bed_chunk_file,  # Each BED chunk
+                        cram_file,
+                        Path(vcf_chunk_file),
+                        self.min_snp_depth,
+                        self.min_frequency,
+                        self.ploidy,
+                    ): bed_chunk_file
+                    for bed_chunk_file, vcf_chunk_file in zip(
+                        bed_chunks, vcf_chunk_files
+                    )
+                }
+
+                # Iterate through completed futures
+                for future in as_completed(futures):
+                    bed_chunk = futures[future]
+                    try:
+                        vcf_chunk_file = future.result()
+                        logging.info(
+                            "Processed BED chunk %s -> VCF chunk %s",
+                            bed_chunk,
+                            vcf_chunk_file,
+                        )
+                    except Exception as exc:
+                        logging.error("Error processing chunk %s: %s", bed_chunk, exc)
+
+            logging.info("All chunks have been processed")
+            # Combine VCF chunk files into the final VCF
+            if len(vcf_chunk_files) > 1:
+                logging.info("Merging vcf")
+                self.merge_vcf_files(vcf_chunk_files, vcf_file)
+            else:
+                Path(vcf_chunk_files[0]).rename(vcf_file)
         logging.info(
             "Completed freebayes step in %f seconds", perf_counter() - startfreebayes
         )
         # Index the vcf file
-
         startindexing = perf_counter()
         if not Path(f"{vcf_file}.tbi").exists():
             logging.info("Indexing")
@@ -629,7 +709,7 @@ def execute(self) -> None:
             bed_file,
         )
         logging.info(
-            "Completed consensus step with python in %f seconds",
+            "Completed consensus step in %f seconds",
             perf_counter() - startconsensuspython,
         )
         logging.info("Completed SNP calling in %f seconds", perf_counter() - start)
@@ -637,5 +717,11 @@ def execute(self) -> None:
             cram_file, vcf_file, consensus_file, freebayes_version
         )
         self.save_config(config, self.census["Stage3FileName"])
-        if os.path.isfile(temp_bed_file.name):
-            os.remove(temp_bed_file.name)
+        # Cleanup temporary files
+        temporary_files = (
+            [temp_ref_file_path] + vcf_chunk_files + [f"{temp_ref_file_path}.fai"]
+        )
+        for temp_file in temporary_files:
+            p = Path(temp_file)
+            if p.exists():
+                p.unlink(missing_ok=True)
diff --git a/poetry.lock b/poetry.lock
index 2b61ba5..7ae25ef 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -766,38 +766,38 @@ testutils = ["gitpython (>3)"]
 
 [[package]]
 name = "pysam"
-version = "0.22.0"
+version = "0.22.1"
 description = "Package for reading, manipulating, and writing genomic data"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "pysam-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:116278a7caa122b2b8acc56d13b3599be9b1236f27a12488bffc306858ff0d57"},
-    {file = "pysam-0.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:da2f1af461e44d5c2c7210d458ee216f8ab98486adf1eea6c88eea5c1058a62f"},
-    {file = "pysam-0.22.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:021fbf6874ad998aba19be33828ad9d23d52273643793488ac4b12917d714c68"},
-    {file = "pysam-0.22.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26199e403855b9da45341d25682e0df27013687d9cb1b4fd328136fbd506292b"},
-    {file = "pysam-0.22.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9bfebf89b1dc2ff6f88d64b5f05d8630deb89562b22764f8ee7f6fa9e677bb91"},
-    {file = "pysam-0.22.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:942dd4a2263996bc2daa21200886e9fde027f32ce8820e7832b20bbdb97eb393"},
-    {file = "pysam-0.22.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:83776ba587eb9575a209efed1cedb49d69c5fa6cc520dd722a0a09d0bb4e9b87"},
-    {file = "pysam-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:4779a99d1ece17a98724d87a5c10c455cf212b3baa3a8399d3d072e4d0ae5ba0"},
-    {file = "pysam-0.22.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:bb61bf30c15f6767403b423b04c293e96fd7635457b506c849aafcf48fc13242"},
-    {file = "pysam-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:32042e0bf3c5dd8554769442c2e1f7b6ada902c33ee44c616d0403e7acd12ee3"},
-    {file = "pysam-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f23b2f47528b94e8abe3b700103fb1214c623ae1c1b8125ecf22d4d33d76720f"},
-    {file = "pysam-0.22.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cfd2b858c7405cf38c730cba779ddf9f8cff28b4842c6440e64781650dcb9a52"},
-    {file = "pysam-0.22.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:87dbf72f3e61fd6d3f92b1b683d9a9e797b6cc213ffcd971899f24a16f9f6e8f"},
-    {file = "pysam-0.22.0-cp36-cp36m-manylinux_2_28_aarch64.whl", hash = "sha256:9af1cd3d07fd4c84e9b3d8a46c65b25f95278185bc6d44c4a48951679d5189ac"},
-    {file = "pysam-0.22.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:f73d7923c89618fb7024875ed8eddc5fb0c911f430e3495de482fcee48143e45"},
-    {file = "pysam-0.22.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6ffe5c98725fea54b1b2aa8f14a60ee9ceaed32c04460d1b861a62603dcd7153"},
-    {file = "pysam-0.22.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:34f5653a82138d28a8e86205785a0398eb6c89f776b4145ff42783168757323c"},
-    {file = "pysam-0.22.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:9d3ebb1515c2fd9b11823469e5b211ca3cc89e976c00c284a2190804c9f11726"},
-    {file = "pysam-0.22.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9b8e18520e7a79bad91b44cf9199c7fa42cec5c3020024d7ef9a7161d0099bf8"},
-    {file = "pysam-0.22.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a98d1ddca64943f3ead507721e52466aea2f7303e549d4960a2eb1d9fff8e3d7"},
-    {file = "pysam-0.22.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:6d6aa2346b11ad35e88c65eb0067321318c25c7f35f75c98061173eabefcf8b0"},
-    {file = "pysam-0.22.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:4f6657a09c81333adb5545cf9a20d4c2ca1686acf8609ad58f13b3ec1b52a9cf"},
-    {file = "pysam-0.22.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:93eb12be3822fb387e5438811f62a0f5e56c1edd5c830aaa316fb50d3d0bc181"},
-    {file = "pysam-0.22.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9ba53f9b0b2c5cb57908855cdb35a31b34c5211d215aa01bdb3e9b3d05c659cc"},
-    {file = "pysam-0.22.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:1b84f99aa04e30bd1cc35c01bd41c2b7680131f56c71a740805aff8086f24b56"},
-    {file = "pysam-0.22.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:481e4efbfbc07b6b92194a005cb9a98006c8378024f41c7b66c58b14f6e77f9c"},
-    {file = "pysam-0.22.0.tar.gz", hash = "sha256:ab7a46973cf0ab8c6ac327f4c3fb67698d7ccbeef8631a716898c6ba01ef3e45"},
+    {file = "pysam-0.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f18e72013ef2db9a9bb7e8ac421934d054427f6c03e66ce8abc39b09c846ba72"},
+    {file = "pysam-0.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:79cd94eeb96541385fa99e759a8f83d21428e092c8b577d50b4eee5823e757cd"},
+    {file = "pysam-0.22.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c71ea45461ee596949061f321a799a97c418164485fdd7e8db89aea2ff979092"},
+    {file = "pysam-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ab3343f221994d163e1ba2691430ce0f6e7da13762473e0d7f9a2d5db3bec235"},
+    {file = "pysam-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:503c833e6cf348d87aec9113b1386d5c85c031d64deb914c29f5ad1792d103e6"},
+    {file = "pysam-0.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4447fdc2630519a00b6bf598995f1440e6f398eb0c084a7c141db026990ae07a"},
+    {file = "pysam-0.22.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1be663a73cf56ddd1d309b91d314a0c94c9bf352eaa3c6eda30cef12699843f0"},
+    {file = "pysam-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:aeb31472365014fd8b37da4a88af758094b5872a8a16a25635a52cf8ceff5a9f"},
+    {file = "pysam-0.22.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e72e129d245574801125029a5892c9e18d2956b13c4203ea585cbd64ccde9351"},
+    {file = "pysam-0.22.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f8f00bb1fb977fc33c87cf5fe9023eefc2ba3d43d30ab4875a1765827018c949"},
+    {file = "pysam-0.22.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c0e051fda433c1c7ff94532f60477bb83b97f4bb183567a0ae23f340e1c200b4"},
+    {file = "pysam-0.22.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:860c7c78ddb1539b83d5476502ba14c8b4e8435810dc7a5b715196da3dfb86b6"},
+    {file = "pysam-0.22.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:18d886d50d75d8f853057fbbb284f0f0e98afad1f76b1a6f55660ea167d31c17"},
+    {file = "pysam-0.22.1-cp36-cp36m-manylinux_2_28_aarch64.whl", hash = "sha256:44420290a619c02da48ca0956548eb82a1665ae97b6ee69c094f9da5a6206431"},
+    {file = "pysam-0.22.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:acff506c921af36f364c5a87f3a30b3c105ebeb270d0e821c2ca571eaf60ca20"},
+    {file = "pysam-0.22.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:098e0bf12d8b0399613065843310c91ba31a02d014b1f6b4e9d7f2d0d1254ff8"},
+    {file = "pysam-0.22.1-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:cd9d457063272df16136640515183ea501bf3371f140a134b2f0a42f425a37d9"},
+    {file = "pysam-0.22.1-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:af9fb53157ba2431b7b20a550c0223f4a039304c9f180d8da98ea9d2d3ef3fbf"},
+    {file = "pysam-0.22.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d3fd6fe5aca79933632f38e5b568ce8d4e67e5c4f3bd39bff55fd9646af814d2"},
+    {file = "pysam-0.22.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b6cf1871c99cfc9c01261ec5f628519c2c889f0ff070e7a26aa5adbf9f69af1"},
+    {file = "pysam-0.22.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:b1addca11c5cfceefaebdfcf3d83bc42f4b89fb1e8ae645a4bdab971cbcd2bc0"},
+    {file = "pysam-0.22.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:17fac22fc89c86241a71084ca097878c61c97f6ff5fd4535d718681a849852a7"},
+    {file = "pysam-0.22.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4aff9b41856d5dba6585ffd60884b8f3778c5d2688f33989662aabe7f4cd0fe0"},
+    {file = "pysam-0.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faa5298291b54f185c7b8f84510224918bddc64bbdcb2e8426ff43e83452310f"},
+    {file = "pysam-0.22.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:4dfae1de006d1c6491a59b00052a3f67c53a136165cf4edd7789b5dcb1e6806f"},
+    {file = "pysam-0.22.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:78ed746a39c9cebe489b8f0f86cf23c09c942e76c901260fb2794906e4cd0e26"},
+    {file = "pysam-0.22.1.tar.gz", hash = "sha256:18a0b97be95bd71e584de698441c46651cdff378db1c9a4fb3f541e560253b22"},
 ]
 
 [[package]]