diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py
index b4bbc72..4915d37 100644
--- a/src/python/ensembl/tools/anno/utils/_utils.py
+++ b/src/python/ensembl/tools/anno/utils/_utils.py
@@ -16,16 +16,17 @@
 
 import errno
 import io
+import json
 import logging
 import os
 from os import PathLike
 from pathlib import Path
-import random
 import re
 import subprocess
 import shutil
+import glob
 import tempfile
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 logger = logging.getLogger("__name__." + __name__)
 
@@ -80,8 +81,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
     Check number of transcript lines in the GTF
 
     Arg:
-        gtf_file: GTF file path
-        content_obj: Object to detect and count in the gtf i.e transcript, repeat
+      gtf_file: GTF file path
+      content_obj: Object to detect and count in the gtf i.e transcript, repeat
 
     Return: Number of occurences
     """
@@ -126,7 +127,7 @@ def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dic
 
         if len(current_seq) > min_seq_length:
             seq_region_to_length[current_header] = len(current_seq)
-    # logging.info("length of the seq region %s", seq_region_to_length)
+    #logging.info("length of the seq region %s", seq_region_to_length)
     return seq_region_to_length
 
 
@@ -166,7 +167,7 @@ def get_slice_id(
             if (end - start + 1) >= min_length:
                 slice_ids_per_region.append([seq_region, start, end])
             start = end + 1
-    # logging.info("list of slice ids %s", str(slice_ids_per_region))
+    #logging.info("list of slice ids %s", str(slice_ids_per_region))
     return slice_ids_per_region
 
 
@@ -252,7 +253,9 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             # Example key KS8000.rs1.re1000000.1
                             gene_id_slice = input_file.name + "." + str(current_gene_id)
                             # Example key KS8000.rs1.re1000000.1.transcript.1
-                            transcript_id_slice = gene_id_slice + "." + str(current_transcript_id)
+                            transcript_id_slice = (
+                                gene_id_slice + "." + str(current_transcript_id)
+                            )
                             # If there is no existing entry, the gene key will be added
                             # and the gene counter is incremented.
                             # gene_id "gene1"; transcript_id "gene1.t1"
@@ -277,12 +280,16 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                                     + ".t"
                                     + str(transcript_id_count_gene_id[gene_id_slice])
                                 )
-                                gene_transcript_id_collection[transcript_id_slice] = new_transcript_id
+                                gene_transcript_id_collection[
+                                    transcript_id_slice
+                                ] = new_transcript_id
                                 transcript_id_count_gene_id[gene_id_slice] += 1
                             else:
                                 # If a transcript of the same set is already present,
                                 # the new id with the incremented counter is added
-                                new_transcript_id = gene_transcript_id_collection[transcript_id_slice]
+                                new_transcript_id = gene_transcript_id_collection[
+                                    transcript_id_slice
+                                ]
                             attribs = re.sub(
                                 full_transcript_id_string,
                                 'transcript_id "' + new_transcript_id + '"',
@@ -295,15 +302,13 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                         else:
                             if new_id_prefix == "repeatmask":
                                 match_feature_type = re.search(
-                                    r"(" + feature_id_label + r"\d+)",
+                                    r"(" + feature_id_label + "\d+)",
                                     line,
                                 )
                             else:
-                                match_feature_type = (
-                                    re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
-                                        r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
-                                        line,
-                                    )
+                                match_feature_type = re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
+                                    r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
+                                    line,
                                 )
                             if match_feature_type:
                                 full_feature_id = match_feature_type.group(1)
@@ -321,8 +326,6 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             "Feature type not recognised, will skip. Feature type: %s",
                             values[2],
                         )
-
-
 def get_sequence(
     seq_region: str,
     start: int,
@@ -351,7 +354,9 @@ def get_sequence(
         "get_sequence %s",
         f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}",
     )
-    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file:
+    with tempfile.NamedTemporaryFile(
+        mode="w+t", delete=False, dir=str(output_dir)
+    ) as bed_temp_file:
         bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}")
         bed_temp_file.close()
     bedtools_command = [
@@ -362,10 +367,10 @@ def get_sequence(
         "-bed",
         str(bed_temp_file.name),
     ]
-    bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with
-        bedtools_command, stdout=subprocess.PIPE
-    )  # pylint:disable=consider-using-with
-    for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")):  # type: ignore
+    bedtools_output = subprocess.Popen(bedtools_command, stdout=subprocess.PIPE)
+    for idx, line in enumerate(
+        io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")  # type: ignore
+    ):
         if idx == 1:
             if strand == 1:
                 sequence = line.rstrip()
@@ -399,155 +404,958 @@ def check_file(file_path: Path) -> None:
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
 
 
-def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
-    """
-    Save fastq sequences in a dictionary <header, sequence>
-    fasta_lines: List[str]: List of sequences
-    """
-    index: Dict[str, str] = {}
-    header: Optional[str] = None
-    seq: List[str] = []
-
-    for line in fasta_lines:
-        line = line.strip()
-        header_match = re.match(r">(.+)\n$", line)
-        if header_match:
-            if header:
-                index[header] = "".join(seq)
-                header = None
-            header = header_match.group(1)
-            seq = []
-        else:
-            seq.append(line)
-    # Add the last sequence
-    if header:
-        index[header] = "".join(seq)
-    return index
-
+def load_results_to_ensembl_db(
+    main_script_dir,
+    load_to_ensembl_db,
+    genome_file,
+    main_output_dir,
+    db_details,
+    num_threads,
+    ):
+    db_loading_script = os.path.join(
+        main_script_dir, "support_scripts_perl", "load_gtf_ensembl.pl"
+    )
+    db_loading_dir = create_dir(main_output_dir, "db_loading")
 
-# def fasta_to_dict(fasta_lines:List[str]):
-#    index = {}
-#    for line in fasta_lines:
-#    #it = iter(fasta_lines)
-#    #for header in it:
-#        match = re.search(r">(.+)\n$", line)
-#        header = match.group(1)
-#        seq = next(line)
-#        index[header] = seq
-#    return index
+    # Should collapse this into a function
+    annotation_results_gtf_file = os.path.join(
+        main_output_dir, "annotation_output", "annotation.gtf"
+    )
+    if os.path.exists(annotation_results_gtf_file):
+        logger.info("Loading main geneset to db")
+        batch_size = 200
+        load_type = "gene"
+        analysis_name = "ensembl"
+        gtf_records = batch_gtf_records(
+            annotation_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find the main gene annotation file, so not loading. Path checked:\n"
+            + annotation_results_gtf_file
+        )
 
+    rfam_results_gtf_file = os.path.join(main_output_dir, "rfam_output", "annotation.gtf")
+    if os.path.exists(rfam_results_gtf_file):
+        logger.info("Loading Rfam-based sncRNA genes to db")
+        batch_size = 500
+        load_type = "gene"
+        analysis_name = "ncrna"
+        gtf_records = batch_gtf_records(
+            rfam_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an Rfam annotation file, so not loading. Path checked:\n"
+            + rfam_results_gtf_file
+        )
 
-def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List:
-    """
-    The protein dataset file is splitted by a number of sequence equals to the batch_size
-    in batch files stored in 10 output directories.
-    protein_dataset : Path for the protein dataset.
-    output_dir : Output directory path.
-    batch_size : Size of the batch, it needs to be equals to the number of threads
-    to parallelise the sequence processing for each file.
-    """
-    batched_protein_files = []
-
-    for i in range(0, 10):
-        create_dir(output_dir, (f"bin_{i}"))
-    with open(protein_dataset, "r", encoding="utf8") as file_in:
-        seq_count = 0
-        batch_count = 0
-        current_record = ""
-        initial_seq = True
-        for line in file_in:
-            match = re.search(r">(.+)$", line)
-            # match header and is not first sequence, if the number of stored sequences in each file equals
-            # the number of batch_size, a new file will be created and the current_record reset
-            if match and not initial_seq and seq_count % batch_size == 0:
-                bin_num = random.randint(0, 9)
-                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-                with batch_file.open("w+") as file_out:
-                    file_out.write(current_record)
-                batch_count += 1
-                seq_count += 1
-                current_record = line
-                batched_protein_files.append(batch_file)
-            # match header and is the first sequence
-            elif match:
-                current_record += line
-                initial_seq = False
-                seq_count += 1
-            # other lines
-            else:
-                current_record += line
+    trnascan_results_gtf_file = os.path.join(
+        main_output_dir, "trnascan_output", "annotation.gtf"
+    )
+    if os.path.exists(trnascan_results_gtf_file):
+        logger.info("Loading tRNAScan-SE tRNA genes to db")
+        batch_size = 500
+        load_type = "gene"
+        analysis_name = "ncrna"
+        gtf_records = batch_gtf_records(
+            trnascan_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an tRNAScan-SE annotation file, so not loading. Path checked:\n"
+            + trnascan_results_gtf_file
+        )
 
-        if current_record:
-            bin_num = random.randint(0, 9)
-            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-            with batch_file.open("w+") as file_out:
-                file_out.write(current_record)
-            batched_protein_files.append(batch_file)
-    return batched_protein_files
+    dust_results_gtf_file = os.path.join(main_output_dir, "dust_output", "annotation.gtf")
+    if os.path.exists(dust_results_gtf_file):
+        logger.info("Loading Dust repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "dust"
+        gtf_records = batch_gtf_records(
+            dust_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a Dust annotation file, so not loading. Path checked:\n"
+            + dust_results_gtf_file
+        )
 
+    red_results_gtf_file = os.path.join(main_output_dir, "red_output", "annotation.gtf")
+    if os.path.exists(red_results_gtf_file):
+        logger.info("Loading Red repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "repeatdetector"
+        gtf_records = batch_gtf_records(
+            red_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a Red annotation file, so not loading. Path checked:\n"
+            + red_results_gtf_file
+        )
 
-def run_command(cmd) -> None:
-    """Submit command in multiprocessing action
+    trf_results_gtf_file = os.path.join(main_output_dir, "trf_output", "annotation.gtf")
+    if os.path.exists(trf_results_gtf_file):
+        logger.info("Loading TRF repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "trf"
+        gtf_records = batch_gtf_records(
+            trf_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a TRF annotation file, so not loading. Path checked:\n"
+            + trf_results_gtf_file
+        )
 
-    Args:
-        cmd (List[str]): command to execute
-    """
-    try:
-        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
-        logger.info("Command executed successfully: %s", cmd)
-        logger.info("Output: %s", result.stdout)
-    except subprocess.CalledProcessError as e:
-        logger.error("Command failed: %s", cmd)
-        logger.error("Error: %s", e.stderr)
+    cpg_results_gtf_file = os.path.join(main_output_dir, "cpg_output", "annotation.gtf")
+    if os.path.exists(cpg_results_gtf_file):
+        logger.info("Loading CpG islands to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "cpg"
+        gtf_records = batch_gtf_records(
+            cpg_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a CpG annotation file, so not loading. Path checked:\n"
+            + cpg_results_gtf_file
+        )
 
+    eponine_results_gtf_file = os.path.join(
+        main_output_dir, "eponine_output", "annotation.gtf"
+    )
+    if os.path.exists(eponine_results_gtf_file):
+        logger.info("Loading Eponine repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "eponine"
+        gtf_records = batch_gtf_records(
+            eponine_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an Eponine annotation file, so not loading. Path checked:\n"
+            + eponine_results_gtf_file
+        )
 
-def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
-    """
-    Parses a GTF file to extract gene, transcript, and exon information.
+    logger.info("Finished loading records to db")
+
+
+def generic_load_records_to_ensembl_db(
+    load_to_ensembl_db,
+    db_loading_script,
+    genome_file,
+    db_details,
+    db_loading_dir,
+    load_type,
+    analysis_name,
+    gtf_records,
+    num_threads,
+ ):
+    pool = multiprocessing.Pool(int(num_threads))
+    for record_batch in gtf_records:
+        pool.apply_async(
+            multiprocess_load_records_to_ensembl_db,
+            args=(
+                load_to_ensembl_db,
+                db_loading_script,
+                genome_file,
+                db_details,
+                db_loading_dir,
+                load_type,
+                analysis_name,
+                record_batch,
+            ),
+        )
 
-    Args:
-        gtf_file: Path to the GTF file.
+    pool.close()
+    pool.join()
+
+
+def multiprocess_load_records_to_ensembl_db(
+    load_to_ensembl_db,
+    db_loading_script,
+    genome_file,
+    db_details,
+    output_dir,
+    load_type,
+    analysis_name,
+    record_batch,
+    ):
+    with tempfile.NamedTemporaryFile(
+        mode="w+t", delete=False, dir=output_dir
+    ) as gtf_temp_out:
+        for line in record_batch:
+            gtf_temp_out.writelines(line)
+            gtf_temp_file_path = gtf_temp_out.name
+
+    (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",")
+
+    loading_cmd = [
+        "perl",
+        db_loading_script,
+        "-genome_file",
+        genome_file,
+        "-dbname",
+        db_name,
+        "-host",
+        db_host,
+        "-port",
+        str(db_port),
+        "-user",
+        db_user,
+        "-pass",
+        db_pass,
+        "-gtf_file",
+        gtf_temp_file_path,
+        "-analysis_name",
+        analysis_name,
+        "-load_type",
+        load_type,
+    ]
 
-    Returns:
-        A dictionary containing gene information, organized by gene_id, transcript_id, and features.
-    """
-    gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+    if load_type == "gene" and analysis_name == "ensembl":
+        loading_cmd.extend(
+            [
+                "-protein_coding_biotype",
+                "anno_protein_coding",
+                "-non_coding_biotype",
+                "anno_lncRNA",
+            ]
+        )
 
-    with open(gtf_file, "r") as gtf_in:
-        for line in gtf_in:
-            eles = line.strip().split("\t")
-            if len(eles) != 9:
+        if load_to_ensembl_db == "single_transcript_genes":
+            loading_cmd.append("-make_single_transcript_genes")
+
+    logger.info(" ".join(loading_cmd))
+    subprocess.run(loading_cmd)
+    gtf_temp_out.close()
+    os.remove(gtf_temp_file_path)  # doesn't seem to be working
+    logger.info("Finished: " + gtf_temp_file_path)
+    gc.collect()
+
+
+def batch_gtf_records(input_gtf_file, batch_size, output_dir, record_type):
+    records = []
+    gtf_in = open(input_gtf_file)
+    if record_type == "gene":
+        # NOTE that the neverending variations on GTF reading/writing/merging
+        # is becoming very messy
+        # need to create a set of utility methods outside of this script
+        # This one assumes the file has unique ids for the parent features.
+        # It then batches them into
+        # sets of records based on the batch size passed in
+        record_counter = 0
+        current_record_batch = []
+        current_gene_id = ""
+        line = gtf_in.readline()
+        while line:
+            if re.search(r"^#", line):
+                line = gtf_in.readline()
                 continue
 
-            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8])
-            if not match:
+            eles = line.split("\t")
+            if not len(eles) == 9:
+                line = gtf_in.readline()
                 continue
 
+            match = re.search(r'gene_id "([^"]+)"', line)
             gene_id = match.group(1)
-            transcript_id = match.group(2)
-            feature_type = eles[2]
 
-            if gene_id not in gtf_genes:
-                gtf_genes[gene_id] = {}
+            if not current_gene_id:
+                record_counter += 1
+                current_gene_id = gene_id
+
+            if not gene_id == current_gene_id:
+                record_counter += 1
+                if record_counter % batch_size == 0:
+                    records.append(current_record_batch)
+                    current_record_batch = []
+                current_gene_id = gene_id
+
+            current_record_batch.append(line)
+            line = gtf_in.readline()
+
+        records.append(current_record_batch)
+
+    elif record_type == "single_line_feature":
+        record_counter = 0
+        current_record_batch = []
+        current_gene_id = ""
+        line = gtf_in.readline()
+        while line:
+            if re.search(r"^#", line):
+                line = gtf_in.readline()
+                continue
+
+            eles = line.split("\t")
+            if not len(eles) == 9:
+                line = gtf_in.readline()
+                continue
+
+            record_counter += 1
+
+            if record_counter % batch_size == 0:
+                records.append(current_record_batch)
+                current_record_batch = []
+
+            current_record_batch.append(line)
+            line = gtf_in.readline()
+
+        records.append(current_record_batch)
 
-            if feature_type == "transcript":
-                gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []}
-            elif feature_type == "exon":
-                if transcript_id in gtf_genes[gene_id]:
-                    gtf_genes[gene_id][transcript_id]["exons"].append(line.strip())
+    gtf_in.close()
+
+    return records
+
+
+def run_find_orfs(genome_file, main_output_dir):
+    min_orf_length = 600
+
+    orf_output_dir = create_dir(main_output_dir, "orf_output")
+    seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
+    for region_name in seq_region_lengths:
+        region_length = seq_region_lengths[region_name]
+        seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
+        for phase in range(0, 6):
+            find_orf_phased_region(
+                region_name, seq, phase, min_orf_length, orf_output_dir
+            )
+
+
+def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
+    current_index = phase
+    orf_counter = 1
+    if phase > 2:
+        seq = reverse_complement(seq)
+        current_index = current_index % 3
+
+    orf_file_path = os.path.join(
+        orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")
+    )
+    orf_out = open(orf_file_path, "w+")
+
+    while current_index < len(seq):
+        codon = seq[current_index : current_index + 3]
+        if codon == "ATG":
+            orf_seq = codon
+            for j in range(current_index + 3, len(seq), 3):
+                next_codon = seq[j : j + 3]
+                if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
+                    orf_seq += next_codon
+                    if len(orf_seq) >= min_orf_length:
+                        orf_out.write(
+                            ">"
+                            + region_name
+                            + "_phase"
+                            + str(phase)
+                            + "_orf"
+                            + str(orf_counter)
+                            + "\n"
+                        )
+                        orf_out.write(orf_seq + "\n")
+                        orf_counter += 1
+                        orf_seq = ""
+                        break
+
+                # If there's another met in phase, then put i to the start of
+                # the codon after j so that only the longest ORF is found
+                if next_codon == "ATG":
+                    current_index = j + 3
+                orf_seq += next_codon
+        current_index += 3
+    orf_out.close()
+
+def bed_to_gff(input_dir, hints_file):
+    gff_out = open(hints_file, "w+")
+    exons_dict = {}
+    for bed_file in glob.glob(input_dir + "/*.bed"):
+        logger.info("Processing file for hints:")
+        logger.info(bed_file)
+        bed_in = open(bed_file)
+        bed_lines = bed_in.readlines()
+        for line in bed_lines:
+            line = line.rstrip()
+            elements = line.split("\t")
+            seq_region_name = elements[0]
+            offset = int(elements[1])
+            hit_name = elements[3]
+            strand = elements[5]
+            block_sizes = elements[10].split(",")
+            block_sizes = list(filter(None, block_sizes))
+            block_starts = elements[11].split(",")
+            block_starts = list(filter(None, block_starts))
+            exons = bed_block_to_exons(block_sizes, block_starts, offset)
+            for i, element in enumerate(exons):
+                exon_coords = exons[i]
+                exon_key = (
+                    seq_region_name
+                    + ":"
+                    + exon_coords[0]
+                    + ":"
+                    + exon_coords[1]
+                    + ":"
+                    + strand
+                )
+                if exon_key in exons_dict:
+                    exons_dict[exon_key][5] += 1
+                else:
+                    gff_list = [
+                        seq_region_name,
+                        "CDNA",
+                        "exon",
+                        exon_coords[0],
+                        exon_coords[1],
+                        1.0,
+                        strand,
+                        ".",
+                    ]
+                    exons_dict[exon_key] = gff_list
+
+    for exon_key, gff_list in exons_dict.items():
+        gff_list[5] = str(gff_list[5])
+        gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
+        gff_out.write(gff_line)
+
+    gff_out.close()
+
+    sorted_hints_out = open((hints_file + ".srt"), "w+")
+    subprocess.run(
+        ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
+        stdout=sorted_hints_out,
+    )
+    sorted_hints_out.close()
+
+def splice_junction_to_gff(input_dir, hints_file):
+    sjf_out = open(hints_file, "w+")
+
+    for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
+        sjf_in = open(sj_tab_file)
+        sjf_lines = sjf_in.readlines()
+        for line in sjf_lines:
+            elements = line.split("\t")
+            strand = "+"
+            # If the strand is undefined then skip, Augustus expects a strand
+            if elements[3] == "0":
+                continue
+            elif elements[3] == "2":
+                strand = "-"
+
+            junction_length = int(elements[2]) - int(elements[1]) + 1
+            if junction_length < 100:
+                continue
+
+            if not elements[4] and elements[7] < 10:
+                continue
+
+            # For the moment treat multimapping and single mapping things as a combined score
+            score = float(elements[6]) + float(elements[7])
+            score = str(score)
+            output_line = [
+                elements[0],
+                "RNASEQ",
+                "intron",
+                elements[1],
+                elements[2],
+                score,
+                strand,
+                ".",
+                ("src=W;mul=" + score + ";"),
+            ]
+            sjf_out.write("\t".join(output_line) + "\n")
+
+    sjf_out.close()
+
+
+def split_genome(genome_file, target_dir, min_seq_length):
+    # This is the lazy initial way of just splitting into a dir
+    # of files based on the toplevel sequence with a min sequence length filter
+    # There are a couple of obvious improvements:
+    # 1) Instead of making files for all seqs, just process N seqs
+    #    parallel, where N = num_threads. Then you could clean up the seq file
+    #    after each seq finishes, thus avoiding potentially
+    #    having thousands of file in a dir
+    # 2) Split the seq into even slices and process these in parallel
+    #    (which the same cleanup as in 1). For sequences smaller than the
+    #    target slice size, bundle them up together into a single file. Vastly
+    #    more complex, partially implemented in the splice_genome method
+    #    Allows for more consistency with parallelisation (since there should
+    #    be no large outliers). But require a mapping strategy for the
+    #    coords and sequence names and all the hints will need to be adjusted
+    current_header = ""
+    current_seq = ""
+
+    file_in = open(genome_file)
+    line = file_in.readline()
+    while line:
+        match = re.search(r">(.+)$", line)
+        if match and current_header:
+            if len(current_seq) > min_seq_length:
+                file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+                if not os.path.exists(file_out_name):
+                    file_out = open(file_out_name, "w+")
+                    file_out.write(">" + current_header + "\n" + current_seq + "\n")
+                    file_out.close()
+
+                else:
+                    print(
+                        "Found an existing split file, so will not overwrite. File found:"
+                    )
+                    print(file_out_name)
+
+            current_seq = ""
+            current_header = match.group(1)
+        elif match:
+            current_header = match.group(1)
+        else:
+            current_seq += line.rstrip()
+
+        line = file_in.readline()
+
+    if len(current_seq) > min_seq_length:
+        file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+        if not os.path.exists(file_out_name):
+            file_out = open(file_out_name, "w+")
+            file_out.write(">" + current_header + "\n" + current_seq + "\n")
+            file_out.close()
+
+        else:
+            logger.info(
+                "Found an existing split file, so will not overwrite. File found:"
+            )
+            logger.info(file_out_name)
+
+    file_in.close()
+
+
+def update_gtf_genes(parsed_gtf_genes, combined_results, validation_type):
+    output_lines = []
+
+    for gene_id in parsed_gtf_genes.keys():
+        transcript_ids = parsed_gtf_genes[gene_id].keys()
+        for transcript_id in transcript_ids:
+            transcript_line = parsed_gtf_genes[gene_id][transcript_id]["transcript"]
+            single_cds_exon_transcript = 0
+            translation_match = re.search(
+                r'; translation_coords "([^"]+)";', transcript_line
+            )
+            if translation_match:
+                translation_coords = translation_match.group(1)
+                translation_coords_list = translation_coords.split(":")
+                # If the start exon coords of both exons are the same,
+                # then it's the same exon and thus a single exon cds
+                if translation_coords_list[0] == translation_coords_list[3]:
+                    single_cds_exon_transcript = 1
+
+            exon_lines = parsed_gtf_genes[gene_id][transcript_id]["exons"]
+            validation_results = combined_results[transcript_id]
+            rnasamba_coding_probability = float(validation_results[0])
+            rnasamba_coding_potential = validation_results[1]
+            cpc2_coding_probability = float(validation_results[2])
+            cpc2_coding_potential = validation_results[3]
+            transcript_length = int(validation_results[4])
+            peptide_length = int(validation_results[5])
+            diamond_e_value = None
+            if len(validation_results) == 7:
+                diamond_e_value = validation_results[6]
+
+            avg_coding_probability = (
+                rnasamba_coding_probability + cpc2_coding_probability
+            ) / 2
+            max_coding_probability = max(
+                rnasamba_coding_probability, cpc2_coding_probability
+            )
+
+            match = re.search(r'; biotype "([^"]+)";', transcript_line)
+            biotype = match.group(1)
+            if biotype == "busco" or biotype == "protein":
+                transcript_line = re.sub(
+                    '; biotype "' + biotype + '";',
+                    '; biotype "protein_coding";',
+                    transcript_line,
+                )
+                output_lines.append(transcript_line)
+                output_lines.extend(exon_lines)
+                continue
+
+            min_single_exon_pep_length = 100
+            min_multi_exon_pep_length = 75
+            min_single_source_probability = 0.8
+            min_single_exon_probability = 0.9
+
+            # Note that the below looks at validating things
+            # under different levels of strictness
+            # There are a few different continue statements,
+            # where transcripts will be skipped resulting
+            # in a smaller post validation file. It mainly
+            # removes single coding exon genes with no real
+            # support or for multi-exon lncRNAs that are less than 200bp long
+            if single_cds_exon_transcript == 1 and validation_type == "relaxed":
+                if diamond_e_value is not None:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_single_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        or cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_single_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                else:
+                    continue
+            elif single_cds_exon_transcript == 1 and validation_type == "moderate":
+                if (
+                    diamond_e_value is not None
+                    and peptide_length >= min_single_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        and cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_single_exon_pep_length
+                    and avg_coding_probability >= min_single_exon_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                else:
+                    continue
+            else:
+                if diamond_e_value is not None:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_multi_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        or cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_multi_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif transcript_length >= 200:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "lncRNA";',
+                        transcript_line,
+                    )
+                    transcript_line = re.sub(
+                        ' translation_coords "[^"]+";', "", transcript_line
+                    )
+                else:
+                    continue
+
+            output_lines.append(transcript_line)
+            output_lines.extend(exon_lines)
+
+    return output_lines
+
+
+def read_gtf_genes(gtf_file):
+    gtf_genes = {}
+    gtf_in = open(gtf_file)
+    line = gtf_in.readline()
+    while line:
+        eles = line.split("\t")
+        if not len(eles) == 9:
+            line = gtf_in.readline()
+            continue
+
+        match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
+
+        if not match:
+            line = gtf_in.readline()
+            continue
+
+        gene_id = match.group(1)
+        transcript_id = match.group(2)
+        feature_type = eles[2]
+        if gene_id not in gtf_genes:
+            gtf_genes[gene_id] = {}
+        if feature_type == "transcript":
+            gtf_genes[gene_id][transcript_id] = {}
+            gtf_genes[gene_id][transcript_id]["transcript"] = line
+            gtf_genes[gene_id][transcript_id]["exons"] = []
+        elif feature_type == "exon":
+            gtf_genes[gene_id][transcript_id]["exons"].append(line)
+        line = gtf_in.readline()
+    gtf_in.close()
 
     return gtf_genes
 
 
-def check_transcriptomic_output(main_output_dir: Path) -> None:
-    """Check across the various transcriptomic if there is a reasonable amount of data
+def fasta_to_dict(fasta_list):
+    index = {}
+    it = iter(fasta_list)
+    for header in it:
+        match = re.search(r">(.+)\n$", header)
+        header = match.group(1)
+        seq = next(it)
+        index[header] = seq
+    return index
 
-    Args:
-        main_output_dir (Path): Output dir path
-    Raises:
-        IOError: If no lines or fewer than the minimum expected lines are found in the output files.
-    """
 
+# def merge_gtf_files(file_paths,id_label):
+
+#  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
+#  gtf_out = open(gtf_file_path,'w+')
+#  for gtf_file_path in gtf_files:
+#    gtf_file_name = os.path.basename(gtf_file_path)
+#    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
+#    start_offset = int(match.group(1))
+#    gtf_in = open(gtf_file_path,'r')
+#    line = gtf_in.readline()
+#    while line:
+#      values = line.split("\t")
+#      if len(values) == 9 and (values[2] in feature_types):
+#        values[3] = str(int(values[3]) + (start_offset - 1))
+#        values[4] = str(int(values[4]) + (start_offset - 1))
+#        gtf_out.write("\t".join(values))
+#        line = gtf_in.readline()
+#    gtf_in.close()
+#  gtf_out.close()
+
+
+def multiprocess_generic(cmd):
+    print(" ".join(cmd))
+    subprocess.run(cmd)
+
+
+def seq_region_names(genome_file):
+    region_list = []
+
+    file_in = open(genome_file)
+    line = file_in.readline()
+    while line:
+        match = re.search(r">([^\s]+)", line)
+        if match:
+            region_name = match.group(1)
+            if region_name == "MT":
+                logger.info("Skipping region named MT")
+                line = file_in.readline()
+                continue
+            else:
+                region_list.append(match.group(1))
+        line = file_in.readline()
+
+    return region_list
+
+
+def slice_genome(genome_file, target_dir, target_slice_size):
+    # The below is sort of tested
+    # Without the
+    target_seq_length = 50000000
+    min_seq_length = 1000
+    current_header = ""
+    current_seq = ""
+    seq_dict = {}
+    for line in seq:  # pylint:disable=used-before-assignment
+        match = re.search(r">(.+)$", line)
+        if match and current_header:
+            seq_dict[current_header] = current_seq
+            current_seq = ""
+            current_header = match.group(1)
+        elif match:
+            current_header = match.group(1)
+        else:
+            current_seq += line.rstrip()
+
+    seq_dict[current_header] = current_seq
+
+    seq_buffer = 0
+    file_number = 0
+    file_name = "genome_file_" + str(file_number)
+
+    for header in seq_dict:
+        seq_iterator = 0
+        seq = seq_dict[header]
+
+        while len(seq) > target_seq_length:
+            file_out = open(os.path.join(target_dir, file_name), "w+")
+            subseq = seq[0:target_seq_length]
+            file_out.write(
+                ">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n"
+            )
+            file_out.close()
+            seq = seq[target_seq_length:]
+            seq_iterator += 1
+            file_number += 1
+            file_name = "genome_file_" + str(file_number)
+
+        if len(seq) >= min_seq_length:
+            file_name = "genome_file_" + str(file_number)
+            file_out = open(os.path.join(file_name), "w+")
+            file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
+            file_out.close()
+            file_number += 1
+            file_name = "genome_file_" + str(file_number)
+
+
+def coallate_results(main_output_dir):
+    results_dir = create_dir(main_output_dir, "results")
+    output_dirs = [
+        "augustus_output",
+        "cpg_output",
+        "dust_output",
+        "eponine_output",
+        "red_output",
+        "rfam_output",
+        "trf_output",
+        "trnascan_output",
+    ]
+    for output_dir in output_dirs:
+        match = re.search(r"(.+)_output", output_dir)
+        result_type = match.group(1)
+        results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
+        copy_path = os.path.join(results_dir, (result_type + ".gtf"))
+        if os.path.exists(results_path):
+            cpy_cmd = ["cp", results_path, copy_path]
+            subprocess.run(cpy_cmd)
+
+
+
+def check_transcriptomic_output(main_output_dir):
+
+    # This will check across the various transcriptomic
+    # dirs and check there's actually some data
     transcriptomic_dirs = [
         "scallop_output",
         "stringtie_output",
@@ -556,23 +1364,31 @@ def check_transcriptomic_output(main_output_dir: Path) -> None:
     total_lines = 0
     min_lines = 100000
     for transcriptomic_dir in transcriptomic_dirs:
-        full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf"
-        if not full_file_path.exists():
+        full_file_path = os.path.join(
+            main_output_dir, transcriptomic_dir, "annotation.gtf"
+        )
+        if not os.path.exists(full_file_path):
             logger.warning(
-                "Warning, no annotation.gtf found for %s. This might be fine, \
-                    e.g. no long read data were provided",
-                transcriptomic_dir,
+                "Warning, no annotation.gtf found for "
+                + transcriptomic_dir
+                + ". This might be fine, e.g. no long read data were provided"
             )
             continue
-        num_lines = sum(1 for _ in full_file_path.open("r"))
-        total_lines += num_lines
-        logging.info("For %s found a total of in the annotation.gtf file", num_lines)
+        num_lines = sum(1 for line in open(full_file_path))
+        total_lines = total_lines + num_lines
+        logger.info(
+            "For "
+            + transcriptomic_dir
+            + " found a total of "
+            + str(num_lines)
+            + " in the annotation.gtf file"
+        )
     if total_lines == 0:
         raise IOError(
             "Anno was run with transcriptomic mode enabled,\
             but the transcriptomic annotation files are empty"
         )
-    if total_lines <= min_lines:
+    elif total_lines <= min_lines:
         raise IOError(
             "Anno was run with transcriptomic mode enabled, \
             but the total number of lines in the output \
@@ -582,318 +1398,9 @@ def check_transcriptomic_output(main_output_dir: Path) -> None:
             "Min allowed: " + str(min_lines)
         )
 
-
-
-# def run_find_orfs(genome_file, main_output_dir):
-#     min_orf_length = 600
-
-#     orf_output_dir = create_dir(main_output_dir, "orf_output")
-#     seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
-#     for region_name in seq_region_lengths:
-#         region_length = seq_region_lengths[region_name]
-#         seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
-#         for phase in range(0, 6):
-#             find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir)
-
-
-# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
-#     current_index = phase
-#     orf_counter = 1
-#     if phase > 2:
-#         seq = reverse_complement(seq)
-#         current_index = current_index % 3
-
-#     orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa"))
-#     orf_out = open(orf_file_path, "w+")
-
-#     while current_index < len(seq):
-#         codon = seq[current_index : current_index + 3]
-#         if codon == "ATG":
-#             orf_seq = codon
-#             for j in range(current_index + 3, len(seq), 3):
-#                 next_codon = seq[j : j + 3]
-#                 if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
-#                     orf_seq += next_codon
-#                     if len(orf_seq) >= min_orf_length:
-#                         orf_out.write(
-#                             ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n"
-#                         )
-#                         orf_out.write(orf_seq + "\n")
-#                         orf_counter += 1
-#                         orf_seq = ""
-#                         break
-
-#                 # If there's another met in phase, then put i to the start of
-#                 # the codon after j so that only the longest ORF is found
-#                 if next_codon == "ATG":
-#                     current_index = j + 3
-#                 orf_seq += next_codon
-#         current_index += 3
-#     orf_out.close()
-
-
-# def bed_to_gff(input_dir, hints_file):
-#     gff_out = open(hints_file, "w+")
-#     exons_dict = {}
-#     for bed_file in glob.glob(input_dir + "/*.bed"):
-#         logger.info("Processing file for hints:")
-#         logger.info(bed_file)
-#         bed_in = open(bed_file)
-#         bed_lines = bed_in.readlines()
-#         for line in bed_lines:
-#             line = line.rstrip()
-#             elements = line.split("\t")
-#             seq_region_name = elements[0]
-#             offset = int(elements[1])
-#             hit_name = elements[3]
-#             strand = elements[5]
-#             block_sizes = elements[10].split(",")
-#             block_sizes = list(filter(None, block_sizes))
-#             block_starts = elements[11].split(",")
-#             block_starts = list(filter(None, block_starts))
-#             exons = bed_block_to_exons(block_sizes, block_starts, offset)
-#             for i, element in enumerate(exons):
-#                 exon_coords = exons[i]
-#                 exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand
-#                 if exon_key in exons_dict:
-#                     exons_dict[exon_key][5] += 1
-#                 else:
-#                     gff_list = [
-#                         seq_region_name,
-#                         "CDNA",
-#                         "exon",
-#                         exon_coords[0],
-#                         exon_coords[1],
-#                         1.0,
-#                         strand,
-#                         ".",
-#                     ]
-#                     exons_dict[exon_key] = gff_list
-
-#     for exon_key, gff_list in exons_dict.items():
-#         gff_list[5] = str(gff_list[5])
-#         gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
-#         gff_out.write(gff_line)
-
-#     gff_out.close()
-
-#     sorted_hints_out = open((hints_file + ".srt"), "w+")
-#     subprocess.run(
-#         ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
-#         stdout=sorted_hints_out,
-#     )
-#     sorted_hints_out.close()
-
-
-# def splice_junction_to_gff(input_dir, hints_file):
-#     sjf_out = open(hints_file, "w+")
-
-#     for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
-#         sjf_in = open(sj_tab_file)
-#         sjf_lines = sjf_in.readlines()
-#         for line in sjf_lines:
-#             elements = line.split("\t")
-#             strand = "+"
-#             # If the strand is undefined then skip, Augustus expects a strand
-#             if elements[3] == "0":
-#                 continue
-#             elif elements[3] == "2":
-#                 strand = "-"
-
-#             junction_length = int(elements[2]) - int(elements[1]) + 1
-#             if junction_length < 100:
-#                 continue
-
-#             if not elements[4] and elements[7] < 10:
-#                 continue
-
-#             # For the moment treat multimapping and single mapping things as a combined score
-#             score = float(elements[6]) + float(elements[7])
-#             score = str(score)
-#             output_line = [
-#                 elements[0],
-#                 "RNASEQ",
-#                 "intron",
-#                 elements[1],
-#                 elements[2],
-#                 score,
-#                 strand,
-#                 ".",
-#                 ("src=W;mul=" + score + ";"),
-#             ]
-#             sjf_out.write("\t".join(output_line) + "\n")
-
-#     sjf_out.close()
-
-
-# def split_genome(genome_file, target_dir, min_seq_length):
-#     # This is the lazy initial way of just splitting into a dir
-#     # of files based on the toplevel sequence with a min sequence length filter
-#     # There are a couple of obvious improvements:
-#     # 1) Instead of making files for all seqs, just process N seqs
-#     #    parallel, where N = num_threads. Then you could clean up the seq file
-#     #    after each seq finishes, thus avoiding potentially
-#     #    having thousands of file in a dir
-#     # 2) Split the seq into even slices and process these in parallel
-#     #    (which the same cleanup as in 1). For sequences smaller than the
-#     #    target slice size, bundle them up together into a single file. Vastly
-#     #    more complex, partially implemented in the splice_genome method
-#     #    Allows for more consistency with parallelisation (since there should
-#     #    be no large outliers). But require a mapping strategy for the
-#     #    coords and sequence names and all the hints will need to be adjusted
-#     current_header = ""
-#     current_seq = ""
-
-#     file_in = open(genome_file)
-#     line = file_in.readline()
-#     while line:
-#         match = re.search(r">(.+)$", line)
-#         if match and current_header:
-#             if len(current_seq) > min_seq_length:
-#                 file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-#                 if not os.path.exists(file_out_name):
-#                     file_out = open(file_out_name, "w+")
-#                     file_out.write(">" + current_header + "\n" + current_seq + "\n")
-#                     file_out.close()
-
-#                 else:
-#                     print("Found an existing split file, so will not overwrite. File found:")
-#                     print(file_out_name)
-
-#             current_seq = ""
-#             current_header = match.group(1)
-#         elif match:
-#             current_header = match.group(1)
-#         else:
-#             current_seq += line.rstrip()
-
-#         line = file_in.readline()
-
-#     if len(current_seq) > min_seq_length:
-#         file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-#         if not os.path.exists(file_out_name):
-#             file_out = open(file_out_name, "w+")
-#             file_out.write(">" + current_header + "\n" + current_seq + "\n")
-#             file_out.close()
-
-#         else:
-#             logger.info("Found an existing split file, so will not overwrite. File found:")
-#             logger.info(file_out_name)
-
-#     file_in.close()
-
-
-# # def merge_gtf_files(file_paths,id_label):
-
-# #  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
-# #  gtf_out = open(gtf_file_path,'w+')
-# #  for gtf_file_path in gtf_files:
-# #    gtf_file_name = os.path.basename(gtf_file_path)
-# #    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
-# #    start_offset = int(match.group(1))
-# #    gtf_in = open(gtf_file_path,'r')
-# #    line = gtf_in.readline()
-# #    while line:
-# #      values = line.split("\t")
-# #      if len(values) == 9 and (values[2] in feature_types):
-# #        values[3] = str(int(values[3]) + (start_offset - 1))
-# #        values[4] = str(int(values[4]) + (start_offset - 1))
-# #        gtf_out.write("\t".join(values))
-# #        line = gtf_in.readline()
-# #    gtf_in.close()
-# #  gtf_out.close()
-
-
-# def multiprocess_generic(cmd):
-#     print(" ".join(cmd))
-#     subprocess.run(cmd)
-
-
-# def seq_region_names(genome_file):
-#     region_list = []
-
-#     file_in = open(genome_file)
-#     line = file_in.readline()
-#     while line:
-#         match = re.search(r">([^\s]+)", line)
-#         if match:
-#             region_name = match.group(1)
-#             if region_name == "MT":
-#                 logger.info("Skipping region named MT")
-#                 line = file_in.readline()
-#                 continue
-#             else:
-#                 region_list.append(match.group(1))
-#         line = file_in.readline()
-
-#     return region_list
-
-
-# def slice_genome(genome_file, target_dir, target_slice_size):
-#     # The below is sort of tested
-#     # Without the
-#     target_seq_length = 50000000
-#     min_seq_length = 1000
-#     current_header = ""
-#     current_seq = ""
-#     seq_dict = {}
-#     for line in seq:  # pylint:disable=used-before-assignment
-#         match = re.search(r">(.+)$", line)
-#         if match and current_header:
-#             seq_dict[current_header] = current_seq
-#             current_seq = ""
-#             current_header = match.group(1)
-#         elif match:
-#             current_header = match.group(1)
-#         else:
-#             current_seq += line.rstrip()
-
-#     seq_dict[current_header] = current_seq
-
-#     seq_buffer = 0
-#     file_number = 0
-#     file_name = "genome_file_" + str(file_number)
-
-#     for header in seq_dict:
-#         seq_iterator = 0
-#         seq = seq_dict[header]
-
-#         while len(seq) > target_seq_length:
-#             file_out = open(os.path.join(target_dir, file_name), "w+")
-#             subseq = seq[0:target_seq_length]
-#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n")
-#             file_out.close()
-#             seq = seq[target_seq_length:]
-#             seq_iterator += 1
-#             file_number += 1
-#             file_name = "genome_file_" + str(file_number)
-
-#         if len(seq) >= min_seq_length:
-#             file_name = "genome_file_" + str(file_number)
-#             file_out = open(os.path.join(file_name), "w+")
-#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
-#             file_out.close()
-#             file_number += 1
-#             file_name = "genome_file_" + str(file_number)
-
-
-# def coallate_results(main_output_dir):
-#     results_dir = create_dir(main_output_dir, "results")
-#     output_dirs = [
-#         "augustus_output",
-#         "cpg_output",
-#         "dust_output",
-#         "eponine_output",
-#         "red_output",
-#         "rfam_output",
-#         "trf_output",
-#         "trnascan_output",
-#     ]
-#     for output_dir in output_dirs:
-#         match = re.search(r"(.+)_output", output_dir)
-#         result_type = match.group(1)
-#         results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
-#         copy_path = os.path.join(results_dir, (result_type + ".gtf"))
-#         if os.path.exists(results_path):
-#             cpy_cmd = ["cp", results_path, copy_path]
-# subprocess.run(cpy_cmd)
+    else:
+        logger.info(
+            "Found "
+            + str(total_lines)
+            + " total lines across the transcriptomic files. Checks passed"
+        )
\ No newline at end of file
diff --git a/src/python/ensembl/tools/anno/utils/_utils_copy.py b/src/python/ensembl/tools/anno/utils/_utils_copy.py
new file mode 100644
index 0000000..b4bbc72
--- /dev/null
+++ b/src/python/ensembl/tools/anno/utils/_utils_copy.py
@@ -0,0 +1,899 @@
+# pylint: disable=missing-module-docstring
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import io
+import logging
+import os
+from os import PathLike
+from pathlib import Path
+import random
+import re
+import subprocess
+import shutil
+import tempfile
+from typing import Dict, List, Optional
+
+logger = logging.getLogger("__name__." + __name__)
+
+# with open(Path(__file__).parents[6] / "conf/config.json", "r", encoding="utf8") as f:
+#    config = json.load(f)
+
+
+def create_dir(input_dir: Path, dir_name: str) -> Path:
+    """
+    Create directory or subdirectory and log operations.
+    Args:
+        main_output_dir: str main output directory path
+        dir_name: str optional subdirectory to be created
+    Returns:
+        str Path to the created directory
+    """
+    if dir_name:
+        target_dir = Path(input_dir) / str(dir_name)
+    else:
+        target_dir = input_dir
+
+    if os.path.exists(target_dir):
+        logger.warning("Directory already exists, will not create again")
+        return target_dir
+
+    logger.info("Attempting to create target dir: %s", target_dir)
+
+    try:
+        os.mkdir(target_dir)
+    except OSError:
+        logger.error("Creation of the dir failed, path used: %s", target_dir)
+    else:
+        logger.info("Successfully created the dir on the following path: %s", target_dir)
+    return target_dir
+
+
+def check_exe(exe_bin: PathLike) -> None:
+    """
+    Check executable path
+    Args:
+        exe_bin: Executable path
+
+    Raises:
+        OSError: Executable path does not exist
+    """
+    if not shutil.which(exe_bin):
+        raise OSError(f"Exe does not exist. Path checked: {exe_bin}")
+
+
+def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
+    """
+    Check number of transcript lines in the GTF
+
+    Arg:
+        gtf_file: GTF file path
+        content_obj: Object to detect and count in the gtf i.e transcript, repeat
+
+    Return: Number of occurences
+    """
+    obj_count = 0
+    with open(gtf_file) as gtf_in:
+        for line in gtf_in:
+            gtf_raw = line.split("\t")
+            if not len(gtf_raw) == 9:
+                continue
+            if gtf_raw[2] == content_obj:
+                obj_count += 1
+    logger.info("%s; Number of %s detected: %d", gtf_file, content_obj, int(obj_count))
+    return obj_count
+
+
+def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dict:
+    """
+    Split the genome file according to the header and store in a dictionary
+    all the sequences whose length is greater than min_seq_length.
+    Args:
+        genome_file: Genome file path.
+        min_seq_length: Minimum slice length.
+    Return: Dictionary of sequence headers and the corresponding sequence length
+    """
+    current_header = ""
+    current_seq = ""
+
+    seq_region_to_length = {}
+    with open(genome_file) as file_in:
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            if match and current_header:
+                if len(current_seq) > min_seq_length:
+                    seq_region_to_length[current_header] = len(current_seq)
+
+                current_seq = ""
+                current_header = match.group(1)
+            elif match:
+                current_header = match.group(1)
+            else:
+                current_seq += line.rstrip()
+
+        if len(current_seq) > min_seq_length:
+            seq_region_to_length[current_header] = len(current_seq)
+    # logging.info("length of the seq region %s", seq_region_to_length)
+    return seq_region_to_length
+
+
+def get_slice_id(
+    seq_region_to_length: Dict,
+    slice_size: int = 1000000,
+    overlap: int = 0,
+    min_length: int = 0,
+) -> List:
+    """
+    Get list of ids for a genomic slice
+    Arg:
+    seq_region_to_length: Dictionary with the sequence headers as keys and the sequence lengths as values
+    slice_size: Size of the slice
+    overlap: Overlap length between two slices
+    min_length: Min length of the slice
+    Return: List of IDs for each genomic slice
+    """
+
+    slice_ids_per_region = []
+    for seq_region in seq_region_to_length:
+        seq_region_length = int(seq_region_to_length[seq_region])
+        if seq_region_length < min_length:
+            continue
+
+        if seq_region_length <= slice_size:
+            slice_ids_per_region.append([seq_region, 1, seq_region_length])
+            continue
+
+        start = 1
+        end = start + slice_size - 1
+        while end < seq_region_length:
+            start = start - overlap
+            start = max(start, 1)
+            end = start + slice_size - 1
+            end = min(end, seq_region_length)
+            if (end - start + 1) >= min_length:
+                slice_ids_per_region.append([seq_region, start, end])
+            start = end + 1
+    # logging.info("list of slice ids %s", str(slice_ids_per_region))
+    return slice_ids_per_region
+
+
+def update_string(text, lookup_text, new_text) -> str:
+    """
+    Update substring
+    Args:
+        text : Text.
+        lookup_text : String to look for.
+        new_text : String to substitute.
+    Return
+        Text with string substitution
+    """
+    text = re.sub(
+        lookup_text,
+        new_text,
+        text,
+    )
+    return text
+
+
+#   slice_output_to_gtf(repeatmasker_dir, "repeat_id", "repeatmask", True, ".rm.gtf")
+def slice_output_to_gtf(  # pylint: disable=too-many-branches
+    output_dir: Path,
+    feature_id_label: str = "",
+    new_id_prefix: str = "",
+    unique_ids: bool = True,
+    file_extension: str = ".gtf",
+) -> None:
+    """
+    Collect all the gtf files per file extension and merge them in a final gtf file  assigning unique ids.
+
+    This holds keys of the current slice details with the gene id to form unique keys.
+    Each time a new key is added the overall gene counter is incremented
+    and the value of the key is set to the new gene id. Any subsequent
+    lines with the same region/gene id key will then just get
+    the new id without incrementing the counter.
+
+    Args:
+    output_dir : Output directory.
+    feature_id_label : Feature identifier.
+    new_id_prefix : New feature identifier.
+    unique_ids : If True assign unique ids for the same feature type.
+    file_extension : Input file extension.
+    """
+    feature_types = ["exon", "transcript", "repeat", "simple_feature"]
+    # Initialise gene and feature counter
+    gene_counter = 1
+    feature_counter = 1
+    # Initialise dictionaries that will store the list of gene and transcript indexes
+    gene_id_collection = {}
+    gene_transcript_id_collection = {}
+    transcript_id_count_gene_id = {}  # one gene can have multiple transcripts
+    input_files = output_dir.glob(str("*" + file_extension))
+    with open(output_dir / "annotation.gtf", "w+") as output_file:
+        for input_file in input_files:
+            if os.stat(input_file).st_size == 0:
+                logger.info("File is empty, will skip %s", input_file.name)
+                continue
+            match = re.search(r"\.rs(\d+)\.re(\d+)\.", input_file.name)
+            assert match is not None
+            start_offset = int(match.group(1))
+            with open(input_file, "r") as gtf_in:
+                for line in gtf_in:
+                    values = line.split("\t")
+                    if len(values) == 9 and (values[2] in feature_types) and unique_ids:
+                        # each slice start from 0 so we need to add the offset to get the real coordinates
+                        values[3] = str(int(values[3]) + (start_offset - 1))
+                        values[4] = str(int(values[4]) + (start_offset - 1))
+                        # Unique id based on gene and transcript ids
+                        attribs = values[8]
+                        # Get the slice details, gene id and transcript id.
+                        match_gene_type = re.search(
+                            r'(gene_id +"([^"]+)").+(transcript_id +"([^"]+)")',
+                            line,
+                        )
+                        # gene_id "1"; transcript_id "1"; biotype "tRNA_pseudogene";
+                        if match_gene_type:
+                            full_gene_id_string = match_gene_type.group(1)
+                            current_gene_id = match_gene_type.group(2)
+                            full_transcript_id_string = match_gene_type.group(3)
+                            current_transcript_id = match_gene_type.group(4)
+                            # Example key KS8000.rs1.re1000000.1
+                            gene_id_slice = input_file.name + "." + str(current_gene_id)
+                            # Example key KS8000.rs1.re1000000.1.transcript.1
+                            transcript_id_slice = gene_id_slice + "." + str(current_transcript_id)
+                            # If there is no existing entry, the gene key will be added
+                            # and the gene counter is incremented.
+                            # gene_id "gene1"; transcript_id "gene1.t1"
+                            if gene_id_slice not in gene_id_collection:
+                                new_gene_id = f"gene{gene_counter}"
+                                gene_id_collection[gene_id_slice] = new_gene_id
+                                transcript_id_count_gene_id[gene_id_slice] = 1
+                                gene_counter += 1
+                            else:
+                                # If there is a key then the gene id will be replaced with the new gene id
+                                new_gene_id = gene_id_collection[gene_id_slice]
+                            attribs = re.sub(
+                                full_gene_id_string,
+                                'gene_id "' + new_gene_id + '"',
+                                attribs,
+                            )
+                            # If there is no existing entry, the transcript key will be added
+                            # and the transcript counter is incremented.
+                            if transcript_id_slice not in gene_transcript_id_collection:
+                                new_transcript_id = (
+                                    gene_id_collection[gene_id_slice]
+                                    + ".t"
+                                    + str(transcript_id_count_gene_id[gene_id_slice])
+                                )
+                                gene_transcript_id_collection[transcript_id_slice] = new_transcript_id
+                                transcript_id_count_gene_id[gene_id_slice] += 1
+                            else:
+                                # If a transcript of the same set is already present,
+                                # the new id with the incremented counter is added
+                                new_transcript_id = gene_transcript_id_collection[transcript_id_slice]
+                            attribs = re.sub(
+                                full_transcript_id_string,
+                                'transcript_id "' + new_transcript_id + '"',
+                                attribs,
+                            )
+                            values[8] = attribs
+                            logger.info("FINAL GTF %s", values)
+                            output_file.write("\t".join(values))
+                            # Unique id based on the feature type
+                        else:
+                            if new_id_prefix == "repeatmask":
+                                match_feature_type = re.search(
+                                    r"(" + feature_id_label + r"\d+)",
+                                    line,
+                                )
+                            else:
+                                match_feature_type = (
+                                    re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
+                                        r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
+                                        line,
+                                    )
+                                )
+                            if match_feature_type:
+                                full_feature_id = match_feature_type.group(1)
+                                new_feature_id = new_id_prefix + str(feature_counter)
+                                attribs = re.sub(
+                                    full_feature_id,
+                                    feature_id_label + ' "' + new_feature_id + '"',
+                                    attribs,
+                                )
+                                feature_counter += 1
+                                values[8] = attribs
+                                output_file.write("\t".join(values))
+                    else:
+                        logger.info(
+                            "Feature type not recognised, will skip. Feature type: %s",
+                            values[2],
+                        )
+
+
+def get_sequence(
+    seq_region: str,
+    start: int,
+    end: int,
+    strand: int,
+    fasta_file: Path,
+    output_dir: Path,
+) -> str:
+    """
+    Creates a tempfile and writes the bed info to it based on whatever information
+    has been passed in about the sequence. Then runs bedtools getfasta. The fasta file
+    should have a faidx. This can be created with the create_faidx static method prior
+    to fetching sequence
+
+    Arg:
+    seq_region: str region name
+    start: int region start
+    end: int region end
+    strand: int strand of the sequence
+    fasta_file: genome FASTA file with indexing
+    output_dir: str working dir
+    Return: str sequence
+    """
+    start -= 1
+    logger.info(
+        "get_sequence %s",
+        f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}",
+    )
+    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file:
+        bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}")
+        bed_temp_file.close()
+    bedtools_command = [
+        "bedtools",
+        "getfasta",
+        "-fi",
+        str(fasta_file),
+        "-bed",
+        str(bed_temp_file.name),
+    ]
+    bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with
+        bedtools_command, stdout=subprocess.PIPE
+    )  # pylint:disable=consider-using-with
+    for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")):  # type: ignore
+        if idx == 1:
+            if strand == 1:
+                sequence = line.rstrip()
+            else:
+                sequence = reverse_complement(line.rstrip())
+
+    os.remove(bed_temp_file.name)
+    # logger.info(f"sequence : {sequence}")
+    return sequence
+
+
+def reverse_complement(sequence: str) -> str:
+    """
+    Get the reverse complement of a nucleotide sequence.
+    Args:
+        sequence: The nucleotide sequence
+    Returns:
+        The reverse complement of the sequence
+    """
+    rev_matrix = str.maketrans("atgcATGC", "tacgTACG")
+    return sequence.translate(rev_matrix)[::-1]
+
+
+def check_file(file_path: Path) -> None:
+    """
+    Raise an error when the file doesn't exist
+    Args:
+        file_path: File path
+    """
+    if not file_path.is_file():
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
+
+
+def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
+    """
+    Save fastq sequences in a dictionary <header, sequence>
+    fasta_lines: List[str]: List of sequences
+    """
+    index: Dict[str, str] = {}
+    header: Optional[str] = None
+    seq: List[str] = []
+
+    for line in fasta_lines:
+        line = line.strip()
+        header_match = re.match(r">(.+)\n$", line)
+        if header_match:
+            if header:
+                index[header] = "".join(seq)
+                header = None
+            header = header_match.group(1)
+            seq = []
+        else:
+            seq.append(line)
+    # Add the last sequence
+    if header:
+        index[header] = "".join(seq)
+    return index
+
+
+# def fasta_to_dict(fasta_lines:List[str]):
+#    index = {}
+#    for line in fasta_lines:
+#    #it = iter(fasta_lines)
+#    #for header in it:
+#        match = re.search(r">(.+)\n$", line)
+#        header = match.group(1)
+#        seq = next(line)
+#        index[header] = seq
+#    return index
+
+
+def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List:
+    """
+    The protein dataset file is splitted by a number of sequence equals to the batch_size
+    in batch files stored in 10 output directories.
+    protein_dataset : Path for the protein dataset.
+    output_dir : Output directory path.
+    batch_size : Size of the batch, it needs to be equals to the number of threads
+    to parallelise the sequence processing for each file.
+    """
+    batched_protein_files = []
+
+    for i in range(0, 10):
+        create_dir(output_dir, (f"bin_{i}"))
+    with open(protein_dataset, "r", encoding="utf8") as file_in:
+        seq_count = 0
+        batch_count = 0
+        current_record = ""
+        initial_seq = True
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            # match header and is not first sequence, if the number of stored sequences in each file equals
+            # the number of batch_size, a new file will be created and the current_record reset
+            if match and not initial_seq and seq_count % batch_size == 0:
+                bin_num = random.randint(0, 9)
+                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+                with batch_file.open("w+") as file_out:
+                    file_out.write(current_record)
+                batch_count += 1
+                seq_count += 1
+                current_record = line
+                batched_protein_files.append(batch_file)
+            # match header and is the first sequence
+            elif match:
+                current_record += line
+                initial_seq = False
+                seq_count += 1
+            # other lines
+            else:
+                current_record += line
+
+        if current_record:
+            bin_num = random.randint(0, 9)
+            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+            with batch_file.open("w+") as file_out:
+                file_out.write(current_record)
+            batched_protein_files.append(batch_file)
+    return batched_protein_files
+
+
+def run_command(cmd) -> None:
+    """Submit command in multiprocessing action
+
+    Args:
+        cmd (List[str]): command to execute
+    """
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        logger.info("Command executed successfully: %s", cmd)
+        logger.info("Output: %s", result.stdout)
+    except subprocess.CalledProcessError as e:
+        logger.error("Command failed: %s", cmd)
+        logger.error("Error: %s", e.stderr)
+
+
+def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
+    """
+    Parses a GTF file to extract gene, transcript, and exon information.
+
+    Args:
+        gtf_file: Path to the GTF file.
+
+    Returns:
+        A dictionary containing gene information, organized by gene_id, transcript_id, and features.
+    """
+    gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+
+    with open(gtf_file, "r") as gtf_in:
+        for line in gtf_in:
+            eles = line.strip().split("\t")
+            if len(eles) != 9:
+                continue
+
+            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8])
+            if not match:
+                continue
+
+            gene_id = match.group(1)
+            transcript_id = match.group(2)
+            feature_type = eles[2]
+
+            if gene_id not in gtf_genes:
+                gtf_genes[gene_id] = {}
+
+            if feature_type == "transcript":
+                gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []}
+            elif feature_type == "exon":
+                if transcript_id in gtf_genes[gene_id]:
+                    gtf_genes[gene_id][transcript_id]["exons"].append(line.strip())
+
+    return gtf_genes
+
+
+def check_transcriptomic_output(main_output_dir: Path) -> None:
+    """Check across the various transcriptomic if there is a reasonable amount of data
+
+    Args:
+        main_output_dir (Path): Output dir path
+    Raises:
+        IOError: If no lines or fewer than the minimum expected lines are found in the output files.
+    """
+
+    transcriptomic_dirs = [
+        "scallop_output",
+        "stringtie_output",
+        "minimap2_output",
+    ]
+    total_lines = 0
+    min_lines = 100000
+    for transcriptomic_dir in transcriptomic_dirs:
+        full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf"
+        if not full_file_path.exists():
+            logger.warning(
+                "Warning, no annotation.gtf found for %s. This might be fine, \
+                    e.g. no long read data were provided",
+                transcriptomic_dir,
+            )
+            continue
+        num_lines = sum(1 for _ in full_file_path.open("r"))
+        total_lines += num_lines
+        logging.info("For %s found a total of in the annotation.gtf file", num_lines)
+    if total_lines == 0:
+        raise IOError(
+            "Anno was run with transcriptomic mode enabled,\
+            but the transcriptomic annotation files are empty"
+        )
+    if total_lines <= min_lines:
+        raise IOError(
+            "Anno was run with transcriptomic mode enabled, \
+            but the total number of lines in the output \
+            files were less than the min expected value"
+            + "\n"
+            "Found: " + str(total_lines) + "\n"
+            "Min allowed: " + str(min_lines)
+        )
+
+
+
+# def run_find_orfs(genome_file, main_output_dir):
+#     min_orf_length = 600
+
+#     orf_output_dir = create_dir(main_output_dir, "orf_output")
+#     seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
+#     for region_name in seq_region_lengths:
+#         region_length = seq_region_lengths[region_name]
+#         seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
+#         for phase in range(0, 6):
+#             find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir)
+
+
+# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
+#     current_index = phase
+#     orf_counter = 1
+#     if phase > 2:
+#         seq = reverse_complement(seq)
+#         current_index = current_index % 3
+
+#     orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa"))
+#     orf_out = open(orf_file_path, "w+")
+
+#     while current_index < len(seq):
+#         codon = seq[current_index : current_index + 3]
+#         if codon == "ATG":
+#             orf_seq = codon
+#             for j in range(current_index + 3, len(seq), 3):
+#                 next_codon = seq[j : j + 3]
+#                 if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
+#                     orf_seq += next_codon
+#                     if len(orf_seq) >= min_orf_length:
+#                         orf_out.write(
+#                             ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n"
+#                         )
+#                         orf_out.write(orf_seq + "\n")
+#                         orf_counter += 1
+#                         orf_seq = ""
+#                         break
+
+#                 # If there's another met in phase, then put i to the start of
+#                 # the codon after j so that only the longest ORF is found
+#                 if next_codon == "ATG":
+#                     current_index = j + 3
+#                 orf_seq += next_codon
+#         current_index += 3
+#     orf_out.close()
+
+
+# def bed_to_gff(input_dir, hints_file):
+#     gff_out = open(hints_file, "w+")
+#     exons_dict = {}
+#     for bed_file in glob.glob(input_dir + "/*.bed"):
+#         logger.info("Processing file for hints:")
+#         logger.info(bed_file)
+#         bed_in = open(bed_file)
+#         bed_lines = bed_in.readlines()
+#         for line in bed_lines:
+#             line = line.rstrip()
+#             elements = line.split("\t")
+#             seq_region_name = elements[0]
+#             offset = int(elements[1])
+#             hit_name = elements[3]
+#             strand = elements[5]
+#             block_sizes = elements[10].split(",")
+#             block_sizes = list(filter(None, block_sizes))
+#             block_starts = elements[11].split(",")
+#             block_starts = list(filter(None, block_starts))
+#             exons = bed_block_to_exons(block_sizes, block_starts, offset)
+#             for i, element in enumerate(exons):
+#                 exon_coords = exons[i]
+#                 exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand
+#                 if exon_key in exons_dict:
+#                     exons_dict[exon_key][5] += 1
+#                 else:
+#                     gff_list = [
+#                         seq_region_name,
+#                         "CDNA",
+#                         "exon",
+#                         exon_coords[0],
+#                         exon_coords[1],
+#                         1.0,
+#                         strand,
+#                         ".",
+#                     ]
+#                     exons_dict[exon_key] = gff_list
+
+#     for exon_key, gff_list in exons_dict.items():
+#         gff_list[5] = str(gff_list[5])
+#         gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
+#         gff_out.write(gff_line)
+
+#     gff_out.close()
+
+#     sorted_hints_out = open((hints_file + ".srt"), "w+")
+#     subprocess.run(
+#         ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
+#         stdout=sorted_hints_out,
+#     )
+#     sorted_hints_out.close()
+
+
+# def splice_junction_to_gff(input_dir, hints_file):
+#     sjf_out = open(hints_file, "w+")
+
+#     for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
+#         sjf_in = open(sj_tab_file)
+#         sjf_lines = sjf_in.readlines()
+#         for line in sjf_lines:
+#             elements = line.split("\t")
+#             strand = "+"
+#             # If the strand is undefined then skip, Augustus expects a strand
+#             if elements[3] == "0":
+#                 continue
+#             elif elements[3] == "2":
+#                 strand = "-"
+
+#             junction_length = int(elements[2]) - int(elements[1]) + 1
+#             if junction_length < 100:
+#                 continue
+
+#             if not elements[4] and elements[7] < 10:
+#                 continue
+
+#             # For the moment treat multimapping and single mapping things as a combined score
+#             score = float(elements[6]) + float(elements[7])
+#             score = str(score)
+#             output_line = [
+#                 elements[0],
+#                 "RNASEQ",
+#                 "intron",
+#                 elements[1],
+#                 elements[2],
+#                 score,
+#                 strand,
+#                 ".",
+#                 ("src=W;mul=" + score + ";"),
+#             ]
+#             sjf_out.write("\t".join(output_line) + "\n")
+
+#     sjf_out.close()
+
+
+# def split_genome(genome_file, target_dir, min_seq_length):
+#     # This is the lazy initial way of just splitting into a dir
+#     # of files based on the toplevel sequence with a min sequence length filter
+#     # There are a couple of obvious improvements:
+#     # 1) Instead of making files for all seqs, just process N seqs
+#     #    parallel, where N = num_threads. Then you could clean up the seq file
+#     #    after each seq finishes, thus avoiding potentially
+#     #    having thousands of file in a dir
+#     # 2) Split the seq into even slices and process these in parallel
+#     #    (which the same cleanup as in 1). For sequences smaller than the
+#     #    target slice size, bundle them up together into a single file. Vastly
+#     #    more complex, partially implemented in the splice_genome method
+#     #    Allows for more consistency with parallelisation (since there should
+#     #    be no large outliers). But require a mapping strategy for the
+#     #    coords and sequence names and all the hints will need to be adjusted
+#     current_header = ""
+#     current_seq = ""
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             if len(current_seq) > min_seq_length:
+#                 file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#                 if not os.path.exists(file_out_name):
+#                     file_out = open(file_out_name, "w+")
+#                     file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#                     file_out.close()
+
+#                 else:
+#                     print("Found an existing split file, so will not overwrite. File found:")
+#                     print(file_out_name)
+
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#         line = file_in.readline()
+
+#     if len(current_seq) > min_seq_length:
+#         file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#         if not os.path.exists(file_out_name):
+#             file_out = open(file_out_name, "w+")
+#             file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#             file_out.close()
+
+#         else:
+#             logger.info("Found an existing split file, so will not overwrite. File found:")
+#             logger.info(file_out_name)
+
+#     file_in.close()
+
+
+# # def merge_gtf_files(file_paths,id_label):
+
+# #  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
+# #  gtf_out = open(gtf_file_path,'w+')
+# #  for gtf_file_path in gtf_files:
+# #    gtf_file_name = os.path.basename(gtf_file_path)
+# #    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
+# #    start_offset = int(match.group(1))
+# #    gtf_in = open(gtf_file_path,'r')
+# #    line = gtf_in.readline()
+# #    while line:
+# #      values = line.split("\t")
+# #      if len(values) == 9 and (values[2] in feature_types):
+# #        values[3] = str(int(values[3]) + (start_offset - 1))
+# #        values[4] = str(int(values[4]) + (start_offset - 1))
+# #        gtf_out.write("\t".join(values))
+# #        line = gtf_in.readline()
+# #    gtf_in.close()
+# #  gtf_out.close()
+
+
+# def multiprocess_generic(cmd):
+#     print(" ".join(cmd))
+#     subprocess.run(cmd)
+
+
+# def seq_region_names(genome_file):
+#     region_list = []
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">([^\s]+)", line)
+#         if match:
+#             region_name = match.group(1)
+#             if region_name == "MT":
+#                 logger.info("Skipping region named MT")
+#                 line = file_in.readline()
+#                 continue
+#             else:
+#                 region_list.append(match.group(1))
+#         line = file_in.readline()
+
+#     return region_list
+
+
+# def slice_genome(genome_file, target_dir, target_slice_size):
+#     # The below is sort of tested
+#     # Without the
+#     target_seq_length = 50000000
+#     min_seq_length = 1000
+#     current_header = ""
+#     current_seq = ""
+#     seq_dict = {}
+#     for line in seq:  # pylint:disable=used-before-assignment
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             seq_dict[current_header] = current_seq
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#     seq_dict[current_header] = current_seq
+
+#     seq_buffer = 0
+#     file_number = 0
+#     file_name = "genome_file_" + str(file_number)
+
+#     for header in seq_dict:
+#         seq_iterator = 0
+#         seq = seq_dict[header]
+
+#         while len(seq) > target_seq_length:
+#             file_out = open(os.path.join(target_dir, file_name), "w+")
+#             subseq = seq[0:target_seq_length]
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n")
+#             file_out.close()
+#             seq = seq[target_seq_length:]
+#             seq_iterator += 1
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+#         if len(seq) >= min_seq_length:
+#             file_name = "genome_file_" + str(file_number)
+#             file_out = open(os.path.join(file_name), "w+")
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
+#             file_out.close()
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+
+# def coallate_results(main_output_dir):
+#     results_dir = create_dir(main_output_dir, "results")
+#     output_dirs = [
+#         "augustus_output",
+#         "cpg_output",
+#         "dust_output",
+#         "eponine_output",
+#         "red_output",
+#         "rfam_output",
+#         "trf_output",
+#         "trnascan_output",
+#     ]
+#     for output_dir in output_dirs:
+#         match = re.search(r"(.+)_output", output_dir)
+#         result_type = match.group(1)
+#         results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
+#         copy_path = os.path.join(results_dir, (result_type + ".gtf"))
+#         if os.path.exists(results_path):
+#             cpy_cmd = ["cp", results_path, copy_path]
+# subprocess.run(cpy_cmd)