naturalis · rvosa · May 3, 2024 · Apr 5, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/.gitignore b/.gitignore
@@ -12,14 +12,13 @@ RAxML_*
 .DS_Store
 
 # ignore local results
-resources/BOLD_Public.18-Dec-2023
-resources/BOLD_Public.30-Dec-2022.tar.gz
+resources/BOLD_Public.*
 results/databases/
 results/fasta/
 results/blast/
 results/grafted.tre
 resources/mnt/
-resources/opentree/
+resources/opentree*
 *.sto
 
 # ignore snakemake hidden files

diff --git a/config/config.yaml b/config/config.yaml
@@ -1,9 +1,17 @@
 
-# Used for parallelization by snakemake
-cpu_cores: 8
-
-# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in fasta_filter
-nfamilies: 17
+# Used for parallelization by snakemake. This setting is based on the architecture of the metal-as-as-service 
+# node netdc-bms-c11g.maas - which has 56 cores. Adjust this setting as needed. For example, on most laptops
+# a reasonable value might be 4, or 8.
+cpu_cores: 56
+
+# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in 
+# fasta_filter that have records for the specified marker. In practice, this means that the input BCDM TSV
+# file has to have exactly this many distinct (not empty) values for the `family` column where the column
+# named under fasta_filter.filter_level has has value fasta_filter.filter_name (e.g. the `order` must be
+# `Odonata`. TODO: make this so that it is done automatically from the data. At present, this needs to be 
+# calculated by the user from the input file, e.g. by first making the sqlite database, or by grepping the 
+# TSV file somehow.
+nfamilies: 36
 
 # Number of outgroups to include in each family-level analysis. Minimum is 2.
 outgroups: 3
@@ -39,7 +47,7 @@ datatype: NT
 # filter levels: class, order, family, genus, all (no filter)
 fasta_filter:
   filter_level: order
-  filter_name: Primates
+  filter_name: Odonata
 
 name: phylogeny
 
@@ -50,8 +58,8 @@ dependencies:
       - -r requirements.txt
 
 file_names:
-  bold_tsv: resources/BOLD_Public.18-Dec-2023/BOLD_Public.18-Dec-2023.tsv
-  open_tre: resources/opentree/opentree13.4_tree/labelled_supertree/labelled_supertree.tre
+  bold_tsv: resources/BOLD_Public.05-Apr-2024-curated.tsv
+  open_tre: resources/opentree14.9_tree/labelled_supertree/labelled_supertree.tre
   hmm: resources/hmm/COI-5P.hmm
   fasta_dir: results/fasta/family
   blast_dir: results/blast

diff --git a/results/databases/README.md b/results/databases/README.md
@@ -0,0 +1 @@
+Placeholder. This folder will contain SQLite databases.
diff --git a/results/fasta/family/README.md b/results/fasta/family/README.md
@@ -0,0 +1 @@
+This is a placeholder for a folder structure where FASTA files, alignments, and trees are written as intermediate results.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -284,7 +284,12 @@ rule reroot_raxml_output:
             -v {params.log_level} 2> {log}
         """
 
-# TODO: experiment with exemplar selection strategies
+# In principle, exemplar choosing can be performed using one of three ways:
+# 1. the tallest tips on either side of the root are chosen
+# 2. the shallowest tips on either side
+# 3. the ones closest to the median root-to-tip path length
+# Empirically, the first option yields rescaled branch lengths that are closest
+# to those optimized freely on a total tree.
 rule choose_exemplars:
     input:
         alignment = "results/fasta/family/{scatteritem}/aligned.fa",
@@ -305,30 +310,53 @@ rule choose_exemplars:
             -o {output} 2> {log}
         """
 
-
+# When aligning the family-level subsets, different families may have different indel
+# patterns. So, although they are all orientated against the model in the same way,
+# there can be some frame shifting. The least bad option is to unalign and realign 
+# the exemplar sequences on the fly. Sed does the unaligning, hmmalign emits Stockholm
+# data on the command line, awk turns it into FASTA.
 rule prep_raxml_backbone:
     input:
         fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa"),
         opentol='results/databases/megatree_loader.ok'
     output:
         fasta="results/fasta/raxml-ready.fa",
-        tree="results/fasta/raxml-ready.tre"
+        tree="results/fasta/raxml-ready.tre",
+        extinct="results/fasta/extinct_pids.txt"
     params:
         db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
-        log_level = config['log_level']
+        log_level = config['log_level'],
+        hmm_file=config['file_names']['hmm']
     log: "logs/prep_raxml_backbone/prep_raxml_backbone.log"
     conda: "envs/prep_raxml_backbone.yml"
     shell:
         """
-        cat {input.fastas} > {output.fasta}
+        # Generate constraint tree and list of extinct PIDs
         python workflow/scripts/backbone_constraint.py \
             -d {params.db} \
             -v {params.log_level} \
             -i '{input.fastas}' \
+            -e {output.extinct} \
             -o {output.tree} 2> {log}
+
+        # Clean the concatenated FASTA by removing gaps (dashes)
+        sed '/^>/! s/-//g' {input.fastas} > results/fasta/unaligned.fa
+
+        # Align with hmmalign and output in Stockholm format
+        hmmalign --trim --dna --informat FASTA --outformat Stockholm {params.hmm_file} results/fasta/unaligned.fa > results/fasta/aligned.sto
+
+        # Convert the Stockholm alignment to a non-interleaved FASTA format for RAxML
+        seqmagick convert results/fasta/aligned.sto {output.fasta}
+
+        # Remove any extinct PIDs
+        [ -e {output.extinct} ] && seqmagick mogrify --exclude-from-file {output.extinct} {output.fasta}        
         """
 
 
+# Constructs the backbone topology under ML using raxml-ng with a tree constraint that
+# is intended to keep all pairs of exemplars monophyletic. Here, no outgroups are 
+# specified, because in principle this step could be dealing with the entire 
+# taxonomic width of BOLD. Instead, the tree will be rooted using the constraint.
 rule run_raxml_backbone:
     input:
         alignment = "results/fasta/raxml-ready.fa",
@@ -349,7 +377,11 @@ rule run_raxml_backbone:
             --search > {log} 2>&1
         """
 
-
+# Reroots the backbone using the constraint tree. The basic method is to find the
+# basal split in the constraint, look for the equivalent bipartition in the inferred
+# tree, and root there. TODO: this approach currently places the root on the midpoint
+# of the bipartition. This is not ideal but it is not obvious what better options
+# are available if there are no outgroups.
 rule reroot_backbone:
     input:
         backbone = "results/fasta/raxml-ready.fa.raxml.bestTree",
@@ -373,7 +405,8 @@ rule reroot_backbone:
 rule graft_clades:
     input:
         backbone = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted",
-        clades = config['file_names']['fasta_dir']
+        clades = config['file_names']['fasta_dir'],
+        extinct = "results/fasta/extinct_pids.txt"
     output:
         tree = "results/grafted.tre"
     params:
@@ -386,6 +419,7 @@ rule graft_clades:
         python workflow/scripts/graft_clades.py \
             -t {input.backbone} \
             -f {input.clades} \
+            -e {input.extinct} \
             -o {output.tree} \
             -n {params.nfamilies} \
             -v {params.log_level} 2> {log}                  

diff --git a/workflow/envs/create_database.yml b/workflow/envs/create_database.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - sqlite
   - pip
   - pip:
     - -r ../../workflow/envs/create_database.txt
diff --git a/workflow/envs/map_opentol.yml b/workflow/envs/map_opentol.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - sqlite
   - pip
   - pip:
     - -r ../../workflow/envs/map_opentol.txt
diff --git a/workflow/envs/megatree_loader.yml b/workflow/envs/megatree_loader.yml
@@ -4,4 +4,5 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - perl-bio-phylo-forest-dbtree
+  - sqlite
+  - perl-bio-phylo-forest-dbtree
diff --git a/workflow/envs/prep_raxml_backbone.yml b/workflow/envs/prep_raxml_backbone.yml
@@ -5,6 +5,8 @@ channels:
   - defaults
 dependencies:
   - sqlite
+  - hmmer
+  - seqmagick
   - pip
   - pip:
     - -r ../../workflow/envs/prep_raxml_backbone.txt
diff --git a/workflow/scripts/backbone_constraint.py b/workflow/scripts/backbone_constraint.py
@@ -10,9 +10,12 @@
 
 def get_ott_ids(pids, conn):
     """
+    Given a list of BOLD process IDs, checks to verify that these all belong to the same family
+    and if so, maps them to OTT IDs. The OTT IDs come from the synthetic tree (i.e. not 
+    'broken' taxa).
     :param pids: a list of BOLD process IDs
     :param conn: a sqlite3 database handle
-    :return:
+    :return: a dictionary where key is OTT ID, value is list of process IDs
     """
     pids_for_ott = {}
     family = None
@@ -57,24 +60,31 @@ def process_exemplars(exemplar_files, conn):
     logger.info('Going to process input files')
     logger.debug(exemplar_files)
     pids_for_ott = {}
+    extinct_pids = []
     for exemplar_file in exemplar_files:
         if not os.path.isfile(exemplar_file):
             logger.info(f'Location "{exemplar_file}" is not a file - skipping...')
             continue
 
-        # Ideally, these have OTT IDs that we can use
+        # Read process IDs from exemplar file into a list
         logger.info(f'Processing input file {exemplar_file}')
         process_ids = []
         for record in SeqIO.parse(exemplar_file, 'fasta'):
             process_ids.append(record.id)
 
-        # Check the database
+        # Get an OTT ID for the process IDs in the list
         dict_of_lists = get_ott_ids(process_ids, conn)
         if dict_of_lists:
             pids_for_ott.update(dict_of_lists)
         else:
+
+            # The process IDs have no family that occurs in the OpenTree. In two cases now this
+            # occurred because the focal exemplar file consisted entirely of extinct taxa 
+            # (i.e. an entire extinct family, such as Megaladapidae). The least bad thing to do
+            # with these is to remove them from the backbone.
             logger.warning(f'Input file with unconstrained sequences (maybe extinct?): {exemplar_file}')
-    return pids_for_ott
+            extinct_pids.extend(process_ids)
+    return pids_for_ott, extinct_pids
 
 
 def remap_tips(tree, pidmap):
@@ -114,6 +124,7 @@ def remap_tips(tree, pidmap):
     parser.add_argument('-d', '--database', required=True, help='SQLite database file')
     parser.add_argument('-i', '--inaln', required=True, help='Input exemplar FASTA files')
     parser.add_argument('-o', '--outtree', required=True, help="Output constraint tree")
+    parser.add_argument('-e', '--extinctpids', required=True, help='Putatively extinct PIDs')
     parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)')
     args = parser.parse_args()
 
@@ -124,12 +135,17 @@ def remap_tips(tree, pidmap):
     logger.info(f"Going to connect to database {args.database}")
     connection = sqlite3.connect(args.database)
 
-    # Get one-to-many mapping from OTT IDs to process IDs
-    pidmap = process_exemplars(args.inaln.split(' '), connection)
+    # Get one-to-many mapping from OTT IDs to process IDs and store extinct PIDs
+    pidmap, extinctpids = process_exemplars(args.inaln.split(' '), connection)
     connection.close()
+    if len(extinctpids) != 0:
+        with open(args.extinctpids, 'w') as file:
+            for pid in extinctpids:
+                file.write(f"{pid}\n")
 
     # Fetch opentol subtree, remap tips
     ids = [int(str(item).removeprefix('ott')) for item in list(pidmap.keys())]
+    logger.info(f'Getting subtree for tips {ids}')
     ott_tree = opentol.get_subtree(ids)
     remap_tips(ott_tree, pidmap)
 

diff --git a/workflow/scripts/create_database.py b/workflow/scripts/create_database.py
@@ -117,7 +117,7 @@ def index_taxonomy(table):
     # Iterate over taxonomy columns and compute index
     for column in ['kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'species', 'bin_uri']:
         logger.info(f'Going to index {column} column of {table} table')
-        database_cursor.execute(f'CREATE INDEX {table}_{column}_idx ON {table} ("{column}");')
+        database_cursor.execute(f'CREATE INDEX IF NOT EXISTS {table}_{column}_idx ON {table} ("{column}");')
 
 
 def normalize_taxonomy():
@@ -187,12 +187,12 @@ def update_fk():
 
     # Load TSV into temporary barcode table, unindexed, then copy over
     load_tsv(args.intsv, args.outdb, 'barcode_temp', 'barcode')
-    database_cursor.execute('CREATE INDEX barcode_nuc_idx ON barcode(nuc)')
-    database_cursor.execute('CREATE INDEX barcode_processid_idx ON barcode(processid)')
-    database_cursor.execute('CREATE INDEX barcode_marker_code_idx ON barcode(marker_code)')
+    database_cursor.execute('CREATE INDEX IF NOT EXISTS barcode_nuc_idx ON barcode(nuc)')
+    database_cursor.execute('CREATE INDEX IF NOT EXISTS barcode_processid_idx ON barcode(processid)')
+    database_cursor.execute('CREATE INDEX IF NOT EXISTS barcode_marker_code_idx ON barcode(marker_code)')
 
-    database_cursor.execute('CREATE INDEX barcode_processid_idx ON barcode(processid)')
-    database_cursor.execute('CREATE INDEX barcode_marker_code_idx ON barcode(marker_code)')
+    database_cursor.execute('CREATE INDEX IF NOT EXISTS barcode_processid_idx ON barcode(processid)')
+    database_cursor.execute('CREATE INDEX IF NOT EXISTS barcode_marker_code_idx ON barcode(marker_code)')
 
     # Index the barcode table's taxonomy, copy its distinct tuples into the taxon table, then index the latter
     index_taxonomy('barcode')

diff --git a/workflow/scripts/family_fasta.py b/workflow/scripts/family_fasta.py
@@ -20,15 +20,19 @@ def get_family_bins(q, conn):
         # Select all distinct family names that match config.yaml filters
         level = q['level']
         name = q['name']
+        marker_code = q['marker_code']
         sql = f'''
-            SELECT family, bin_uri, species 
-            FROM taxon 
-            WHERE "{level}" = "{name}" AND species is not "None" 
-            GROUP BY family, bin_uri
+            SELECT DISTINCT family, bin_uri
+            FROM barcode
+            WHERE marker_code = '{marker_code}' 
+              AND "{level}" = '{name}'
+              AND family IS NOT NULL 
+              AND family <> ''
+            ORDER BY family ASC, bin_uri ASC;
         '''
         fam = pd.read_sql_query(sql, conn)
 
-        # Check if filter is all or if used filter did not resulted in any records
+        # Check if this configuration contains any records at all
         if len(fam) == 0:
             raise Exception(f"No records found with {q}.")
 
@@ -56,9 +60,11 @@ def write_bin(q, conn, fh):
             t."{q["level"]}" = "{q["name"]}" AND
             t.family = "{q["family"]}" AND
             t.bin_uri = "{q["bin_uri"]}" AND
-            b.marker_code = "{q["marker_code"]}"
+            b.marker_code = "{q["marker_code"]}" AND
+            t.species IS NOT NULL AND
+            t.species <> '' 
         ORDER BY
-        length(b.nuc) DESC LIMIT 1
+        b.nuc_basecount DESC LIMIT 1
         '''
     logger.debug(query)
     famseq = pd.read_sql_query(query, conn)
@@ -97,7 +103,8 @@ def write_bin(q, conn, fh):
     # Get families and bins for configured level and name
     query = {
         'level': args.level,
-        'name': args.name
+        'name': args.name,
+        'marker_code': args.marker
     }
     df = get_family_bins(query, connection)
 
@@ -121,7 +128,6 @@ def write_bin(q, conn, fh):
                 logger.debug(f"Writing {bin_uri}")
                 query['bin_uri'] = bin_uri
                 query['family'] = family
-                query['marker_code'] = args.marker
                 write_bin(query, connection, handle)
 
         index += 1