Hide BiopythonDeprecationWarnings when reading certain sequence files

Biopython 1.85 will show a deprecation warning when using format='fasta' with files that start with anything but '>'. The warning as-is should not be exposed to Augur users. It is not triggered when reading files with format='fasta-pearson', so this is the easiest thing to do to maintain behavior of Biopython <1.85.
nextstrain · Jan 21, 2025 · 945011a · 945011a
1 parent cc365cb
commit 945011a
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 6 deletions.
diff --git a/augur/align.py b/augur/align.py
@@ -8,7 +8,7 @@
 from Bio import AlignIO, SeqIO, Seq, Align
 from .argparse_ import ExtendOverwriteDefault
 from .io.file import open_file
-from .io.sequences import read_sequence, read_sequences as io_read_sequences
+from .io.sequences import read_sequence, read_sequences as io_read_sequences, BIOPYTHON_FASTA_FORMAT
 from .io.shell_command_runner import run_shell_command
 from .io.vcf import shquote
 from .utils import nthreads_value
@@ -241,7 +241,7 @@ def read_reference(ref_fname):
         raise AlignmentError("ERROR: Cannot read reference sequence."
                              "\n\tmake sure the file \"%s\" exists"%ref_fname)
     try:
-        ref_seq = read_sequence(ref_fname, format='genbank' if ref_fname.split('.')[-1] in ['gb', 'genbank'] else 'fasta')
+        ref_seq = read_sequence(ref_fname, format='genbank' if ref_fname.split('.')[-1] in ['gb', 'genbank'] else BIOPYTHON_FASTA_FORMAT)
     except:
         raise AlignmentError("ERROR: Cannot read reference sequence."
                 "\n\tmake sure the file %s contains one sequence in genbank or fasta format"%ref_fname)

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -32,7 +32,7 @@
 from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
     genome_features_to_auspice_annotation
 from .io.file import open_file
-from .io.sequences import read_sequence
+from .io.sequences import read_sequence, BIOPYTHON_FASTA_FORMAT
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
@@ -399,7 +399,7 @@ def run(args):
         aln = args.alignment
         ref = None
         if args.root_sequence:
-            for fmt in ['fasta', 'genbank']:
+            for fmt in [BIOPYTHON_FASTA_FORMAT, 'genbank']:
                 try:
                     ref = str(read_sequence(args.root_sequence, format=fmt).seq).upper()
                     break

diff --git a/augur/io/sequences.py b/augur/io/sequences.py
@@ -2,13 +2,18 @@
 import os
 
 from augur.errors import AugurError
+from importlib.metadata import version
+from packaging.version import Version
 from typing import Iterator, Iterable, Union
 from .file import open_file
 
 
+BIOPYTHON_FASTA_FORMAT = "fasta" if Version(version("biopython")) < Version("1.85") else "fasta-pearson"
+
+
 def read_sequence(
     path: str,
-    format: str = "fasta",
+    format: str = BIOPYTHON_FASTA_FORMAT,
 ) -> Bio.SeqIO.SeqRecord:
     """Read a single sequence from a path.
 
@@ -20,7 +25,7 @@ def read_sequence(
 
 def read_sequences(
     *paths: Iterable[Union[str, os.PathLike]],
-    format: str = "fasta",
+    format: str = BIOPYTHON_FASTA_FORMAT,
 ) -> Iterator[Bio.SeqIO.SeqRecord]:
     """Read sequences from one or more paths.