Merge pull request #1381: Enforce UTF-8 encoding

nextstrain · Feb 12, 2024 · e2ca468 · e2ca468
2 parents dd8a1cb + 5762baa
commit e2ca468
Show file tree

Hide file tree

Showing 24 changed files with 122 additions and 44 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,6 +9,7 @@
 * `augur parse`: A new optional `--output-id-field` argument allows the user to select any ID field for the produced FASTA file (e.g. 'accession' instead of 'name' or 'strain'). [#1403][] (@j23414)
   * When no `--output-id-field` is given and the data has both `name` and `strain` fields, continue to preferentially use `name` over `strain` as the sequence ID field; but, throw a deprecation warning that the order will be switched to prefer `strain` over `name` in the future to be consistent with the rest of Augur.
   * Added entry to [DEPRECATED.md](./DEPRECATED.md).
+* Compression should now be supported for all input and output files. Please [open an issue](https://github.com/nextstrain/augur/issues) if you find one that doesn't! [#1381][] (@victorlin)
 
 ### Bug Fixes
 
@@ -19,8 +20,10 @@
 * filter: Updated the help text of `--include` and `--include-where` to explicitly state that this can add strains that are missing an entry from `--sequences`. [#1389][] (@victorlin)
 * filter: Fixed the summary messages to properly reflect force-inclusion of strains that are missing an entry from `--sequences`. [#1389][] (@victorlin)
 * filter: Updated wording of summary messages. [#1389][] (@victorlin)
+* Enforce UTF-8 encoding when reading and writing files. Improve error messages when a non-UTF-8 file is used. [#1381][] (@victorlin)
 
 [#1294]: https://github.com/nextstrain/augur/pull/1294
+[#1381]: https://github.com/nextstrain/augur/pull/1381
 [#1389]: https://github.com/nextstrain/augur/pull/1389
 [#1410]: https://github.com/nextstrain/augur/pull/1410
 [#1403]: https://github.com/nextstrain/augur/pull/1403

diff --git a/augur/align.py b/augur/align.py
@@ -6,6 +6,7 @@
 from shutil import copyfile
 import numpy as np
 from Bio import AlignIO, SeqIO, Seq, Align
+from .io.file import open_file
 from .io.shell_command_runner import run_shell_command
 from .io.vcf import shquote
 from .utils import nthreads_value
@@ -369,7 +370,7 @@ def analyse_insertions(aln, ungapped, insertion_csv):
         for insertion_seq, strains in i_data.items():
             for strain in strains:
                 strain_data[strain][idx] = insertion_seq
-    with open(insertion_csv, 'w', encoding='utf-8') as fh:
+    with open_file(insertion_csv, 'w') as fh:
         print(",".join(header), file=fh)
         for strain in strain_data:
             print("{},{}".format(strain, ",".join(strain_data[strain])), file=fh)

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -29,6 +29,7 @@
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
+from .io.file import open_file
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
@@ -465,7 +466,7 @@ def run(args):
 
             # Save ancestral amino acid sequences to FASTA.
             if args.output_translations:
-                with open(args.output_translations.replace("%GENE", gene), "w", encoding="utf-8") as oh:
+                with open_file(args.output_translations.replace("%GENE", gene), "w") as oh:
                     for node in aa_result["tt"].tree.find_clades():
                         oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")
 

diff --git a/augur/clades.py b/augur/clades.py
@@ -19,6 +19,7 @@
 import networkx as nx
 from itertools import islice
 from .errors import AugurError
+from .io.file import PANDAS_READ_CSV_OPTIONS
 from argparse import SUPPRESS
 from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name
 
@@ -64,6 +65,7 @@ def read_in_clade_definitions(clade_file):
         sep='\t' if clade_file.endswith('.tsv') else ',',
         comment='#',
         na_filter=False,
+        **PANDAS_READ_CSV_OPTIONS,
     )
 
     clade_inheritance_rows = df[df['gene'] == 'clade']

diff --git a/augur/distance.py b/augur/distance.py
@@ -186,6 +186,7 @@
 import sys
 
 from .frequency_estimators import timestamp_to_float
+from .io.file import open_file
 from .reconstruct_sequences import load_alignments
 from .utils import annotate_parents_for_tree, first_line, read_node_data, write_json
 
@@ -213,7 +214,7 @@ def read_distance_map(map_file):
     [('default', 0.0), ('map', {'SigPep': {0: {('W', 'P'): -8.3}}})]
     """
     # Load the JSON.
-    with open(map_file, "r", encoding='utf-8') as fh:
+    with open_file(map_file, "r") as fh:
         json_distance_map = json.load(fh)
 
     # Confirm that all required fields are present.

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -12,6 +12,7 @@
 from Bio import Phylo
 
 from .errors import AugurError
+from .io.file import open_file
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
 from .types import ValidationMode
 from .utils import read_node_data, write_json, json_size, read_config, read_lat_longs, read_colors
@@ -1011,7 +1012,7 @@ def set_description(data_json, cmd_line_description_file):
     `meta.description` in *data_json* to the text provided.
     """
     try:
-        with open(cmd_line_description_file, encoding='utf-8') as description_file:
+        with open_file(cmd_line_description_file) as description_file:
             markdown_text = description_file.read()
             data_json['meta']['description'] = markdown_text
     except FileNotFoundError:

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -14,7 +14,7 @@
     ID_COLUMN as SEQUENCE_INDEX_ID_COLUMN,
     DELIMITER as SEQUENCE_INDEX_DELIMITER,
 )
-from augur.io.file import open_file
+from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file
 from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata
 from augur.io.sequences import read_sequences, write_sequences
 from augur.io.print import print_err
@@ -70,6 +70,7 @@ def run(args):
             sep=SEQUENCE_INDEX_DELIMITER,
             index_col=SEQUENCE_INDEX_ID_COLUMN,
             dtype={SEQUENCE_INDEX_ID_COLUMN: "string"},
+            **PANDAS_READ_CSV_OPTIONS,
         )
 
         # Remove temporary index file, if it exists.
@@ -134,13 +135,14 @@ def run(args):
             priorities = defaultdict(random_generator.random)
 
     # Setup logging.
+    output_log_context_manager = open_file(args.output_log, "w", newline='')
     output_log_writer = None
     if args.output_log:
         # Log the names of strains that were filtered or force-included, so we
         # can properly account for each strain (e.g., including those that were
         # initially filtered for one reason and then included again for another
         # reason).
-        output_log = open(args.output_log, "w", newline='')
+        output_log = output_log_context_manager.__enter__()
         output_log_header = ("strain", "filter", "kwargs")
         output_log_writer = csv.DictWriter(
             output_log,

diff --git a/augur/filter/io.py b/augur/filter/io.py
@@ -9,6 +9,7 @@
 from xopen import xopen
 
 from augur.errors import AugurError
+from augur.io.file import open_file
 from augur.io.metadata import Metadata, METADATA_DATE_COLUMN
 from augur.io.print import print_err
 from .constants import GROUP_BY_GENERATED_COLUMNS
@@ -76,7 +77,7 @@ def constant_factory(value):
         return lambda: value
 
     try:
-        with open(fname, encoding='utf-8') as pfile:
+        with open_file(fname) as pfile:
             return defaultdict(constant_factory(-np.inf), {
                 elems[0]: float(elems[1])
                 for elems in (line.strip().split('\t') if '\t' in line else line.strip().split() for line in pfile.readlines())

diff --git a/augur/frequencies.py b/augur/frequencies.py
@@ -10,6 +10,7 @@
 from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
 from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
 from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates
+from .io.file import open_file
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata
 from .utils import write_json
 
@@ -110,7 +111,7 @@ def run(args):
     if args.method == "kde":
         # Load weights if they have been provided.
         if args.weights:
-            with open(args.weights, "r", encoding='utf-8') as fh:
+            with open_file(args.weights, "r") as fh:
                 weights = json.load(fh)
 
             weights_attribute = args.weights_attribute

diff --git a/augur/import_/beast.py b/augur/import_/beast.py
@@ -11,6 +11,7 @@
 import numpy as np
 from Bio import Phylo
 from treetime import TreeAnc
+from augur.io.file import open_file
 from augur.utils import write_json
 
 def register_parser(parent_subparsers):
@@ -234,7 +235,7 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose
 
     if isinstance(tree_path,str): ## determine if path or handle was provided to function
         try:
-            handle=open(tree_path,'r', encoding='utf-8')
+            handle=open_file(tree_path,'r')
         except FileNotFoundError:
             print("FATAL: No such file {}".format(tree_path))
             sys.exit(2)

diff --git a/augur/io/file.py b/augur/io/file.py
@@ -1,7 +1,16 @@
 import os
 from contextlib import contextmanager
 from io import IOBase
+from textwrap import dedent
 from xopen import PipedCompressionReader, PipedCompressionWriter, xopen
+from augur.errors import AugurError
+
+
+ENCODING = "utf-8"
+
+PANDAS_READ_CSV_OPTIONS = {
+    'encoding': ENCODING,
+}
 
 
 @contextmanager
@@ -24,9 +33,23 @@ def open_file(path_or_buffer, mode="r", **kwargs):
         File handle object
 
     """
+
+    # Read all files using a specific encoding.
+    kwargs['encoding'] = ENCODING
+
     if isinstance(path_or_buffer, (str, os.PathLike)):
-        with xopen(path_or_buffer, mode, **kwargs) as handle:
-            yield handle
+        try:
+            with xopen(path_or_buffer, mode, **kwargs) as handle:
+                yield handle
+        except UnicodeDecodeError as e:
+            # TODO: Consider moving this to the top-level error handler to
+            # handle errors from other I/O functions such as pandas.read_csv.
+            # This is not trivial since the filepath is useful to include in the
+            # message, but is not available through UnicodeDecodeError alone.
+            raise AugurError(dedent(f"""\
+                File {path_or_buffer!r} contains {e.object[e.start:e.end]!r} which is not valid in the expected {e.encoding!r} encoding.
+                Try re-saving the file using the {e.encoding!r} encoding."""))
+
 
     elif isinstance(path_or_buffer, (IOBase, PipedCompressionReader, PipedCompressionWriter)):
         yield path_or_buffer

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -10,7 +10,7 @@
 from augur.errors import AugurError
 from augur.io.print import print_err
 from augur.types import DataErrorMethod
-from .file import open_file
+from .file import PANDAS_READ_CSV_OPTIONS, open_file
 
 
 DEFAULT_DELIMITERS = (',', '\t')
@@ -95,6 +95,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
         metadata_file,
         iterator=True,
         **kwargs,
+        **PANDAS_READ_CSV_OPTIONS,
     )
     chunk = metadata.read(nrows=1)
     metadata.close()
@@ -153,7 +154,8 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
 
     return pd.read_csv(
         metadata_file,
-        **kwargs
+        **kwargs,
+        **PANDAS_READ_CSV_OPTIONS,
     )
 
 

diff --git a/augur/io/vcf.py b/augur/io/vcf.py
@@ -1,6 +1,7 @@
 import os
 import shlex
 
+from .file import open_file
 from .shell_command_runner import run_shell_command
 
 
@@ -67,7 +68,7 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
 
     #prepare the header of the VCF & write out
     header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
-    with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
+    with open_file(vcf_file_name, 'w') as the_file:
         the_file.write( "##fileformat=VCFv4.2\n"+
                         "##source=NextStrain_Protein_Translation\n"+
                         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
@@ -122,10 +123,10 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
             vcfWrite.append("\t".join(output))
 
     #write it all out
-    with open(ref_file_name, 'w', encoding='utf-8') as the_file:
+    with open_file(ref_file_name, 'w') as the_file:
         the_file.write("\n".join(refWrite))
 
-    with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
+    with open_file(vcf_file_name, 'a') as the_file:
         the_file.write("\n".join(vcfWrite))
 
     if vcf_file_name.lower().endswith('.gz'):

diff --git a/augur/lbi.py b/augur/lbi.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 import json
 import numpy as np
+from .io.file import open_file
 from .utils import write_json
 
 
@@ -96,7 +97,7 @@ def run(args):
     tree = Bio.Phylo.read(args.tree, "newick")
 
     # Load branch lengths.
-    with open(args.branch_lengths, "r", encoding='utf-8') as json_fh:
+    with open_file(args.branch_lengths, "r") as json_fh:
         branch_lengths = json.load(json_fh)
 
     # Annotate branch lengths and dates onto tree nodes.

diff --git a/augur/measurements/export.py b/augur/measurements/export.py
@@ -6,6 +6,7 @@
 import sys
 
 from augur.argparse_ import HideAsFalseAction
+from augur.io.file import PANDAS_READ_CSV_OPTIONS
 from augur.utils import first_line, write_json
 from augur.validate import (
     measurements as read_measurements_json,
@@ -106,7 +107,7 @@ def run(args):
 
     # Load input collection TSV file
     try:
-        collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include)
+        collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include, **PANDAS_READ_CSV_OPTIONS)
     except FileNotFoundError:
         print(
             f"ERROR: collection TSV file {args.collection!r} does not exist",

diff --git a/augur/reconstruct_sequences.py b/augur/reconstruct_sequences.py
@@ -3,6 +3,7 @@
 """
 
 from Bio import SeqIO, Seq, SeqRecord, Phylo
+from .io.file import open_file
 from .utils import read_node_data
 
 
@@ -71,7 +72,7 @@ def run(args):
     #if VCF, read in the reference seq for each gene, put on root
     if(is_vcf):
         node_data["nodes"][root_node]['aa_sequences'] = {}
-        with open(args.vcf_aa_reference, encoding='utf-8') as handle:
+        with open_file(args.vcf_aa_reference) as handle:
             for record in SeqIO.parse(handle, "fasta"):
                 if record.id==args.gene:
                     #'root' may not be same as 'reference', so apply any mutations at root here!

diff --git a/augur/sequence_traits.py b/augur/sequence_traits.py
@@ -3,10 +3,10 @@
 """
 
 import sys
-import gzip
 import numpy as np
 from treetime.vcf_utils import read_vcf
 from collections import defaultdict
+from .io.file import PANDAS_READ_CSV_OPTIONS, open_file
 from .utils import write_json, get_json_name
 
 def read_in_translate_vcf(vcf_file, ref_file):
@@ -47,10 +47,7 @@ def mutation_struct():
     altLoc = 0
     sampLoc = 9
 
-    #Use different openers depending on whether compressed
-    opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open
-
-    with opn(vcf_file, mode='rt') as f:
+    with open_file(vcf_file, mode='rt') as f:
         samps = []
 
         for line in f:
@@ -169,7 +166,7 @@ def read_in_features(drm_file):
 
     mutPositions = defaultdict(list)
 
-    df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',')
+    df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',', **PANDAS_READ_CSV_OPTIONS)
     for mi, m in df.iterrows():
         pos = m.SITE-1 #put in python numbering
         gene = m.GENE if hasattr(m, 'GENE') else 'nuc'

diff --git a/augur/traits.py b/augur/traits.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 import sys
 from .errors import AugurError
+from .io.file import open_file
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
 from .utils import write_json, get_json_name
 TINY = 1e-12
@@ -157,7 +158,7 @@ def run(args):
     if args.weights:
         weight_dict = {c:{} for c in args.columns}
         sep = ',' if args.weights.endswith('csv') else '\t'
-        with open(args.weights, 'r', encoding='utf-8') as fh:
+        with open_file(args.weights, 'r') as fh:
             for line in fh:
                 if line[0]=='#':
                     continue
@@ -205,7 +206,7 @@ def run(args):
             models[column]['transition_matrix'] = [list(x) for x in gtr.W]
 
         if gtr:
-            with open(out_prefix+'%s.mugration_model.txt'%column, 'w', encoding='utf-8') as ofile:
+            with open_file(out_prefix+'%s.mugration_model.txt'%column, 'w') as ofile:
                 ofile.write('Map from character to field name\n')
                 for k,v in alphabet.items():
                     ofile.write(k+':\t'+str(v)+'\n')