diff --git a/CHANGES.md b/CHANGES.md index 591febe98..2ba0a960f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,7 @@ * `augur parse`: A new optional `--output-id-field` argument allows the user to select any ID field for the produced FASTA file (e.g. 'accession' instead of 'name' or 'strain'). [#1403][] (@j23414) * When no `--output-id-field` is given and the data has both `name` and `strain` fields, continue to preferentially use `name` over `strain` as the sequence ID field; but, throw a deprecation warning that the order will be switched to prefer `strain` over `name` in the future to be consistent with the rest of Augur. * Added entry to [DEPRECATED.md](./DEPRECATED.md). +* Compression should now be supported for all input and output files. Please [open an issue](https://github.com/nextstrain/augur/issues) if you find one that doesn't! [#1381][] (@victorlin) ### Bug Fixes @@ -19,8 +20,10 @@ * filter: Updated the help text of `--include` and `--include-where` to explicitly state that this can add strains that are missing an entry from `--sequences`. [#1389][] (@victorlin) * filter: Fixed the summary messages to properly reflect force-inclusion of strains that are missing an entry from `--sequences`. [#1389][] (@victorlin) * filter: Updated wording of summary messages. [#1389][] (@victorlin) +* Enforce UTF-8 encoding when reading and writing files. Improve error messages when a non-UTF-8 file is used. [#1381][] (@victorlin) [#1294]: https://github.com/nextstrain/augur/pull/1294 +[#1381]: https://github.com/nextstrain/augur/pull/1381 [#1389]: https://github.com/nextstrain/augur/pull/1389 [#1410]: https://github.com/nextstrain/augur/pull/1410 [#1403]: https://github.com/nextstrain/augur/pull/1403 diff --git a/augur/align.py b/augur/align.py index c9b021d34..7c2a7af2d 100644 --- a/augur/align.py +++ b/augur/align.py @@ -6,6 +6,7 @@ from shutil import copyfile import numpy as np from Bio import AlignIO, SeqIO, Seq, Align +from .io.file import open_file from .io.shell_command_runner import run_shell_command from .io.vcf import shquote from .utils import nthreads_value @@ -369,7 +370,7 @@ def analyse_insertions(aln, ungapped, insertion_csv): for insertion_seq, strains in i_data.items(): for strain in strains: strain_data[strain][idx] = insertion_seq - with open(insertion_csv, 'w', encoding='utf-8') as fh: + with open_file(insertion_csv, 'w') as fh: print(",".join(header), file=fh) for strain in strain_data: print("{},{}".format(strain, ",".join(strain_data[strain])), file=fh) diff --git a/augur/ancestral.py b/augur/ancestral.py index 67edd9a00..2771948eb 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -29,6 +29,7 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name +from .io.file import open_file from .io.vcf import is_vcf as is_filename_vcf from treetime.vcf_utils import read_vcf, write_vcf from collections import defaultdict @@ -465,7 +466,7 @@ def run(args): # Save ancestral amino acid sequences to FASTA. if args.output_translations: - with open(args.output_translations.replace("%GENE", gene), "w", encoding="utf-8") as oh: + with open_file(args.output_translations.replace("%GENE", gene), "w") as oh: for node in aa_result["tt"].tree.find_clades(): oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n") diff --git a/augur/clades.py b/augur/clades.py index 39053aa31..bb39ec511 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -19,6 +19,7 @@ import networkx as nx from itertools import islice from .errors import AugurError +from .io.file import PANDAS_READ_CSV_OPTIONS from argparse import SUPPRESS from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name @@ -64,6 +65,7 @@ def read_in_clade_definitions(clade_file): sep='\t' if clade_file.endswith('.tsv') else ',', comment='#', na_filter=False, + **PANDAS_READ_CSV_OPTIONS, ) clade_inheritance_rows = df[df['gene'] == 'clade'] diff --git a/augur/distance.py b/augur/distance.py index e44584e1e..bfcaa1bf2 100644 --- a/augur/distance.py +++ b/augur/distance.py @@ -186,6 +186,7 @@ import sys from .frequency_estimators import timestamp_to_float +from .io.file import open_file from .reconstruct_sequences import load_alignments from .utils import annotate_parents_for_tree, first_line, read_node_data, write_json @@ -213,7 +214,7 @@ def read_distance_map(map_file): [('default', 0.0), ('map', {'SigPep': {0: {('W', 'P'): -8.3}}})] """ # Load the JSON. - with open(map_file, "r", encoding='utf-8') as fh: + with open_file(map_file, "r") as fh: json_distance_map = json.load(fh) # Confirm that all required fields are present. diff --git a/augur/export_v2.py b/augur/export_v2.py index 6b8ae0e70..02aab1a37 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -12,6 +12,7 @@ from Bio import Phylo from .errors import AugurError +from .io.file import open_file from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .types import ValidationMode from .utils import read_node_data, write_json, json_size, read_config, read_lat_longs, read_colors @@ -1011,7 +1012,7 @@ def set_description(data_json, cmd_line_description_file): `meta.description` in *data_json* to the text provided. """ try: - with open(cmd_line_description_file, encoding='utf-8') as description_file: + with open_file(cmd_line_description_file) as description_file: markdown_text = description_file.read() data_json['meta']['description'] = markdown_text except FileNotFoundError: diff --git a/augur/filter/_run.py b/augur/filter/_run.py index b81758dd3..0e9f43bf4 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -14,7 +14,7 @@ ID_COLUMN as SEQUENCE_INDEX_ID_COLUMN, DELIMITER as SEQUENCE_INDEX_DELIMITER, ) -from augur.io.file import open_file +from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata from augur.io.sequences import read_sequences, write_sequences from augur.io.print import print_err @@ -70,6 +70,7 @@ def run(args): sep=SEQUENCE_INDEX_DELIMITER, index_col=SEQUENCE_INDEX_ID_COLUMN, dtype={SEQUENCE_INDEX_ID_COLUMN: "string"}, + **PANDAS_READ_CSV_OPTIONS, ) # Remove temporary index file, if it exists. @@ -134,13 +135,14 @@ def run(args): priorities = defaultdict(random_generator.random) # Setup logging. + output_log_context_manager = open_file(args.output_log, "w", newline='') output_log_writer = None if args.output_log: # Log the names of strains that were filtered or force-included, so we # can properly account for each strain (e.g., including those that were # initially filtered for one reason and then included again for another # reason). - output_log = open(args.output_log, "w", newline='') + output_log = output_log_context_manager.__enter__() output_log_header = ("strain", "filter", "kwargs") output_log_writer = csv.DictWriter( output_log, diff --git a/augur/filter/io.py b/augur/filter/io.py index 6ebf253ef..670b5b245 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -9,6 +9,7 @@ from xopen import xopen from augur.errors import AugurError +from augur.io.file import open_file from augur.io.metadata import Metadata, METADATA_DATE_COLUMN from augur.io.print import print_err from .constants import GROUP_BY_GENERATED_COLUMNS @@ -76,7 +77,7 @@ def constant_factory(value): return lambda: value try: - with open(fname, encoding='utf-8') as pfile: + with open_file(fname) as pfile: return defaultdict(constant_factory(-np.inf), { elems[0]: float(elems[1]) for elems in (line.strip().split('\t') if '\t' in line else line.strip().split() for line in pfile.readlines()) diff --git a/augur/frequencies.py b/augur/frequencies.py index 3afb47860..1a4461b90 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -10,6 +10,7 @@ from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates +from .io.file import open_file from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata from .utils import write_json @@ -110,7 +111,7 @@ def run(args): if args.method == "kde": # Load weights if they have been provided. if args.weights: - with open(args.weights, "r", encoding='utf-8') as fh: + with open_file(args.weights, "r") as fh: weights = json.load(fh) weights_attribute = args.weights_attribute diff --git a/augur/import_/beast.py b/augur/import_/beast.py index acef81269..a30734bc8 100644 --- a/augur/import_/beast.py +++ b/augur/import_/beast.py @@ -11,6 +11,7 @@ import numpy as np from Bio import Phylo from treetime import TreeAnc +from augur.io.file import open_file from augur.utils import write_json def register_parser(parent_subparsers): @@ -234,7 +235,7 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose if isinstance(tree_path,str): ## determine if path or handle was provided to function try: - handle=open(tree_path,'r', encoding='utf-8') + handle=open_file(tree_path,'r') except FileNotFoundError: print("FATAL: No such file {}".format(tree_path)) sys.exit(2) diff --git a/augur/io/file.py b/augur/io/file.py index 1104f89cf..5b8a7bc13 100644 --- a/augur/io/file.py +++ b/augur/io/file.py @@ -1,7 +1,16 @@ import os from contextlib import contextmanager from io import IOBase +from textwrap import dedent from xopen import PipedCompressionReader, PipedCompressionWriter, xopen +from augur.errors import AugurError + + +ENCODING = "utf-8" + +PANDAS_READ_CSV_OPTIONS = { + 'encoding': ENCODING, +} @contextmanager @@ -24,9 +33,23 @@ def open_file(path_or_buffer, mode="r", **kwargs): File handle object """ + + # Read all files using a specific encoding. + kwargs['encoding'] = ENCODING + if isinstance(path_or_buffer, (str, os.PathLike)): - with xopen(path_or_buffer, mode, **kwargs) as handle: - yield handle + try: + with xopen(path_or_buffer, mode, **kwargs) as handle: + yield handle + except UnicodeDecodeError as e: + # TODO: Consider moving this to the top-level error handler to + # handle errors from other I/O functions such as pandas.read_csv. + # This is not trivial since the filepath is useful to include in the + # message, but is not available through UnicodeDecodeError alone. + raise AugurError(dedent(f"""\ + File {path_or_buffer!r} contains {e.object[e.start:e.end]!r} which is not valid in the expected {e.encoding!r} encoding. + Try re-saving the file using the {e.encoding!r} encoding.""")) + elif isinstance(path_or_buffer, (IOBase, PipedCompressionReader, PipedCompressionWriter)): yield path_or_buffer diff --git a/augur/io/metadata.py b/augur/io/metadata.py index f8be2f5ad..32747eceb 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -10,7 +10,7 @@ from augur.errors import AugurError from augur.io.print import print_err from augur.types import DataErrorMethod -from .file import open_file +from .file import PANDAS_READ_CSV_OPTIONS, open_file DEFAULT_DELIMITERS = (',', '\t') @@ -95,6 +95,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id metadata_file, iterator=True, **kwargs, + **PANDAS_READ_CSV_OPTIONS, ) chunk = metadata.read(nrows=1) metadata.close() @@ -153,7 +154,8 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id return pd.read_csv( metadata_file, - **kwargs + **kwargs, + **PANDAS_READ_CSV_OPTIONS, ) diff --git a/augur/io/vcf.py b/augur/io/vcf.py index cb472b065..c808c3e40 100644 --- a/augur/io/vcf.py +++ b/augur/io/vcf.py @@ -1,6 +1,7 @@ import os import shlex +from .file import open_file from .shell_command_runner import run_shell_command @@ -67,7 +68,7 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name): #prepare the header of the VCF & write out header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames - with open(vcf_file_name, 'w', encoding='utf-8') as the_file: + with open_file(vcf_file_name, 'w') as the_file: the_file.write( "##fileformat=VCFv4.2\n"+ "##source=NextStrain_Protein_Translation\n"+ "##FORMAT=\n") @@ -122,10 +123,10 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name): vcfWrite.append("\t".join(output)) #write it all out - with open(ref_file_name, 'w', encoding='utf-8') as the_file: + with open_file(ref_file_name, 'w') as the_file: the_file.write("\n".join(refWrite)) - with open(vcf_file_name, 'a', encoding='utf-8') as the_file: + with open_file(vcf_file_name, 'a') as the_file: the_file.write("\n".join(vcfWrite)) if vcf_file_name.lower().endswith('.gz'): diff --git a/augur/lbi.py b/augur/lbi.py index bf7387797..9624f7f6e 100644 --- a/augur/lbi.py +++ b/augur/lbi.py @@ -5,6 +5,7 @@ from collections import defaultdict import json import numpy as np +from .io.file import open_file from .utils import write_json @@ -96,7 +97,7 @@ def run(args): tree = Bio.Phylo.read(args.tree, "newick") # Load branch lengths. - with open(args.branch_lengths, "r", encoding='utf-8') as json_fh: + with open_file(args.branch_lengths, "r") as json_fh: branch_lengths = json.load(json_fh) # Annotate branch lengths and dates onto tree nodes. diff --git a/augur/measurements/export.py b/augur/measurements/export.py index 077bd63ba..ec495590a 100644 --- a/augur/measurements/export.py +++ b/augur/measurements/export.py @@ -6,6 +6,7 @@ import sys from augur.argparse_ import HideAsFalseAction +from augur.io.file import PANDAS_READ_CSV_OPTIONS from augur.utils import first_line, write_json from augur.validate import ( measurements as read_measurements_json, @@ -106,7 +107,7 @@ def run(args): # Load input collection TSV file try: - collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include) + collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include, **PANDAS_READ_CSV_OPTIONS) except FileNotFoundError: print( f"ERROR: collection TSV file {args.collection!r} does not exist", diff --git a/augur/reconstruct_sequences.py b/augur/reconstruct_sequences.py index 8a6ba258a..478801855 100644 --- a/augur/reconstruct_sequences.py +++ b/augur/reconstruct_sequences.py @@ -3,6 +3,7 @@ """ from Bio import SeqIO, Seq, SeqRecord, Phylo +from .io.file import open_file from .utils import read_node_data @@ -71,7 +72,7 @@ def run(args): #if VCF, read in the reference seq for each gene, put on root if(is_vcf): node_data["nodes"][root_node]['aa_sequences'] = {} - with open(args.vcf_aa_reference, encoding='utf-8') as handle: + with open_file(args.vcf_aa_reference) as handle: for record in SeqIO.parse(handle, "fasta"): if record.id==args.gene: #'root' may not be same as 'reference', so apply any mutations at root here! diff --git a/augur/sequence_traits.py b/augur/sequence_traits.py index 601d1dd61..1d09ac6b2 100644 --- a/augur/sequence_traits.py +++ b/augur/sequence_traits.py @@ -3,10 +3,10 @@ """ import sys -import gzip import numpy as np from treetime.vcf_utils import read_vcf from collections import defaultdict +from .io.file import PANDAS_READ_CSV_OPTIONS, open_file from .utils import write_json, get_json_name def read_in_translate_vcf(vcf_file, ref_file): @@ -47,10 +47,7 @@ def mutation_struct(): altLoc = 0 sampLoc = 9 - #Use different openers depending on whether compressed - opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open - - with opn(vcf_file, mode='rt') as f: + with open_file(vcf_file, mode='rt') as f: samps = [] for line in f: @@ -169,7 +166,7 @@ def read_in_features(drm_file): mutPositions = defaultdict(list) - df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',') + df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',', **PANDAS_READ_CSV_OPTIONS) for mi, m in df.iterrows(): pos = m.SITE-1 #put in python numbering gene = m.GENE if hasattr(m, 'GENE') else 'nuc' diff --git a/augur/traits.py b/augur/traits.py index 893bf052d..0b520c600 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -6,6 +6,7 @@ from collections import defaultdict import sys from .errors import AugurError +from .io.file import open_file from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .utils import write_json, get_json_name TINY = 1e-12 @@ -157,7 +158,7 @@ def run(args): if args.weights: weight_dict = {c:{} for c in args.columns} sep = ',' if args.weights.endswith('csv') else '\t' - with open(args.weights, 'r', encoding='utf-8') as fh: + with open_file(args.weights, 'r') as fh: for line in fh: if line[0]=='#': continue @@ -205,7 +206,7 @@ def run(args): models[column]['transition_matrix'] = [list(x) for x in gtr.W] if gtr: - with open(out_prefix+'%s.mugration_model.txt'%column, 'w', encoding='utf-8') as ofile: + with open_file(out_prefix+'%s.mugration_model.txt'%column, 'w') as ofile: ofile.write('Map from character to field name\n') for k,v in alphabet.items(): ofile.write(k+':\t'+str(v)+'\n') diff --git a/augur/tree.py b/augur/tree.py index 84e2c2143..98c31f510 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -16,6 +16,7 @@ from pathlib import Path from .errors import AugurError +from .io.file import open_file from .io.sequences import read_sequences from .io.shell_command_runner import run_shell_command from .io.vcf import shquote @@ -249,7 +250,7 @@ def random_string(n): tmp_aln_file = str(Path(aln_file).with_name(Path(aln_file).stem + "-delim.fasta")) log_file = str(Path(tmp_aln_file).with_suffix(".iqtree.log")) num_seqs = 0 - with open(tmp_aln_file, 'w', encoding='utf-8') as ofile, open(aln_file, encoding='utf-8') as ifile: + with open_file(tmp_aln_file, 'w') as ofile, open_file(aln_file) as ifile: for line in ifile: tmp_line = line if line.startswith(">"): @@ -358,7 +359,7 @@ def write_out_informative_fasta(compress_seq, alignment, stripFile=None): #If want a position map, print: if printPositionMap: - with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file: + with open_file(fasta_file+".positions.txt", 'w') as the_file: the_file.write("\n".join(pos)) return fasta_file @@ -396,7 +397,7 @@ def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_fil # Write the masked alignment to disk one record at a time. alignment_file_path = Path(alignment_file) masked_alignment_file = str(alignment_file_path.parent / ("masked_%s" % alignment_file_path.name)) - with open(masked_alignment_file, "w", encoding='utf-8') as oh: + with open_file(masked_alignment_file, "w") as oh: for record in alignment: # Convert to a mutable sequence to enable masking with Ns. sequence = MutableSeq(str(record.seq)) diff --git a/augur/util_support/color_parser.py b/augur/util_support/color_parser.py index 28ac3a1d8..1f8c9360f 100644 --- a/augur/util_support/color_parser.py +++ b/augur/util_support/color_parser.py @@ -2,6 +2,7 @@ import functools from augur.data import as_file +from augur.io.file import open_file from augur.util_support.color_parser_line import ColorParserLine @@ -17,11 +18,11 @@ def mapping(self): if self.use_defaults: with as_file("colors.tsv") as file: - with open(file, encoding="utf-8") as defaults: + with open_file(file) as defaults: colors = {**colors, **self.parse_file(defaults)} if self.mapping_filename: - with open(self.mapping_filename, encoding="utf-8") as mapping: + with open_file(self.mapping_filename) as mapping: colors = {**colors, **self.parse_file(mapping)} return colors diff --git a/augur/util_support/node_data_file.py b/augur/util_support/node_data_file.py index 4add65ef4..2d27c421b 100644 --- a/augur/util_support/node_data_file.py +++ b/augur/util_support/node_data_file.py @@ -3,6 +3,7 @@ from augur.__version__ import __version__ from augur.__version__ import is_augur_version_compatible from augur.errors import AugurError +from augur.io.file import open_file from augur.io.print import print_err from augur.types import ValidationMode from augur.validate import validate_json, ValidateError, load_json_schema @@ -16,7 +17,7 @@ def __init__(self, fname, validation_mode=ValidationMode.ERROR): self.fname = fname self.validation_mode = validation_mode - with open(fname, encoding="utf-8") as jfile: + with open_file(fname) as jfile: self.attrs = json.load(jfile) self.validate() diff --git a/augur/utils.py b/augur/utils.py index e54b565a8..bd1a9ccd7 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -10,7 +10,7 @@ from .__version__ import __version__ from augur.data import as_file -from augur.io.file import open_file +from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file from augur.io.print import print_err from augur.types import ValidationMode @@ -292,7 +292,7 @@ def _read_gff(reference, feature_names): valid_types = ['gene', 'source', 'region'] features = {} - with open(reference, encoding='utf-8') as in_handle: + with open_file(reference) as in_handle: # Note that `GFF.parse` doesn't always yield GFF records in the order # one may expect, but since we raise AugurError if there are multiple # this doesn't matter. @@ -443,7 +443,7 @@ def read_config(fname): return defaultdict(dict) try: - with open(fname, 'rb') as ifile: + with open_file(fname, 'rb') as ifile: config = json.load(ifile) except json.decoder.JSONDecodeError as err: print("FATAL ERROR:") @@ -474,12 +474,12 @@ def add_line_to_coordinates(line): print("WARNING: geo-coordinate file contains invalid line. Please make sure not to mix tabs and spaces as delimiters (use only tabs):",line) if use_defaults: with as_file("lat_longs.tsv") as file: - with open(file, encoding="utf-8") as defaults: + with open_file(file) as defaults: for line in defaults: add_line_to_coordinates(line) if overrides: if os.path.isfile(overrides): - with open(overrides, encoding='utf-8') as ifile: + with open_file(overrides) as ifile: for line in ifile: add_line_to_coordinates(line) else: @@ -699,11 +699,11 @@ def read_bed_file(bed_file): mask_sites = [] try: bed = pd.read_csv(bed_file, sep='\t', header=None, usecols=[1,2], - dtype={1:int,2:int}) + dtype={1:int,2:int}, **PANDAS_READ_CSV_OPTIONS) except ValueError: # Check if we have a header row. Otherwise, just fail. bed = pd.read_csv(bed_file, sep='\t', header=None, usecols=[1,2], - dtype={1:int,2:int}, skiprows=1) + dtype={1:int,2:int}, skiprows=1, **PANDAS_READ_CSV_OPTIONS) print("Skipped row 1 of %s, assuming it is a header." % bed_file) for _, row in bed.iterrows(): mask_sites.extend(range(row[1], row[2])) @@ -728,7 +728,7 @@ def read_mask_file(mask_file): Sorted list of unique zero-indexed sites """ mask_sites = [] - with open(mask_file, encoding='utf-8') as mf: + with open_file(mask_file) as mf: for idx, line in enumerate(l.strip() for l in mf.readlines()): if "\t" in line: line = line.split("\t")[1] diff --git a/augur/validate.py b/augur/validate.py index aefdf5589..364a5d785 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -12,6 +12,7 @@ from textwrap import indent from typing import Iterable, Union from augur.data import as_file +from augur.io.file import open_file from augur.io.print import print_err from augur.io.json import shorten_as_json from .validate_export import verifyMainJSONIsInternallyConsistent, verifyMetaAndOrTreeJSONsAreInternallyConsistent @@ -30,7 +31,7 @@ def load_json_schema(path, refs=None): (located in augur/data) ''' try: - with as_file(path) as file, open(file, "r", encoding = "utf-8") as fh: + with as_file(path) as file, open_file(file, "r") as fh: schema = json.load(fh) except json.JSONDecodeError as err: raise ValidateError("Schema {} is not a valid JSON file. Error: {}".format(path, err)) @@ -45,7 +46,7 @@ def load_json_schema(path, refs=None): # Make the validator aware of additional schemas schema_store = dict() for k, v in refs.items(): - with as_file(v) as file, open(file, "r", encoding = "utf-8") as fh: + with as_file(v) as file, open_file(file, "r") as fh: schema_store[k] = json.load(fh) resolver = jsonschema.RefResolver.from_schema(schema,store=schema_store) schema_validator = Validator(schema, resolver=resolver) @@ -67,7 +68,7 @@ def resolve_remote(url): def load_json(path): - with open(path, 'rb') as fh: + with open_file(path, 'rb') as fh: try: jsonToValidate = json.load(fh) except json.JSONDecodeError: diff --git a/tests/functional/filter/cram/filter-file-encoding-error.t b/tests/functional/filter/cram/filter-file-encoding-error.t new file mode 100644 index 000000000..926cfb083 --- /dev/null +++ b/tests/functional/filter/cram/filter-file-encoding-error.t @@ -0,0 +1,33 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create a metadata file that contains a non-ASCII character. + + $ cat >metadata.tsv <<~~ + > strain col1 + > SEQ_1 ã + > SEQ_2 b + > SEQ_3 c + > ~~ + +Encode it as WINDOWS-1252. + + $ iconv -f UTF-8 -t WINDOWS-1252 metadata.tsv > metadata-windows-1252.tsv + +The UTF-8 encoded file can be used without issues. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --output-strains filtered_strains.txt + 0 strains were dropped during filtering + 3 strains passed all filters + +An error is shown when using the WINDOWS-1252 encoded file. + + $ ${AUGUR} filter \ + > --metadata metadata-windows-1252.tsv \ + > --output-strains filtered_strains.txt + ERROR: File 'metadata-windows-1252.tsv' contains b'\xe3' which is not valid in the expected 'utf-8' encoding. + Try re-saving the file using the 'utf-8' encoding. + [2]