Skip to content

Commit

Permalink
Merge pull request #1381: Enforce UTF-8 encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Feb 12, 2024
2 parents dd8a1cb + 5762baa commit e2ca468
Show file tree
Hide file tree
Showing 24 changed files with 122 additions and 44 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* `augur parse`: A new optional `--output-id-field` argument allows the user to select any ID field for the produced FASTA file (e.g. 'accession' instead of 'name' or 'strain'). [#1403][] (@j23414)
* When no `--output-id-field` is given and the data has both `name` and `strain` fields, continue to preferentially use `name` over `strain` as the sequence ID field; but, throw a deprecation warning that the order will be switched to prefer `strain` over `name` in the future to be consistent with the rest of Augur.
* Added entry to [DEPRECATED.md](./DEPRECATED.md).
* Compression should now be supported for all input and output files. Please [open an issue](https://github.com/nextstrain/augur/issues) if you find one that doesn't! [#1381][] (@victorlin)

### Bug Fixes

Expand All @@ -19,8 +20,10 @@
* filter: Updated the help text of `--include` and `--include-where` to explicitly state that this can add strains that are missing an entry from `--sequences`. [#1389][] (@victorlin)
* filter: Fixed the summary messages to properly reflect force-inclusion of strains that are missing an entry from `--sequences`. [#1389][] (@victorlin)
* filter: Updated wording of summary messages. [#1389][] (@victorlin)
* Enforce UTF-8 encoding when reading and writing files. Improve error messages when a non-UTF-8 file is used. [#1381][] (@victorlin)

[#1294]: https://github.com/nextstrain/augur/pull/1294
[#1381]: https://github.com/nextstrain/augur/pull/1381
[#1389]: https://github.com/nextstrain/augur/pull/1389
[#1410]: https://github.com/nextstrain/augur/pull/1410
[#1403]: https://github.com/nextstrain/augur/pull/1403
Expand Down
3 changes: 2 additions & 1 deletion augur/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from shutil import copyfile
import numpy as np
from Bio import AlignIO, SeqIO, Seq, Align
from .io.file import open_file
from .io.shell_command_runner import run_shell_command
from .io.vcf import shquote
from .utils import nthreads_value
Expand Down Expand Up @@ -369,7 +370,7 @@ def analyse_insertions(aln, ungapped, insertion_csv):
for insertion_seq, strains in i_data.items():
for strain in strains:
strain_data[strain][idx] = insertion_seq
with open(insertion_csv, 'w', encoding='utf-8') as fh:
with open_file(insertion_csv, 'w') as fh:
print(",".join(header), file=fh)
for strain in strain_data:
print("{},{}".format(strain, ",".join(strain_data[strain])), file=fh)
Expand Down
3 changes: 2 additions & 1 deletion augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
from .io.file import open_file
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict
Expand Down Expand Up @@ -465,7 +466,7 @@ def run(args):

# Save ancestral amino acid sequences to FASTA.
if args.output_translations:
with open(args.output_translations.replace("%GENE", gene), "w", encoding="utf-8") as oh:
with open_file(args.output_translations.replace("%GENE", gene), "w") as oh:
for node in aa_result["tt"].tree.find_clades():
oh.write(f">{node.name}\n{aa_result['tt'].sequence(node, as_string=True, reconstructed=True)}\n")

Expand Down
2 changes: 2 additions & 0 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import networkx as nx
from itertools import islice
from .errors import AugurError
from .io.file import PANDAS_READ_CSV_OPTIONS
from argparse import SUPPRESS
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name

Expand Down Expand Up @@ -64,6 +65,7 @@ def read_in_clade_definitions(clade_file):
sep='\t' if clade_file.endswith('.tsv') else ',',
comment='#',
na_filter=False,
**PANDAS_READ_CSV_OPTIONS,
)

clade_inheritance_rows = df[df['gene'] == 'clade']
Expand Down
3 changes: 2 additions & 1 deletion augur/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
import sys

from .frequency_estimators import timestamp_to_float
from .io.file import open_file
from .reconstruct_sequences import load_alignments
from .utils import annotate_parents_for_tree, first_line, read_node_data, write_json

Expand Down Expand Up @@ -213,7 +214,7 @@ def read_distance_map(map_file):
[('default', 0.0), ('map', {'SigPep': {0: {('W', 'P'): -8.3}}})]
"""
# Load the JSON.
with open(map_file, "r", encoding='utf-8') as fh:
with open_file(map_file, "r") as fh:
json_distance_map = json.load(fh)

# Confirm that all required fields are present.
Expand Down
3 changes: 2 additions & 1 deletion augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from Bio import Phylo

from .errors import AugurError
from .io.file import open_file
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .types import ValidationMode
from .utils import read_node_data, write_json, json_size, read_config, read_lat_longs, read_colors
Expand Down Expand Up @@ -1011,7 +1012,7 @@ def set_description(data_json, cmd_line_description_file):
`meta.description` in *data_json* to the text provided.
"""
try:
with open(cmd_line_description_file, encoding='utf-8') as description_file:
with open_file(cmd_line_description_file) as description_file:
markdown_text = description_file.read()
data_json['meta']['description'] = markdown_text
except FileNotFoundError:
Expand Down
6 changes: 4 additions & 2 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
ID_COLUMN as SEQUENCE_INDEX_ID_COLUMN,
DELIMITER as SEQUENCE_INDEX_DELIMITER,
)
from augur.io.file import open_file
from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file
from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata
from augur.io.sequences import read_sequences, write_sequences
from augur.io.print import print_err
Expand Down Expand Up @@ -70,6 +70,7 @@ def run(args):
sep=SEQUENCE_INDEX_DELIMITER,
index_col=SEQUENCE_INDEX_ID_COLUMN,
dtype={SEQUENCE_INDEX_ID_COLUMN: "string"},
**PANDAS_READ_CSV_OPTIONS,
)

# Remove temporary index file, if it exists.
Expand Down Expand Up @@ -134,13 +135,14 @@ def run(args):
priorities = defaultdict(random_generator.random)

# Setup logging.
output_log_context_manager = open_file(args.output_log, "w", newline='')
output_log_writer = None
if args.output_log:
# Log the names of strains that were filtered or force-included, so we
# can properly account for each strain (e.g., including those that were
# initially filtered for one reason and then included again for another
# reason).
output_log = open(args.output_log, "w", newline='')
output_log = output_log_context_manager.__enter__()
output_log_header = ("strain", "filter", "kwargs")
output_log_writer = csv.DictWriter(
output_log,
Expand Down
3 changes: 2 additions & 1 deletion augur/filter/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from xopen import xopen

from augur.errors import AugurError
from augur.io.file import open_file
from augur.io.metadata import Metadata, METADATA_DATE_COLUMN
from augur.io.print import print_err
from .constants import GROUP_BY_GENERATED_COLUMNS
Expand Down Expand Up @@ -76,7 +77,7 @@ def constant_factory(value):
return lambda: value

try:
with open(fname, encoding='utf-8') as pfile:
with open_file(fname) as pfile:
return defaultdict(constant_factory(-np.inf), {
elems[0]: float(elems[1])
for elems in (line.strip().split('\t') if '\t' in line else line.strip().split() for line in pfile.readlines())
Expand Down
3 changes: 2 additions & 1 deletion augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates
from .io.file import open_file
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata
from .utils import write_json

Expand Down Expand Up @@ -110,7 +111,7 @@ def run(args):
if args.method == "kde":
# Load weights if they have been provided.
if args.weights:
with open(args.weights, "r", encoding='utf-8') as fh:
with open_file(args.weights, "r") as fh:
weights = json.load(fh)

weights_attribute = args.weights_attribute
Expand Down
3 changes: 2 additions & 1 deletion augur/import_/beast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
from Bio import Phylo
from treetime import TreeAnc
from augur.io.file import open_file
from augur.utils import write_json

def register_parser(parent_subparsers):
Expand Down Expand Up @@ -234,7 +235,7 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose

if isinstance(tree_path,str): ## determine if path or handle was provided to function
try:
handle=open(tree_path,'r', encoding='utf-8')
handle=open_file(tree_path,'r')
except FileNotFoundError:
print("FATAL: No such file {}".format(tree_path))
sys.exit(2)
Expand Down
27 changes: 25 additions & 2 deletions augur/io/file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import os
from contextlib import contextmanager
from io import IOBase
from textwrap import dedent
from xopen import PipedCompressionReader, PipedCompressionWriter, xopen
from augur.errors import AugurError


ENCODING = "utf-8"

PANDAS_READ_CSV_OPTIONS = {
'encoding': ENCODING,
}


@contextmanager
Expand All @@ -24,9 +33,23 @@ def open_file(path_or_buffer, mode="r", **kwargs):
File handle object
"""

# Read all files using a specific encoding.
kwargs['encoding'] = ENCODING

if isinstance(path_or_buffer, (str, os.PathLike)):
with xopen(path_or_buffer, mode, **kwargs) as handle:
yield handle
try:
with xopen(path_or_buffer, mode, **kwargs) as handle:
yield handle
except UnicodeDecodeError as e:
# TODO: Consider moving this to the top-level error handler to
# handle errors from other I/O functions such as pandas.read_csv.
# This is not trivial since the filepath is useful to include in the
# message, but is not available through UnicodeDecodeError alone.
raise AugurError(dedent(f"""\
File {path_or_buffer!r} contains {e.object[e.start:e.end]!r} which is not valid in the expected {e.encoding!r} encoding.
Try re-saving the file using the {e.encoding!r} encoding."""))


elif isinstance(path_or_buffer, (IOBase, PipedCompressionReader, PipedCompressionWriter)):
yield path_or_buffer
Expand Down
6 changes: 4 additions & 2 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from augur.errors import AugurError
from augur.io.print import print_err
from augur.types import DataErrorMethod
from .file import open_file
from .file import PANDAS_READ_CSV_OPTIONS, open_file


DEFAULT_DELIMITERS = (',', '\t')
Expand Down Expand Up @@ -95,6 +95,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
metadata_file,
iterator=True,
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)
chunk = metadata.read(nrows=1)
metadata.close()
Expand Down Expand Up @@ -153,7 +154,8 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id

return pd.read_csv(
metadata_file,
**kwargs
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)


Expand Down
7 changes: 4 additions & 3 deletions augur/io/vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import shlex

from .file import open_file
from .shell_command_runner import run_shell_command


Expand Down Expand Up @@ -67,7 +68,7 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):

#prepare the header of the VCF & write out
header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
with open_file(vcf_file_name, 'w') as the_file:
the_file.write( "##fileformat=VCFv4.2\n"+
"##source=NextStrain_Protein_Translation\n"+
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
Expand Down Expand Up @@ -122,10 +123,10 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
vcfWrite.append("\t".join(output))

#write it all out
with open(ref_file_name, 'w', encoding='utf-8') as the_file:
with open_file(ref_file_name, 'w') as the_file:
the_file.write("\n".join(refWrite))

with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
with open_file(vcf_file_name, 'a') as the_file:
the_file.write("\n".join(vcfWrite))

if vcf_file_name.lower().endswith('.gz'):
Expand Down
3 changes: 2 additions & 1 deletion augur/lbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import defaultdict
import json
import numpy as np
from .io.file import open_file
from .utils import write_json


Expand Down Expand Up @@ -96,7 +97,7 @@ def run(args):
tree = Bio.Phylo.read(args.tree, "newick")

# Load branch lengths.
with open(args.branch_lengths, "r", encoding='utf-8') as json_fh:
with open_file(args.branch_lengths, "r") as json_fh:
branch_lengths = json.load(json_fh)

# Annotate branch lengths and dates onto tree nodes.
Expand Down
3 changes: 2 additions & 1 deletion augur/measurements/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys

from augur.argparse_ import HideAsFalseAction
from augur.io.file import PANDAS_READ_CSV_OPTIONS
from augur.utils import first_line, write_json
from augur.validate import (
measurements as read_measurements_json,
Expand Down Expand Up @@ -106,7 +107,7 @@ def run(args):

# Load input collection TSV file
try:
collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include)
collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include, **PANDAS_READ_CSV_OPTIONS)
except FileNotFoundError:
print(
f"ERROR: collection TSV file {args.collection!r} does not exist",
Expand Down
3 changes: 2 additions & 1 deletion augur/reconstruct_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from Bio import SeqIO, Seq, SeqRecord, Phylo
from .io.file import open_file
from .utils import read_node_data


Expand Down Expand Up @@ -71,7 +72,7 @@ def run(args):
#if VCF, read in the reference seq for each gene, put on root
if(is_vcf):
node_data["nodes"][root_node]['aa_sequences'] = {}
with open(args.vcf_aa_reference, encoding='utf-8') as handle:
with open_file(args.vcf_aa_reference) as handle:
for record in SeqIO.parse(handle, "fasta"):
if record.id==args.gene:
#'root' may not be same as 'reference', so apply any mutations at root here!
Expand Down
9 changes: 3 additions & 6 deletions augur/sequence_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
"""

import sys
import gzip
import numpy as np
from treetime.vcf_utils import read_vcf
from collections import defaultdict
from .io.file import PANDAS_READ_CSV_OPTIONS, open_file
from .utils import write_json, get_json_name

def read_in_translate_vcf(vcf_file, ref_file):
Expand Down Expand Up @@ -47,10 +47,7 @@ def mutation_struct():
altLoc = 0
sampLoc = 9

#Use different openers depending on whether compressed
opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open

with opn(vcf_file, mode='rt') as f:
with open_file(vcf_file, mode='rt') as f:
samps = []

for line in f:
Expand Down Expand Up @@ -169,7 +166,7 @@ def read_in_features(drm_file):

mutPositions = defaultdict(list)

df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',')
df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',', **PANDAS_READ_CSV_OPTIONS)
for mi, m in df.iterrows():
pos = m.SITE-1 #put in python numbering
gene = m.GENE if hasattr(m, 'GENE') else 'nuc'
Expand Down
5 changes: 3 additions & 2 deletions augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from collections import defaultdict
import sys
from .errors import AugurError
from .io.file import open_file
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .utils import write_json, get_json_name
TINY = 1e-12
Expand Down Expand Up @@ -157,7 +158,7 @@ def run(args):
if args.weights:
weight_dict = {c:{} for c in args.columns}
sep = ',' if args.weights.endswith('csv') else '\t'
with open(args.weights, 'r', encoding='utf-8') as fh:
with open_file(args.weights, 'r') as fh:
for line in fh:
if line[0]=='#':
continue
Expand Down Expand Up @@ -205,7 +206,7 @@ def run(args):
models[column]['transition_matrix'] = [list(x) for x in gtr.W]

if gtr:
with open(out_prefix+'%s.mugration_model.txt'%column, 'w', encoding='utf-8') as ofile:
with open_file(out_prefix+'%s.mugration_model.txt'%column, 'w') as ofile:
ofile.write('Map from character to field name\n')
for k,v in alphabet.items():
ofile.write(k+':\t'+str(v)+'\n')
Expand Down
Loading

0 comments on commit e2ca468

Please sign in to comment.