diff --git a/DEPRECATED.md b/DEPRECATED.md index 18fe3938c..772b428c0 100644 --- a/DEPRECATED.md +++ b/DEPRECATED.md @@ -4,6 +4,14 @@ These features are deprecated, which means they are no longer maintained and will go away in a future major version of Augur. They are currently still available for backwards compatibility, but should not be used in new code. +## `augur parse` preference of `name` over `strain` as the sequence ID field + +*Deprecated in February 2024. Planned to be reordered June 2024 or after.* + +Currently, `augur parse` checks for a 'name' field and then a 'strain' field to use as a sequence ID. This order will be changed in favor of searching for a 'strain' and then a 'name' field to be more consistent with the rest of Augur. + +Users who have both 'name' and 'strain' fields in their data, and want to favor using the 'name' field should add the following `augur parse` parameter `--output-id-field 'name'`. + ## `augur.utils.read_strains` *Deprecated in December 2023. Planned for removal March 2024 or after.* diff --git a/augur/parse.py b/augur/parse.py index 936daf37f..95b205c3a 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -6,10 +6,11 @@ from .io.file import open_file from .io.sequences import read_sequences, write_sequences -from .io.metadata import DEFAULT_ID_COLUMNS from .dates import get_numerical_date_from_value from .errors import AugurError +PARSE_DEFAULT_ID_COLUMNS = ("name", "strain") + forbidden_characters = str.maketrans( {' ': None, '(': '_', @@ -143,7 +144,7 @@ def register_parser(parent_subparsers): parser.add_argument('--output-sequences', required=True, help="output sequences file") parser.add_argument('--output-metadata', required=True, help="output metadata file") parser.add_argument('--output-id-field', required=False, - help=f"The record field to use as the sequence identifier in the FASTA output. If not provided, this will use the first available of {DEFAULT_ID_COLUMNS}. If none of those are available, this will use the first field in the fasta header.") + help=f"The record field to use as the sequence identifier in the FASTA output. If not provided, this will use the first available of {PARSE_DEFAULT_ID_COLUMNS}. If none of those are available, this will use the first field in the fasta header.") parser.add_argument('--fields', required=True, nargs='+', help="fields in fasta header") parser.add_argument('--prettify-fields', nargs='+', help="apply string prettifying operations (underscores to spaces, capitalization, etc) to specified metadata fields") parser.add_argument('--separator', default='|', help="separator of fasta header") @@ -169,9 +170,11 @@ def run(args): raise AugurError(f"Output id field '{args.output_id_field}' not found in fields {args.fields}.") strain_key = args.output_id_field else: - for possible_id in DEFAULT_ID_COLUMNS: + for possible_id in PARSE_DEFAULT_ID_COLUMNS: if possible_id in args.fields: strain_key = possible_id + if possible_id == "name" and "strain" in args.fields: + print("DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').\nUsers who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'", file=sys.stderr) break if not strain_key: strain_key = args.fields[0] diff --git a/tests/functional/parse.t b/tests/functional/parse.t index 43f9b00ed..6b62bbb5a 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -62,7 +62,7 @@ This should fail. ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url']. [2] -Parse Zika sequences into sequences and metadata, preferred default ids is 'strain', then 'name', then first field. +Parse Zika sequences into sequences and metadata, preferred default ids is 'name', then 'strain', then first field. $ ${AUGUR} parse \ > --sequences parse/zika.fasta \ @@ -71,18 +71,20 @@ Parse Zika sequences into sequences and metadata, preferred default ids is 'stra > --fields strain virus name date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst + DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name'). + Users who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name' - $ diff -u "parse/sequences.fasta" "$TMP/sequences.fasta" + $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta" $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv" -Parse Zika sequences into sequences and metadata when there is no 'strain' field. -This should use the 2nd entry in DEFAULT_ID_COLUMNS ('strain', 'name') instead. +Parse Zika sequences into sequences and metadata when there is no 'name' field. +This should use the 2nd entry in DEFAULT_ID_COLUMNS ('name', 'strain') instead. $ ${AUGUR} parse \ > --sequences parse/zika.fasta \ > --output-sequences "$TMP/sequences.fasta" \ > --output-metadata "$TMP/metadata.tsv" \ - > --fields col1 virus name date region country division city db segment authors url title journal paper_url \ + > --fields col1 virus strain date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst