-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a60f5a3
commit 6614a79
Showing
4 changed files
with
574 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,5 @@ | ||
# craftbox x.x.x | ||
|
||
## NEW FEATURES | ||
|
||
* `csv2fasta`: Convert two columns from a CSV file to FASTA entries (PR #). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
name: csv2fasta | ||
description: | | ||
Convert two columns from a CSV file to FASTA entries. The CSV file can | ||
contain an optional header and each row (other than the header) becomes | ||
a single FASTA record. One of the two columns will be used as the names | ||
for the FASTA entries, while the other become the sequences. The sequences | ||
column must only contain characters that are valid IUPAC notation for | ||
nucleotides or a group thereof (wildcard characters). | ||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: --input | ||
type: file | ||
direction: input | ||
example: barcodes.csv | ||
description: CSV file to be processed. | ||
required: true | ||
- name: --header | ||
type: boolean_true | ||
description: | | ||
Parse the first line of the CSV file as a header. | ||
- name: "CSV dialect options" | ||
description: | | ||
Options that can be used to override the automatically detected | ||
dialect of the CSV file. | ||
arguments: | ||
- name: --delimiter | ||
type: string | ||
description: | | ||
Overwrite the column delimiter character. | ||
- name: --quote_character | ||
type: string | ||
description: | | ||
Overwrite the character used to denote the start and end of a quoted item. | ||
- name: "CSV column arguments" | ||
description: | | ||
Parameters for the selection of columns from the CSV file. | ||
Only required when your CSV file contains more than 2 columns, | ||
otherwise the first column will be used for the FASTA header | ||
and the second for the FASTA nucleotide sequences. This default | ||
can still be overwritten by using the options below. | ||
arguments: | ||
- name: --sequence_column | ||
type: string | ||
description: | | ||
Name of the column containing the sequences. Implies 'header'. | ||
Cannot be used together with 'sequence_column_index'. | ||
required: false | ||
- name: "--name_column" | ||
type: string | ||
description: | | ||
Name of the column describing the FASTA headers. Implies 'header'. | ||
Cannot be used together with 'name_column_index'. | ||
required: false | ||
- name: "--sequence_column_index" | ||
type: integer | ||
min: 0 | ||
description: | | ||
Index of the column to use as the FASTA sequences, counter from the left and | ||
starting from 0. Cannot be used in combination with the 'sequence_column' argument. | ||
required: false | ||
- name: "--name_column_index" | ||
type: integer | ||
min: 0 | ||
description: | | ||
Index of the column to use as the FASTA headers, counter from the left and | ||
starting from 0. Cannot be used in combination with 'name_column'. | ||
required: false | ||
- name: Outputs | ||
arguments: | ||
- name: "--output" | ||
type: file | ||
example: barcodes.fasta | ||
direction: output | ||
description: Output fasta file. | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
test_resources: | ||
- type: python_script | ||
path: test_csv2fasta.py | ||
|
||
engines: | ||
- type: docker | ||
image: python:slim | ||
setup: | ||
- type: apt | ||
packages: | ||
- procps | ||
- type: python | ||
packages: | ||
- dnaio | ||
test_setup: | ||
- type: python | ||
packages: | ||
- pytest | ||
- viashpy | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from pathlib import Path | ||
import dnaio | ||
import csv | ||
|
||
## VIASH START | ||
par = { | ||
|
||
} | ||
## VIASH END | ||
|
||
iupac = frozenset("ABCDGHKMNRSTUVWXY") | ||
|
||
def resolve_header_name_to_index(header_entries, column_name): | ||
try: | ||
return header_entries.index(column_name) | ||
except ValueError as e: | ||
raise ValueError(f"Column name '{column_name}' could not " | ||
"be found in the header of the CSV file.") from e | ||
|
||
|
||
def csv_records(csv_file, delimiter, quote_character, | ||
header, sequence_column, name_column, | ||
sequence_column_index, name_column_index): | ||
with open(csv_file, newline='') as csvfile: | ||
# Deduce CSV dialect based on first 5 lines. | ||
hint = "\n".join([csvfile.readline() for _ in range(5)]) | ||
csvfile.seek(0) | ||
dialect = csv.Sniffer().sniff(hint) | ||
reader_args = {"dialect": dialect} | ||
delimiter_arg = {"delimiter": delimiter} if delimiter else {} | ||
quotechar_arg = {"quotechar": quote_character} if delimiter else {} | ||
all_args = reader_args | delimiter_arg | quotechar_arg | ||
csv_reader = csv.reader(csvfile, **all_args) | ||
for linenum, line in enumerate(csv_reader): | ||
if not linenum: # First row | ||
num_columns = len(line) | ||
if header: | ||
if sequence_column: | ||
sequence_column_index = resolve_header_name_to_index(line, sequence_column) | ||
if name_column: | ||
name_column_index = resolve_header_name_to_index(line, name_column) | ||
continue | ||
if not (linenum - header): # First 'data' line | ||
if (not sequence_column_index and not name_column_index and len(line) == 2): | ||
name_column_index, sequence_column_index = 0, 1 | ||
if sequence_column_index == name_column_index: | ||
raise ValueError("The same columns were selected for both the FASTQ sequences and " | ||
"headers.") | ||
if sequence_column_index is None: | ||
raise ValueError("Either 'sequence_column_index' or 'sequence_column' needs " | ||
"to be specified.") | ||
if name_column_index is None: | ||
raise ValueError("Either 'name_column' or 'name_column_index' needs to " | ||
"be specified.") | ||
if name_column_index >= num_columns: | ||
raise ValueError(f"Requested to use column number {name_column_index} " | ||
f"(0 based) for the FASTA headers, but only {num_columns} " | ||
"were found on the first line.") | ||
if sequence_column_index >= num_columns: | ||
raise ValueError(f"Requested to use column number {sequence_column_index} " | ||
f"(0 based) for the FASTA sequences, but only {num_columns} " | ||
"were found on the first line.") | ||
if len(line) != num_columns: | ||
raise ValueError(f"Number of columns ({len(line)}) found on line {linenum+1} " | ||
"is different compared to number of columns found " | ||
f"previously ({num_columns}).") | ||
sequence_name, sequence = line[name_column_index], line[sequence_column_index] | ||
invalid_characters = set(sequence.upper()) - iupac | ||
if set(sequence.upper()) - iupac: | ||
raise ValueError(f"The sequence ('{sequence}') found on line {linenum+1} " | ||
f"contains characters ({','.join(invalid_characters)}) " | ||
"which are not valid IUPAC identifiers for nucleotides.") | ||
yield sequence_name, sequence | ||
|
||
|
||
def main(par): | ||
par['input'], par['output'] = Path(par['input']), Path(par['output']) | ||
sequence_column, name_column = par['sequence_column'], par['name_column'] | ||
sequence_column_index, name_column_index = par['sequence_column_index'], par['name_column_index'] | ||
if (sequence_column or name_column) and not par['header']: | ||
par["header"] = True | ||
if sequence_column_index and sequence_column: | ||
raise ValueError("Cannot specify both 'sequence_column_index' and 'sequence_column'") | ||
if name_column and name_column_index: | ||
raise ValueError("Cannot specify both 'name_column_index' and 'name_column'") | ||
if (sequence_column_index or name_column_index) and \ | ||
(sequence_column_index == name_column_index): | ||
raise ValueError("The value specified for 'sequence_column_index' cannot be the same as " | ||
"the value for 'name_column_index'.") | ||
with dnaio.open(par['output'], mode='w', fileformat="fasta") as writer: | ||
for header, sequence in csv_records(par['input'], | ||
par['delimiter'], | ||
par['quote_character'], | ||
par['header'], | ||
sequence_column, | ||
name_column, | ||
sequence_column_index, | ||
name_column_index): | ||
writer.write(dnaio.SequenceRecord(header, sequence)) | ||
|
||
if __name__ == "__main__": | ||
main(par) |
Oops, something went wrong.