Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: add csv2fasta #61

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

* `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).

* `csv2fasta`: Convert two columns from a CSV file to FASTA entries (PR #61).

## MAJOR CHANGES

Expand Down
102 changes: 102 additions & 0 deletions src/sequenceformats/csv2fasta/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: csv2fasta
description: |
Convert two columns from a CSV file to FASTA entries. The CSV file can
contain an optional header and each row (other than the header) becomes
a single FASTA record. One of the two columns will be used as the names
for the FASTA entries, while the other become the sequences. The sequences
column must only contain characters that are valid IUPAC notation for
nucleotides or a group thereof (wildcard characters).
argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
direction: input
example: barcodes.csv
description: CSV file to be processed.
required: true
- name: --header
type: boolean_true
description: |
Parse the first line of the CSV file as a header.
- name: "CSV dialect options"
description: |
Options that can be used to override the automatically detected
dialect of the CSV file.
arguments:
- name: --delimiter
type: string
description: |
Overwrite the column delimiter character.
- name: --quote_character
type: string
description: |
Overwrite the character used to denote the start and end of a quoted item.
- name: "CSV column arguments"
description: |
Parameters for the selection of columns from the CSV file.
Only required when your CSV file contains more than 2 columns,
otherwise the first column will be used for the FASTA header
and the second for the FASTA nucleotide sequences. This default
can still be overwritten by using the options below.
arguments:
- name: --sequence_column
tverbeiren marked this conversation as resolved.
Show resolved Hide resolved
type: string
description: |
Name of the column containing the sequences. Implies 'header'.
Cannot be used together with 'sequence_column_index'.
required: false
- name: "--name_column"
type: string
description: |
Name of the column describing the FASTA headers. Implies 'header'.
Cannot be used together with 'name_column_index'.
required: false
- name: "--sequence_column_index"
type: integer
min: 0
description: |
Index of the column to use as the FASTA sequences, counter from the left and
starting from 0. Cannot be used in combination with the 'sequence_column' argument.
required: false
- name: "--name_column_index"
type: integer
min: 0
description: |
Index of the column to use as the FASTA headers, counter from the left and
starting from 0. Cannot be used in combination with 'name_column'.
required: false
- name: Outputs
arguments:
- name: "--output"
type: file
example: barcodes.fasta
direction: output
description: Output fasta file.

resources:
- type: python_script
path: script.py
test_resources:
- type: python_script
path: test_csv2fasta.py

engines:
- type: docker
image: python:slim
setup:
- type: apt
packages:
- procps
- type: python
packages:
- dnaio
test_setup:
- type: python
packages:
- pytest
- viashpy

runners:
- type: executable
- type: nextflow
102 changes: 102 additions & 0 deletions src/sequenceformats/csv2fasta/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from pathlib import Path
import dnaio
import csv

## VIASH START
par = {

}
## VIASH END

iupac = frozenset("ABCDGHKMNRSTUVWXY")

def resolve_header_name_to_index(header_entries, column_name):
try:
return header_entries.index(column_name)
except ValueError as e:
raise ValueError(f"Column name '{column_name}' could not "
"be found in the header of the CSV file.") from e


def csv_records(csv_file, delimiter, quote_character,
header, sequence_column, name_column,
sequence_column_index, name_column_index):
with open(csv_file, newline='') as csvfile:
# Deduce CSV dialect based on first 5 lines.
hint = "\n".join([csvfile.readline() for _ in range(5)])
csvfile.seek(0)
dialect = csv.Sniffer().sniff(hint)
reader_args = {"dialect": dialect}
delimiter_arg = {"delimiter": delimiter} if delimiter else {}
quotechar_arg = {"quotechar": quote_character} if delimiter else {}
all_args = reader_args | delimiter_arg | quotechar_arg
csv_reader = csv.reader(csvfile, **all_args)
for linenum, line in enumerate(csv_reader):
if not linenum: # First row
num_columns = len(line)
if header:
if sequence_column:
sequence_column_index = resolve_header_name_to_index(line, sequence_column)
if name_column:
name_column_index = resolve_header_name_to_index(line, name_column)
continue
if not (linenum - header): # First 'data' line
if (not sequence_column_index and not name_column_index and len(line) == 2):
name_column_index, sequence_column_index = 0, 1
if sequence_column_index == name_column_index:
raise ValueError("The same columns were selected for both the FASTQ sequences and "
"headers.")
if sequence_column_index is None:
raise ValueError("Either 'sequence_column_index' or 'sequence_column' needs "
"to be specified.")
if name_column_index is None:
raise ValueError("Either 'name_column' or 'name_column_index' needs to "
"be specified.")
if name_column_index >= num_columns:
raise ValueError(f"Requested to use column number {name_column_index} "
f"(0 based) for the FASTA headers, but only {num_columns} "
"were found on the first line.")
if sequence_column_index >= num_columns:
raise ValueError(f"Requested to use column number {sequence_column_index} "
f"(0 based) for the FASTA sequences, but only {num_columns} "
"were found on the first line.")
if len(line) != num_columns:
raise ValueError(f"Number of columns ({len(line)}) found on line {linenum+1} "
"is different compared to number of columns found "
f"previously ({num_columns}).")
sequence_name, sequence = line[name_column_index], line[sequence_column_index]
invalid_characters = set(sequence.upper()) - iupac
if set(sequence.upper()) - iupac:
raise ValueError(f"The sequence ('{sequence}') found on line {linenum+1} "
f"contains characters ({','.join(invalid_characters)}) "
"which are not valid IUPAC identifiers for nucleotides.")
yield sequence_name, sequence


def main(par):
par['input'], par['output'] = Path(par['input']), Path(par['output'])
sequence_column, name_column = par['sequence_column'], par['name_column']
sequence_column_index, name_column_index = par['sequence_column_index'], par['name_column_index']
if (sequence_column or name_column) and not par['header']:
par["header"] = True
if sequence_column_index and sequence_column:
raise ValueError("Cannot specify both 'sequence_column_index' and 'sequence_column'")
if name_column and name_column_index:
raise ValueError("Cannot specify both 'name_column_index' and 'name_column'")
if (sequence_column_index or name_column_index) and \
(sequence_column_index == name_column_index):
raise ValueError("The value specified for 'sequence_column_index' cannot be the same as "
"the value for 'name_column_index'.")
with dnaio.open(par['output'], mode='w', fileformat="fasta") as writer:
for header, sequence in csv_records(par['input'],
par['delimiter'],
par['quote_character'],
par['header'],
sequence_column,
name_column,
sequence_column_index,
name_column_index):
writer.write(dnaio.SequenceRecord(header, sequence))

if __name__ == "__main__":
main(par)
Loading