diff --git a/CHANGELOG.md b/CHANGELOG.md index 99c859d2..23b37990 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). +* `csv2fasta`: Convert two columns from a CSV file to FASTA entries (PR #61). ## MAJOR CHANGES diff --git a/src/sequenceformats/csv2fasta/config.vsh.yaml b/src/sequenceformats/csv2fasta/config.vsh.yaml new file mode 100644 index 00000000..c49f73ea --- /dev/null +++ b/src/sequenceformats/csv2fasta/config.vsh.yaml @@ -0,0 +1,102 @@ +name: csv2fasta +description: | + Convert two columns from a CSV file to FASTA entries. The CSV file can + contain an optional header and each row (other than the header) becomes + a single FASTA record. One of the two columns will be used as the names + for the FASTA entries, while the other become the sequences. The sequences + column must only contain characters that are valid IUPAC notation for + nucleotides or a group thereof (wildcard characters). +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + example: barcodes.csv + description: CSV file to be processed. + required: true + - name: --header + type: boolean_true + description: | + Parse the first line of the CSV file as a header. + - name: "CSV dialect options" + description: | + Options that can be used to override the automatically detected + dialect of the CSV file. + arguments: + - name: --delimiter + type: string + description: | + Overwrite the column delimiter character. + - name: --quote_character + type: string + description: | + Overwrite the character used to denote the start and end of a quoted item. + - name: "CSV column arguments" + description: | + Parameters for the selection of columns from the CSV file. + Only required when your CSV file contains more than 2 columns, + otherwise the first column will be used for the FASTA header + and the second for the FASTA nucleotide sequences. This default + can still be overwritten by using the options below. + arguments: + - name: --sequence_column + type: string + description: | + Name of the column containing the sequences. Implies 'header'. + Cannot be used together with 'sequence_column_index'. + required: false + - name: "--name_column" + type: string + description: | + Name of the column describing the FASTA headers. Implies 'header'. + Cannot be used together with 'name_column_index'. + required: false + - name: "--sequence_column_index" + type: integer + min: 0 + description: | + Index of the column to use as the FASTA sequences, counter from the left and + starting from 0. Cannot be used in combination with the 'sequence_column' argument. + required: false + - name: "--name_column_index" + type: integer + min: 0 + description: | + Index of the column to use as the FASTA headers, counter from the left and + starting from 0. Cannot be used in combination with 'name_column'. + required: false + - name: Outputs + arguments: + - name: "--output" + type: file + example: barcodes.fasta + direction: output + description: Output fasta file. + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test_csv2fasta.py + +engines: + - type: docker + image: python:slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - dnaio + test_setup: + - type: python + packages: + - pytest + - viashpy + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/sequenceformats/csv2fasta/script.py b/src/sequenceformats/csv2fasta/script.py new file mode 100644 index 00000000..b3c7ae89 --- /dev/null +++ b/src/sequenceformats/csv2fasta/script.py @@ -0,0 +1,102 @@ +from pathlib import Path +import dnaio +import csv + +## VIASH START +par = { + +} +## VIASH END + +iupac = frozenset("ABCDGHKMNRSTUVWXY") + +def resolve_header_name_to_index(header_entries, column_name): + try: + return header_entries.index(column_name) + except ValueError as e: + raise ValueError(f"Column name '{column_name}' could not " + "be found in the header of the CSV file.") from e + + +def csv_records(csv_file, delimiter, quote_character, + header, sequence_column, name_column, + sequence_column_index, name_column_index): + with open(csv_file, newline='') as csvfile: + # Deduce CSV dialect based on first 5 lines. + hint = "\n".join([csvfile.readline() for _ in range(5)]) + csvfile.seek(0) + dialect = csv.Sniffer().sniff(hint) + reader_args = {"dialect": dialect} + delimiter_arg = {"delimiter": delimiter} if delimiter else {} + quotechar_arg = {"quotechar": quote_character} if delimiter else {} + all_args = reader_args | delimiter_arg | quotechar_arg + csv_reader = csv.reader(csvfile, **all_args) + for linenum, line in enumerate(csv_reader): + if not linenum: # First row + num_columns = len(line) + if header: + if sequence_column: + sequence_column_index = resolve_header_name_to_index(line, sequence_column) + if name_column: + name_column_index = resolve_header_name_to_index(line, name_column) + continue + if not (linenum - header): # First 'data' line + if (not sequence_column_index and not name_column_index and len(line) == 2): + name_column_index, sequence_column_index = 0, 1 + if sequence_column_index == name_column_index: + raise ValueError("The same columns were selected for both the FASTQ sequences and " + "headers.") + if sequence_column_index is None: + raise ValueError("Either 'sequence_column_index' or 'sequence_column' needs " + "to be specified.") + if name_column_index is None: + raise ValueError("Either 'name_column' or 'name_column_index' needs to " + "be specified.") + if name_column_index >= num_columns: + raise ValueError(f"Requested to use column number {name_column_index} " + f"(0 based) for the FASTA headers, but only {num_columns} " + "were found on the first line.") + if sequence_column_index >= num_columns: + raise ValueError(f"Requested to use column number {sequence_column_index} " + f"(0 based) for the FASTA sequences, but only {num_columns} " + "were found on the first line.") + if len(line) != num_columns: + raise ValueError(f"Number of columns ({len(line)}) found on line {linenum+1} " + "is different compared to number of columns found " + f"previously ({num_columns}).") + sequence_name, sequence = line[name_column_index], line[sequence_column_index] + invalid_characters = set(sequence.upper()) - iupac + if set(sequence.upper()) - iupac: + raise ValueError(f"The sequence ('{sequence}') found on line {linenum+1} " + f"contains characters ({','.join(invalid_characters)}) " + "which are not valid IUPAC identifiers for nucleotides.") + yield sequence_name, sequence + + +def main(par): + par['input'], par['output'] = Path(par['input']), Path(par['output']) + sequence_column, name_column = par['sequence_column'], par['name_column'] + sequence_column_index, name_column_index = par['sequence_column_index'], par['name_column_index'] + if (sequence_column or name_column) and not par['header']: + par["header"] = True + if sequence_column_index and sequence_column: + raise ValueError("Cannot specify both 'sequence_column_index' and 'sequence_column'") + if name_column and name_column_index: + raise ValueError("Cannot specify both 'name_column_index' and 'name_column'") + if (sequence_column_index or name_column_index) and \ + (sequence_column_index == name_column_index): + raise ValueError("The value specified for 'sequence_column_index' cannot be the same as " + "the value for 'name_column_index'.") + with dnaio.open(par['output'], mode='w', fileformat="fasta") as writer: + for header, sequence in csv_records(par['input'], + par['delimiter'], + par['quote_character'], + par['header'], + sequence_column, + name_column, + sequence_column_index, + name_column_index): + writer.write(dnaio.SequenceRecord(header, sequence)) + +if __name__ == "__main__": + main(par) \ No newline at end of file diff --git a/src/sequenceformats/csv2fasta/test_csv2fasta.py b/src/sequenceformats/csv2fasta/test_csv2fasta.py new file mode 100644 index 00000000..30f6059e --- /dev/null +++ b/src/sequenceformats/csv2fasta/test_csv2fasta.py @@ -0,0 +1,366 @@ +import pytest +import re +import sys +from uuid import uuid4 +from textwrap import dedent +from subprocess import CalledProcessError + +## VIASH START +meta = { + 'config': 'src/sequenceformats/csv2fasta/config.vsh.yaml', + 'executable': 'target/executable/sequenceformats/csv2fasta' +} +## VIASH END + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return tmp_path / f"{uuid4()}{extension}" + return wrapper + +@pytest.mark.parametrize("arg,val,expected_err", [("name_column", "barcode_name", + ("sequence_column_index", "sequence_column")), + ("sequence_column", "sequence", + ("name_column", "name_column_index"))]) +def test_csvtofasta_no_columns_selected_raises(run_component, random_path, arg, val, expected_err): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + args = [ + "--input", input_path, + "--output", output_path, + "--header" + ] + args.extend([f"--{arg}", val]) + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert f"ValueError: Either '{expected_err[0]}' or '{expected_err[1]}' needs to be specified." in \ + err.value.stdout.decode('utf-8') + +def test_csvtofasta_column_does_not_exist_raises(run_component, random_path): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + args = [ + "--input", input_path, + "--output", output_path, + "--sequence_column", "foo", + ] + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert "ValueError: Column name 'foo' could not be found in the " + \ + "header of the CSV file." in err.value.stdout.decode('utf-8') + +def test_csvtofasta_same_column_selected_raises(run_component, random_path): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + args = [ + "--input", input_path, + "--output", output_path, + "--sequence_column_index", "1", + "--name_column_index", "1", + ] + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert "ValueError: The value specified for 'sequence_column_index' cannot " + \ + "be the same as the value for 'name_column_index'" in \ + err.value.stdout.decode('utf-8') + +@pytest.mark.parametrize("arg,val,expected_err", [("sequence_column_index", "3", "sequences"), + ("name_column_index", "4", "headers")]) +def test_csvtofasta_header_select_index_out_of_bounds_raises(run_component, random_path, arg, val, expected_err): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + other_column_map = { + "sequence_column_index": ["--name_column_index", "1"], + "name_column_index": ["--sequence_column_index", "2"], + } + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + args = [ + "--input", input_path, + "--output", output_path, + "--header", + ] + args += [f"--{arg}", val] + args += other_column_map[arg] + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert f"ValueError: Requested to use column number {val} (0 based) for the FASTA " + \ + f"{expected_err}, but only 3 were found on the first line." in \ + err.value.stdout.decode('utf-8') + +def test_csvtofasta_header_select_column_by_both_name_and_index(run_component, random_path): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column_index", "2", + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_autodetect_dialect(run_component, random_path): + csv_contents = dedent("""\ + barcode_name\tsome_other_column\tsequence + barcode1\tfoo\tACGT + barcode2\tbar\tTTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column_index", "2", + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + + csv_contents = dedent("""\ + "barcode_name"\t"some_other_column"\t"sequence" + "barcode1"\t"foo"\t"ACGT" + "barcode2"\t"bar"\t"TTTA" + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column_index", "2", + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_header_select_column_by_name(run_component, random_path): + csv_contents = dedent("""\ + barcode_name,some_other_column,sequence + barcode1,foo,ACGT + barcode2,bar,TTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column", "sequence" + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_header_2_columns(run_component, random_path): + csv_contents = dedent("""\ + barcode_name,sequence + barcode1,ACGT + barcode2,TTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header" + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_2_columns(run_component, random_path): + csv_contents = dedent("""\ + barcode1,ACGT + barcode2,TTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_2_columns_but_still_swap(run_component, random_path): + csv_contents = dedent("""\ + ACGT,barcode1 + TTTA,barcode2 + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--sequence_column_index", "0", + "--name_column_index", "1"] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + +def test_csvtofasta_2_columns_but_not_valid_sequence(run_component, random_path): + csv_contents = dedent("""\ + barcodes,sequences + barcode1,ACGT + barcode2,TTTA + """) + + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + with pytest.raises(CalledProcessError) as err: + run_component([ + "--input", input_path, + "--output", output_path] + ) + assert re.search(r"ValueError: The sequence \('sequences'\) found on line " + r"1 contains characters \(.+\) which are not valid " + r"IUPAC identifiers for nucleotides\.", + err.value.stdout.decode('utf-8')) + + csv_contents = dedent("""\ + barcodes,sequences + barcode1,ACGT + barcode2,TTEA + """) + + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + with pytest.raises(CalledProcessError) as err: + run_component([ + "--input", input_path, + "--output", output_path, + "--header"] + ) + assert re.search(r"ValueError: The sequence \('TTEA'\) found on line " + r"3 contains characters \(E\) which are not valid " + r"IUPAC identifiers for nucleotides\.", + err.value.stdout.decode('utf-8')) + + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) \ No newline at end of file