Skip to content

Commit

Permalink
Add sniffer
Browse files Browse the repository at this point in the history
  • Loading branch information
DriesSchaumont committed Jun 12, 2024
1 parent bd03322 commit 8b1b8bd
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 12 deletions.
19 changes: 10 additions & 9 deletions src/sequenceformats/csv2fasta/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,23 @@ argument_groups:
example: barcodes.csv
description: CSV file to be processed.
required: true
- name: "CSV Format arguments"
- name: --header
type: boolean_true
description: |
Parse the first line of the CSV file as a header.
- name: "CSV dialect options"
description: |
Options that can be used to override the automatically detected
dialect of the CSV file.
arguments:
- name: --header
type: boolean_true
description: |
Parse the first line of the CSV file as a header.
- name: --delimiter
type: string
description: |
Column delimiter
default: ","
Overwrite the column delimiter character.
- name: --quote_character
type: string
description: |
Character used to denote the start and end of a quoted item.
default: '"'
Overwrite the character used to denote the start and end of a quoted item.
- name: "CSV column arguments"
description: |
Parameters for the selection of columns from the CSV file.
Expand Down
12 changes: 9 additions & 3 deletions src/sequenceformats/csv2fasta/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,15 @@ def csv_records(csv_file, delimiter, quote_character,
header, sequence_column, name_column,
sequence_column_index, name_column_index):
with open(csv_file, newline='') as csvfile:
csv_reader = csv.reader(csvfile,
delimiter=delimiter,
quotechar=quote_character)
# Deduce CSV dialect based on first 5 lines.
hint = "\n".join([csvfile.readline() for _ in range(5)])
csvfile.seek(0)
dialect = csv.Sniffer().sniff(hint)
reader_args = {"dialect": dialect}
delimiter_arg = {"delimiter": delimiter} if delimiter else {}
quotechar_arg = {"quotechar": quote_character} if delimiter else {}
all_args = reader_args | delimiter_arg | quotechar_arg
csv_reader = csv.reader(csvfile, **all_args)
for linenum, line in enumerate(csv_reader):
if not linenum: # First row
num_columns = len(line)
Expand Down
61 changes: 61 additions & 0 deletions src/sequenceformats/csv2fasta/test_csv2fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,65 @@ def test_csvtofasta_header_select_column_by_both_name_and_index(run_component, r
output_contents = open_output.read()
assert output_contents == expected

def test_csvtofasta_autodetect_dialect(run_component, random_path):
csv_contents = dedent("""\
barcode_name\tsome_other_column\tsequence
barcode1\tfoo\tACGT
barcode2\tbar\tTTTA
""")

expected= dedent("""\
>barcode1
ACGT
>barcode2
TTTA
""")
input_path = random_path("csv")
with input_path.open('w') as open_input:
open_input.write(csv_contents)
output_path = random_path("csv")
run_component([
"--input", input_path,
"--output", output_path,
"--header",
"--name_column", "barcode_name",
"--sequence_column_index", "2",
]
)
assert output_path.is_file()
with output_path.open('r') as open_output:
output_contents = open_output.read()
assert output_contents == expected

csv_contents = dedent("""\
"barcode_name"\t"some_other_column"\t"sequence"
"barcode1"\t"foo"\t"ACGT"
"barcode2"\t"bar"\t"TTTA"
""")

expected= dedent("""\
>barcode1
ACGT
>barcode2
TTTA
""")
input_path = random_path("csv")
with input_path.open('w') as open_input:
open_input.write(csv_contents)
output_path = random_path("csv")
run_component([
"--input", input_path,
"--output", output_path,
"--header",
"--name_column", "barcode_name",
"--sequence_column_index", "2",
]
)
assert output_path.is_file()
with output_path.open('r') as open_output:
output_contents = open_output.read()
assert output_contents == expected

def test_csvtofasta_header_select_column_by_name(run_component, random_path):
csv_contents = dedent("""\
barcode_name,some_other_column,sequence
Expand Down Expand Up @@ -301,5 +360,7 @@ def test_csvtofasta_2_columns_but_not_valid_sequence(run_component, random_path)
r"IUPAC identifiers for nucleotides\.",
err.value.stdout.decode('utf-8'))



if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

0 comments on commit 8b1b8bd

Please sign in to comment.