From 8b1b8bdc06ac508452de4cf8afec6d53eceb31dd Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Wed, 12 Jun 2024 07:53:30 +0000 Subject: [PATCH] Add sniffer --- src/sequenceformats/csv2fasta/config.vsh.yaml | 19 +++--- src/sequenceformats/csv2fasta/script.py | 12 +++- .../csv2fasta/test_csv2fasta.py | 61 +++++++++++++++++++ 3 files changed, 80 insertions(+), 12 deletions(-) diff --git a/src/sequenceformats/csv2fasta/config.vsh.yaml b/src/sequenceformats/csv2fasta/config.vsh.yaml index 46dc660f..c49f73ea 100644 --- a/src/sequenceformats/csv2fasta/config.vsh.yaml +++ b/src/sequenceformats/csv2fasta/config.vsh.yaml @@ -15,22 +15,23 @@ argument_groups: example: barcodes.csv description: CSV file to be processed. required: true - - name: "CSV Format arguments" + - name: --header + type: boolean_true + description: | + Parse the first line of the CSV file as a header. + - name: "CSV dialect options" + description: | + Options that can be used to override the automatically detected + dialect of the CSV file. arguments: - - name: --header - type: boolean_true - description: | - Parse the first line of the CSV file as a header. - name: --delimiter type: string description: | - Column delimiter - default: "," + Overwrite the column delimiter character. - name: --quote_character type: string description: | - Character used to denote the start and end of a quoted item. - default: '"' + Overwrite the character used to denote the start and end of a quoted item. - name: "CSV column arguments" description: | Parameters for the selection of columns from the CSV file. diff --git a/src/sequenceformats/csv2fasta/script.py b/src/sequenceformats/csv2fasta/script.py index 3317137a..b3c7ae89 100644 --- a/src/sequenceformats/csv2fasta/script.py +++ b/src/sequenceformats/csv2fasta/script.py @@ -22,9 +22,15 @@ def csv_records(csv_file, delimiter, quote_character, header, sequence_column, name_column, sequence_column_index, name_column_index): with open(csv_file, newline='') as csvfile: - csv_reader = csv.reader(csvfile, - delimiter=delimiter, - quotechar=quote_character) + # Deduce CSV dialect based on first 5 lines. + hint = "\n".join([csvfile.readline() for _ in range(5)]) + csvfile.seek(0) + dialect = csv.Sniffer().sniff(hint) + reader_args = {"dialect": dialect} + delimiter_arg = {"delimiter": delimiter} if delimiter else {} + quotechar_arg = {"quotechar": quote_character} if delimiter else {} + all_args = reader_args | delimiter_arg | quotechar_arg + csv_reader = csv.reader(csvfile, **all_args) for linenum, line in enumerate(csv_reader): if not linenum: # First row num_columns = len(line) diff --git a/src/sequenceformats/csv2fasta/test_csv2fasta.py b/src/sequenceformats/csv2fasta/test_csv2fasta.py index 244e28d2..30f6059e 100644 --- a/src/sequenceformats/csv2fasta/test_csv2fasta.py +++ b/src/sequenceformats/csv2fasta/test_csv2fasta.py @@ -149,6 +149,65 @@ def test_csvtofasta_header_select_column_by_both_name_and_index(run_component, r output_contents = open_output.read() assert output_contents == expected +def test_csvtofasta_autodetect_dialect(run_component, random_path): + csv_contents = dedent("""\ + barcode_name\tsome_other_column\tsequence + barcode1\tfoo\tACGT + barcode2\tbar\tTTTA + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column_index", "2", + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + + csv_contents = dedent("""\ + "barcode_name"\t"some_other_column"\t"sequence" + "barcode1"\t"foo"\t"ACGT" + "barcode2"\t"bar"\t"TTTA" + """) + + expected= dedent("""\ + >barcode1 + ACGT + >barcode2 + TTTA + """) + input_path = random_path("csv") + with input_path.open('w') as open_input: + open_input.write(csv_contents) + output_path = random_path("csv") + run_component([ + "--input", input_path, + "--output", output_path, + "--header", + "--name_column", "barcode_name", + "--sequence_column_index", "2", + ] + ) + assert output_path.is_file() + with output_path.open('r') as open_output: + output_contents = open_output.read() + assert output_contents == expected + def test_csvtofasta_header_select_column_by_name(run_component, random_path): csv_contents = dedent("""\ barcode_name,some_other_column,sequence @@ -301,5 +360,7 @@ def test_csvtofasta_2_columns_but_not_valid_sequence(run_component, random_path) r"IUPAC identifiers for nucleotides\.", err.value.stdout.decode('utf-8')) + + if __name__ == "__main__": sys.exit(pytest.main([__file__])) \ No newline at end of file