From 8b1b8bdc06ac508452de4cf8afec6d53eceb31dd Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Wed, 12 Jun 2024 07:53:30 +0000
Subject: [PATCH] Add sniffer

---
 src/sequenceformats/csv2fasta/config.vsh.yaml | 19 +++---
 src/sequenceformats/csv2fasta/script.py       | 12 +++-
 .../csv2fasta/test_csv2fasta.py               | 61 +++++++++++++++++++
 3 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/src/sequenceformats/csv2fasta/config.vsh.yaml b/src/sequenceformats/csv2fasta/config.vsh.yaml
index 46dc660f..c49f73ea 100644
--- a/src/sequenceformats/csv2fasta/config.vsh.yaml
+++ b/src/sequenceformats/csv2fasta/config.vsh.yaml
@@ -15,22 +15,23 @@ argument_groups:
       example: barcodes.csv
       description: CSV file to be processed.
       required: true
-  - name: "CSV Format arguments"
+    - name: --header
+      type: boolean_true
+      description: |
+        Parse the first line of the CSV file as a header.
+  - name: "CSV dialect options"
+    description: |
+      Options that can be used to override the automatically detected
+      dialect of the CSV file.
     arguments:
-      - name: --header
-        type: boolean_true
-        description: |
-          Parse the first line of the CSV file as a header.
       - name: --delimiter
         type: string
         description: |
-          Column delimiter
-        default: ","
+          Overwrite the column delimiter character.
       - name: --quote_character
         type: string
         description: |
-          Character used to denote the start and end of a quoted item.
-        default: '"'
+          Overwrite the character used to denote the start and end of a quoted item.
   - name: "CSV column arguments"
     description: |
       Parameters for the selection of columns from the CSV file.
diff --git a/src/sequenceformats/csv2fasta/script.py b/src/sequenceformats/csv2fasta/script.py
index 3317137a..b3c7ae89 100644
--- a/src/sequenceformats/csv2fasta/script.py
+++ b/src/sequenceformats/csv2fasta/script.py
@@ -22,9 +22,15 @@ def csv_records(csv_file, delimiter, quote_character,
                 header, sequence_column, name_column,
                 sequence_column_index, name_column_index):
     with open(csv_file, newline='') as csvfile:
-        csv_reader = csv.reader(csvfile,
-                                delimiter=delimiter,
-                                quotechar=quote_character)
+        # Deduce CSV dialect based on first 5 lines.
+        hint = "\n".join([csvfile.readline() for _ in range(5)])
+        csvfile.seek(0)
+        dialect = csv.Sniffer().sniff(hint)
+        reader_args = {"dialect": dialect}
+        delimiter_arg = {"delimiter": delimiter} if delimiter else {}
+        quotechar_arg = {"quotechar": quote_character} if delimiter else {}
+        all_args = reader_args | delimiter_arg | quotechar_arg
+        csv_reader = csv.reader(csvfile, **all_args)
         for linenum, line in enumerate(csv_reader):
             if not linenum: # First row
                 num_columns = len(line)
diff --git a/src/sequenceformats/csv2fasta/test_csv2fasta.py b/src/sequenceformats/csv2fasta/test_csv2fasta.py
index 244e28d2..30f6059e 100644
--- a/src/sequenceformats/csv2fasta/test_csv2fasta.py
+++ b/src/sequenceformats/csv2fasta/test_csv2fasta.py
@@ -149,6 +149,65 @@ def test_csvtofasta_header_select_column_by_both_name_and_index(run_component, r
         output_contents = open_output.read()
     assert output_contents == expected
 
+def test_csvtofasta_autodetect_dialect(run_component, random_path):
+    csv_contents = dedent("""\
+    barcode_name\tsome_other_column\tsequence
+    barcode1\tfoo\tACGT
+    barcode2\tbar\tTTTA
+    """)
+
+    expected= dedent("""\
+    >barcode1
+    ACGT
+    >barcode2
+    TTTA
+    """)
+    input_path = random_path("csv")
+    with input_path.open('w') as open_input:
+        open_input.write(csv_contents)
+    output_path = random_path("csv")
+    run_component([
+        "--input", input_path,
+        "--output", output_path,
+        "--header", 
+        "--name_column", "barcode_name",
+        "--sequence_column_index", "2",
+        ]
+    )
+    assert output_path.is_file()
+    with output_path.open('r') as open_output:
+        output_contents = open_output.read()
+    assert output_contents == expected
+
+    csv_contents = dedent("""\
+    "barcode_name"\t"some_other_column"\t"sequence"
+    "barcode1"\t"foo"\t"ACGT"
+    "barcode2"\t"bar"\t"TTTA"
+    """)
+
+    expected= dedent("""\
+    >barcode1
+    ACGT
+    >barcode2
+    TTTA
+    """)
+    input_path = random_path("csv")
+    with input_path.open('w') as open_input:
+        open_input.write(csv_contents)
+    output_path = random_path("csv")
+    run_component([
+        "--input", input_path,
+        "--output", output_path,
+        "--header", 
+        "--name_column", "barcode_name",
+        "--sequence_column_index", "2",
+        ]
+    )
+    assert output_path.is_file()
+    with output_path.open('r') as open_output:
+        output_contents = open_output.read()
+    assert output_contents == expected
+
 def test_csvtofasta_header_select_column_by_name(run_component, random_path):
     csv_contents = dedent("""\
     barcode_name,some_other_column,sequence
@@ -301,5 +360,7 @@ def test_csvtofasta_2_columns_but_not_valid_sequence(run_component, random_path)
                      r"IUPAC identifiers for nucleotides\.", 
                      err.value.stdout.decode('utf-8'))
 
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__]))
\ No newline at end of file