From 91b31b207481ea127f07aab140524f23aaf20deb Mon Sep 17 00:00:00 2001
From: Rutger Vos <rutgeraldo@gmail.com>
Date: Thu, 17 Oct 2024 14:18:40 +0200
Subject: [PATCH] adding unit tests for yaml and csv merging

---
 tests/test_merge.py | 119 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 tests/test_merge.py

diff --git a/tests/test_merge.py b/tests/test_merge.py
new file mode 100644
index 0000000..f4ccd8e
--- /dev/null
+++ b/tests/test_merge.py
@@ -0,0 +1,119 @@
+import pytest
+import sys
+import os
+
+# Add the parent directory to sys.path to import the module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from barcode_validator.result import DNAAnalysisResult, DNAAnalysisResultSet
+
+@pytest.fixture
+def result_set():
+    results = [
+        DNAAnalysisResult("BGENL001-23"),
+        DNAAnalysisResult("BGENL002-23"),
+        DNAAnalysisResult("BGENL003-23"),
+        DNAAnalysisResult("BGENL004-23"),
+        DNAAnalysisResult("BGENL005-23")
+    ]
+    return DNAAnalysisResultSet(results)
+
+@pytest.fixture
+def data_dir():
+    # Construct the path to the data directory
+    return os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'examples', 'rich_set')
+
+def test_add_csv_file(result_set, data_dir):
+    csv_file = os.path.join(data_dir, 'mge_fastp_r13s100_nocontam.csv')
+    result_set.add_csv_file(csv_file)
+
+    # Check if the CSV data was correctly added to each result
+    for result in result_set.results:
+        assert 'cov_avg' in result.data['ancillary']
+        cov_avg = result.data['ancillary']['cov_avg']
+        assert cov_avg != ''  # Ensure it's not empty
+        try:
+            float_cov_avg = float(cov_avg)
+            assert str(int(float_cov_avg)).isdigit()  # Check if the integer part is a digit string
+        except ValueError:
+            pytest.fail(f"cov_avg '{cov_avg}' is not a valid float string")
+
+    # Check a specific value (you may need to adjust this based on your actual data)
+    assert result_set.results[0].data['ancillary']['cov_avg'] != ''
+
+def test_add_yaml_file(result_set, data_dir):
+    yaml_file = os.path.join(data_dir, 'mge_fastp_r13s100_nocontam.yaml')
+    result_set.add_yaml_file(yaml_file)
+
+    # Check if the YAML data was correctly added to each result
+    expected_keys = [
+        'samples_file', 'protein_reference_file', 'output_dir', 'genes', 'r', 's', 'run_name'
+    ]
+
+    for result in result_set.results:
+        for key in expected_keys:
+            assert key in result.data['ancillary'], f"Key '{key}' not found in ancillary data"
+
+    # Check specific values
+    first_result = result_set.results[0].data['ancillary']
+    assert first_result['samples_file'] == "/gpfs/nhmfsa/bulk/share/data/mbl/share/workspaces/groups/genomics-collections/BGE/2024-02-01/DataDelivery_2024-02-01_18-24-39_snpseq00629/files/WK-3860/read_paths.csv"
+    assert first_result['protein_reference_file'] == "/gpfs/nhmfsa/bulk/share/data/mbl/share/scratch/MGE/protein_references/benchmarking_data_570_refs-contam_refs_final14/benchmarking_data_570_taxonomy_gene_fetch_sum_out.csv"
+    assert first_result['output_dir'] == "/gpfs/nhmfsa/bulk/share/data/mbl/share/scratch/MGE/cox1/benchmarking/MGE-fastp_pipeline_alt_params"
+    assert first_result['genes'] == ['cox1']
+    assert first_result['r'] == [1.3]
+    assert first_result['s'] == [100]
+    assert first_result['run_name'] == "mge_fastp_r13s100_nocontam"
+
+
+def test_result_fields(result_set, data_dir):
+    # Add CSV file
+    csv_file = os.path.join(data_dir, 'mge_fastp_r13s100_nocontam.csv')
+    result_set.add_csv_file(csv_file)
+
+    # Add YAML file
+    yaml_file = os.path.join(data_dir, 'mge_fastp_r13s100_nocontam.yaml')
+    result_set.add_yaml_file(yaml_file)
+
+    # Check result fields
+    fields = result_set.results[0].result_fields()
+    assert 'cov_avg' in fields
+    assert 'samples_file' in fields
+    assert 'protein_reference_file' in fields
+    assert 'output_dir' in fields
+    assert 'genes' in fields
+    assert 'r' in fields
+    assert 's' in fields
+    assert 'run_name' in fields
+
+    # Check values for a single result
+    values = result_set.results[0].get_values()
+    assert any(isinstance(v, str) and v != '' and float(v) > 0 for v in values)  # cov_avg from CSV
+    assert any(
+        v == "/gpfs/nhmfsa/bulk/share/data/mbl/share/workspaces/groups/genomics-collections/BGE/2024-02-01/DataDelivery_2024-02-01_18-24-39_snpseq00629/files/WK-3860/read_paths.csv"
+        for v in values)  # samples_file from YAML
+
+    # Check string representation of the entire result set
+    result_set_str = str(result_set)
+
+    # Split the string into lines
+    lines = result_set_str.split('\n')
+
+    # Check header
+    header = lines[0].split('\t')
+    assert 'cov_avg' in header
+    assert 'samples_file' in header
+    assert 'genes' in header
+    assert 'r' in header
+    assert 's' in header
+    assert 'run_name' in header
+
+    # Check content (first data line)
+    first_data_line = lines[1].split('\t')
+    assert "/gpfs/nhmfsa/bulk/share/data/mbl/share/workspaces/groups/genomics-collections/BGE/2024-02-01/DataDelivery_2024-02-01_18-24-39_snpseq00629/files/WK-3860/read_paths.csv" in first_data_line
+    assert "['cox1']" in first_data_line
+    assert "[1.3]" in first_data_line
+    assert "[100]" in first_data_line
+    assert "mge_fastp_r13s100_nocontam" in first_data_line
+
+if __name__ == '__main__':
+    pytest.main()
\ No newline at end of file