single-end and expand tests added. fixed issue with empty alignment a…

…t R2 in single-end mode that was emerging together with --expand.
open2c · Oct 4, 2024 · 942b490 · 942b490
1 parent d3624a4
commit 942b490
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 58 deletions.
diff --git a/pairtools/cli/parse2.py b/pairtools/cli/parse2.py
@@ -43,10 +43,10 @@
     help="""Reported position of alignments in pairs of complex walks (pos columns). 
     Each alignment in .bam/.sam Hi-C-like data has two ends, and you can report one or another depending of the position of alignment on a read or in a pair.  
     
-    "junction" - inner ends of sequential alignments in each pair, aka ligation junctions (complex walks default),
+    "junction" - inner ends of sequential alignments in each pair, aka ligation junctions,
     "read" - 5'-end of alignments relative to R1 or R2 read coordinate system (as in traditional Hi-C),
     "walk" - 5'-end of alignments relative to the whole walk coordinate system,
-    "outer" - outer ends of sequential alignments in each pair. """,
+    "outer" - outer ends of sequential alignments in each pair (parse2 default). """,
 )
 @click.option(
     "--report-orientation",

diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py
@@ -668,7 +668,7 @@ def parse2_read(
         ]
 
         algns1 = normalize_alignment_list(algns1, 1, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap)
-        algns2 = [empty_alignment()]  # Empty alignment dummy
+        algns2 = []  # Empty alignment dummy
 
         if len(algns1) > 1:
             # Look for ligation pair, and report linear alignments after deduplication of complex walks:
@@ -891,11 +891,11 @@ def parse_complex_walk(
 
     **Intramolecular deduplication**
 
-     Forward read (left):                       right read (right):
+    Forward read (left):                       right read (right):
     5'------------------------->3'     3'<--------------------------5'
-             algns1                              algns2
+            algns1                              algns2
     <5---3><5---3><5---3><5---3>        <3---5><3---5><3---5><3---5>
-       l0     l1    l2     l3              r3     r2     r1    r0
+        l0     l1    l2     l3              r3     r2     r1    r0
 
     Alignment - bwa mem reported hit or alignment after gaps conversion.
     Left and right alignments (algns1: [l0, l1, l2, l3], algns2: [r0, r1, r2, r3])
@@ -929,8 +929,8 @@ def parse_complex_walk(
                 If comparison is successful, go to 6.
         6. Verify.
             Check that downstream pairs on the left read overlap with the upstream pairs on the right read.
-                 If yes, exit.
-                 If not, we do not have an overlap, go to step 3.
+                If yes, exit.
+                If not, we do not have an overlap, go to step 3.
     """
 
     AVAILABLE_REPORT_POSITION = ["outer", "junction", "read", "walk"]
@@ -1007,66 +1007,70 @@ def parse_complex_walk(
         if not is_overlap:
             current_right_pair = 1
 
-    # II. Search of partial overlap if there are less than 2 alignments at either sides, or no overlaps found
-    if current_right_pair == 1:
-        last_reported_alignment_left = last_reported_alignment_right = 1
-        if partial_overlap(
-            algns1[-1],
-            algns2[-1],
-            max_insert_size=max_insert_size,
-            dedup_max_mismatch=dedup_max_mismatch,
-        ):
-            if (
-                n_algns1 >= 2
-            ):  # single alignment on right read and multiple alignments on left
-                pair_index = (len(algns1) - 1, "R1")
-                output_pairs.append(
-                    format_pair(
-                        algns1[-2],
-                        algns1[-1],
-                        pair_index=pair_index,
-                        algn2_pos3=algns2[-1]["pos5"],
-                        report_position=report_position,
-                        report_orientation=report_orientation,
+    if (n_algns2 == 0):
+        last_reported_alignment_left = 1
+        last_reported_alignment_right = 0
+    else:
+        # II. Search of partial overlap if there are less than 2 alignments at either sides, or no overlaps found
+        if (current_right_pair == 1):
+            last_reported_alignment_left = last_reported_alignment_right = 1
+            if partial_overlap(
+                algns1[-1],
+                algns2[-1],
+                max_insert_size=max_insert_size,
+                dedup_max_mismatch=dedup_max_mismatch,
+            ):
+                if (
+                    n_algns1 >= 2
+                ):  # single alignment on right read and multiple alignments on left
+                    pair_index = (len(algns1) - 1, "R1")
+                    output_pairs.append(
+                        format_pair(
+                            algns1[-2],
+                            algns1[-1],
+                            pair_index=pair_index,
+                            algn2_pos3=algns2[-1]["pos5"],
+                            report_position=report_position,
+                            report_orientation=report_orientation,
+                        )
                     )
-                )
-                last_reported_alignment_left = 2  # set the pointer for reporting
+                    last_reported_alignment_left = 2  # set the pointer for reporting
+
+                if (
+                    n_algns2 >= 2
+                ):  # single alignment on left read and multiple alignments on right
+                    pair_index = (len(algns1), "R2")
+                    output_pairs.append(
+                        format_pair(
+                            algns2[-1],
+                            algns2[-2],
+                            pair_index=pair_index,
+                            algn1_pos3=algns1[-1]["pos5"],
+                            report_position=report_position,
+                            report_orientation=report_orientation,
+                        )
+                    )
+                    last_reported_alignment_right = 2  # set the pointer for reporting
+
+                # Note that if n_algns1==n_algns2==1 and alignments overlap, then we don't need to check,
+                # it's a non-ligated DNA fragment that we don't report.
 
-            if (
-                n_algns2 >= 2
-            ):  # single alignment on left read and multiple alignments on right
-                pair_index = (len(algns1), "R2")
+            else:  # end alignments do not overlap, report regular pair:
+                pair_index = (len(algns1), "R1-2")
                 output_pairs.append(
                     format_pair(
+                        algns1[-1],
                         algns2[-1],
-                        algns2[-2],
                         pair_index=pair_index,
-                        algn1_pos3=algns1[-1]["pos5"],
                         report_position=report_position,
                         report_orientation=report_orientation,
                     )
                 )
-                last_reported_alignment_right = 2  # set the pointer for reporting
-
-            # Note that if n_algns1==n_algns2==1 and alignments overlap, then we don't need to check,
-            # it's a non-ligated DNA fragment that we don't report.
-
-        else:  # end alignments do not overlap, report regular pair:
-            pair_index = (len(algns1), "R1-2")
-            output_pairs.append(
-                format_pair(
-                    algns1[-1],
-                    algns2[-1],
-                    pair_index=pair_index,
-                    report_position=report_position,
-                    report_orientation=report_orientation,
-                )
-            )
 
-    else:  # there was an overlap, set some pointers:
-        last_reported_alignment_left = (
-            last_reported_alignment_right
-        ) = current_right_pair
+        else:  # there was an overlap, set some pointers:
+            last_reported_alignment_left = (
+                last_reported_alignment_right
+            ) = current_right_pair
 
     # III. Report all remaining alignments.
     # Report all unique alignments on left read (sequential):
@@ -1146,7 +1150,6 @@ def expand_pairs(pairs_list, max_expansion_depth=None):
     list of expanded pairs
 
     """
-
     for algn1, _algn1, pair_index1 in pairs_list:
         for _algn2, algn2, pair_index2 in pairs_list:
             if pair_index1 > pair_index2:

diff --git a/tests/test_parse2.py b/tests/test_parse2.py
@@ -126,3 +126,138 @@ def test_mock_pysam_parse2_pair():
         print()
 
         assert assigned_pair == simulated_pair
+
+
+def test_mock_pysam_parse2_single_end():
+
+    """Testing single-end mode for parse2, no-flip mode.
+    --report-position is outer (parse2 default)
+    --report-orientation is pair (parse2 default)
+    """
+
+    mock_sam_path = os.path.join(testdir, "data", "mock.parse2-single-end.sam")
+    mock_chroms_path = os.path.join(testdir, "data", "mock.chrom.sizes")
+    try:
+        result = subprocess.check_output(
+            [
+                "python",
+                "-m",
+                "pairtools",
+                "parse2",
+                "-c",
+                mock_chroms_path,
+                "--single-end",
+                "--add-pair-index",
+                "--no-flip",
+                "--report-position",
+                "outer",
+                "--report-orientation",
+                "pair",
+                mock_sam_path,
+            ],
+        ).decode("ascii")
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        print(sys.exc_info())
+        raise e
+
+    # check if the header got transferred correctly
+    sam_header = [l.strip() for l in open(mock_sam_path, "r") if l.startswith("@")]
+    pairsam_header = [l.strip() for l in result.split("\n") if l.startswith("#")]
+    for l in sam_header:
+        assert any([l in l2 for l2 in pairsam_header])
+
+    # check that the pairs got assigned properly
+    id_counter = 0
+    prev_id = ""
+    for l in result.split("\n"):
+        if l.startswith("#") or not l:
+            continue
+
+        if prev_id == l.split("\t")[0]:
+            id_counter += 1
+        else:
+            id_counter = 0
+        prev_id = l.split("\t")[0]
+
+        assigned_pair = l.split("\t")[1:8] + l.split("\t")[-2:]
+        print(l.split("SIMULATED:", 1)[1].split("\031", 1)[0].split("|"), id_counter)
+        simulated_pair = (
+            l.split("SIMULATED:", 1)[1]
+            .split("\031", 1)[0]
+            .split("|")[id_counter]
+            .split(",")
+        )
+        print(assigned_pair)
+        print(simulated_pair, prev_id)
+        print()
+
+        assert assigned_pair == simulated_pair
+
+
+def test_mock_pysam_parse2_single_end_expand():
+
+    """Testing single-end mode for parse2, no-flip mode, with --expand.
+    --report-position is outer (parse2 default)
+    --report-orientation is pair (parse2 default)
+    """
+
+    mock_sam_path = os.path.join(testdir, "data", "mock.parse2-single-end.expand.sam")
+    mock_chroms_path = os.path.join(testdir, "data", "mock.chrom.sizes")
+    try:
+        result = subprocess.check_output(
+            [
+                "python",
+                "-m",
+                "pairtools",
+                "parse2",
+                "-c",
+                mock_chroms_path,
+                "--single-end",
+                "--expand",
+                "--add-pair-index",
+                "--no-flip",
+                "--report-position",
+                "outer",
+                "--report-orientation",
+                "pair",
+                mock_sam_path,
+            ],
+        ).decode("ascii")
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        print(sys.exc_info())
+        raise e
+
+    # check if the header got transferred correctly
+    sam_header = [l.strip() for l in open(mock_sam_path, "r") if l.startswith("@")]
+    pairsam_header = [l.strip() for l in result.split("\n") if l.startswith("#")]
+    for l in sam_header:
+        assert any([l in l2 for l2 in pairsam_header])
+
+    # check that the pairs got assigned properly
+    id_counter = 0
+    prev_id = ""
+    for l in result.split("\n"):
+        if l.startswith("#") or not l:
+            continue
+
+        if prev_id == l.split("\t")[0]:
+            id_counter += 1
+        else:
+            id_counter = 0
+        prev_id = l.split("\t")[0]
+
+        assigned_pair = l.split("\t")[1:8] + l.split("\t")[-2:]
+        print(l.split("SIMULATED:", 1)[1].split("\031", 1)[0].split("|"), id_counter)
+        simulated_pair = (
+            l.split("SIMULATED:", 1)[1]
+            .split("\031", 1)[0]
+            .split("|")[id_counter]
+            .split(",")
+        )
+        print(assigned_pair)
+        print(simulated_pair, prev_id)
+        print()
+
+        assert assigned_pair == simulated_pair