Pull in dnanexus/read-id-regex merge into dp-indels

broadinstitute · Jan 4, 2015 · d7ee8ef · d7ee8ef
2 parents 8de7298 + 3500bdb
commit d7ee8ef
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 5 deletions.
diff --git a/read_utils.py b/read_utils.py
@@ -23,15 +23,13 @@
 # ***  purge_unmated  ***
 # =======================
 
-def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2) :
+def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex) :
     """Use mergeShuffledFastqSeqs to purge unmated reads, and put corresponding
        reads in the same order."""
     tempOutput = mkstempfname()
     mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(),
                                               'mergeShuffledFastqSeqs.pl')
-    # The regular expression that follow says that the sequence identifiers
-    # of corresponding sequences must be of the form SEQID/1 and SEQID/2
-    cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', '^@(\S+)/[1|2]$',
+    cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex,
               '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput]
     log.debug(' '.join(cmdline))
     subprocess.check_call(cmdline)
@@ -45,6 +43,11 @@ def parser_purge_unmated() :
                        Corresponding sequences must have sequence identifiers
                        of the form SEQID/1 and SEQID/2.
                     ''')
+    # The default regular expression says that the sequence identifiers of
+    # corresponding sequences must be of the form SEQID/1 and SEQID/2
+    parser.add_argument("--regex",
+        help="Perl regular expression to parse paired read IDs (default: %(default)s)",
+        default='^@(\S+)/[1|2]$')
     parser.add_argument('inFastq1',
         help='Input fastq file; 1st end of paired-end reads.')
     parser.add_argument('inFastq2',
@@ -61,7 +64,8 @@ def main_purge_unmated(args) :
     inFastq2 = args.inFastq2
     outFastq1 = args.outFastq1
     outFastq2 = args.outFastq2
-    purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2)
+    regex = args.regex
+    purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex)
     return 0
 
 __commands__.append(('purge_unmated', main_purge_unmated,

diff --git a/test/input/TestPurgeUnmated/in_sra1.fastq b/test/input/TestPurgeUnmated/in_sra1.fastq
@@ -0,0 +1,16 @@
+@from_chr1_1.1 XXX XXX
+TCAATAAAAAAAAAAAAGAAAGAAAAAAAAATTCTCCTCATTTTTGTTGT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+@from_ebov_1.1 XXX XXX
+GTGTGCTCAGTTGAAAATCCCTTGTCAACACCTAGGTCTTATCACATCAC
++
+BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+@from_chr9_1.1 XXX XXX
+ACAAGTGTGCCTTTGAAGGAGGAGGAGGAAGAGAGGAAGAAAGGAGAAGT
++
+CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+@from_chr1_2.1 XXX XXX
+AAAAAAAGAAAGAAAAAAAAATTCTCCTCATTTTTGTTGTTTTGAATTTT
++
+DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
diff --git a/test/input/TestPurgeUnmated/in_sra2.fastq b/test/input/TestPurgeUnmated/in_sra2.fastq
@@ -0,0 +1,16 @@
+@from_chr9_1.2 XXX XXX
+GGGGTGGATCGAGGTGGGTGGTGGCATCTGGGATCTCTGCTTGGCTTCCC
++
+CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCD
+@from_chr1_2.2 XXX XXX
+AATTCTAAGAAATTATATTATTTCTTTGATAATTTCCTCTCCTCTTGTTT
++
+DEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDE
+@from_ebov_2.2 XXX XXX
+GCGCCGAGTCTCACTGAATCTGACATGGATTACCACAAGATCTTGACAGC
++
+EFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEF
+@from_ebov_1.2 XXX XXX
+TTATTGTTAAAGGACAGCATTAGTTCACAGTCAAACAAGCAAGATTGAGA
++
+BCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBC
diff --git a/test/test_read_utils.py b/test/test_read_utils.py
@@ -23,6 +23,23 @@ def test_purge_unmated(self) :
         self.assertEqualContents(outFastq1, expected1Fastq)
         self.assertEqualContents(outFastq2, expected2Fastq)
 
+    # test on FASTQs with read IDs in the style of SRA fastq-dump
+    def test_purge_unmated_sra(self):
+        myInputDir = util.file.get_test_input_path(self)
+        inFastq1 = os.path.join(myInputDir, 'in_sra1.fastq')
+        inFastq2 = os.path.join(myInputDir, 'in_sra2.fastq')
+        outFastq1 = util.file.mkstempfname('.fastq')
+        outFastq2 = util.file.mkstempfname('.fastq')
+        parser = read_utils.parser_purge_unmated()
+        args = parser.parse_args(['--regex', '^@(\S+).[1|2] .*', inFastq1, inFastq2, outFastq1, outFastq2])
+        read_utils.main_purge_unmated(args)
+
+        # The expected outputs are identical to the previous case.
+        expected1Fastq = os.path.join(myInputDir, 'expected1.fastq')
+        expected2Fastq = os.path.join(myInputDir, 'expected2.fastq')
+        self.assertEqualContents(outFastq1, expected1Fastq)
+        self.assertEqualContents(outFastq2, expected2Fastq)
+
 class TestFastqToFasta(TestCaseWithTmp) :
     def test_fastq_to_fasta(self) :
         myInputDir = util.file.get_test_input_path(self)