Skip to content

Commit

Permalink
Pull in dnanexus/read-id-regex merge into dp-indels
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Jan 4, 2015
2 parents 8de7298 + 3500bdb commit d7ee8ef
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 5 deletions.
14 changes: 9 additions & 5 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,13 @@
# *** purge_unmated ***
# =======================

def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2) :
def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex) :
"""Use mergeShuffledFastqSeqs to purge unmated reads, and put corresponding
reads in the same order."""
tempOutput = mkstempfname()
mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(),
'mergeShuffledFastqSeqs.pl')
# The regular expression that follow says that the sequence identifiers
# of corresponding sequences must be of the form SEQID/1 and SEQID/2
cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', '^@(\S+)/[1|2]$',
cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex,
'-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput]
log.debug(' '.join(cmdline))
subprocess.check_call(cmdline)
Expand All @@ -45,6 +43,11 @@ def parser_purge_unmated() :
Corresponding sequences must have sequence identifiers
of the form SEQID/1 and SEQID/2.
''')
# The default regular expression says that the sequence identifiers of
# corresponding sequences must be of the form SEQID/1 and SEQID/2
parser.add_argument("--regex",
help="Perl regular expression to parse paired read IDs (default: %(default)s)",
default='^@(\S+)/[1|2]$')
parser.add_argument('inFastq1',
help='Input fastq file; 1st end of paired-end reads.')
parser.add_argument('inFastq2',
Expand All @@ -61,7 +64,8 @@ def main_purge_unmated(args) :
inFastq2 = args.inFastq2
outFastq1 = args.outFastq1
outFastq2 = args.outFastq2
purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2)
regex = args.regex
purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex)
return 0

__commands__.append(('purge_unmated', main_purge_unmated,
Expand Down
16 changes: 16 additions & 0 deletions test/input/TestPurgeUnmated/in_sra1.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@from_chr1_1.1 XXX XXX
TCAATAAAAAAAAAAAAGAAAGAAAAAAAAATTCTCCTCATTTTTGTTGT
+
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
@from_ebov_1.1 XXX XXX
GTGTGCTCAGTTGAAAATCCCTTGTCAACACCTAGGTCTTATCACATCAC
+
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
@from_chr9_1.1 XXX XXX
ACAAGTGTGCCTTTGAAGGAGGAGGAGGAAGAGAGGAAGAAAGGAGAAGT
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
@from_chr1_2.1 XXX XXX
AAAAAAAGAAAGAAAAAAAAATTCTCCTCATTTTTGTTGTTTTGAATTTT
+
DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
16 changes: 16 additions & 0 deletions test/input/TestPurgeUnmated/in_sra2.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@from_chr9_1.2 XXX XXX
GGGGTGGATCGAGGTGGGTGGTGGCATCTGGGATCTCTGCTTGGCTTCCC
+
CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCD
@from_chr1_2.2 XXX XXX
AATTCTAAGAAATTATATTATTTCTTTGATAATTTCCTCTCCTCTTGTTT
+
DEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDEDE
@from_ebov_2.2 XXX XXX
GCGCCGAGTCTCACTGAATCTGACATGGATTACCACAAGATCTTGACAGC
+
EFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEFEF
@from_ebov_1.2 XXX XXX
TTATTGTTAAAGGACAGCATTAGTTCACAGTCAAACAAGCAAGATTGAGA
+
BCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBCBC
17 changes: 17 additions & 0 deletions test/test_read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,23 @@ def test_purge_unmated(self) :
self.assertEqualContents(outFastq1, expected1Fastq)
self.assertEqualContents(outFastq2, expected2Fastq)

# test on FASTQs with read IDs in the style of SRA fastq-dump
def test_purge_unmated_sra(self):
myInputDir = util.file.get_test_input_path(self)
inFastq1 = os.path.join(myInputDir, 'in_sra1.fastq')
inFastq2 = os.path.join(myInputDir, 'in_sra2.fastq')
outFastq1 = util.file.mkstempfname('.fastq')
outFastq2 = util.file.mkstempfname('.fastq')
parser = read_utils.parser_purge_unmated()
args = parser.parse_args(['--regex', '^@(\S+).[1|2] .*', inFastq1, inFastq2, outFastq1, outFastq2])
read_utils.main_purge_unmated(args)

# The expected outputs are identical to the previous case.
expected1Fastq = os.path.join(myInputDir, 'expected1.fastq')
expected2Fastq = os.path.join(myInputDir, 'expected2.fastq')
self.assertEqualContents(outFastq1, expected1Fastq)
self.assertEqualContents(outFastq2, expected2Fastq)

class TestFastqToFasta(TestCaseWithTmp) :
def test_fastq_to_fasta(self) :
myInputDir = util.file.get_test_input_path(self)
Expand Down

0 comments on commit d7ee8ef

Please sign in to comment.