From fee1a478c4b374ec2bd8d954ed9df76098e756df Mon Sep 17 00:00:00 2001 From: "andreas@fgu210.anat.ox.ac.uk" Date: Fri, 16 Oct 2009 16:57:26 +0100 Subject: [PATCH] added range tests and update docu --- doc/api.rst | 13 +++++++--- doc/contents.rst | 5 ++-- doc/glossary.rst | 4 +-- doc/usage.rst | 10 +++++--- pysam/csamtools.pyx | 60 ++++++++++++++++++++++++++++----------------- setup.py | 22 +++++------------ tests/pysam_test.py | 36 +++++++++++++-------------- 7 files changed, 83 insertions(+), 67 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 1703aa3e..d80624f6 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1,7 +1,5 @@ pysam - An interface for reading and writing SAM files -===================================================== - - +====================================================== Pysam is a python module that makes it easy to read and manipulate mapped short read sequence data stored in SAM files. It's a lightweight wrapper of the samtools_ C-API. @@ -18,6 +16,7 @@ Each iteration returns a :class:`~pysam.AlignedRead` object which represents a s print alignedread samfile.close() + To give:: EAS56_57:6:190:289:82 0 99 <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; 69 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA 0 192 1 @@ -68,6 +67,14 @@ To give:: base in read EAS51_64:3:190:727:308 = G ... +Commands available in :term:`csamtools` are available as simple function calls. For example:: + + pysam.sort( "ex1.bam", "output" ) + +corresponds to the command line:: + + samtools sort ex1.bam output + .. Note:: #. Coordinates in pysam are always 0-based (following the python convention). SAM text files use 1-based coordinates. #. The above examples can be run in the /test directory of the installation directory. Type 'make' before running them. diff --git a/doc/contents.rst b/doc/contents.rst index 8775d4d4..f7cf8aba 100644 --- a/doc/contents.rst +++ b/doc/contents.rst @@ -8,7 +8,7 @@ pysam: samtools interface for python :Author: Andreas Heger, Tildon Grant Belgrad, Martin Goodson, Leo Goodstad :Date: |today| -:Revision: $Id$ +:Version: |version| The *SAM/BAM* format is a way to store efficiently large numbers of alignments [Li2009]_, such as those routinely are created by next-generation sequencing @@ -25,10 +25,11 @@ Contents .. toctree:: :maxdepth: 2 + api.rst usage.rst glossary.rst - api.rst developer.rst + release.rst Indices and tables ------------------ diff --git a/doc/glossary.rst b/doc/glossary.rst index 292c636e..2aca8f62 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -15,7 +15,7 @@ Glossary A genomic region, stated relative to a reference sequence. Consists of reference name ('chr1'), start (100000), and end (200000). 0-based coordinates. Can be expressed as a string ('chr1:10000:20000') column - Reads that are aligned to a base in the :term:`target` sequence. + Reads that are aligned to a base in the :term:`reference` sequence. Reference The sequence that reads have been mapped onto. For example ``chr1``, ``contig123``. @@ -30,7 +30,7 @@ Glossary sam file A file containing aligned reads. The :term:`sam file` can either - be a :term:`bam file` or a :term:`tam file`. + be a :term:`BAM` file or a :term:`TAM` file. pileup Pileup diff --git a/doc/usage.rst b/doc/usage.rst index eb134c3c..e8f0e893 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -1,3 +1,5 @@ +.. _Usage: + ***** Usage ***** @@ -50,12 +52,12 @@ and return a :class:`pysam.AlignedRead` object for each:: iter = pysam.IteratorRow( samfile, "seq1:10-20") for x in iter: print str(x) -Note that both methods iterate through a :term:`bam file` +Note that both methods iterate through a :term:`BAM` file on a read-by-read basis. They need not load all data into memory. Fetching returns all reads overlapping a region sorted -by the lowest aligned base in the :term:`target` sequence. +by the lowest aligned base in the :term:`reference` sequence. Note that it will also return reads that are only partially overlapping with the :term:`region`. Thus the reads returned might span a region that is larger than the one queried. @@ -66,7 +68,7 @@ Using the pileup-engine The :term:`pileup` engine of :term:`csamtools` iterates over all reads that are aligned to a :term:`region`. In contrast to :term:`fetching`, the :term:`pileup` engine -returns for each base in the target sequence the reads that +returns for each base in the :term:`reference` sequence the reads that map to that particular position. Again, there are two principal methods to iterate. @@ -78,7 +80,7 @@ The first works via a callback function:: The second method uses python iterators. The iterator :class:`pysam.IteratorColumn` will iterate through each :term:`column` -(target bases) and return a list of aligned reads:: +(reference bases) and return a list of aligned reads:: iter = pysam.IteratorRow( samfile, "seq1:10-20") for x in iter: print str(x) diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index 76a27ff5..1ef39007 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -296,6 +296,40 @@ cdef class Samfile: """return the number of :ref:`reference` sequences.""" return self.samfile.header.n_targets + def _parseRegion( self, reference = None, start = None, end = None, + region = None ): + '''parse region information. + + raise Value for for invalid regions. + + returns a tuple of region, tid, start and end. Region + is a valid samtools :term:`region` or None if the region + extends over the whole file. + ''' + + cdef int rtid + cdef int rstart + cdef int rend + + rtid = rstart = rend = 0 + + # translate to a region + if reference: + if start != None and end != None: + region = "%s:%i-%i" % (reference, start, end) + else: + region = reference + + if region: + bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend) + if rtid < 0: raise ValueError( "invalid region `%s`" % region ) + + if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) + if rstart < 0: raise ValueError( 'negative start coordinate (%i)' % rstart ) + if rend < 0: raise ValueError( 'negative end coordinate (%i)' % rend ) + + return region, rtid, rstart, rend + def fetch( self, reference = None, start = None, end = None, region = None, @@ -325,16 +359,7 @@ cdef class Samfile: cdef int rstart cdef int rend - # translate to a region - if reference: - if start and end: - region = "%s:%i-%i" % (reference, start, end) - else: - region = reference - - if region: - bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend) - if rtid < 0: raise ValueError( "invalid region `%s`" % region ) + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) if self.isbam: assert self._hasIndex(), "no index available for fetch" @@ -383,17 +408,8 @@ cdef class Samfile: cdef int rend cdef bam_plbuf_t *buf - # translate to a region - if reference: - if start and end: - region = "%s:%i-%i" % (reference, start, end) - else: - region = reference - - if region: - bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend) - if rtid < 0: raise ValueError( "invalid region `%s`" % region ) - + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + if self.isbam: assert self._hasIndex(), "no index available for pileup" @@ -874,7 +890,7 @@ cdef class AlignedRead: def __get__(self): return self._delegate.core.qual property mrnm: - """the :term:`target` id of the mate """ + """the :term:`reference` id of the mate """ def __get__(self): return self._delegate.core.mtid property mpos: diff --git a/setup.py b/setup.py index a9380fc3..f9515d85 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,10 @@ #!/usr/bin/python -"""\ +''' -NCL -*** +pysam +***** -Nested contained lists are a way to index segment data. See -Alekseyenko & Lee (2007) -(http://bioinformatics.oxfordjournals.org/cgi/content/full/23/11/1386) - -The following code was taken from the author's implemetation -in pygr (http://sourceforge.net/projects/pygr) and modified. The -modifications include: - -1. remove target coordinates, only target_id is kept. - -"""\ +''' import os, sys, glob, shutil @@ -47,7 +37,7 @@ version = "0.1" classifiers = """ -Development Status :: 1 - Pre Alpha +Development Status :: 2 - Alpha Operating System :: MacOS :: MacOS X Operating System :: Microsoft :: Windows :: Windows NT/2000 Operating System :: OS Independent @@ -81,7 +71,7 @@ 'author_email': "andreas.heger@gmail.com", 'license': "GPL", 'platforms': "ALL", - 'url': "TBD", + 'url': "http://code.google.com/p/pysam/", 'py_modules': [ "pysam/__init__", "pysam/Pileup"], 'ext_modules': [pysam,], diff --git a/tests/pysam_test.py b/tests/pysam_test.py index cfc5498c..cd10171e 100755 --- a/tests/pysam_test.py +++ b/tests/pysam_test.py @@ -227,7 +227,7 @@ def setUp(self): def checkRange( self, rnge ): '''compare results from iterator with those from samtools.''' - ps = list(self.samfile.fetch(rnge)) + ps = list(self.samfile.fetch(region=rnge)) sa = list(pysam.view( "ex1.bam", rnge , raw = True) ) self.assertEqual( len(ps), len(sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa) )) # check if the same reads are returned and in the same order @@ -425,19 +425,19 @@ def setUp(self): self.samfile=pysam.Samfile( "ex1.bam","rb" ) def testPileupColumn(self): - for pcolumn1 in self.samfile.pileup( "chr1:105" ): + for pcolumn1 in self.samfile.pileup( region="chr1:105" ): if pcolumn1.pos == 104: self.assertEqual( pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0) ) self.assertEqual( pcolumn1.pos, 105-1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105-1) ) self.assertEqual( pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2) ) - for pcolumn2 in self.samfile.pileup( "chr2:1480" ): + for pcolumn2 in self.samfile.pileup( region="chr2:1480" ): if pcolumn2.pos == 1479: self.assertEqual( pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1) ) self.assertEqual( pcolumn2.pos, 1480-1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480-1) ) self.assertEqual( pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12) ) def testPileupRead(self): - for pcolumn1 in self.samfile.pileup( "chr1:105" ): + for pcolumn1 in self.samfile.pileup( region="chr1:105" ): if pcolumn1.pos == 104: self.assertEqual( len(pcolumn1.pileups), 2, "# reads aligned to column mismatch in position 1: %s != %s" % (len(pcolumn1.pileups), 2) ) # self.assertEqual( pcolumn1.pileups[0] # need to test additional properties here @@ -466,26 +466,26 @@ def testBackwardsOrderNewFormat(self): self.assertRaises( ValueError, self.samfile.fetch, 'chr1', 100, 10 ) def testBackwardsOrderOldFormat(self): - self.assertRaises( ValueError, self.samfile.fetch, "chr1:100-10") - -# def testOutOfRangeNegativeNewFormat(self): -# self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 ) -# self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 ) -# self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 ) + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:100-10") + + def testOutOfRangeNegativeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 ) + self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 ) -# def testOutOfRangeNegativeOldFormat(self): -# self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-10" ) -# self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-0" ) -# self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5--10" ) + def testOutOfRangeNegativeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-10" ) + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-0" ) + self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5--10" ) def testOutOfRangNewFormat(self): self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999 ) -# def testOutOfRangeLargeNewFormat(self): -# self.assertRaises( ValueError, self.samfile.fetch, "chr1", 99999999999999999, 999999999999999999 ) + def testOutOfRangeLargeNewFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999999999999999999999999, 9999999999999999999999999999999999999999 ) -# def testOutOfRangeLargeOldFormat(self): -# self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" ) + def testOutOfRangeLargeOldFormat(self): + self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" ) def tearDown(self): self.samfile.close()