From fee1a478c4b374ec2bd8d954ed9df76098e756df Mon Sep 17 00:00:00 2001
From: "andreas@fgu210.anat.ox.ac.uk" <andreas@fgu210.anat.ox.ac.uk>
Date: Fri, 16 Oct 2009 16:57:26 +0100
Subject: [PATCH] added range tests and update docu

---
 doc/api.rst         | 13 +++++++---
 doc/contents.rst    |  5 ++--
 doc/glossary.rst    |  4 +--
 doc/usage.rst       | 10 +++++---
 pysam/csamtools.pyx | 60 ++++++++++++++++++++++++++++-----------------
 setup.py            | 22 +++++------------
 tests/pysam_test.py | 36 +++++++++++++--------------
 7 files changed, 83 insertions(+), 67 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 1703aa3e..d80624f6 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -1,7 +1,5 @@
 pysam - An interface for reading and writing SAM files
-=====================================================
-
-
+======================================================
 
 Pysam is a python module that makes it easy to read and manipulate mapped short read sequence data stored in SAM files. It's a lightweight wrapper of the samtools_ C-API.
 
@@ -18,6 +16,7 @@ Each iteration returns a :class:`~pysam.AlignedRead` object which represents a s
 	print alignedread
 
    samfile.close()
+
 To give::
 
     EAS56_57:6:190:289:82	0	99	<<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<;	69	CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA	0	192	1
@@ -68,6 +67,14 @@ To give::
         base in read EAS51_64:3:190:727:308 = G
     ...
 
+Commands available in :term:`csamtools` are available as simple function calls. For example::
+
+   pysam.sort( "ex1.bam", "output" )
+
+corresponds to the command line::
+
+   samtools sort ex1.bam output
+
 .. Note::
     #. Coordinates in pysam are always 0-based (following the python convention). SAM text files use 1-based coordinates.
     #. The above examples can be run in the /test directory of the installation directory. Type 'make' before running them.
diff --git a/doc/contents.rst b/doc/contents.rst
index 8775d4d4..f7cf8aba 100644
--- a/doc/contents.rst
+++ b/doc/contents.rst
@@ -8,7 +8,7 @@ pysam: samtools interface for python
 
 :Author: Andreas Heger, Tildon Grant Belgrad, Martin Goodson, Leo Goodstad
 :Date: |today|
-:Revision: $Id$
+:Version: |version|
 
 The *SAM/BAM* format is a way to store efficiently large numbers of alignments
 [Li2009]_, such as those routinely are created by next-generation sequencing 
@@ -25,10 +25,11 @@ Contents
 .. toctree::
    :maxdepth: 2
 
+   api.rst
    usage.rst
    glossary.rst
-   api.rst
    developer.rst
+   release.rst
 
 Indices and tables
 ------------------
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 292c636e..2aca8f62 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -15,7 +15,7 @@ Glossary
       A genomic region, stated relative to a reference sequence. Consists of reference name ('chr1'), start (100000), and end (200000). 0-based coordinates. Can be expressed as a string ('chr1:10000:20000')
 
    column
-      Reads that are aligned to a base in the :term:`target` sequence.
+      Reads that are aligned to a base in the :term:`reference` sequence.
      
    Reference
       The sequence that reads have been mapped onto. For example ``chr1``, ``contig123``.
@@ -30,7 +30,7 @@ Glossary
 
    sam file
        A file containing aligned reads. The :term:`sam file` can either
-       be a :term:`bam file` or a :term:`tam file`.
+       be a :term:`BAM` file or a :term:`TAM` file.
 
    pileup
       Pileup     
diff --git a/doc/usage.rst b/doc/usage.rst
index eb134c3c..e8f0e893 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -1,3 +1,5 @@
+.. _Usage: 
+
 *****
 Usage
 *****
@@ -50,12 +52,12 @@ and return a :class:`pysam.AlignedRead` object for each::
    iter = pysam.IteratorRow( samfile, "seq1:10-20")
    for x in iter: print str(x)
 
-Note that both methods iterate through a :term:`bam file`
+Note that both methods iterate through a :term:`BAM` file
 on a read-by-read basis. They need not load all data into
 memory.
 
 Fetching returns all reads overlapping a region sorted
-by the lowest aligned base in the :term:`target` sequence.
+by the lowest aligned base in the :term:`reference` sequence.
 Note that it will also return reads that are only partially
 overlapping with the :term:`region`. Thus the reads returned
 might span a region that is larger than the one queried.
@@ -66,7 +68,7 @@ Using the pileup-engine
 The :term:`pileup` engine of :term:`csamtools` iterates
 over all reads that are aligned to a :term:`region`. In
 contrast to :term:`fetching`, the :term:`pileup` engine 
-returns for each base in the target sequence the reads that
+returns for each base in the :term:`reference` sequence the reads that
 map to that particular position.
 
 Again, there are two principal methods to iterate.
@@ -78,7 +80,7 @@ The first works via a callback function::
 
 The second method uses python iterators. The iterator
 :class:`pysam.IteratorColumn` will iterate through each :term:`column`
-(target bases) and return a list of aligned reads::
+(reference bases) and return a list of aligned reads::
 
    iter = pysam.IteratorRow( samfile, "seq1:10-20")
    for x in iter: print str(x)
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx
index 76a27ff5..1ef39007 100644
--- a/pysam/csamtools.pyx
+++ b/pysam/csamtools.pyx
@@ -296,6 +296,40 @@ cdef class Samfile:
         """return the number of :ref:`reference` sequences."""
         return self.samfile.header.n_targets
 
+    def _parseRegion( self, reference = None, start = None, end = None, 
+                       region = None ):
+        '''parse region information.
+
+        raise Value for for invalid regions.
+
+        returns a tuple of region, tid, start and end. Region
+        is a valid samtools :term:`region` or None if the region
+        extends over the whole file.
+        '''
+        
+        cdef int rtid
+        cdef int rstart
+        cdef int rend
+
+        rtid = rstart = rend = 0
+
+        # translate to a region
+        if reference:
+            if start != None and end != None:
+                region = "%s:%i-%i" % (reference, start, end)
+            else:
+                region = reference
+
+        if region:
+            bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
+            if rtid < 0: raise ValueError( "invalid region `%s`" % region )
+
+            if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
+            if rstart < 0: raise ValueError( 'negative start coordinate (%i)' % rstart )
+            if rend < 0: raise ValueError( 'negative end coordinate (%i)' % rend )
+            
+        return region, rtid, rstart, rend
+
     def fetch( self, 
                reference = None, start = None, end = None, 
                region = None, 
@@ -325,16 +359,7 @@ cdef class Samfile:
         cdef int rstart
         cdef int rend
 
-        # translate to a region
-        if reference:
-            if start and end:
-                region = "%s:%i-%i" % (reference, start, end)
-            else:
-                region = reference
-
-        if region:
-            bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
-            if rtid < 0: raise ValueError( "invalid region `%s`" % region )
+        region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 
         if self.isbam:
             assert self._hasIndex(), "no index available for fetch"            
@@ -383,17 +408,8 @@ cdef class Samfile:
         cdef int rend
         cdef bam_plbuf_t *buf
 
-        # translate to a region
-        if reference:
-            if start and end:
-                region = "%s:%i-%i" % (reference, start, end)
-            else:
-                region = reference
-
-        if region:
-            bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
-            if rtid < 0: raise ValueError( "invalid region `%s`" % region )
-            
+        region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
+        
         if self.isbam:
             assert self._hasIndex(), "no index available for pileup"
 
@@ -874,7 +890,7 @@ cdef class AlignedRead:
         def __get__(self): 
             return self._delegate.core.qual
     property mrnm:
-        """the :term:`target` id of the mate """     
+        """the :term:`reference` id of the mate """     
         def __get__(self): 
             return self._delegate.core.mtid
     property mpos: 
diff --git a/setup.py b/setup.py
index a9380fc3..f9515d85 100644
--- a/setup.py
+++ b/setup.py
@@ -1,20 +1,10 @@
 #!/usr/bin/python
-"""\
+'''
 
-NCL
-***
+pysam
+*****
 
-Nested contained lists are a way to index segment data. See
-Alekseyenko & Lee (2007) 
-(http://bioinformatics.oxfordjournals.org/cgi/content/full/23/11/1386)
-
-The following code was taken from the author's implemetation
-in pygr (http://sourceforge.net/projects/pygr) and modified. The
-modifications include:
-
-1. remove target coordinates, only target_id is kept.
-
-"""\
+'''
 
 import os, sys, glob, shutil
 
@@ -47,7 +37,7 @@
 version = "0.1"
 
 classifiers = """
-Development Status :: 1 - Pre Alpha
+Development Status :: 2 - Alpha
 Operating System :: MacOS :: MacOS X
 Operating System :: Microsoft :: Windows :: Windows NT/2000
 Operating System :: OS Independent
@@ -81,7 +71,7 @@
     'author_email': "andreas.heger@gmail.com",
     'license': "GPL",
     'platforms': "ALL",
-    'url': "TBD",
+    'url': "http://code.google.com/p/pysam/",
     'py_modules': [
       "pysam/__init__", "pysam/Pileup"],
     'ext_modules': [pysam,],
diff --git a/tests/pysam_test.py b/tests/pysam_test.py
index cfc5498c..cd10171e 100755
--- a/tests/pysam_test.py
+++ b/tests/pysam_test.py
@@ -227,7 +227,7 @@ def setUp(self):
 
     def checkRange( self, rnge ):
         '''compare results from iterator with those from samtools.'''
-        ps = list(self.samfile.fetch(rnge))
+        ps = list(self.samfile.fetch(region=rnge))
         sa = list(pysam.view( "ex1.bam", rnge , raw = True) )
         self.assertEqual( len(ps), len(sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa) ))
         # check if the same reads are returned and in the same order
@@ -425,19 +425,19 @@ def setUp(self):
         self.samfile=pysam.Samfile( "ex1.bam","rb" )
 
     def testPileupColumn(self):
-        for pcolumn1 in self.samfile.pileup( "chr1:105" ):
+        for pcolumn1 in self.samfile.pileup( region="chr1:105" ):
             if pcolumn1.pos == 104:
                 self.assertEqual( pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0) )
                 self.assertEqual( pcolumn1.pos, 105-1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105-1) )
                 self.assertEqual( pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2) )
-        for pcolumn2 in self.samfile.pileup( "chr2:1480" ):
+        for pcolumn2 in self.samfile.pileup( region="chr2:1480" ):
             if pcolumn2.pos == 1479:
                 self.assertEqual( pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1) )
                 self.assertEqual( pcolumn2.pos, 1480-1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480-1) )
                 self.assertEqual( pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12) )
 
     def testPileupRead(self):
-        for pcolumn1 in self.samfile.pileup( "chr1:105" ):
+        for pcolumn1 in self.samfile.pileup( region="chr1:105" ):
             if pcolumn1.pos == 104:
                 self.assertEqual( len(pcolumn1.pileups), 2, "# reads aligned to column mismatch in position 1: %s != %s" % (len(pcolumn1.pileups), 2) )
 #                self.assertEqual( pcolumn1.pileups[0]  # need to test additional properties here
@@ -466,26 +466,26 @@ def testBackwardsOrderNewFormat(self):
         self.assertRaises( ValueError, self.samfile.fetch, 'chr1', 100, 10 )
 
     def testBackwardsOrderOldFormat(self):
-        self.assertRaises( ValueError, self.samfile.fetch, "chr1:100-10")
-
-#    def testOutOfRangeNegativeNewFormat(self):
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 )
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 )
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 )
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:100-10")
+        
+    def testOutOfRangeNegativeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, -10 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 5, 0 )
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", -5, -10 )
 
-#    def testOutOfRangeNegativeOldFormat(self):
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-10" )
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5-0" )
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1:-5--10" )
+    def testOutOfRangeNegativeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-10" )
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5-0" )
+        self.assertRaises( ValueError, self.samfile.fetch, region="chr1:-5--10" )
 
     def testOutOfRangNewFormat(self):
         self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999 )
 
-#    def testOutOfRangeLargeNewFormat(self):
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 99999999999999999, 999999999999999999 )
+    def testOutOfRangeLargeNewFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1", 9999999999999999999999999999999, 9999999999999999999999999999999999999999 )
 
-#    def testOutOfRangeLargeOldFormat(self):
-#        self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" )
+    def testOutOfRangeLargeOldFormat(self):
+        self.assertRaises( ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999" )
 
     def tearDown(self):
         self.samfile.close()