Merge pull request #19 from pauldmccarthy/bf/export

Bf/export
pauldmccarthy · May 14, 2019 · aba0358 · aba0358
2 parents 16554e6 + d9e5b5e
commit aba0358
Show file tree

Hide file tree

Showing 10 changed files with 154 additions and 23 deletions.
diff --git a/.ci/run_32bit_test.sh b/.ci/run_32bit_test.sh
@@ -7,7 +7,7 @@
 PYV=$PYTHON_VERSION
 
 apt-get update
-apt-get install -y libssl-dev openssl wget build-essential
+apt-get install -y libssl-dev openssl wget build-essential libffi-dev
 cd /
 wget https://www.python.org/ftp/python/$PYV/Python-$PYV.tar.xz
 tar xf Python-$PYV.tar.xz

diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,9 @@ branches:
   except:
     - /^v[0-9]\.[0-9]\.[0-9]/
 
+
+dist: xenial
+
 sudo: required
 
 language: python
@@ -80,15 +83,14 @@ matrix:
       env:    TEST_SUITE=zran_test EXTRA_ARGS="--use_mmap"           NITERS=25   NELEMS=805306368 TEST_PATTERN="test_seek_then_read_block"
     - python: 3.6
       env:    TEST_SUITE=zran_test EXTRA_ARGS="--use_mmap --concat"  NITERS=25   NELEMS=805306368 TEST_PATTERN="test_seek_then_read_block"
-    - python: 3.6
-    - env:    TEST_SUITE=32bittest
+
 
 # Tests covering the indexed_gzip module
 python:
   - 2.7
-  - 3.4
   - 3.5
   - 3.6
+  - 3.7
 
 env:
   - TEST_SUITE=indexed_gzip_test                                   NITERS=5000 NELEMS=50000

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,17 @@
 # `indexed_gzip` changelog
 
 
+## 0.8.9 (May 14th 2019)
+
+
+* The `IndexedGzipFile.import_index` method and `zran_import_index` function
+  can handle index files which do not contain any index points (#18).
+
+
 ## 0.8.8 (November 22nd 2018)
 
 * Fixed bug affecting files which were an exact multiple of the read buffer
-  size.
+  size (#15).
 
 
 ## 0.8.7 (August 3rd 2018)

diff --git a/appveyor.yml b/appveyor.yml
@@ -13,16 +13,6 @@ environment:
       MINICONDA:      "C:\\Miniconda-x64"
       PYTHON_ARCH:    "64"
 
-    - PYTHON_VERSION: "3.4"
-      PYTHON:         "C:\\Miniconda34"
-      MINICONDA:      "C:\\Miniconda34"
-      PYTHON_ARCH:    "32"
-
-    - PYTHON_VERSION: "3.4"
-      PYTHON:         "C:\\Miniconda34-x64"
-      MINICONDA:      "C:\\Miniconda34-x64"
-      PYTHON_ARCH:    "64"
-
     - PYTHON_VERSION: "3.5"
       PYTHON:         "C:\\Miniconda35"
       MINICONDA:      "C:\\Miniconda35"
@@ -43,6 +33,18 @@ environment:
       MINICONDA:      "C:\\Miniconda36-x64"
       PYTHON_ARCH:    "64"
 
+    - PYTHON_VERSION: "3.7"
+      PYTHON:         "C:\\Miniconda37"
+      MINICONDA:      "C:\\Miniconda37"
+      PYTHON_ARCH:    "32"
+
+    - PYTHON_VERSION: "3.7"
+      PYTHON:         "C:\\Miniconda37-x64"
+      MINICONDA:      "C:\\Miniconda37-x64"
+      PYTHON_ARCH:    "64"
+
+
+
 
 install:
   - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"

diff --git a/indexed_gzip/__init__.py b/indexed_gzip/__init__.py
@@ -15,4 +15,4 @@
                            ZranError)
 
 
-__version__ = '0.8.8'
+__version__ = '0.8.9'
diff --git a/indexed_gzip/indexed_gzip.pyx b/indexed_gzip/indexed_gzip.pyx
@@ -136,6 +136,9 @@ cdef class _IndexedGzipFile:
         with an open file handle (``fileobj``), or with a ``filename``. If the
         former, the file must have been opened in ``'rb'`` mode.
 
+        .. note:: The ``auto_build`` behaviour only takes place on calls to
+                  :meth:`seek`.
+
         :arg filename:         File name.
 
         :arg mode:             Opening mode. Must be either ``'r'`` or ``'rb``.
@@ -145,7 +148,7 @@ cdef class _IndexedGzipFile:
         :arg fid:              Deprecated, use ``fileobj`` instead.
 
         :arg auto_build:       If ``True`` (the default), the index is
-                               automatically built on seeks/reads.
+                               automatically built on calls to :meth:`seek`.
 
         :arg spacing:          Number of bytes between index seek points.
 

diff --git a/indexed_gzip/tests/ctest_zran.pyx b/indexed_gzip/tests/ctest_zran.pyx
@@ -13,10 +13,13 @@ import itertools       as it
 import subprocess      as sp
 import                    sys
 import                    time
+import                    gzip
+import                    shutil
 import                    random
 import                    hashlib
 import                    tempfile
 import                    threading
+import                    contextlib
 
 import numpy as np
 
@@ -56,6 +59,21 @@ cimport indexed_gzip.zran as zran
 np.import_array()
 
 
+@contextlib.contextmanager
+def tempdir():
+    """Returns a context manager which creates and returns a temporary
+    directory, and then deletes it on exit.
+    """
+    testdir = tempfile.mkdtemp()
+    prevdir = os.getcwd()
+    try:
+        os.chdir(testdir)
+        yield testdir
+    finally:
+        os.chdir(prevdir)
+        shutil.rmtree(testdir)
+
+
 cdef read_element(zran.zran_index_t *index, element, nelems, seek=True):
 
     cdef void *buffer
@@ -867,3 +885,55 @@ def test_export_then_import(testfile):
 
         zran.zran_free(&index1)
         zran.zran_free(&index2)
+
+
+def test_export_import_no_points():
+
+    cdef zran.zran_index_t index
+    cdef void             *buffer
+
+    data   = np.random.randint(1, 255, 100, dtype=np.uint8)
+    buf    = ReadBuffer(100)
+    buffer = buf.buffer
+
+    with tempdir():
+
+        with gzip.open('data.gz', 'wb') as f:
+            f.write(data.tostring())
+
+        with open('data.gz', 'rb')  as pyfid:
+            cfid = fdopen(pyfid.fileno(), 'rb')
+            assert zran.zran_init(&index,
+                                  cfid,
+                                  1048576,
+                                  32768,
+                                  131072,
+                                  0) == 0
+            assert zran.zran_read(&index, buffer, 100)  == 100
+
+            pybuf = <bytes>(<char *>buffer)[:100]
+            assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data)
+
+            with open('data.gz.index', 'wb') as pyidxfid:
+                cidxfid = fdopen(pyidxfid.fileno(), 'wb')
+                assert zran.zran_export_index(&index, cidxfid) == 0
+            zran.zran_free(&index)
+
+        with open('data.gz', 'rb')  as pyfid:
+            cfid = fdopen(pyfid.fileno(), 'rb')
+            assert zran.zran_init(&index,
+                                  cfid,
+                                  1048576,
+                                  32768,
+                                  131072,
+                                  0) == 0
+
+            with open('data.gz.index', 'rb') as pyidxfid:
+                cidxfid = fdopen(pyidxfid.fileno(), 'rb')
+                assert zran.zran_import_index(&index, cidxfid) == 0
+            assert index.npoints == 0
+
+            assert zran.zran_read(&index, buffer, 100)  == 100
+            pybuf = <bytes>(<char *>buffer)[:100]
+            assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data)
+            zran.zran_free(&index)
diff --git a/indexed_gzip/tests/test_zran.py b/indexed_gzip/tests/test_zran.py
@@ -36,3 +36,4 @@ def test_read_all_sequential(   testfile, nelems):                         ctest
     def test_build_then_read(       testfile, nelems, seed, use_mmap):         ctest_zran.test_build_then_read(       testfile, nelems, seed, use_mmap)
     def test_readbuf_spacing_sizes( testfile, nelems, niters, seed):           ctest_zran.test_readbuf_spacing_sizes( testfile, nelems, niters, seed)
     def test_export_then_import(    testfile):                                 ctest_zran.test_export_then_import(    testfile)
+    def test_export_import_no_points():                                        ctest_zran.test_export_import_no_points()
diff --git a/indexed_gzip/zran.c b/indexed_gzip/zran.c
@@ -8,6 +8,7 @@
  *
  * Author: Paul McCarthy <[email protected]>
  */
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,6 +37,13 @@ static int is_readonly(FILE *fd)
 {
     return (fcntl(fileno(fd), F_GETFL) & O_ACCMODE) == O_RDONLY;
 }
+
+
+static uint32_t max(uint32_t a, uint32_t b) {
+
+  if (a > b) return a;
+  else       return b;
+}
 #endif
 
 #include "zran.h"
@@ -416,7 +424,7 @@ uint32_t ZRAN_INFLATE_STOP_AT_BLOCK         = 64;
  * parameters are respectively updated to contain the total number of
  * compressed bytes that were read from the file, and total number of
  * decompressed bytes that were copied to the data buffer.
-
+ *
  *   - ZRAN_INFLATE_OK:             Inflation was successful and the requested
  *                                  number of bytes were copied to the provided
  *                                  data buffer.
@@ -2698,8 +2706,11 @@ int zran_import_index(zran_index_t *index,
      * At this step, the number of points is known. Allocate space for new list
      * of points. This pointer should be cleaned up before exit in case of
      * failure.
+     *
+     * The index file is allowed to contain 0 points, in which case we
+     * initialise the point list to 8 (same as in zran_init).
      */
-    new_list = calloc(1, sizeof(zran_point_t) * npoints);
+    new_list = calloc(1, sizeof(zran_point_t) * max(npoints, 8));
 
     if (new_list == NULL) goto memory_error;
 
@@ -2836,8 +2847,12 @@ int zran_import_index(zran_index_t *index,
     index->list    = new_list;
     index->npoints = npoints;
 
-    /* Let's not forget to update the size as well. */
-    index->size    = npoints;
+    /*
+     * Let's not forget to update the size as well.
+     * If npoints is 0, the list will have been
+     * initialised to allow space for 8 points.
+     */
+    index->size    = max(npoints, 8);
 
     zran_log("zran_import_index: done\n");
 

diff --git a/indexed_gzip/zran.h b/indexed_gzip/zran.h
@@ -55,8 +55,9 @@ struct _zran_index {
     size_t        uncompressed_size;
 
     /*
-     * Spacing size in bytes, relative to the compressed
-     * data stream, between adjacent index points
+     * Spacing size in bytes, relative to the
+     * uncompressed data stream, between adjacent
+     * index points.
      */
     uint32_t      spacing;
 
@@ -319,6 +320,36 @@ enum {
  *
  * See zran_import_index for importing.
  *
+ * A zran index file is a binary file which has the following header
+ * structure. All fields are assumed to be stored with little-endian
+ * ordering:
+ *
+ * | Offset | Length | Description                     |
+ * | 0      | 7      | File header (GZIDX\00\00)       |
+ * | 7      | 8      | Compressed file size  (uint64)  |
+ * | 15     | 8      | Uncompressed file size (uint64) |
+ * | 23     | 4      | Index point spacing (uint32)    |
+ * | 27     | 4      | Index window size W (uint32)    |
+ * | 31     | 4      | Number of index points (uint32) |
+ *
+ * The header is followed by the offsets for each index point:
+ *
+ * | Offset | Length | Description                              |
+ * | 0      | 8      | Compressed offset for point 0 (uint64)   |
+ * | 8      | 8      | Uncompressed offset for point 0 (uint64) |
+ * | 16     | 1      | Bit offset for point 0 (uint8)           |
+ * | ...    | ...    | ...                                      |
+ * | N*17   | 8      | Compressed offset for point N (uint64)   |
+ * | ...    | ...    | ...                                      |
+ *
+ * Finally the window data for every index point is concatenated
+ * (W represents the index window size):
+ *
+ * | Offset | Length | Description                   |
+ * | 0      | W      | Window data for index point N |
+ * | ...    | ...    | ...                           |
+ * | N*W    | W      | Window data for index point N |
+ *
  * Returns:
  *   - ZRAN_EXPORT_OK for success.
  *
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,4 +15,4 @@
		ZranError)


		__version__ = '0.8.8'
		__version__ = '0.8.9'