diff --git a/.ci/run_32bit_test.sh b/.ci/run_32bit_test.sh index a1f61ea0..e418f82d 100755 --- a/.ci/run_32bit_test.sh +++ b/.ci/run_32bit_test.sh @@ -7,7 +7,7 @@ PYV=$PYTHON_VERSION apt-get update -apt-get install -y libssl-dev openssl wget build-essential +apt-get install -y libssl-dev openssl wget build-essential libffi-dev cd / wget https://www.python.org/ftp/python/$PYV/Python-$PYV.tar.xz tar xf Python-$PYV.tar.xz diff --git a/.travis.yml b/.travis.yml index 02eb2637..cfea8b54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,9 @@ branches: except: - /^v[0-9]\.[0-9]\.[0-9]/ + +dist: xenial + sudo: required language: python @@ -80,15 +83,14 @@ matrix: env: TEST_SUITE=zran_test EXTRA_ARGS="--use_mmap" NITERS=25 NELEMS=805306368 TEST_PATTERN="test_seek_then_read_block" - python: 3.6 env: TEST_SUITE=zran_test EXTRA_ARGS="--use_mmap --concat" NITERS=25 NELEMS=805306368 TEST_PATTERN="test_seek_then_read_block" - - python: 3.6 - - env: TEST_SUITE=32bittest + # Tests covering the indexed_gzip module python: - 2.7 - - 3.4 - 3.5 - 3.6 + - 3.7 env: - TEST_SUITE=indexed_gzip_test NITERS=5000 NELEMS=50000 diff --git a/CHANGELOG.md b/CHANGELOG.md index 99c2ae18..8a0a271f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,17 @@ # `indexed_gzip` changelog +## 0.8.9 (May 14th 2019) + + +* The `IndexedGzipFile.import_index` method and `zran_import_index` function + can handle index files which do not contain any index points (#18). + + ## 0.8.8 (November 22nd 2018) * Fixed bug affecting files which were an exact multiple of the read buffer - size. + size (#15). ## 0.8.7 (August 3rd 2018) diff --git a/appveyor.yml b/appveyor.yml index a40e524a..5f9670fc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,16 +13,6 @@ environment: MINICONDA: "C:\\Miniconda-x64" PYTHON_ARCH: "64" - - PYTHON_VERSION: "3.4" - PYTHON: "C:\\Miniconda34" - MINICONDA: "C:\\Miniconda34" - PYTHON_ARCH: "32" - - - PYTHON_VERSION: "3.4" - PYTHON: "C:\\Miniconda34-x64" - MINICONDA: "C:\\Miniconda34-x64" - PYTHON_ARCH: "64" - - PYTHON_VERSION: "3.5" PYTHON: "C:\\Miniconda35" MINICONDA: "C:\\Miniconda35" @@ -43,6 +33,18 @@ environment: MINICONDA: "C:\\Miniconda36-x64" PYTHON_ARCH: "64" + - PYTHON_VERSION: "3.7" + PYTHON: "C:\\Miniconda37" + MINICONDA: "C:\\Miniconda37" + PYTHON_ARCH: "32" + + - PYTHON_VERSION: "3.7" + PYTHON: "C:\\Miniconda37-x64" + MINICONDA: "C:\\Miniconda37-x64" + PYTHON_ARCH: "64" + + + install: - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%" diff --git a/indexed_gzip/__init__.py b/indexed_gzip/__init__.py index f01acda4..383995f8 100644 --- a/indexed_gzip/__init__.py +++ b/indexed_gzip/__init__.py @@ -15,4 +15,4 @@ ZranError) -__version__ = '0.8.8' +__version__ = '0.8.9' diff --git a/indexed_gzip/indexed_gzip.pyx b/indexed_gzip/indexed_gzip.pyx index 82f81bd1..fdc676de 100644 --- a/indexed_gzip/indexed_gzip.pyx +++ b/indexed_gzip/indexed_gzip.pyx @@ -136,6 +136,9 @@ cdef class _IndexedGzipFile: with an open file handle (``fileobj``), or with a ``filename``. If the former, the file must have been opened in ``'rb'`` mode. + .. note:: The ``auto_build`` behaviour only takes place on calls to + :meth:`seek`. + :arg filename: File name. :arg mode: Opening mode. Must be either ``'r'`` or ``'rb``. @@ -145,7 +148,7 @@ cdef class _IndexedGzipFile: :arg fid: Deprecated, use ``fileobj`` instead. :arg auto_build: If ``True`` (the default), the index is - automatically built on seeks/reads. + automatically built on calls to :meth:`seek`. :arg spacing: Number of bytes between index seek points. diff --git a/indexed_gzip/tests/ctest_zran.pyx b/indexed_gzip/tests/ctest_zran.pyx index 0e881076..a1c9a13f 100644 --- a/indexed_gzip/tests/ctest_zran.pyx +++ b/indexed_gzip/tests/ctest_zran.pyx @@ -13,10 +13,13 @@ import itertools as it import subprocess as sp import sys import time +import gzip +import shutil import random import hashlib import tempfile import threading +import contextlib import numpy as np @@ -56,6 +59,21 @@ cimport indexed_gzip.zran as zran np.import_array() +@contextlib.contextmanager +def tempdir(): + """Returns a context manager which creates and returns a temporary + directory, and then deletes it on exit. + """ + testdir = tempfile.mkdtemp() + prevdir = os.getcwd() + try: + os.chdir(testdir) + yield testdir + finally: + os.chdir(prevdir) + shutil.rmtree(testdir) + + cdef read_element(zran.zran_index_t *index, element, nelems, seek=True): cdef void *buffer @@ -867,3 +885,55 @@ def test_export_then_import(testfile): zran.zran_free(&index1) zran.zran_free(&index2) + + +def test_export_import_no_points(): + + cdef zran.zran_index_t index + cdef void *buffer + + data = np.random.randint(1, 255, 100, dtype=np.uint8) + buf = ReadBuffer(100) + buffer = buf.buffer + + with tempdir(): + + with gzip.open('data.gz', 'wb') as f: + f.write(data.tostring()) + + with open('data.gz', 'rb') as pyfid: + cfid = fdopen(pyfid.fileno(), 'rb') + assert zran.zran_init(&index, + cfid, + 1048576, + 32768, + 131072, + 0) == 0 + assert zran.zran_read(&index, buffer, 100) == 100 + + pybuf = (buffer)[:100] + assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data) + + with open('data.gz.index', 'wb') as pyidxfid: + cidxfid = fdopen(pyidxfid.fileno(), 'wb') + assert zran.zran_export_index(&index, cidxfid) == 0 + zran.zran_free(&index) + + with open('data.gz', 'rb') as pyfid: + cfid = fdopen(pyfid.fileno(), 'rb') + assert zran.zran_init(&index, + cfid, + 1048576, + 32768, + 131072, + 0) == 0 + + with open('data.gz.index', 'rb') as pyidxfid: + cidxfid = fdopen(pyidxfid.fileno(), 'rb') + assert zran.zran_import_index(&index, cidxfid) == 0 + assert index.npoints == 0 + + assert zran.zran_read(&index, buffer, 100) == 100 + pybuf = (buffer)[:100] + assert np.all(np.frombuffer(pybuf, dtype=np.uint8) == data) + zran.zran_free(&index) diff --git a/indexed_gzip/tests/test_zran.py b/indexed_gzip/tests/test_zran.py index 39a9f8ad..a4451eea 100644 --- a/indexed_gzip/tests/test_zran.py +++ b/indexed_gzip/tests/test_zran.py @@ -36,3 +36,4 @@ def test_read_all_sequential( testfile, nelems): ctest def test_build_then_read( testfile, nelems, seed, use_mmap): ctest_zran.test_build_then_read( testfile, nelems, seed, use_mmap) def test_readbuf_spacing_sizes( testfile, nelems, niters, seed): ctest_zran.test_readbuf_spacing_sizes( testfile, nelems, niters, seed) def test_export_then_import( testfile): ctest_zran.test_export_then_import( testfile) + def test_export_import_no_points(): ctest_zran.test_export_import_no_points() diff --git a/indexed_gzip/zran.c b/indexed_gzip/zran.c index 365fa08b..00310bd7 100644 --- a/indexed_gzip/zran.c +++ b/indexed_gzip/zran.c @@ -8,6 +8,7 @@ * * Author: Paul McCarthy */ + #include #include #include @@ -36,6 +37,13 @@ static int is_readonly(FILE *fd) { return (fcntl(fileno(fd), F_GETFL) & O_ACCMODE) == O_RDONLY; } + + +static uint32_t max(uint32_t a, uint32_t b) { + + if (a > b) return a; + else return b; +} #endif #include "zran.h" @@ -416,7 +424,7 @@ uint32_t ZRAN_INFLATE_STOP_AT_BLOCK = 64; * parameters are respectively updated to contain the total number of * compressed bytes that were read from the file, and total number of * decompressed bytes that were copied to the data buffer. - + * * - ZRAN_INFLATE_OK: Inflation was successful and the requested * number of bytes were copied to the provided * data buffer. @@ -2698,8 +2706,11 @@ int zran_import_index(zran_index_t *index, * At this step, the number of points is known. Allocate space for new list * of points. This pointer should be cleaned up before exit in case of * failure. + * + * The index file is allowed to contain 0 points, in which case we + * initialise the point list to 8 (same as in zran_init). */ - new_list = calloc(1, sizeof(zran_point_t) * npoints); + new_list = calloc(1, sizeof(zran_point_t) * max(npoints, 8)); if (new_list == NULL) goto memory_error; @@ -2836,8 +2847,12 @@ int zran_import_index(zran_index_t *index, index->list = new_list; index->npoints = npoints; - /* Let's not forget to update the size as well. */ - index->size = npoints; + /* + * Let's not forget to update the size as well. + * If npoints is 0, the list will have been + * initialised to allow space for 8 points. + */ + index->size = max(npoints, 8); zran_log("zran_import_index: done\n"); diff --git a/indexed_gzip/zran.h b/indexed_gzip/zran.h index abba98b1..aa1fc6ab 100644 --- a/indexed_gzip/zran.h +++ b/indexed_gzip/zran.h @@ -55,8 +55,9 @@ struct _zran_index { size_t uncompressed_size; /* - * Spacing size in bytes, relative to the compressed - * data stream, between adjacent index points + * Spacing size in bytes, relative to the + * uncompressed data stream, between adjacent + * index points. */ uint32_t spacing; @@ -319,6 +320,36 @@ enum { * * See zran_import_index for importing. * + * A zran index file is a binary file which has the following header + * structure. All fields are assumed to be stored with little-endian + * ordering: + * + * | Offset | Length | Description | + * | 0 | 7 | File header (GZIDX\00\00) | + * | 7 | 8 | Compressed file size (uint64) | + * | 15 | 8 | Uncompressed file size (uint64) | + * | 23 | 4 | Index point spacing (uint32) | + * | 27 | 4 | Index window size W (uint32) | + * | 31 | 4 | Number of index points (uint32) | + * + * The header is followed by the offsets for each index point: + * + * | Offset | Length | Description | + * | 0 | 8 | Compressed offset for point 0 (uint64) | + * | 8 | 8 | Uncompressed offset for point 0 (uint64) | + * | 16 | 1 | Bit offset for point 0 (uint8) | + * | ... | ... | ... | + * | N*17 | 8 | Compressed offset for point N (uint64) | + * | ... | ... | ... | + * + * Finally the window data for every index point is concatenated + * (W represents the index window size): + * + * | Offset | Length | Description | + * | 0 | W | Window data for index point N | + * | ... | ... | ... | + * | N*W | W | Window data for index point N | + * * Returns: * - ZRAN_EXPORT_OK for success. *