From 27aacf3a1b1a79537b15f3fa950e0f799489b146 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 28 Dec 2020 13:54:57 +0100 Subject: [PATCH] squased commit 1.8.0 / Optimizations --- .gitignore | 1 - .gitmodules | 0 CMakeLists.txt | 45 +- Changelog | 8 + README.md | 17 +- benchmarks/.gitignore | 0 ...-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt | 4 + dependencies/.gitignore | 1 + dependencies/zstd-lib-common/LICENSE | 30 + dependencies/zstd-lib-common/README.txt | 1 + dependencies/zstd-lib-common/mem.h | 453 +++++++++ dependencies/zstd-lib-common/xxhash.c | 862 +++++++++++++++++ dependencies/zstd-lib-common/xxhash.h | 285 ++++++ .../zstd-seekable-adapted/CMakeLists.txt | 0 dependencies/zstd-seekable-adapted/README.md | 1 + .../zstd-seekable-adapted/zstd_seekable.h | 187 ++++ .../zstd_seekable_utils.hpp | 65 ++ .../zstdseek_compress.cpp | 369 +++++++ .../zstdseek_decompress.cpp | 484 ++++++++++ .../zstd-seekable-adapted/zstdseek_utils.cpp | 288 ++++++ deps | 12 + include/chunked_reader.hpp | 70 ++ include/config.hpp.in | 8 +- include/fasta_to_fastafs.hpp | 22 +- include/fastafs.hpp | 14 +- include/fivebit_fivebytes.hpp | 49 + include/flags.hpp | 2 + include/fourbit_byte.hpp | 21 +- include/twobit_byte.hpp | 30 +- include/utils.hpp | 6 +- scripts/benchmark.py | 88 ++ scripts/test_utils.py | 1 + scripts/utils.py | 44 + src/chunked_reader.cpp | 240 +++++ src/database.cpp | 82 +- src/fasta_to_fastafs.cpp | 905 ++++++++++++++---- src/fastafs.cpp | 653 +++++++------ src/fivebit_fivebytes.cpp | 282 ++++++ src/flags.cpp | 15 + src/fourbit_byte.cpp | 35 +- src/fuse.cpp | 177 +++- src/main.cpp | 56 +- src/twobit_byte.cpp | 30 +- src/utils.cpp | 39 +- test/CMakeLists.txt | 215 ++++- test/cache/test_cache.cpp | 150 ++- test/chunked_reader/test_chunked_reader.cpp | 306 ++++++ test/data/test_010.fa | 7 + test/data/test_011.fa | 2 + test/fastafs/test_fastafs.cpp | 128 ++- .../test_fivebit_fivebytes.cpp | 202 ++++ test/test_functional.py | 32 +- test/test_utils.py | 170 +++- test/view/test_view.cpp | 242 ++++- tmp/.gitignore | 9 + tmp/benchmark/.gitignore | 2 + 56 files changed, 6689 insertions(+), 758 deletions(-) create mode 100644 .gitmodules create mode 100644 benchmarks/.gitignore create mode 100644 benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt create mode 100644 dependencies/zstd-lib-common/LICENSE create mode 100644 dependencies/zstd-lib-common/README.txt create mode 100644 dependencies/zstd-lib-common/mem.h create mode 100644 dependencies/zstd-lib-common/xxhash.c create mode 100644 dependencies/zstd-lib-common/xxhash.h create mode 100644 dependencies/zstd-seekable-adapted/CMakeLists.txt create mode 100644 dependencies/zstd-seekable-adapted/README.md create mode 100644 dependencies/zstd-seekable-adapted/zstd_seekable.h create mode 100644 dependencies/zstd-seekable-adapted/zstd_seekable_utils.hpp create mode 100644 dependencies/zstd-seekable-adapted/zstdseek_compress.cpp create mode 100644 dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp create mode 100644 dependencies/zstd-seekable-adapted/zstdseek_utils.cpp create mode 100644 deps create mode 100644 include/chunked_reader.hpp create mode 100644 include/fivebit_fivebytes.hpp create mode 100755 scripts/benchmark.py create mode 120000 scripts/test_utils.py create mode 100644 scripts/utils.py create mode 100644 src/chunked_reader.cpp create mode 100644 src/fivebit_fivebytes.cpp create mode 100644 test/chunked_reader/test_chunked_reader.cpp create mode 100644 test/data/test_010.fa create mode 100644 test/data/test_011.fa create mode 100644 test/fivebit_fivebytes/test_fivebit_fivebytes.cpp create mode 100644 tmp/.gitignore create mode 100644 tmp/benchmark/.gitignore diff --git a/.gitignore b/.gitignore index 90d5b97d..2f70da61 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ Testing/ cmake_install.cmake .directory /*.2bit -tmp/ repeats.txt build/ xcheck.sh diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..e69de29b diff --git a/CMakeLists.txt b/CMakeLists.txt index e37568f0..1f1c7a66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,10 +5,10 @@ cmake_minimum_required(VERSION 2.8) project(fastafs) -# Do this once in a while - find different bugs +# Do this once in a while - find different compiler warnings #set(CMAKE_CXX_COMPILER "clang++") -set(PROJECT_VERSION "1.7.5") +set(PROJECT_VERSION "1.8.0") set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs") set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues") @@ -77,10 +77,20 @@ add_custom_target(tidy DEPENDS make_tidy ) # ---------------------------------------------------------------------- # ---------------------------- Compilation ----------------------------- + add_subdirectory(src) -include_directories(include) #include_directories(${BUILD_DIR}) include_directories("${BUILD_DIR}/include") +include_directories("${CMAKE_SOURCE_DIR}/include") + + +# ZSTD +include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common") + +# ZSTD-SEEKABLE +add_subdirectory("${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted") +include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted") + add_definitions(-std=c++14) # Boost @@ -98,8 +108,10 @@ link_libraries(ssl) link_libraries(crypto) link_libraries(fuse) link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity +link_libraries(zstd) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") - +# -DXXH_NAMESPACE=ZST_ if(DEBUG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -Wall -g -ggdb -Wconversion -D_FILE_OFFSET_BITS=64")# -Werror makes compilation crash when warnings are given (also part of Travis) else() @@ -115,11 +127,19 @@ add_executable(fastafs src/ucsc2bit.cpp src/twobit_byte.cpp src/fourbit_byte.cpp + src/fivebit_fivebytes.cpp src/database.cpp src/utils.cpp src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp + src/chunked_reader.cpp + + + dependencies/zstd-lib-common/xxhash.c + dependencies/zstd-seekable-adapted/zstdseek_utils.cpp + dependencies/zstd-seekable-adapted/zstdseek_compress.cpp + dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp ) set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") @@ -133,11 +153,19 @@ add_executable(mount.fastafs src/ucsc2bit.cpp src/twobit_byte.cpp src/fourbit_byte.cpp + src/fivebit_fivebytes.cpp src/database.cpp src/utils.cpp src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp + src/chunked_reader.cpp + + + dependencies/zstd-lib-common/xxhash.c + dependencies/zstd-seekable-adapted/zstdseek_utils.cpp + dependencies/zstd-seekable-adapted/zstdseek_compress.cpp + dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp ) set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") @@ -149,11 +177,19 @@ add_library(libfastafs SHARED src/ucsc2bit.cpp src/twobit_byte.cpp src/fourbit_byte.cpp + src/fivebit_fivebytes.cpp src/database.cpp src/utils.cpp src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp + src/chunked_reader.cpp + + + dependencies/zstd-lib-common/xxhash.c + dependencies/zstd-seekable-adapted/zstdseek_utils.cpp + dependencies/zstd-seekable-adapted/zstdseek_compress.cpp + dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp ) target_include_directories(libfastafs PUBLIC include) target_sources(libfastafs PUBLIC include/fastafs.hpp) @@ -180,6 +216,7 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as al add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) +add_test(test_fivebit_fivebytes "${BUILD_TEST_DIR}/test_fivebit_fivebytes") add_test(test_cache "${BUILD_TEST_DIR}/test_cache") add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_flags "${BUILD_TEST_DIR}/test_flags") diff --git a/Changelog b/Changelog index 6c5d1bef..566f34e0 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,11 @@ +2020-04-29 Youri Hoogstrate + + * v1.8.0 + * Support for zstd-seekable + * Support for protein sequences using 8-to-5 byte compression + * Object type for chunked/buffered file reading + * Requires libzstd (1.4.5 and above) as dependency + 2020-03-03 Youri Hoogstrate * v1.7.5 diff --git a/README.md b/README.md index ad749732..242f4111 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,21 @@ Direct link to the file format specification: ![](https://bioinf-galaxian.erasmusmc.nl/public/images/fastafs/fastafs-example.gif) -## in a compressed and random access manner +## Elegant integration of sequence data archives, backwards compatible with FASTA and no API's needed -RNA, DNA and protein sequences are commonly stored in the FASTA format. Although very commonly used and easy to read, FASTA files consume vast amounts of diskspace and need to be provided with additional files to achieve random access and interoperability. Classical compressors only offer back and forwards compression of the files, often requiring to decompress to a new copy of the FASTA file. +RNA, DNA and protein sequences are commonly stored in the FASTA format. Although very commonly used and easy to read, FASTA files come with additional metadata files and consume unnecessary disk space. These additional metadata files need to be are necessary to achieve random access and have certain interoperability features, and require additional maintaince. Classical FASTA (de-)compressors only offer back and forwards compression of the files, often requiring to decompress to a new copy of the FASTA file making it inpractical solutions in particular for random access use cases. Although they typically produce very compact archives with quick algorithms, they are not widely adopted in our bioinformatics software. -Here we propose a solution; a virtual layer to (random access) TwoBit/FourBit compression that provides read-only access to a FASTA file and the guarenteed in-sync FAI, DICT and 2BIT files, through a FUSE file system layer. By simply mounting the compressed archive as a FASTA and necessary metadata files, we only virtualize chunks of the FASTA corresponding to an file request. Additional advantages of FASTAFS are the toolkit and interface are sequence verification, checking file integrity and a feature rich toolskit that allows management of the mounted files. +Here we propose a solution; a virtual layer between (random access) FASTA archives and read-only access to FASTA files and their guarenteed in-sync FAI, DICT and 2BIT files, through the File System in Userspace (FUSE) file system layer. When the archive is mounted, fastafs virtualizes a folder containing the FASTA and necessary metadata files, only accessing the chunks of the archive needed to deliver to the file request. This elegant software solution offers several advantages: + - virtual files and their system calls are identical to flat files and preserve backwards compatibility with tools only compatible with FASTA, also for random access use-cases, + - there is no need to use additional disk space for temporary decompression or to put entire FASTA files into memory, + - for random access requests, computational resources are only spent on decompressing the region of interest, + - it does not need multiple implementations of software libraries for each distinct tool and for each programming language, + - it does not require to maintain multiple files that all together make up one data entity as it is guaranteed to provide dict- and fai-files that are in sync with their FASTA of origin. -FASTAFS is deliberately made backwards compatible with both TwoBit and Fasta. The package even allows to mount TwoBit files instead of FASTAFS files, to FASTA files. An important question is whether FASTAFS is this famous 15th standard ()? -Partially, but it is not designed to replace FASTA nor 2bit as the mountpoints provide an exact identical way of file access as regular flat file acces, and is thus backwards compatible. +In addition, the corresponding toolkit offers an interface that allows ENA sequence identification, file integrity verification and management of the mounted files and process ids. + +FASTAFS is deliberately made backwards compatible with both TwoBit and Fasta. The package even allows to mount TwoBit files instead of FASTAFS files, to FASTA files. For those who believe FASTAFS is this famous 15th standard ()? +Partially, it is not designed to replace FASTA nor TwoBit as the mountpoints provide an exact identical way of file access as regular flat file acces, and is thus backwards compatible. Instead, it offers the same old standard with an elegant toolkit that allows easier integration with workflow management systems. ## Installation and compilation diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt b/benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt new file mode 100644 index 00000000..aa7dd804 --- /dev/null +++ b/benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt @@ -0,0 +1,4 @@ +timestamp git-commit perf:cycles perf:total_time perf:user_time perf:sys_time cmd git-mod-status +2020-12-28 11:49:37.552478 a8f0d23cf01e6c622932427222862e481b20141c 211512123619 47.812391901 77.174097 0.834557 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps +2020-12-28 11:52:00.792104 a8f0d23cf01e6c622932427222862e481b20141c 212350209391 45.673330693 75.748635 0.850329 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps +2020-12-28 11:53:58.344964 a8f0d23cf01e6c622932427222862e481b20141c 215102039446 47.109749872 76.433183 0.887304 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps diff --git a/dependencies/.gitignore b/dependencies/.gitignore index f681a6b2..4fefb764 100644 --- a/dependencies/.gitignore +++ b/dependencies/.gitignore @@ -1 +1,2 @@ IntervalTree.hpp +/zstd diff --git a/dependencies/zstd-lib-common/LICENSE b/dependencies/zstd-lib-common/LICENSE new file mode 100644 index 00000000..a793a802 --- /dev/null +++ b/dependencies/zstd-lib-common/LICENSE @@ -0,0 +1,30 @@ +BSD License + +For Zstandard software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/dependencies/zstd-lib-common/README.txt b/dependencies/zstd-lib-common/README.txt new file mode 100644 index 00000000..da98b66f --- /dev/null +++ b/dependencies/zstd-lib-common/README.txt @@ -0,0 +1 @@ +These files are copied from ZSTD 1.4.5 (lib/common) diff --git a/dependencies/zstd-lib-common/mem.h b/dependencies/zstd-lib-common/mem.h new file mode 100644 index 00000000..89c8aea7 --- /dev/null +++ b/dependencies/zstd-lib-common/mem.h @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef MEM_H_MODULE +#define MEM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-**************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ +#include /* memcpy */ + + +/*-**************************************** +* Compiler specifics +******************************************/ +#if defined(_MSC_VER) /* Visual Studio */ +# include /* _byteswap_ulong */ +# include /* _byteswap_* */ +#endif +#if defined(__GNUC__) +# define MEM_STATIC static __inline __attribute__((unused)) +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define MEM_STATIC static inline +#elif defined(_MSC_VER) +# define MEM_STATIC static __inline +#else +# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + +#ifndef __has_builtin +# define __has_builtin(x) 0 /* compat. with non-clang compilers */ +#endif + +/* code only tested on 32 and 64 bits systems */ +#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } +MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } + +/* detects whether we are being compiled under msan */ +#if defined (__has_feature) +# if __has_feature(memory_sanitizer) +# define MEMORY_SANITIZER 1 +# endif +#endif + +#if defined (MEMORY_SANITIZER) +/* Not all platforms that support msan provide sanitizers/msan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +#include /* intptr_t */ + +/* Make memory region fully initialized (without changing its contents). */ +void __msan_unpoison(const volatile void *a, size_t size); + +/* Make memory region fully uninitialized (without changing its contents). + This is a legacy interface that does not update origin information. Use + __msan_allocated_memory() instead. */ +void __msan_poison(const volatile void *a, size_t size); + +/* Returns the offset of the first (at least partially) poisoned byte in the + memory range, or -1 if the whole range is good. */ +intptr_t __msan_test_shadow(const volatile void *x, size_t size); +#endif + +/* detects whether we are being compiled under asan */ +#if defined (__has_feature) +# if __has_feature(address_sanitizer) +# define ADDRESS_SANITIZER 1 +# endif +#elif defined(__SANITIZE_ADDRESS__) +# define ADDRESS_SANITIZER 1 +#endif + +#if defined (ADDRESS_SANITIZER) +/* Not all platforms that support asan provide sanitizers/asan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +/** + * Marks a memory region ([addr, addr+size)) as unaddressable. + * + * This memory must be previously allocated by your program. Instrumented + * code is forbidden from accessing addresses in this region until it is + * unpoisoned. This function is not guaranteed to poison the entire region - + * it could poison only a subregion of [addr, addr+size) due to ASan + * alignment restrictions. + * + * \note This function is not thread-safe because no two threads can poison or + * unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_poison_memory_region(void const volatile *addr, size_t size); + +/** + * Marks a memory region ([addr, addr+size)) as addressable. + * + * This memory must be previously allocated by your program. Accessing + * addresses in this region is allowed until this region is poisoned again. + * This function could unpoison a super-region of [addr, addr+size) due + * to ASan alignment restrictions. + * + * \note This function is not thread-safe because no two threads can + * poison or unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +#endif + + +/*-************************************************************** +* Basic Types +*****************************************************************/ +#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef int16_t S16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef int64_t S64; +#else +# include +#if CHAR_BIT != 8 +# error "this implementation requires char to be exactly 8-bit type" +#endif + typedef unsigned char BYTE; +#if USHRT_MAX != 65535 +# error "this implementation requires short to be exactly 16-bit type" +#endif + typedef unsigned short U16; + typedef signed short S16; +#if UINT_MAX != 4294967295 +# error "this implementation requires int to be exactly 32-bit type" +#endif + typedef unsigned int U32; + typedef signed int S32; +/* note : there are no limits defined for long long type in C90. + * limits exist in C99, however, in such case, is preferred */ + typedef unsigned long long U64; + typedef signed long long S64; +#endif + + +/*-************************************************************** +* Memory I/O +*****************************************************************/ +/* MEM_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets depending on alignment. + * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6) + * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define MEM_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) +# define MEM_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; } +MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } + +MEM_STATIC unsigned MEM_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + +#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) + +/* violates C standard, by lying on structure alignment. +Only use if no other choice to achieve best performance on target platform */ +MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } +MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } +MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } +MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; } + +#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) + __pragma( pack(push, 1) ) + typedef struct { U16 v; } unalign16; + typedef struct { U32 v; } unalign32; + typedef struct { U64 v; } unalign64; + typedef struct { size_t v; } unalignArch; + __pragma( pack(pop) ) +#else + typedef struct { U16 v; } __attribute__((packed)) unalign16; + typedef struct { U32 v; } __attribute__((packed)) unalign32; + typedef struct { U64 v; } __attribute__((packed)) unalign64; + typedef struct { size_t v; } __attribute__((packed)) unalignArch; +#endif + +MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; } +MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; } +MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; } +MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; } + +#else + +/* default method, safe and standard. + can sometimes prove slower */ + +MEM_STATIC U16 MEM_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U32 MEM_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U64 MEM_read64(const void* memPtr) +{ + U64 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC size_t MEM_readST(const void* memPtr) +{ + size_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write64(void* memPtr, U64 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* MEM_FORCE_MEMORY_ACCESS */ + +MEM_STATIC U32 MEM_swap32(U32 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_ulong(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap32)) + return __builtin_bswap32(in); +#else + return ((in << 24) & 0xff000000 ) | + ((in << 8) & 0x00ff0000 ) | + ((in >> 8) & 0x0000ff00 ) | + ((in >> 24) & 0x000000ff ); +#endif +} + +MEM_STATIC U64 MEM_swap64(U64 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_uint64(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap64)) + return __builtin_bswap64(in); +#else + return ((in << 56) & 0xff00000000000000ULL) | + ((in << 40) & 0x00ff000000000000ULL) | + ((in << 24) & 0x0000ff0000000000ULL) | + ((in << 8) & 0x000000ff00000000ULL) | + ((in >> 8) & 0x00000000ff000000ULL) | + ((in >> 24) & 0x0000000000ff0000ULL) | + ((in >> 40) & 0x000000000000ff00ULL) | + ((in >> 56) & 0x00000000000000ffULL); +#endif +} + +MEM_STATIC size_t MEM_swapST(size_t in) +{ + if (MEM_32bits()) + return (size_t)MEM_swap32((U32)in); + else + return (size_t)MEM_swap64((U64)in); +} + +/*=== Little endian r/w ===*/ + +MEM_STATIC U16 MEM_readLE16(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read16(memPtr); + else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)(p[0] + (p[1]<<8)); + } +} + +MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) +{ + if (MEM_isLittleEndian()) { + MEM_write16(memPtr, val); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE)val; + p[1] = (BYTE)(val>>8); + } +} + +MEM_STATIC U32 MEM_readLE24(const void* memPtr) +{ + return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); +} + +MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) +{ + MEM_writeLE16(memPtr, (U16)val); + ((BYTE*)memPtr)[2] = (BYTE)(val>>16); +} + +MEM_STATIC U32 MEM_readLE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read32(memPtr); + else + return MEM_swap32(MEM_read32(memPtr)); +} + +MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, val32); + else + MEM_write32(memPtr, MEM_swap32(val32)); +} + +MEM_STATIC U64 MEM_readLE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read64(memPtr); + else + return MEM_swap64(MEM_read64(memPtr)); +} + +MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, val64); + else + MEM_write64(memPtr, MEM_swap64(val64)); +} + +MEM_STATIC size_t MEM_readLEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readLE32(memPtr); + else + return (size_t)MEM_readLE64(memPtr); +} + +MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeLE32(memPtr, (U32)val); + else + MEM_writeLE64(memPtr, (U64)val); +} + +/*=== Big endian r/w ===*/ + +MEM_STATIC U32 MEM_readBE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap32(MEM_read32(memPtr)); + else + return MEM_read32(memPtr); +} + +MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, MEM_swap32(val32)); + else + MEM_write32(memPtr, val32); +} + +MEM_STATIC U64 MEM_readBE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap64(MEM_read64(memPtr)); + else + return MEM_read64(memPtr); +} + +MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, MEM_swap64(val64)); + else + MEM_write64(memPtr, val64); +} + +MEM_STATIC size_t MEM_readBEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readBE32(memPtr); + else + return (size_t)MEM_readBE64(memPtr); +} + +MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeBE32(memPtr, (U32)val); + else + MEM_writeBE64(memPtr, (U64)val); +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* MEM_H_MODULE */ diff --git a/dependencies/zstd-lib-common/xxhash.c b/dependencies/zstd-lib-common/xxhash.c new file mode 100644 index 00000000..429a869e --- /dev/null +++ b/dependencies/zstd-lib-common/xxhash.c @@ -0,0 +1,862 @@ +/* + * xxHash - Fast Hash algorithm + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash homepage: http://www.xxhash.com + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \ + defined(__ICCARM__) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. + * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. + * By default, this option is disabled. To enable it, uncomment below define : + */ +/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; set to 0 when the input data + * is guaranteed to be aligned. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/* Modify the local functions below should you wish to use some other memory routines */ +/* for malloc(), free() */ +#include +#include /* size_t */ +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } +/* for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } + +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +#endif +#include "xxhash.h" + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR + + +#ifdef _MSC_VER +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# define MEM_MODULE +# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */ +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } +static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; + +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +#if defined(__ICCARM__) +# include +# define XXH_rotl32(x,r) __ROR(x,(32 - r)) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +# define XXH_swap64 _byteswap_uint64 +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +# define XXH_swap64 __builtin_bswap64 +#else +static U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +static U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN + static const int g_one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one)) +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/* ************************************* +* Constants +***************************************/ +static const U32 PRIME32_1 = 2654435761U; +static const U32 PRIME32_2 = 2246822519U; +static const U32 PRIME32_3 = 3266489917U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; + +static const U64 PRIME64_1 = 11400714785074694791ULL; +static const U64 PRIME64_2 = 14029467366897019727ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; + +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ************************** +* Utils +****************************/ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + + +/* *************************** +* Simple Hash Functions +*****************************/ + +static U32 XXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; + v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; + v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4; + v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4; + } while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p+4<=bEnd) { + h32 += XXH_get32bits(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_CREATESTATE_STATIC(state); + XXH32_reset(state, seed); + XXH32_update(state, input, len); + return XXH32_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +static U64 XXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + U64 h64; +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8; + v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8; + v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8; + v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8; + } while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_CREATESTATE_STATIC(state); + XXH64_reset(state, seed); + XXH64_update(state, input, len); + return XXH64_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +/* ************************************************** +* Advanced Hash Functions +****************************************************/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + + +/*** Hash feed ***/ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-4); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-8); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem32; + const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize; + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + while (p+4<=bEnd) { + h32 += XXH_readLE32(p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + + +/* **** XXH64 **** */ + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem64; + const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize; + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 + PRIME64_5; + } + + h64 += (U64) state->total_len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + +/* ************************** +* Canonical representation +****************************/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} diff --git a/dependencies/zstd-lib-common/xxhash.h b/dependencies/zstd-lib-common/xxhash.h new file mode 100644 index 00000000..4207eba8 --- /dev/null +++ b/dependencies/zstd-lib-common/xxhash.h @@ -0,0 +1,285 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bits version, named XXH64, is available since r35. +It offers much better speed, but for 64-bits applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/* **************************** +* API modifier +******************************/ +/** XXH_PRIVATE_API +* This is useful if you want to include xxhash functions in `static` mode +* in order to inline them, and remove their symbol from the public list. +* Methodology : +* #define XXH_PRIVATE_API +* #include "xxhash.h" +* `xxhash.c` is automatically included. +* It's not useful to compile and link it as a separate module anymore. +*/ +#ifdef XXH_PRIVATE_API +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else +# define XXH_PUBLIC_API static /* this version may generate warnings for unused static functions; disable the relevant warning */ +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_PRIVATE_API */ + +/*!XXH_NAMESPACE, aka Namespace Emulation : + +If you want to include _and expose_ xxHash functions from within your own library, +but also want to avoid symbol collisions with another library which also includes xxHash, + +you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library +with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values). + +Note that no change is required within the calling program as long as it includes `xxhash.h` : +regular symbol name will be automatically translated by this header. +*/ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 2 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Simple Hash Functions +******************************/ +typedef unsigned int XXH32_hash_t; +typedef unsigned long long XXH64_hash_t; + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); + +/*! +XXH32() : + Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s +XXH64() : + Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark). +*/ + + +/* **************************** +* Streaming Hash Functions +******************************/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! State allocation, compatible with dynamic libraries */ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + + +/* hash streaming */ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/* +These functions generate the xxHash of an input provided in multiple segments. +Note that, for small input, they are slower than single-call functions, due to state management. +For small input, prefer `XXH32()` and `XXH64()` . + +XXH state must first be allocated, using XXH*_createState() . + +Start a new hash by initializing state with a seed, using XXH*_reset(). + +Then, feed the hash state by calling XXH*_update() as many times as necessary. +Obviously, input must be allocated and read accessible. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + +Finally, a hash value can be produced anytime, by using XXH*_digest(). +This function returns the nn-bits hash as an int or long long. + +It's still possible to continue inserting input into the hash state after a digest, +and generate some new hashes later on, by calling again XXH*_digest(). + +When done, free XXH state space if it was allocated dynamically. +*/ + + +/* ************************** +* Utils +****************************/ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* ! C99 */ +# define restrict /* disable restrict */ +#endif + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state); + + +/* ************************** +* Canonical representation +****************************/ +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. +* The canonical representation uses human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. +*/ +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +#endif /* XXHASH_H_5627135585666179 */ + + + +/* ================================================================================================ + This section contains definitions which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + They shall only be used with static linking. + Never use these definitions in association with dynamic linking ! +=================================================================================================== */ +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345) +#define XXH_STATIC_H_3543687687345 + +/* These definitions are only meant to allow allocation of XXH state + statically, on stack, or in a struct for example. + Do not use members directly. */ + + struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; /* buffer defined as U32 for alignment */ + unsigned memsize; + unsigned reserved; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH32_state_t */ + + struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; /* buffer defined as U64 for alignment */ + unsigned memsize; + unsigned reserved[2]; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH64_state_t */ + + +# ifdef XXH_PRIVATE_API +# include "xxhash.c" /* include xxhash functions as `static`, for inlining */ +# endif + +#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */ + + +#if defined (__cplusplus) +} +#endif diff --git a/dependencies/zstd-seekable-adapted/CMakeLists.txt b/dependencies/zstd-seekable-adapted/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/dependencies/zstd-seekable-adapted/README.md b/dependencies/zstd-seekable-adapted/README.md new file mode 100644 index 00000000..ce924529 --- /dev/null +++ b/dependencies/zstd-seekable-adapted/README.md @@ -0,0 +1 @@ +This should be identical to the reference implementation of zstd-seekable but with minor tweaks to get it working with CPP diff --git a/dependencies/zstd-seekable-adapted/zstd_seekable.h b/dependencies/zstd-seekable-adapted/zstd_seekable.h new file mode 100644 index 00000000..6777048e --- /dev/null +++ b/dependencies/zstd-seekable-adapted/zstd_seekable.h @@ -0,0 +1,187 @@ +#ifndef SEEKABLE_H +#define SEEKABLE_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include +#include "zstd.h" /* ZSTDLIB_API */ + + +#define ZSTD_seekTableFooterSize 9 + +#define ZSTD_SEEKABLE_MAGICNUMBER 0x8F92EAB1 + +#define ZSTD_SEEKABLE_MAXFRAMES 0x8000000U + +/* Limit the maximum size to avoid any potential issues storing the compressed size */ +#define ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE 0x80000000U + +/*-**************************************************************************** +* Seekable Format +* +* The seekable format splits the compressed data into a series of "frames", +* each compressed individually so that decompression of a section in the +* middle of an archive only requires zstd to decompress at most a frame's +* worth of extra data, instead of the entire archive. +******************************************************************************/ + +typedef struct ZSTD_seekable_CStream_s ZSTD_seekable_CStream; +typedef struct ZSTD_seekable_s ZSTD_seekable; + +/*-**************************************************************************** +* Seekable compression - HowTo +* A ZSTD_seekable_CStream object is required to tracking streaming operation. +* Use ZSTD_seekable_createCStream() and ZSTD_seekable_freeCStream() to create/ +* release resources. +* +* Streaming objects are reusable to avoid allocation and deallocation, +* to start a new compression operation call ZSTD_seekable_initCStream() on the +* compressor. +* +* Data streamed to the seekable compressor will automatically be split into +* frames of size `maxFrameSize` (provided in ZSTD_seekable_initCStream()), +* or if none is provided, will be cut off whenever ZSTD_seekable_endFrame() is +* called or when the default maximum frame size (2GB) is reached. +* +* Use ZSTD_seekable_initCStream() to initialize a ZSTD_seekable_CStream object +* for a new compression operation. +* `maxFrameSize` indicates the size at which to automatically start a new +* seekable frame. `maxFrameSize == 0` implies the default maximum size. +* `checksumFlag` indicates whether or not the seek table should include frame +* checksums on the uncompressed data for verification. +* @return : a size hint for input to provide for compression, or an error code +* checkable with ZSTD_isError() +* +* Use ZSTD_seekable_compressStream() repetitively to consume input stream. +* The function will automatically update both `pos` fields. +* Note that it may not consume the entire input, in which case `pos < size`, +* and it's up to the caller to present again remaining data. +* @return : a size hint, preferred nb of bytes to use as input for next +* function call or an error code, which can be tested using +* ZSTD_isError(). +* Note 1 : it's just a hint, to help latency a little, any other +* value will work fine. +* +* At any time, call ZSTD_seekable_endFrame() to end the current frame and +* start a new one. +* +* ZSTD_seekable_endStream() will end the current frame, and then write the seek +* table so that decompressors can efficiently find compressed frames. +* ZSTD_seekable_endStream() may return a number > 0 if it was unable to flush +* all the necessary data to `output`. In this case, it should be called again +* until all remaining data is flushed out and 0 is returned. +******************************************************************************/ + +/*===== Seekable compressor management =====*/ +ZSTDLIB_API ZSTD_seekable_CStream* ZSTD_seekable_createCStream(void); +ZSTDLIB_API size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs); + +/*===== Seekable compression functions =====*/ +ZSTDLIB_API size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, int compressionLevel, int checksumFlag, unsigned maxFrameSize); +ZSTDLIB_API size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +ZSTDLIB_API size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); +ZSTDLIB_API size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); + +/*= Raw seek table API + * These functions allow for the seek table to be constructed directly. + * This table can then be appended to a file of concatenated frames. + * This allows the frames to be compressed independently, even in parallel, + * and compiled together afterward into a seekable archive. + * + * Use ZSTD_seekable_createFrameLog() to allocate and initialize a tracking + * structure. + * + * Call ZSTD_seekable_logFrame() once for each frame in the archive. + * checksum is optional, and will not be used if checksumFlag was 0 when the + * frame log was created. If present, it should be the least significant 32 + * bits of the XXH64 hash of the uncompressed data. + * + * Call ZSTD_seekable_writeSeekTable to serialize the data into a seek table. + * If the entire table was written, the return value will be 0. Otherwise, + * it will be equal to the number of bytes left to write. */ +typedef struct ZSTD_frameLog_s ZSTD_frameLog; +ZSTDLIB_API ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag); +ZSTDLIB_API size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl); +ZSTDLIB_API size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, unsigned compressedSize, unsigned decompressedSize, unsigned checksum); +ZSTDLIB_API size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output); + +/*-**************************************************************************** +* Seekable decompression - HowTo +* A ZSTD_seekable object is required to tracking the seekTable. +* +* Call ZSTD_seekable_init* to initialize a ZSTD_seekable object with the +* the seek table provided in the input. +* There are three modes for ZSTD_seekable_init: +* - ZSTD_seekable_initBuff() : An in-memory API. The data contained in +* `src` should be the entire seekable file, including the seek table. +* `src` should be kept alive and unmodified until the ZSTD_seekable object +* is freed or reset. +* - ZSTD_seekable_initFile() : A simplified file API using stdio. fread and +* fseek will be used to access the required data for building the seek +* table and doing decompression operations. `src` should not be closed +* or modified until the ZSTD_seekable object is freed or reset. +* - ZSTD_seekable_initAdvanced() : A general API allowing the client to +* provide its own read and seek callbacks. +* + ZSTD_seekable_read() : read exactly `n` bytes into `buffer`. +* Premature EOF should be treated as an error. +* + ZSTD_seekable_seek() : seek the read head to `offset` from `origin`, +* where origin is either SEEK_SET (beginning of +* file), or SEEK_END (end of file). +* Both functions should return a non-negative value in case of success, and a +* negative value in case of failure. If implementing using this API and +* stdio, be careful with files larger than 4GB and fseek. All of these +* functions return an error code checkable with ZSTD_isError(). +* +* Call ZSTD_seekable_decompress to decompress `dstSize` bytes at decompressed +* offset `offset`. ZSTD_seekable_decompress may have to decompress the entire +* prefix of the frame before the desired data if it has not already processed +* this section. If ZSTD_seekable_decompress is called multiple times for a +* consecutive range of data, it will efficiently retain the decompressor object +* and avoid redecompressing frame prefixes. The return value is the number of +* bytes decompressed, or an error code checkable with ZSTD_isError(). +* +* The seek table access functions can be used to obtain the data contained +* in the seek table. If frameIndex is larger than the value returned by +* ZSTD_seekable_getNumFrames(), they will return error codes checkable with +* ZSTD_isError(). Note that since the offset access functions return +* unsigned long long instead of size_t, in this case they will instead return +* the value ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE. +******************************************************************************/ + +/*===== Seekable decompressor management =====*/ +ZSTDLIB_API ZSTD_seekable* ZSTD_seekable_create(void); +ZSTDLIB_API size_t ZSTD_seekable_free(ZSTD_seekable* zs); + +/*===== Seekable decompression functions =====*/ +ZSTDLIB_API size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src); +ZSTDLIB_API size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned long long offset); +ZSTDLIB_API size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_getFileDecompressedSize(ZSTD_seekable* zs); + +#define ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE (0ULL-2) +/*===== Seek Table access functions =====*/ +ZSTDLIB_API unsigned ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs); +ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API unsigned ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, unsigned long long offset); + +/*===== Seekable advanced I/O API =====*/ +typedef int(ZSTD_seekable_read)(void* opaque, void* buffer, size_t n); +typedef int(ZSTD_seekable_seek)(void* opaque, long long offset, int origin); +typedef struct { + void* opaque; + ZSTD_seekable_read* read; + ZSTD_seekable_seek* seek; +} ZSTD_seekable_customFile; +ZSTDLIB_API size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src); + +#if defined (__cplusplus) +} +#endif + +#endif diff --git a/dependencies/zstd-seekable-adapted/zstd_seekable_utils.hpp b/dependencies/zstd-seekable-adapted/zstd_seekable_utils.hpp new file mode 100644 index 00000000..6d25f96a --- /dev/null +++ b/dependencies/zstd-seekable-adapted/zstd_seekable_utils.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include // strlen, memset, strcat + + +#include // malloc, exit +#include // fprintf, perror, feof +#include // strerror +#include // errno +//#define ZSTD_STATIC_LINKING_ONLY +#include // presumes zstd library is installed +#include + +#include "zstd_seekable.h" + + + +void* malloc_orDie(size_t size); + +FILE* fopen_orDie(const char *filename, const char *instruction); + +size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file); + +size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file); + +size_t fclose_orDie(FILE* file); + +void* realloc_orDie(void* ptr, size_t size); + +void fseek_orDie(FILE* file, long int offset, int origin); + + + +size_t ZSTD_seekable_compressFile_orDie(const char* fname, const char* outName, int cLevel, unsigned frameSize); + + + + +#ifndef ZSTD_SEEKABLE_DECOMPRESS_INIT_DATA_HPP +#define ZSTD_SEEKABLE_DECOMPRESS_INIT_DATA_HPP + +// struct for already initialised reading +struct ZSTD_seekable_decompress_init_data { + FILE* fin; + //bool fin_locked; + + //ZSTD_seekable* seekable; + + // size_t const initResult; // to be added later if always the same? + // size_t maxFileSize; // to be added later if always the same? +}; + +#endif + + +ZSTD_seekable_decompress_init_data* ZSTD_seekable_decompressFile_init(const char* ); +size_t ZSTD_seekable_decompressFile_orDie(ZSTD_seekable_decompress_init_data* , off_t, char* , off_t); +size_t ZSTD_seekable_decompressFile_orDie(const char* , off_t , char *, off_t ); diff --git a/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp b/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp new file mode 100644 index 00000000..5506c424 --- /dev/null +++ b/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include /* malloc, free */ +#include /* UINT_MAX */ +#include + +#define XXH_STATIC_LINKING_ONLY +//#define XXH_NAMESPACE ZSTD_ +#include "xxhash.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_seekable.h" + +#define CHECK_Z(f) { size_t const ret = (f); if (ret != 0) return ret; } + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +typedef struct { + U32 cSize; + U32 dSize; + U32 checksum; +} framelogEntry_t; + +struct ZSTD_frameLog_s { + framelogEntry_t* entries; + U32 size; + U32 capacity; + + int checksumFlag; + + /* for use when streaming out the seek table */ + U32 seekTablePos; + U32 seekTableIndex; +} framelog_t; + +struct ZSTD_seekable_CStream_s { + ZSTD_CStream* cstream; + ZSTD_frameLog framelog; + + U32 frameCSize; + U32 frameDSize; + + XXH64_state_t xxhState; + + U32 maxFrameSize; + + int writingSeekTable; +}; + +size_t ZSTD_seekable_frameLog_allocVec(ZSTD_frameLog* fl) +{ + /* allocate some initial space */ + size_t const FRAMELOG_STARTING_CAPACITY = 16; + fl->entries = (framelogEntry_t*)malloc( + sizeof(framelogEntry_t) * FRAMELOG_STARTING_CAPACITY); + if (fl->entries == NULL) return ERROR(memory_allocation); + fl->capacity = FRAMELOG_STARTING_CAPACITY; + + return 0; +} + +size_t ZSTD_seekable_frameLog_freeVec(ZSTD_frameLog* fl) +{ + if (fl != NULL) free(fl->entries); + return 0; +} + +ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag) +{ + ZSTD_frameLog* fl = (ZSTD_frameLog*) malloc(sizeof(ZSTD_frameLog)); + if (fl == NULL) return NULL; + + if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(fl))) { + free(fl); + return NULL; + } + + fl->checksumFlag = checksumFlag; + fl->seekTablePos = 0; + fl->seekTableIndex = 0; + fl->size = 0; + + return fl; +} + +size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl) +{ + ZSTD_seekable_frameLog_freeVec(fl); + free(fl); + return 0; +} + +ZSTD_seekable_CStream* ZSTD_seekable_createCStream() +{ + ZSTD_seekable_CStream* zcs = (ZSTD_seekable_CStream*) malloc(sizeof(ZSTD_seekable_CStream)); + + if (zcs == NULL) return NULL; + + memset(zcs, 0, sizeof(*zcs)); + + zcs->cstream = ZSTD_createCStream(); + if (zcs->cstream == NULL) goto failed1; + + if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(&zcs->framelog))) goto failed2; + + return zcs; + +failed2: + ZSTD_freeCStream(zcs->cstream); +failed1: + free(zcs); + return NULL; +} + +size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs) +{ + if (zcs == NULL) return 0; /* support free on null */ + ZSTD_freeCStream(zcs->cstream); + ZSTD_seekable_frameLog_freeVec(&zcs->framelog); + free(zcs); + + return 0; +} + +size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, + int compressionLevel, + int checksumFlag, + unsigned maxFrameSize) +{ + zcs->framelog.size = 0; + zcs->frameCSize = 0; + zcs->frameDSize = 0; + + /* make sure maxFrameSize has a reasonable value */ + if (maxFrameSize > ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE) { + return ERROR(frameParameter_unsupported); + } + + zcs->maxFrameSize = maxFrameSize + ? maxFrameSize + : ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE; + + zcs->framelog.checksumFlag = checksumFlag; + if (zcs->framelog.checksumFlag) { + XXH64_reset(&zcs->xxhState, 0); + } + + zcs->framelog.seekTablePos = 0; + zcs->framelog.seekTableIndex = 0; + zcs->writingSeekTable = 0; + + return ZSTD_initCStream(zcs->cstream, compressionLevel); +} + +size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, + unsigned compressedSize, + unsigned decompressedSize, + unsigned checksum) +{ + if (fl->size == ZSTD_SEEKABLE_MAXFRAMES) + return ERROR(frameIndex_tooLarge); + + /* grow the buffer if required */ + if (fl->size == fl->capacity) { + /* exponential size increase for constant amortized runtime */ + size_t const newCapacity = fl->capacity * 2; + framelogEntry_t* const newEntries = (framelogEntry_t*) realloc(fl->entries, + sizeof(framelogEntry_t) * newCapacity); + + if (newEntries == NULL) return ERROR(memory_allocation); + + fl->entries = newEntries; + assert(newCapacity <= UINT_MAX); + fl->capacity = (U32)newCapacity; + } + + fl->entries[fl->size] = (framelogEntry_t){ + compressedSize, decompressedSize, checksum + }; + fl->size++; + + return 0; +} + +size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output) +{ + size_t const prevOutPos = output->pos; + /* end the frame */ + size_t ret = ZSTD_endStream(zcs->cstream, output); + + zcs->frameCSize += (U32) (output->pos - prevOutPos); + + /* need to flush before doing the rest */ + if (ret) return ret; + + /* frame done */ + + /* store the frame data for later */ + ret = ZSTD_seekable_logFrame( + &zcs->framelog, zcs->frameCSize, zcs->frameDSize, + zcs->framelog.checksumFlag + ? XXH64_digest(&zcs->xxhState) & 0xFFFFFFFFU + : 0); + if (ret) return ret; + + /* reset for the next frame */ + zcs->frameCSize = 0; + zcs->frameDSize = 0; + + ZSTD_resetCStream(zcs->cstream, 0); + if (zcs->framelog.checksumFlag) + XXH64_reset(&zcs->xxhState, 0); + + return 0; +} + +size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + const BYTE* const inBase = (const BYTE*) input->src + input->pos; + size_t inLen = input->size - input->pos; + + inLen = MIN(inLen, (size_t)(zcs->maxFrameSize - zcs->frameDSize)); + + /* if we haven't finished flushing the last frame, don't start writing a new one */ + if (inLen > 0) { + ZSTD_inBuffer inTmp = { inBase, inLen, 0 }; + size_t const prevOutPos = output->pos; + + size_t const ret = ZSTD_compressStream(zcs->cstream, output, &inTmp); + + if (zcs->framelog.checksumFlag) { + XXH64_update(&zcs->xxhState, inBase, inTmp.pos); + } + + zcs->frameCSize += (U32) (output->pos - prevOutPos); + zcs->frameDSize += (U32) (inTmp.pos); + + input->pos += inTmp.pos; + + if (ZSTD_isError(ret)) return ret; + } + + if (zcs->maxFrameSize == zcs->frameDSize) { + /* log the frame and start over */ + size_t const ret = ZSTD_seekable_endFrame(zcs, output); + if (ZSTD_isError(ret)) return ret; + + /* get the client ready for the next frame */ + return (size_t)zcs->maxFrameSize; + } + + return (size_t)(zcs->maxFrameSize - zcs->frameDSize); +} + +static inline size_t ZSTD_seekable_seekTableSize(const ZSTD_frameLog* fl) +{ + size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0); + size_t const seekTableLen = ZSTD_SKIPPABLEHEADERSIZE + + sizePerFrame * fl->size + + ZSTD_seekTableFooterSize; + + return seekTableLen; +} + +static inline size_t ZSTD_stwrite32(ZSTD_frameLog* fl, + ZSTD_outBuffer* output, U32 const value, + U32 const offset) +{ + if (fl->seekTablePos < offset + 4) { + BYTE tmp[4]; /* so that we can work with buffers too small to write a whole word to */ + size_t const lenWrite = + MIN(output->size - output->pos, offset + 4 - fl->seekTablePos); + MEM_writeLE32(tmp, value); + memcpy((BYTE*)output->dst + output->pos, + tmp + (fl->seekTablePos - offset), lenWrite); + output->pos += lenWrite; + fl->seekTablePos += (U32) lenWrite; + + if (lenWrite < 4) return ZSTD_seekable_seekTableSize(fl) - fl->seekTablePos; + } + return 0; +} + +size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output) +{ + /* seekTableIndex: the current index in the table and + * seekTableSize: the amount of the table written so far + * + * This function is written this way so that if it has to return early + * because of a small buffer, it can keep going where it left off. + */ + + size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0); + size_t const seekTableLen = ZSTD_seekable_seekTableSize(fl); + + CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_MAGIC_SKIPPABLE_START | 0xE, 0)); + assert(seekTableLen <= (size_t)UINT_MAX); + CHECK_Z(ZSTD_stwrite32(fl, output, (U32)seekTableLen - ZSTD_SKIPPABLEHEADERSIZE, 4)); + + while (fl->seekTableIndex < fl->size) { + unsigned long long const start = ZSTD_SKIPPABLEHEADERSIZE + sizePerFrame * fl->seekTableIndex; + assert(start + 8 <= UINT_MAX); + CHECK_Z(ZSTD_stwrite32(fl, output, + fl->entries[fl->seekTableIndex].cSize, + (U32)start + 0)); + + CHECK_Z(ZSTD_stwrite32(fl, output, + fl->entries[fl->seekTableIndex].dSize, + (U32)start + 4)); + + if (fl->checksumFlag) { + CHECK_Z(ZSTD_stwrite32( + fl, output, fl->entries[fl->seekTableIndex].checksum, + (U32)start + 8)); + } + + fl->seekTableIndex++; + } + + assert(seekTableLen <= UINT_MAX); + CHECK_Z(ZSTD_stwrite32(fl, output, fl->size, + (U32)seekTableLen - ZSTD_seekTableFooterSize)); + + if (output->size - output->pos < 1) return seekTableLen - fl->seekTablePos; + if (fl->seekTablePos < seekTableLen - 4) { + BYTE sfd = 0; + sfd |= (fl->checksumFlag) << 7; + + ((BYTE*)output->dst)[output->pos] = sfd; + output->pos++; + fl->seekTablePos++; + } + + CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_SEEKABLE_MAGICNUMBER, + (U32)seekTableLen - 4)); + + if (fl->seekTablePos != seekTableLen) return ERROR(GENERIC); + return 0; +} + +size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output) +{ + if (!zcs->writingSeekTable && zcs->frameDSize) { + const size_t endFrame = ZSTD_seekable_endFrame(zcs, output); + if (ZSTD_isError(endFrame)) return endFrame; + /* return an accurate size hint */ + if (endFrame) return endFrame + ZSTD_seekable_seekTableSize(&zcs->framelog); + } + + zcs->writingSeekTable = 1; + + return ZSTD_seekable_writeSeekTable(&zcs->framelog, output); +} diff --git a/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp b/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp new file mode 100644 index 00000000..4e99d3ae --- /dev/null +++ b/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* ********************************************************* +* Turn on Large Files support (>4GB) for 32-bit Linux/Unix +***********************************************************/ +#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */ +# if !defined(_FILE_OFFSET_BITS) +# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */ +# endif +# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */ +# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */ +# endif +# if defined(_AIX) || defined(__hpux) +# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */ +# endif +#endif + +/* ************************************************************ +* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW +***************************************************************/ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +# define LONG_SEEK _fseeki64 +#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */ +# define LONG_SEEK fseeko +#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__) +# define LONG_SEEK fseeko64 +#elif defined(_WIN32) && !defined(__DJGPP__) +# include + static int LONG_SEEK(FILE* file, __int64 offset, int origin) { + LARGE_INTEGER off; + DWORD method; + off.QuadPart = offset; + if (origin == SEEK_END) + method = FILE_END; + else if (origin == SEEK_CUR) + method = FILE_CURRENT; + else + method = FILE_BEGIN; + + if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method)) + return 0; + else + return -1; + } +#else +# define LONG_SEEK fseek +#endif + +#include /* malloc, free */ +#include /* FILE* */ +#include /* UNIT_MAX */ +#include + +#define XXH_STATIC_LINKING_ONLY +//#define XXH_NAMESPACE ZSTD_ +#include "xxhash.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_seekable.h" + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +#define CHECK_IO(f) { int const errcod = (f); if (errcod < 0) return ERROR(seekableIO); } + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +/* Special-case callbacks for FILE* and in-memory modes, so that we can treat + * them the same way as the advanced API */ +static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n) +{ + size_t const result = fread(buffer, 1, n, (FILE*)opaque); + if (result != n) { + return -1; + } + return 0; +} + +static int ZSTD_seekable_seek_FILE(void* opaque, long long offset, int origin) +{ + int const ret = LONG_SEEK((FILE*)opaque, offset, origin); + if (ret) return ret; + return fflush((FILE*)opaque); +} + +typedef struct { + const void *ptr; + size_t size; + size_t pos; +} buffWrapper_t; + +static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n) +{ + buffWrapper_t* buff = (buffWrapper_t*) opaque; + if (buff->pos + n > buff->size) return -1; + memcpy(buffer, (const BYTE*)buff->ptr + buff->pos, n); + buff->pos += n; + return 0; +} + +static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin) +{ + buffWrapper_t* const buff = (buffWrapper_t*) opaque; + unsigned long long newOffset; + switch (origin) { + case SEEK_SET: + newOffset = offset; + break; + case SEEK_CUR: + newOffset = (unsigned long long)buff->pos + offset; + break; + case SEEK_END: + newOffset = (unsigned long long)buff->size + offset; + break; + default: + assert(0); /* not possible */ + } + if (newOffset > buff->size) { + return -1; + } + buff->pos = newOffset; + return 0; +} + +typedef struct { + U64 cOffset; + U64 dOffset; + U32 checksum; +} seekEntry_t; + +typedef struct { + seekEntry_t* entries; + size_t tableLen; + + int checksumFlag; +} seekTable_t; + +#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_MAX + +struct ZSTD_seekable_s { + ZSTD_DStream* dstream; + seekTable_t seekTable; + ZSTD_seekable_customFile src; + + U64 decompressedOffset; + U32 curFrame; + + BYTE inBuff[SEEKABLE_BUFF_SIZE]; /* need to do our own input buffering */ + BYTE outBuff[SEEKABLE_BUFF_SIZE]; /* so we can efficiently decompress the + starts of chunks before we get to the + desired section */ + ZSTD_inBuffer in; /* maintain continuity across ZSTD_seekable_decompress operations */ + buffWrapper_t buffWrapper; /* for `src.opaque` in in-memory mode */ + + XXH64_state_t xxhState; +}; + +ZSTD_seekable* ZSTD_seekable_create(void) +{ + ZSTD_seekable* zs = (ZSTD_seekable*) malloc(sizeof(ZSTD_seekable)); + + if (zs == NULL) return NULL; + + /* also initializes stage to zsds_init */ + memset(zs, 0, sizeof(*zs)); + + zs->dstream = ZSTD_createDStream(); + if (zs->dstream == NULL) { + free(zs); + return NULL; + } + + return zs; +} + +size_t ZSTD_seekable_free(ZSTD_seekable* zs) +{ + if (zs == NULL) return 0; /* support free on null */ + ZSTD_freeDStream(zs->dstream); + free(zs->seekTable.entries); + free(zs); + + return 0; +} + +/** ZSTD_seekable_offsetToFrameIndex() : + * Performs a binary search to find the last frame with a decompressed offset + * <= pos + * @return : the frame's index */ +unsigned ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, unsigned long long pos) +{ + U32 lo = 0; + U32 hi = (U32)zs->seekTable.tableLen; + assert(zs->seekTable.tableLen <= UINT_MAX); + + if (pos >= zs->seekTable.entries[zs->seekTable.tableLen].dOffset) { + return (U32)zs->seekTable.tableLen; + } + + while (lo + 1 < hi) { + U32 const mid = lo + ((hi - lo) >> 1); + if (zs->seekTable.entries[mid].dOffset <= pos) { + lo = mid; + } else { + hi = mid; + } + } + return lo; +} + +unsigned ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs) +{ + assert(zs->seekTable.tableLen <= UINT_MAX); + return (unsigned)zs->seekTable.tableLen; +} + +unsigned long long ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return zs->seekTable.entries[frameIndex].cOffset; +} + +unsigned long long ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return zs->seekTable.entries[frameIndex].dOffset; +} + +size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, unsigned frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge); + return zs->seekTable.entries[frameIndex + 1].cOffset - + zs->seekTable.entries[frameIndex].cOffset; +} + +size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, unsigned frameIndex) +{ + if (frameIndex > zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge); + return zs->seekTable.entries[frameIndex + 1].dOffset - + zs->seekTable.entries[frameIndex].dOffset; +} + +static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs) +{ + int checksumFlag; + ZSTD_seekable_customFile src = zs->src; + /* read the footer, fixed size */ + CHECK_IO(src.seek(src.opaque, -(int)ZSTD_seekTableFooterSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, ZSTD_seekTableFooterSize)); + + if (MEM_readLE32(zs->inBuff + 5) != ZSTD_SEEKABLE_MAGICNUMBER) { + return ERROR(prefix_unknown); + } + + { BYTE const sfd = zs->inBuff[4]; + checksumFlag = sfd >> 7; + + /* check reserved bits */ + if ((checksumFlag >> 2) & 0x1f) { + return ERROR(corruption_detected); + } + } + + { U32 const numFrames = MEM_readLE32(zs->inBuff); + U32 const sizePerEntry = 8 + (checksumFlag?4:0); + U32 const tableSize = sizePerEntry * numFrames; + U32 const frameSize = tableSize + ZSTD_seekTableFooterSize + ZSTD_SKIPPABLEHEADERSIZE; + + U32 remaining = frameSize - ZSTD_seekTableFooterSize; /* don't need to re-read footer */ + { + U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE); + + CHECK_IO(src.seek(src.opaque, -(S64)frameSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, toRead)); + + remaining -= toRead; + } + + if (MEM_readLE32(zs->inBuff) != (ZSTD_MAGIC_SKIPPABLE_START | 0xE)) { + return ERROR(prefix_unknown); + } + if (MEM_readLE32(zs->inBuff+4) + ZSTD_SKIPPABLEHEADERSIZE != frameSize) { + return ERROR(prefix_unknown); + } + + { /* Allocate an extra entry at the end so that we can do size + * computations on the last element without special case */ + seekEntry_t* entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1)); + + U32 idx = 0; + U32 pos = 8; + + + U64 cOffset = 0; + U64 dOffset = 0; + + if (!entries) { + free(entries); + return ERROR(memory_allocation); + } + + /* compute cumulative positions */ + for (; idx < numFrames; idx++) { + if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) { + U32 const offset = SEEKABLE_BUFF_SIZE - pos; + U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset); + memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */ + CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead)); + remaining -= toRead; + pos = 0; + } + entries[idx].cOffset = cOffset; + entries[idx].dOffset = dOffset; + + cOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + dOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + if (checksumFlag) { + entries[idx].checksum = MEM_readLE32(zs->inBuff + pos); + pos += 4; + } + } + entries[numFrames].cOffset = cOffset; + entries[numFrames].dOffset = dOffset; + + zs->seekTable.entries = entries; + zs->seekTable.tableLen = numFrames; + zs->seekTable.checksumFlag = checksumFlag; + return 0; + } + } +} + +size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize) +{ + zs->buffWrapper = (buffWrapper_t){src, srcSize, 0}; + { ZSTD_seekable_customFile srcFile = {&zs->buffWrapper, + &ZSTD_seekable_read_buff, + &ZSTD_seekable_seek_buff}; + return ZSTD_seekable_initAdvanced(zs, srcFile); } +} + +size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src) +{ + ZSTD_seekable_customFile srcFile = {src, &ZSTD_seekable_read_FILE, + &ZSTD_seekable_seek_FILE}; + return ZSTD_seekable_initAdvanced(zs, srcFile); +} + +size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src) +{ + zs->src = src; + + { const size_t seekTableInit = ZSTD_seekable_loadSeekTable(zs); + if (ZSTD_isError(seekTableInit)) return seekTableInit; } + + zs->decompressedOffset = (U64)-1; + zs->curFrame = (U32)-1; + + { const size_t dstreamInit = ZSTD_initDStream(zs->dstream); + if (ZSTD_isError(dstreamInit)) return dstreamInit; } + return 0; +} + + + +// this functions returns the over-all decompressed size +// this data might be accessible pre-compiled through the zstd library too? +size_t ZSTD_seekable_getFileDecompressedSize(ZSTD_seekable* zs) +{ + size_t written = 0; + + for(size_t i = 0 ; i < zs->seekTable.tableLen ; i++) { + written += ZSTD_seekable_getFrameDecompressedSize(zs, i); + } + + return written; +} + + + +size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, unsigned long long offset) +{ + U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset); + do { + /* check if we can continue from a previous decompress job */ + if (targetFrame != zs->curFrame || offset != zs->decompressedOffset) { + zs->decompressedOffset = zs->seekTable.entries[targetFrame].dOffset; + zs->curFrame = targetFrame; + + CHECK_IO(zs->src.seek(zs->src.opaque, + zs->seekTable.entries[targetFrame].cOffset, + SEEK_SET)); + zs->in = (ZSTD_inBuffer){zs->inBuff, 0, 0}; + XXH64_reset(&zs->xxhState, 0); + ZSTD_resetDStream(zs->dstream); + } + + while (zs->decompressedOffset < offset + len) { + size_t toRead; + ZSTD_outBuffer outTmp; + size_t prevOutPos; + if (zs->decompressedOffset < offset) { + /* dummy decompressions until we get to the target offset */ + outTmp = (ZSTD_outBuffer){zs->outBuff, MIN(SEEKABLE_BUFF_SIZE, offset - zs->decompressedOffset), 0}; + } else { + outTmp = (ZSTD_outBuffer){dst, len, zs->decompressedOffset - offset}; + } + + prevOutPos = outTmp.pos; + toRead = ZSTD_decompressStream(zs->dstream, &outTmp, &zs->in); + if (ZSTD_isError(toRead)) { + return toRead; + } + + if (zs->seekTable.checksumFlag) { + XXH64_update(&zs->xxhState, (BYTE*)outTmp.dst + prevOutPos, + outTmp.pos - prevOutPos); + } + zs->decompressedOffset += outTmp.pos - prevOutPos; + + if (toRead == 0) { + /* frame complete */ + + /* verify checksum */ + if (zs->seekTable.checksumFlag && + (XXH64_digest(&zs->xxhState) & 0xFFFFFFFFU) != + zs->seekTable.entries[targetFrame].checksum) { + return ERROR(corruption_detected); + } + + if (zs->decompressedOffset < offset + len) { + /* go back to the start and force a reset of the stream */ + targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, zs->decompressedOffset); + } + break; + } + + /* read in more data if we're done with this buffer */ + if (zs->in.pos == zs->in.size) { + toRead = MIN(toRead, SEEKABLE_BUFF_SIZE); + CHECK_IO(zs->src.read(zs->src.opaque, zs->inBuff, toRead)); + zs->in.size = toRead; + zs->in.pos = 0; + } + } + } while (zs->decompressedOffset != offset + len); + + return len; +} + +size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) { + return ERROR(frameIndex_tooLarge); + } + + { + size_t const decompressedSize = + zs->seekTable.entries[frameIndex + 1].dOffset - + zs->seekTable.entries[frameIndex].dOffset; + if (dstSize < decompressedSize) { + return ERROR(dstSize_tooSmall); + } + return ZSTD_seekable_decompress( + zs, dst, decompressedSize, + zs->seekTable.entries[frameIndex].dOffset); + } +} diff --git a/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp b/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp new file mode 100644 index 00000000..750bd775 --- /dev/null +++ b/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + + + +#include // malloc, exit +#include // fprintf, perror, feof +#include // strerror +#include // errno +//#define ZSTD_STATIC_LINKING_ONLY +#include // presumes zstd library is installed +#include +#include + +#include // random nr + + +#include "zstd_seekable.h" +#include "zstd_seekable_utils.hpp" + + +//#define MIN(a, b) ((a) < (b) ? (a) : (b)) + + +void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc:"); + exit(1); +} + + +FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + + +size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + + +size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + + +size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + + + +void* realloc_orDie(void* ptr, size_t size) +{ + ptr = realloc(ptr, size); + if (ptr) return ptr; + /* error */ + perror("realloc"); + exit(1); +} + + + +void fseek_orDie(FILE* file, long int offset, int origin) { + if (!fseek(file, offset, origin)) { + if (!fflush(file)) return; + } + /* error */ + perror("fseek"); + exit(7); +} + + + + +size_t ZSTD_seekable_compressFile_orDie(const char* fname, + const char* outName, + int cLevel, + unsigned int frameSize) +{ + size_t written = 0; + + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = fopen_orDie(outName, "wb"); + size_t const buffInSize = ZSTD_CStreamInSize(); /* can always read one full block */ + void* const buffIn = malloc_orDie(buffInSize); + size_t const buffOutSize = ZSTD_CStreamOutSize(); /* can always flush a full block */ + void* const buffOut = malloc_orDie(buffOutSize); + + ZSTD_seekable_CStream* const cstream = ZSTD_seekable_createCStream(); + if (cstream==NULL) { fprintf(stderr, "ZSTD_seekable_createCStream() error \n"); exit(10); } + size_t const initResult = ZSTD_seekable_initCStream(cstream, cLevel, 1, frameSize); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t read, toRead = buffInSize; + while( (read = fread_orDie(buffIn, toRead, fin)) ) { + ZSTD_inBuffer input = { buffIn, read, 0 }; + while (input.pos < input.size) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + toRead = ZSTD_seekable_compressStream(cstream, &output , &input); /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */ + if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_seekable_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); } + if (toRead > buffInSize) toRead = buffInSize; /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/ + fwrite_orDie(buffOut, output.pos, fout); + } + } + + while (1) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + size_t const remainingToFlush = ZSTD_seekable_endStream(cstream, &output); /* close stream */ + if (ZSTD_isError(remainingToFlush)) { fprintf(stderr, "ZSTD_seekable_endStream() error : %s \n", ZSTD_getErrorName(remainingToFlush)); exit(13); } + written += fwrite_orDie(buffOut, output.pos, fout); + if (!remainingToFlush) break; + } + + ZSTD_seekable_freeCStream(cstream); + fclose_orDie(fout); + fclose_orDie(fin); + free(buffIn); + free(buffOut); + + return written; +} + + + + + +ZSTD_seekable_decompress_init_data* ZSTD_seekable_decompressFile_init(const char* fname) +{ + //FILE* const fin = fopen_orDie(fname, "rb"); + + return new ZSTD_seekable_decompress_init_data{ + fopen_orDie(fname, "rb") + }; +} + + + + + + + +size_t ZSTD_seekable_decompressFile_orDie(ZSTD_seekable_decompress_init_data* fh, off_t startOffset, char* buffer, off_t endOffset) +{ + //printf("[%i] Read raw zstd seekable [re-using fin] ... \n", msgid); + size_t written = 0; + + if(fh->fin == NULL) { + printf("fin == NULL: YES!!\n"); + exit(124); + } + //else { + // printf("[%i] == NULL: no\n",msgid); + //} + + if (feof(fh->fin)) { + printf ("!!!! FEOF !!!!! \n"); + exit(123); + } + //else { + // printf ("[%i] no feof\n",msgid); + //} + //printf("[%i] ftell: %i\n",msgid, ftell(fh->fin)); + fseek_orDie(fh->fin,0, SEEK_SET); + //printf("[%i] ftell post fseek: %i\n",msgid, ftell(fh->fin)); + + + size_t const buffOutSize = ZSTD_DStreamOutSize(); // Guarantee to successfully flush at least one complete compressed block in all circumstances. + char* const buffOut = (char*) malloc_orDie(buffOutSize); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + size_t const initResult = ZSTD_seekable_initFile(seekable, fh->fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t maxFileSize = ZSTD_seekable_getFileDecompressedSize(seekable); + endOffset = std::min( (size_t) endOffset, maxFileSize); // avoid out of boundary requests + + size_t buffer_out_i = 0; + while (startOffset < endOffset) { + size_t const result = ZSTD_seekable_decompress(seekable, buffOut, std::min((size_t) endOffset - startOffset, buffOutSize), (size_t) startOffset); + + if (ZSTD_isError(result)) { + fprintf(stderr, "ZSTD_seekable_decompress() error : %s \n", + ZSTD_getErrorName(result)); + exit(12); + } + + for(size_t i = 0; i < result; i++) { + buffer[buffer_out_i] = buffOut[i]; + buffer_out_i++; + } + + startOffset += result; + written += result; + } + + ZSTD_seekable_free(seekable); + //fclose_orDie(fin); + free(buffOut); + + + //fh->fin_locked = false; + + return written; +} + + + + + + +size_t ZSTD_seekable_decompressFile_orDie(const char* fname, off_t startOffset, char* buffer, off_t endOffset) +{ + printf("Read raw zstd seekable [+ new fin] ... \n"); + size_t written = 0; + + FILE* const fin = fopen_orDie(fname, "rb"); + size_t const buffOutSize = ZSTD_DStreamOutSize(); // Guarantee to successfully flush at least one complete compressed block in all circumstances. + char* const buffOut = (char*) malloc_orDie(buffOutSize); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + size_t const initResult = ZSTD_seekable_initFile(seekable, fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t maxFileSize = ZSTD_seekable_getFileDecompressedSize(seekable); + endOffset = std::min( (size_t) endOffset, maxFileSize); // avoid out of boundary requests + + size_t buffer_out_i = 0; + while (startOffset < endOffset) { + size_t const result = ZSTD_seekable_decompress(seekable, buffOut, std::min((size_t) endOffset - startOffset, buffOutSize), (size_t) startOffset); + + if (ZSTD_isError(result)) { + fprintf(stderr, "ZSTD_seekable_decompress() error : %s \n", + ZSTD_getErrorName(result)); + exit(12); + } + + for(size_t i = 0; i < result; i++) { + buffer[buffer_out_i] = buffOut[i]; + buffer_out_i++; + } + + startOffset += result; + written += result; + } + + ZSTD_seekable_free(seekable); + fclose_orDie(fin); + free(buffOut); + + return written; +} + + + diff --git a/deps b/deps new file mode 100644 index 00000000..e0044eea --- /dev/null +++ b/deps @@ -0,0 +1,12 @@ + +Required dependencies: + +cmake +libboost (libboost-dev + libboost-test-dev + libboost-system-dev + libboost-filesystem-dev) +libssl (libssl-dev) +libzstd (libzstd-dev) +libzstd (libzlib-dev) +libz / zlib (zlib1g-dev) +libfuse (fuse) [tested with fuse2] +libfuse-dev (fuse) [tested with fuse2] +c & cpp (build-essential) diff --git a/include/chunked_reader.hpp b/include/chunked_reader.hpp new file mode 100644 index 00000000..7079dc78 --- /dev/null +++ b/include/chunked_reader.hpp @@ -0,0 +1,70 @@ + + +#ifndef CHUNKED_READER +#define CHUNKED_READER + + +#include +#include +#include +#include + +#include + + +#include "config.hpp" + +#include "utils.hpp" + +#include "zstd_seekable_utils.hpp" + + +enum compression_type : signed char { + undefined = -1, + uncompressed = 0, + zstd = 1 +}; + + + + +class chunked_reader +{ +public: + chunked_reader(char *); // filename + chunked_reader(const char *); // filename + ~chunked_reader(); + + void init(); // generic tasks needed for init + + std::string filename; // try doing this with inode + + std::ifstream *fh_flat; + void update_flat_buffer(); + + ZSTD_seekable_decompress_init_data* fh_zstd; + void update_zstd_buffer(); + + + compression_type filetype; + char buffer[READ_BUFFER_SIZE + 1]; + size_t buffer_i; + size_t buffer_n; + + off_t file_i; + + void set_filetype(); + + size_t read(char *, size_t);// @deprecate + size_t read(unsigned char *, size_t); + unsigned char read(); + + void seek(off_t); + size_t tell(); + //size_t size(); +}; + + + +#endif + diff --git a/include/config.hpp.in b/include/config.hpp.in index 91f3c9d5..6f80a01a 100644 --- a/include/config.hpp.in +++ b/include/config.hpp.in @@ -58,6 +58,9 @@ static const std::string UCSC2BIT_VERSION = "\x00\x00\x00\x00"s; static const std::string FASTAFS_MAGIC = "\x0F\x0A\x46\x53"s; static const std::string FASTAFS_VERSION = "\x00\x00\x00\x00"s; +static const std::string ZSTD_MAGIC = "\x28\xB5\x2F\xFD"s; + + static const std::string DICT_HEADER = "@HD\tVN:1.0\tSO:unsorted\n"; @@ -70,10 +73,11 @@ static const size_t MAX_SIZE_SEQ_NAME = 255; const static char DICT_TWOBIT = 1; const static char DICT_FOURBIT = 2; +const static char DICT_FIVEBIT = 3; -static const char ENCODE_HASH_TWOBIT_DNA[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; -static const char ENCODE_HASH_TWOBIT_RNA[256][5] = {"UUUU", "UUUC", "UUUA", "UUUG", "UUCU", "UUCC", "UUCA", "UUCG", "UUAU", "UUAC", "UUAA", "UUAG", "UUGU", "UUGC", "UUGA", "UUGG", "UCUU", "UCUC", "UCUA", "UCUG", "UCCU", "UCCC", "UCCA", "UCCG", "UCAU", "UCAC", "UCAA", "UCAG", "UCGU", "UCGC", "UCGA", "UCGG", "UAUU", "UAUC", "UAUA", "UAUG", "UACU", "UACC", "UACA", "UACG", "UAAU", "UAAC", "UAAA", "UAAG", "UAGU", "UAGC", "UAGA", "UAGG", "UGUU", "UGUC", "UGUA", "UGUG", "UGCU", "UGCC", "UGCA", "UGCG", "UGAU", "UGAC", "UGAA", "UGAG", "UGGU", "UGGC", "UGGA", "UGGG", "CUUU", "CUUC", "CUUA", "CUUG", "CUCU", "CUCC", "CUCA", "CUCG", "CUAU", "CUAC", "CUAA", "CUAG", "CUGU", "CUGC", "CUGA", "CUGG", "CCUU", "CCUC", "CCUA", "CCUG", "CCCU", "CCCC", "CCCA", "CCCG", "CCAU", "CCAC", "CCAA", "CCAG", "CCGU", "CCGC", "CCGA", "CCGG", "CAUU", "CAUC", "CAUA", "CAUG", "CACU", "CACC", "CACA", "CACG", "CAAU", "CAAC", "CAAA", "CAAG", "CAGU", "CAGC", "CAGA", "CAGG", "CGUU", "CGUC", "CGUA", "CGUG", "CGCU", "CGCC", "CGCA", "CGCG", "CGAU", "CGAC", "CGAA", "CGAG", "CGGU", "CGGC", "CGGA", "CGGG", "AUUU", "AUUC", "AUUA", "AUUG", "AUCU", "AUCC", "AUCA", "AUCG", "AUAU", "AUAC", "AUAA", "AUAG", "AUGU", "AUGC", "AUGA", "AUGG", "ACUU", "ACUC", "ACUA", "ACUG", "ACCU", "ACCC", "ACCA", "ACCG", "ACAU", "ACAC", "ACAA", "ACAG", "ACGU", "ACGC", "ACGA", "ACGG", "AAUU", "AAUC", "AAUA", "AAUG", "AACU", "AACC", "AACA", "AACG", "AAAU", "AAAC", "AAAA", "AAAG", "AAGU", "AAGC", "AAGA", "AAGG", "AGUU", "AGUC", "AGUA", "AGUG", "AGCU", "AGCC", "AGCA", "AGCG", "AGAU", "AGAC", "AGAA", "AGAG", "AGGU", "AGGC", "AGGA", "AGGG", "GUUU", "GUUC", "GUUA", "GUUG", "GUCU", "GUCC", "GUCA", "GUCG", "GUAU", "GUAC", "GUAA", "GUAG", "GUGU", "GUGC", "GUGA", "GUGG", "GCUU", "GCUC", "GCUA", "GCUG", "GCCU", "GCCC", "GCCA", "GCCG", "GCAU", "GCAC", "GCAA", "GCAG", "GCGU", "GCGC", "GCGA", "GCGG", "GAUU", "GAUC", "GAUA", "GAUG", "GACU", "GACC", "GACA", "GACG", "GAAU", "GAAC", "GAAA", "GAAG", "GAGU", "GAGC", "GAGA", "GAGG", "GGUU", "GGUC", "GGUA", "GGUG", "GGCU", "GGCC", "GGCA", "GGCG", "GGAU", "GGAC", "GGAA", "GGAG", "GGGU", "GGGC", "GGGA", "GGGG"}; +static const int ZSTD_COMPRESSION_QUALIITY = 5; +static const unsigned ZSTD_SEEKABLE_FRAME_SIZE = 1024 * 1024; // size in bytes [1mb] #endif diff --git a/include/fasta_to_fastafs.hpp b/include/fasta_to_fastafs.hpp index 2b629e68..c0e267da 100644 --- a/include/fasta_to_fastafs.hpp +++ b/include/fasta_to_fastafs.hpp @@ -9,6 +9,8 @@ #include "fastafs.hpp" #include "twobit_byte.hpp" #include "fourbit_byte.hpp" +#include "fivebit_fivebytes.hpp" + @@ -16,13 +18,18 @@ class fasta_to_fastafs_seq { public: - void add_N(); + void add_unknown(); + + void finish_sequence(std::ofstream &); - void add_twobit_ACTG(unsigned char, std::ofstream &);//Adds a T or a U - void finish_twobit_sequence(std::ofstream &); + void twobit_add(unsigned char, std::ofstream &); + void twobit_finish_sequence(std::ofstream &); - void add_fourbit_ACTG(unsigned char, std::ofstream &);//Adds a T or a U - void finish_fourbit_sequence(std::ofstream &); + void fourbit_add(unsigned char, std::ofstream &); + void fourbit_finish_sequence(std::ofstream &); + + void fivebit_add(unsigned char, std::ofstream &); + void fivebit_finish_sequence(std::ofstream &); off_t file_offset_in_fasta; // file positions in FASTA file where sequence data blocks starts [ACTG] off_t file_offset_in_fastafs; // file positions in FASTAFS file where sequence data blocks starts [2bit/4bit] @@ -62,6 +69,7 @@ class fasta_to_fastafs_seq twobit_byte twobit_data; fourbit_byte fourbit_data; + fivebit_fivebytes fivebit_data; fasta_to_fastafs_seq(off_t fof_fasta, off_t fof_fastafs, const std::string &name): @@ -79,11 +87,11 @@ class fasta_to_fastafs_seq { if(name.size() > 255) { fprintf(stderr, "[fasta_to_fastafs::init] sequence name truncated to 255 charaters: %s\n", name.c_str()); - this->name = this->name.substr (0,255); + this->name = this->name.substr(0, 255); } MD5_Init(&this->ctx); } - + void flush(); }; diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 18922096..8a75933f 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -14,6 +14,7 @@ #include "sequence_region.hpp" #include "flags.hpp" +#include "chunked_reader.hpp" struct ffs2f_init_seq { @@ -73,15 +74,15 @@ class fastafs_seq fastafs_seq(); uint32_t fasta_filesize(uint32_t padding); - void view_fasta(ffs2f_init_seq*, std::ifstream *); + void view_fasta(ffs2f_init_seq*, chunked_reader &fh); size_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *); - uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); - uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); - template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, chunked_reader &); + uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, chunked_reader &); + template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, chunked_reader &); - std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. - std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. + std::string sha1(ffs2f_init_seq*, chunked_reader &);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. + std::string md5(ffs2f_init_seq*, chunked_reader &);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. uint32_t n_bits(); @@ -123,6 +124,7 @@ class fastafs size_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer + uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t, chunked_reader &); uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); diff --git a/include/fivebit_fivebytes.hpp b/include/fivebit_fivebytes.hpp new file mode 100644 index 00000000..7240f6a8 --- /dev/null +++ b/include/fivebit_fivebytes.hpp @@ -0,0 +1,49 @@ + +#ifndef FIVEBIT_FIVEBYTES_HPP +#define FIVEBIT_FIVEBYTES_HPP + +#include +#include "config.hpp" + +#include "chunked_reader.hpp" + + +class fivebit_fivebytes +{ +public: + static const char fivebit_alphabet[28 + 1]; + static const char encode_hash[28 + 1][2]; + + static const char n_fill_unmasked = '?'; + static const char n_fill_masked = '?'; + + unsigned char data_compressed[5]; // 5 + unsigned char data_decompressed[8]; + + void unpack(); // unpacks this->data into this->data_decompressed + + //unsigned char data; // single char - not useful + + //static const unsigned char bits_per_nucleotide = 5; + //static const char nucleotides_per_byte = 1 ; //8 / bits_per_nucleotide ; + static const char bytes_per_chunk = 5 ; // this is about decompressed chunks + static const char nucleotides_per_chunk = 8 ; // this is about decompressed chunks + + void set(unsigned char, unsigned char); + void set(char *);// string with 8 amino acids + void set_compressed(unsigned char (&compressed_data)[5]);// string with 5 character - requires unpacking + + char *get(void); + char *get(unsigned char); + + static unsigned char iterator_to_offset(unsigned int); + static unsigned char decompressed_to_compressed_bytes(unsigned char); // when only 5/8 bytes are filled, only 4/5 bytes need to be written + + static const off_t nucleotides_to_compressed_fileoffset(size_t); // file offset waarna gelezen kan worden + static const off_t nucleotides_to_compressed_offset(size_t);// aantal bytes nodig om zoveel data weg te schrijven + + + void next(chunked_reader &); // update the compressed data and set buffer to decompressed data +}; + +#endif diff --git a/include/flags.hpp b/include/flags.hpp index cdf837d4..d6874d87 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -80,6 +80,7 @@ class fastafs_sequence_flags : public twobit_flag bool is_dna(); // alphabet: 'ACTG' + 'N' bool is_rna(); // alphabet: 'ACUG' + 'N' bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' + bool is_protein(); // alphabet: 'ABCDEFGHIJKLMNOPQRSTUVWYZX*-' bool is_complete(); bool is_incomplete() @@ -107,6 +108,7 @@ class fastafs_sequence_flags : public twobit_flag void set_dna(); void set_rna(); void set_iupec_nucleotide(); + void set_protein(); void set_complete(); void set_incomplete(); diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp index 89e46957..f6decccf 100644 --- a/include/fourbit_byte.hpp +++ b/include/fourbit_byte.hpp @@ -1,28 +1,37 @@ -#ifndef fourbit_BYTE_HPP -#define fourbit_BYTE_HPP +#ifndef FOURBIT_BYTE_HPP +#define FOURBIT_BYTE_HPP #include #include "config.hpp" +#include "chunked_reader.hpp" + + class fourbit_byte { public: static const char fourbit_alhpabet[17]; - static const char encode_hash[256][3]; + static char encode_hash[256][3]; static const char n_fill_unmasked = '-'; static const char n_fill_masked = '-'; - static const char bits_per_nucleotide = 4; - static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; + static const unsigned char bits_per_nucleotide = 4; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; // this is about compressed data + static const char nucleotides_per_chunk = 8 / bits_per_nucleotide ; // this is about decompressed chunks unsigned char data; void set(unsigned char, unsigned char); void set(char*);// string met 4 bytes set - const char *get(void); + char *get(void); char *get(unsigned char); static unsigned char iterator_to_offset(unsigned int); + + static const off_t nucleotides_to_compressed_fileoffset(size_t); // file offset waarna gelezen kan worden + static const off_t nucleotides_to_compressed_offset(size_t);// aantal bytes nodig om zoveel data weg te schrijven + + void next(chunked_reader &); // update the compressed data and set buffer to decompressed data }; #endif diff --git a/include/twobit_byte.hpp b/include/twobit_byte.hpp index a36615dd..798bee8c 100644 --- a/include/twobit_byte.hpp +++ b/include/twobit_byte.hpp @@ -5,30 +5,46 @@ #include #include "config.hpp" +#include "chunked_reader.hpp" + + class twobit_byte { -public: - const char (&encode_hash)[256][5]; - twobit_byte(const char (&encode_hash_arg)[256][5]): encode_hash(encode_hash_arg) {}; +private: // things only needed by the compression [encoding, not decoding] +public: + char (&encode_hash)[256][5]; + twobit_byte(char (&encode_hash_arg)[256][5]): encode_hash(encode_hash_arg) {}; static const char n_fill_unmasked = 'N'; static const char n_fill_masked = 'n'; - static const char bits_per_nucleotide = 2; - static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; + static const unsigned char bits_per_nucleotide = 2; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; // this is about compressed data + static const char nucleotides_per_chunk = 8 / bits_per_nucleotide ; // this is about decompressed chunks - unsigned char data; + unsigned char data; // go private void set(unsigned char, unsigned char); void set(char*);// string met 4 bytes set - const char *get(void); + char *get(void); char *get(unsigned char); static unsigned char iterator_to_offset(unsigned int); + + static const off_t nucleotides_to_compressed_fileoffset(size_t); // file offset waarna gelezen kan worden + static const off_t nucleotides_to_compressed_offset(size_t);// aantal bytes nodig om zoveel data weg te schrijven + + void next(chunked_reader &); // update the compressed data }; + +static char ENCODE_HASH_TWOBIT_DNA[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; +static char ENCODE_HASH_TWOBIT_RNA[256][5] = {"UUUU", "UUUC", "UUUA", "UUUG", "UUCU", "UUCC", "UUCA", "UUCG", "UUAU", "UUAC", "UUAA", "UUAG", "UUGU", "UUGC", "UUGA", "UUGG", "UCUU", "UCUC", "UCUA", "UCUG", "UCCU", "UCCC", "UCCA", "UCCG", "UCAU", "UCAC", "UCAA", "UCAG", "UCGU", "UCGC", "UCGA", "UCGG", "UAUU", "UAUC", "UAUA", "UAUG", "UACU", "UACC", "UACA", "UACG", "UAAU", "UAAC", "UAAA", "UAAG", "UAGU", "UAGC", "UAGA", "UAGG", "UGUU", "UGUC", "UGUA", "UGUG", "UGCU", "UGCC", "UGCA", "UGCG", "UGAU", "UGAC", "UGAA", "UGAG", "UGGU", "UGGC", "UGGA", "UGGG", "CUUU", "CUUC", "CUUA", "CUUG", "CUCU", "CUCC", "CUCA", "CUCG", "CUAU", "CUAC", "CUAA", "CUAG", "CUGU", "CUGC", "CUGA", "CUGG", "CCUU", "CCUC", "CCUA", "CCUG", "CCCU", "CCCC", "CCCA", "CCCG", "CCAU", "CCAC", "CCAA", "CCAG", "CCGU", "CCGC", "CCGA", "CCGG", "CAUU", "CAUC", "CAUA", "CAUG", "CACU", "CACC", "CACA", "CACG", "CAAU", "CAAC", "CAAA", "CAAG", "CAGU", "CAGC", "CAGA", "CAGG", "CGUU", "CGUC", "CGUA", "CGUG", "CGCU", "CGCC", "CGCA", "CGCG", "CGAU", "CGAC", "CGAA", "CGAG", "CGGU", "CGGC", "CGGA", "CGGG", "AUUU", "AUUC", "AUUA", "AUUG", "AUCU", "AUCC", "AUCA", "AUCG", "AUAU", "AUAC", "AUAA", "AUAG", "AUGU", "AUGC", "AUGA", "AUGG", "ACUU", "ACUC", "ACUA", "ACUG", "ACCU", "ACCC", "ACCA", "ACCG", "ACAU", "ACAC", "ACAA", "ACAG", "ACGU", "ACGC", "ACGA", "ACGG", "AAUU", "AAUC", "AAUA", "AAUG", "AACU", "AACC", "AACA", "AACG", "AAAU", "AAAC", "AAAA", "AAAG", "AAGU", "AAGC", "AAGA", "AAGG", "AGUU", "AGUC", "AGUA", "AGUG", "AGCU", "AGCC", "AGCA", "AGCG", "AGAU", "AGAC", "AGAA", "AGAG", "AGGU", "AGGC", "AGGA", "AGGG", "GUUU", "GUUC", "GUUA", "GUUG", "GUCU", "GUCC", "GUCA", "GUCG", "GUAU", "GUAC", "GUAA", "GUAG", "GUGU", "GUGC", "GUGA", "GUGG", "GCUU", "GCUC", "GCUA", "GCUG", "GCCU", "GCCC", "GCCA", "GCCG", "GCAU", "GCAC", "GCAA", "GCAG", "GCGU", "GCGC", "GCGA", "GCGG", "GAUU", "GAUC", "GAUA", "GAUG", "GACU", "GACC", "GACA", "GACG", "GAAU", "GAAC", "GAAA", "GAAG", "GAGU", "GAGC", "GAGA", "GAGG", "GGUU", "GGUC", "GGUA", "GGUG", "GGCU", "GGCC", "GGCA", "GGCG", "GGAU", "GGAC", "GGAA", "GGAG", "GGGU", "GGGC", "GGGA", "GGGG"}; + + + class twobit_byte_dna : public twobit_byte { public: diff --git a/include/utils.hpp b/include/utils.hpp index d13194f4..e5751f6f 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -19,9 +19,13 @@ std::string std_string_nullbyte_safe(char *, size_t); bool is_fasta_file(char *); bool is_ucsc2bit_file(char *); +bool is_zstd_file(const char *); std::string basename_cpp(std::string); std::string realpath_cpp(std::string); -uint32_t file_crc32(const std::string &, off_t, size_t ); +uint32_t file_crc32(const std::string &, off_t, size_t); + +bool file_exist(const char *); + diff --git a/scripts/benchmark.py b/scripts/benchmark.py new file mode 100755 index 00000000..a22190d4 --- /dev/null +++ b/scripts/benchmark.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +import os +import datetime + +from utils import * +from test_utils import * + + +RESULTS_FILE = "benchmarks/" + get_sys_id() + ".txt" +if not os.path.exists(RESULTS_FILE): + with open(RESULTS_FILE,'w') as fh: + fh.write( + "\t".join( + ["timestamp", "git-commit", "perf:cycles", "perf:total_time", "perf:user_time", "perf:sys_time", "cmd", "git-mod-status"] + ) + "\n" + ) + +GIT_REV = get_git_revision() +TIMESTAMP = str(datetime.datetime.now()) + + +PATH = 'tmp/benchmark' +if not os.path.exists(PATH): + os.mkdir(PATH) + + +generate_ACTG_fa(PATH + "/test.fa") +# time to convert to fastafs +# perf stat -r 4 ./bin/fastafs cache --fastafs-only -o tmp/benchmark/test.fastafs tmp/benchmark/test.fa + +# should find diff + ccyles +difference = diff_fasta_with_mounted(PATH + "/test.fa", "tmp/benchmark/test", "test", 40, './bin/fastafs', 'tmp/benchmark/mnt') +""" +./bin/fastafs cache -o tmp/benchmark/test tmp/benchmark/test.fa +./bin/fastafs check -f tmp/benchmark/test.zst + + +./bin/fastafs mount -f -p 40 tmp/benchmark/test.zst tmp/benchmark/mnt/ + +./bin/fastafs mount -p 40 tmp/benchmark/test.zst tmp/benchmark/mnt/ +cat tmp/benchmark/mnt/test.fa > tmp/test.fa.x +fusermount -u tmp/benchmark/mnt/ + + +ll tmp/benchmark/test.fa tmp/test.fa.x + +""" + +# {'cmd': ['perf', 'stat', '-e', 'cycles', './bin/fastafs', 'mount', '-d', '-f', '-p', '40', '-f', 'tmp/benchmark/test.zst', 'tmp/benchmark/mnt/'], +# 'stdout': "\n---\nprocessing argv[0] = './bin/fastafs' [current argument=0]\nprocessing argv[1] = 'mount' ", +# 'stderr': '\n---\nFUSE library version: 2.9.9\nnullpath_ok: 0\nnopath: 0\nutime_omit_ok: 0\nunique: 1, opcode: INIT (', +# 'perf': {'cycles': 213834032495, +# 'total_time': 45.750253598, 'user_time': 76.734195, 'sys_time': 0.899997}} + + + +with open(RESULTS_FILE, 'a') as fh: + #print(" >> difference: " + difference) + fh.write( + "\t".join([TIMESTAMP, + GIT_REV[0].replace('git-commit:',''), + str(difference['perf']['cycles']), + str(difference['perf']['total_time']), + str(difference['perf']['user_time']), + str(difference['perf']['sys_time']), + " ".join(difference['cmd']), + GIT_REV[1]]) + "\n" + ) + + + +# time to convert to fastafs+GZ +# perf stat -r 4 ./bin/fastafs cache -o tmp/benchmark/test.fastafs.gz tmp/benchmark/test.fa + +## exporting to ZSTD is faster than regular writing? + + + +# thread 1: +# perf stat ./bin/fastafs mount -f tmp/benchmark/test.fastafs.gz.zst tmp/benchmark/mnt +# ./bin/fastafs mount -f tmp/benchmark/test.fastafs.gz.zst tmp/benchmark/mnt + +# thread 2: +# cat tmp/benchmark/mnt/test.fastafs.gz.fa > /dev/null ; sudo umount tmp/benchmark/mnt +# + + diff --git a/scripts/test_utils.py b/scripts/test_utils.py new file mode 120000 index 00000000..f991a20e --- /dev/null +++ b/scripts/test_utils.py @@ -0,0 +1 @@ +../test/test_utils.py \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 00000000..bfef5af1 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import wget +import os +from tqdm import tqdm +import datetime +import subprocess + + + +def generate_ACTG_fa(output_file): + if not os.path.exists(output_file): + with open(output_file, 'w') as fh: + for chrom in tqdm(['chr1', 'chr2', 'chr3', 'chr4']): + fh.write(">" + chrom + "\n") + buf = '' + for i in tqdm(range(1337 * 1337)): + buf += "AAAAACCCCCTTTTTGGGGGGTCAGCTAGCTACGATCGATCGACTACGA" + + chunksize = 40 + while len(buf) >= chunksize: + fh.write(buf[0:chunksize] + "\n") + buf = buf[chunksize:] + + if len(buf) > 0: + fh.write(buf + "\n") + + + +def get_sys_id(): + import socket + sid = socket.gethostname() + + with open('/etc/machine-id') as fh: + mid = fh.read().strip() + + return (sid + "_" + mid) + + import sys + exit(1) + + + + diff --git a/src/chunked_reader.cpp b/src/chunked_reader.cpp new file mode 100644 index 00000000..c005bc08 --- /dev/null +++ b/src/chunked_reader.cpp @@ -0,0 +1,240 @@ + +#include "chunked_reader.hpp" + + + +chunked_reader::chunked_reader(char * afilename) : + fh_flat(nullptr), fh_zstd(nullptr), buffer_i(0), buffer_n(0), file_i(0) +{ + + this->filename = realpath_cpp(afilename); + this->init(); +} + +chunked_reader::chunked_reader(const char * afilename) : + fh_flat(nullptr), fh_zstd(nullptr), buffer_i(0), buffer_n(0), file_i(0) +{ + this->filename = realpath_cpp(afilename); + this->init(); +} + +chunked_reader::~chunked_reader() +{ + //printf("[chunked_reader::~chunked_reader] exterminate, destroy(!)\n"); + + if(this->fh_flat != nullptr) { + if(this->fh_flat->is_open()) { + this->fh_flat->close(); + } + + delete this->fh_flat; + } + + if(this->fh_zstd != nullptr) { + //ZSTD_seekable_free(this->fh_zstd->seekable); + fclose_orDie(this->fh_zstd->fin); + + //delete this->fh_zstd->seekable; + //delete this->fh_zstd->fin; + + delete this->fh_zstd; + } +} + +void chunked_reader::init() +{ + this->set_filetype(); + + switch(this->filetype) { + + case uncompressed: + this->fh_flat = new std::ifstream; + this->fh_flat->open(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + + if(this->fh_flat->is_open()) { + this->fh_flat->seekg(0, std::ios::beg); + this->update_flat_buffer(); + } else { + throw std::runtime_error("[chunked_reader::init] Cannot open file for reading.\n"); + } + break; + + case zstd: + //printf("[chunked_reader::init()] - init ZSTD_seekable_decompress_init_data* fh_zstd; \n"); + this->fh_zstd = ZSTD_seekable_decompressFile_init(this->filename.c_str()); + // make zstd handle - to be implemented later on + //ZSTD_seekable_decompress_data + break; + + default: + throw std::runtime_error("[chunked_reader::init] Should never happen - but avoids compiler warning.\n"); + break; + } +} + +void chunked_reader::set_filetype() +{ + if(is_zstd_file((const char*) this->filename.c_str())) { + this->filetype = zstd; + } else { + this->filetype = uncompressed; + } +} + + +size_t chunked_reader::read(char *arg_buffer, size_t buffer_size) +{ + buffer_size = std::min(buffer_size, (size_t) READ_BUFFER_SIZE); + size_t written = 0; + + while(this->buffer_i < this->buffer_n and written < buffer_size) { + arg_buffer[written++] = this->buffer[this->buffer_i++]; + } + + /* + size_t n = std::min(this->buffer_n - this->buffer_i, buffer_size - written); + memcpy(&arg_buffer[written], &this->buffer[this->buffer_i] , n); + written += n; + this->buffer_i += n; + */ + + if(written < buffer_size) { + // overwrite buffer + switch(this->filetype) { + case uncompressed: + this->update_flat_buffer(); + break; + case zstd: + this->update_zstd_buffer(); + break; + default: + throw std::runtime_error("[chunked_reader::read] reading from uninitialized object\n"); + break; + } + + // same loop again + while(this->buffer_i < this->buffer_n and written < buffer_size) { + arg_buffer[written++] = this->buffer[this->buffer_i++]; + } + /* - somehow memcpy is slightly slower - test again @ mom laptop + size_t n = std::min(this->buffer_n - this->buffer_i, buffer_size - written); + memcpy(&arg_buffer[written], &this->buffer[this->buffer_i] , n); + written += n; + this->buffer_i += n; + */ + } + + return written; +} + + + + + +size_t chunked_reader::read(unsigned char *arg_buffer, size_t buffer_size) +{ + buffer_size = std::min(buffer_size, (size_t) READ_BUFFER_SIZE); + size_t written = 0; + + while(this->buffer_i < this->buffer_n and written < buffer_size) { + arg_buffer[written++] = this->buffer[this->buffer_i++]; + } + + + if(written < buffer_size) { + // overwrite buffer + switch(this->filetype) { + case uncompressed: + this->update_flat_buffer(); + break; + case zstd: + this->update_zstd_buffer(); + break; + default: + throw std::runtime_error("[chunked_reader::read] reading from uninitialized object\n"); + break; + } + + // same loop again + while(this->buffer_i < this->buffer_n and written < buffer_size) { + arg_buffer[written++] = this->buffer[this->buffer_i++]; + } + } + + return written; +} + + + +// reads single byte from the buffer +unsigned char chunked_reader::read() +{ + if(this->buffer_i >= this->buffer_n) { + switch(this->filetype) { + case uncompressed: + this->update_flat_buffer(); + break; + case zstd: + this->update_zstd_buffer(); + break; + default: + throw std::runtime_error("[chunked_reader::read] reading from uninitialized object\n"); + break; + } + } + + return this->buffer[this->buffer_i++]; +} + + + + +void chunked_reader::update_flat_buffer() +{ + this->fh_flat->read(this->buffer, READ_BUFFER_SIZE); + + this->buffer_i = 0; + this->buffer_n = (size_t) this->fh_flat->gcount(); + this->file_i += this->buffer_n; +} + + +void chunked_reader::update_zstd_buffer() +{ + //size_t written = ZSTD_seekable_decompressFile_orDie(this->filename.c_str(), this->file_i, this->buffer, this->file_i + READ_BUFFER_SIZE); + size_t written = ZSTD_seekable_decompressFile_orDie(this->fh_zstd, this->file_i, this->buffer, this->file_i + READ_BUFFER_SIZE); + + this->buffer_i = 0; + this->buffer_n = written; + this->file_i += written; +} + + + +void chunked_reader::seek(off_t offset) +{ + this->file_i = offset; + + switch(this->filetype) { + case uncompressed: + this->fh_flat->clear(); // reset error state + + if(!this->fh_flat->is_open()) { + this->fh_flat->open(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + } + + this->fh_flat->seekg(offset, std::ios::beg); + this->update_flat_buffer(); + break; + default: + this->update_zstd_buffer(); + break; + } +} + + +size_t chunked_reader::tell() +{ + return this->file_i - this->buffer_n + this->buffer_i; +} + diff --git a/src/database.cpp b/src/database.cpp index aff2962f..a7e78eaf 100644 --- a/src/database.cpp +++ b/src/database.cpp @@ -53,46 +53,62 @@ void database::list() std::cout << "FASTAFS NAME\tFASTAFS\t\tSEQUENCES\tBASES\t\tDISK SIZE\tCOMPR-%\tMOUNT POINT(S)" << std::endl; std::ifstream infile(this->idx); std::string line; + std::string version; + while(std::getline(infile, line)) { - //std::istringstream iss(line); std::string fname = this->path + "/" + line + ".fastafs"; + bool zstd_seek = false; + + if(!file_exist(fname.c_str())) { + fname = this->path + "/" + line + ".fastafs.zst"; + zstd_seek = true; + } - fastafs f = fastafs(line); - f.load(fname); + if(file_exist(fname.c_str())) { + fastafs f = fastafs(line); + f.load(fname); - std::ifstream file(fname, std::ios::in | std::ios::binary | std::ios::ate); - uint32_t size = (uint32_t) file.tellg(); - file.close(); + std::ifstream file(fname, std::ios::in | std::ios::binary | std::ios::ate); + uint32_t size = (uint32_t) file.tellg(); + file.close(); - std::string mountpoints = "-"; - size_t n_mountpoints = fastafs_fuse_mounts.count(fname); + std::string mountpoints = "-"; + size_t n_mountpoints = fastafs_fuse_mounts.count(fname); - if(n_mountpoints > 0) { - mountpoints == ""; - bool is_first = true; + if(n_mountpoints > 0) { + mountpoints == ""; + bool is_first = true; - auto it = fastafs_fuse_mounts.find(fname); - for(; it != fastafs_fuse_mounts.end() ; it++) { - if(is_first) { - mountpoints = it->second.second; - is_first = false; - } else { - mountpoints = mountpoints + "," + it->second.second; + auto it = fastafs_fuse_mounts.find(fname); + for(; it != fastafs_fuse_mounts.end() ; it++) { + if(is_first) { + mountpoints = it->second.second; + is_first = false; + } else { + mountpoints = mountpoints + "," + it->second.second; + } } } - } + if(zstd_seek) { + version = "v0-x32+Z"; + } else { + version = "v0-x32"; + } - printf("%-16s%-16s%-16u%-16u%-16u%-8.1f%s\n",//double %% escapes the - line.c_str(), - std::string("v0-x32-2bit").c_str(),// version ,architechture (32 bit = max 4Gb files..., but can be elaborated to max 4gb per sequence line, then compression types, currently only 2bit) - (uint32_t) f.data.size(), - f.n(), - size, - (float) 100.0 * (float) size / (float) f.fasta_filesize(50), // @todo fastafs file size! - mountpoints.c_str() - ); + printf("%-16s%-16s%-16u%-16u%-16u%-8.1f%s\n",//double %% escapes the + line.c_str(), + version.c_str(),// version ,architechture (32 bit = max 4Gb files..., but can be elaborated to max 4gb per sequence line, then compression types, currently only 2bit) + (uint32_t) f.data.size(), + f.n(), + size, + (float) 100.0 * (float) size / (float) f.fasta_filesize(50), // @todo fastafs file size! + mountpoints.c_str() + ); + } else { + // print error invalid file? + } } } @@ -101,9 +117,11 @@ void database::list() std::string database::add(char *name) { std::ofstream outputFile; + outputFile.open(this->idx, std::fstream::app); outputFile << name << std::endl; outputFile.close(); + return this->path + "/" + name + ".fastafs"; } @@ -117,12 +135,18 @@ std::string database::get(std::string fastafs_name_or_id) std::string fname; std::ifstream infile(this->idx); std::string line; + while(std::getline(infile, line)) { - //std::istringstream iss(line); if(line.compare(fastafs_name_or_id) == 0) { fname = this->path + "/" + line + ".fastafs"; + + if(!file_exist(fname.c_str())) { + fname = this->path + "/" + line + ".fastafs.zst"; + } + } } + return fname; } diff --git a/src/fasta_to_fastafs.cpp b/src/fasta_to_fastafs.cpp index 83983001..4b67d4b3 100644 --- a/src/fasta_to_fastafs.cpp +++ b/src/fasta_to_fastafs.cpp @@ -10,15 +10,15 @@ -const static char nt[2] = "T"; -const static char nc[2] = "C"; + const static char na[2] = "A"; +const static char nc[2] = "C"; const static char ng[2] = "G"; +const static char nt[2] = "T"; +const static char nu[2] = "U"; const static char nn[2] = "N"; - -const static char nu[2] = "U"; const static char nr[2] = "R"; const static char ny[2] = "Y"; const static char nk[2] = "K"; @@ -30,29 +30,40 @@ const static char nd[2] = "D"; const static char nh[2] = "H"; const static char nv[2] = "V"; +const static char ne[2] = "E"; +const static char nf[2] = "F"; +const static char ni[2] = "I"; +const static char nj[2] = "J"; +const static char nl[2] = "L"; +const static char no[2] = "O"; +const static char np[2] = "P"; +const static char nq[2] = "Q"; +const static char nz[2] = "Z"; +const static char nx[2] = "X"; size_t fasta_to_fastafs_seq::N_bytes_used() { // just the number of n-blocks, not their actual size - return (size_t) (4 + (this->n_block_starts.size() * (4*2))); + return (size_t)(4 + (this->n_block_starts.size() * (4 * 2))); } -size_t fasta_to_fastafs_seq::twobit_bytes_used() { - +size_t fasta_to_fastafs_seq::twobit_bytes_used() +{ //printf("n_actg: %i\n", n_actg); //printf("n_actg: %i + 3 = \n", n_actg, n_actg + 3); //printf("n_actg: (%i + 3) / 4 = %i\n", n_actg, (n_actg + 3) / 4); - - static const int twobits_per_byte = 4; - return (size_t) ((this->n_actg + (twobits_per_byte - 1)) / twobits_per_byte); + + //return (size_t)((this->n_actg + (twobit_byte::nucleotides_per_byte - 1)) / twobit_byte::nucleotides_per_byte); + return twobit_byte::nucleotides_to_compressed_offset(this->n_actg); + } -void fasta_to_fastafs_seq::add_twobit_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +void fasta_to_fastafs_seq::twobit_add(unsigned char nucleotide, std::ofstream &fh_fastafs) { this->twobit_data.set(twobit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = @@ -72,18 +83,18 @@ void fasta_to_fastafs_seq::add_twobit_ACTG(unsigned char nucleotide, std::ofstre -void fasta_to_fastafs_seq::finish_twobit_sequence(std::ofstream &fh_fastafs) +void fasta_to_fastafs_seq::finish_sequence(std::ofstream &fh_fastafs) { - uint32_t j; - - // flush last nucleotide - if(this->n_actg % 4 != 0) { - for(j = this->n_actg % 4; j < 4; j++) { - this->twobit_data.set(twobit_byte::iterator_to_offset(j), 0); - } - fh_fastafs << this->twobit_data.data; + if(this->current_dict == DICT_TWOBIT) { + this->twobit_finish_sequence(fh_fastafs); + } else if(this->current_dict == DICT_FOURBIT) { + this->fourbit_finish_sequence(fh_fastafs); + } else { + this->fivebit_finish_sequence(fh_fastafs); } + uint32_t j; + if(this->previous_was_N) { this->n_block_ends.push_back(this->n_actg + this->N - 1); } @@ -141,12 +152,20 @@ void fasta_to_fastafs_seq::finish_twobit_sequence(std::ofstream &fh_fastafs) +void fasta_to_fastafs_seq::twobit_finish_sequence(std::ofstream &fh_fastafs) +{ + // flush last nucleotide + if(this->n_actg % 4 != 0) { + for(uint32_t j = this->n_actg % 4; j < 4; j++) { + this->twobit_data.set(twobit_byte::iterator_to_offset(j), 0); + } + fh_fastafs << this->twobit_data.data; + } +} - - -void fasta_to_fastafs_seq::add_fourbit_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +void fasta_to_fastafs_seq::fourbit_add(unsigned char nucleotide, std::ofstream &fh_fastafs) { this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = @@ -163,23 +182,9 @@ void fasta_to_fastafs_seq::add_fourbit_ACTG(unsigned char nucleotide, std::ofstr this->n_actg++; } -void fasta_to_fastafs_seq::add_N() -{ - if(!this->previous_was_N) { - this->n_block_starts.push_back(this->n_actg + this->N); - } - - this->previous_was_N = true; - this->N++; -} - - - -void fasta_to_fastafs_seq::finish_fourbit_sequence(std::ofstream &fh_fastafs) +void fasta_to_fastafs_seq::fourbit_finish_sequence(std::ofstream &fh_fastafs) { - uint32_t j; - // flush last nucleotide if(this->n_actg % 2 != 0) { this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), 0); @@ -187,64 +192,58 @@ void fasta_to_fastafs_seq::finish_fourbit_sequence(std::ofstream &fh_fastafs) fh_fastafs << this->fourbit_data.data; } - if(this->previous_was_N) { - this->n_block_ends.push_back(this->n_actg + this->N - 1); - } +} - // do M block - if(this->in_m_block) { - this->m_block_ends.push_back(this->n_actg + this->N - 1); - //printf("closing m-block: %u\n",this->n_actg + this->N - 1); - } -#if DEBUG - if(this->m_block_starts.size() != this->m_block_ends.size()) { - throw std::runtime_error("M blocks not correctly parsed\n"); +void fasta_to_fastafs_seq::fivebit_add(unsigned char amino_acid, std::ofstream &fh_fastafs) +{ + unsigned char off = fivebit_fivebytes::iterator_to_offset(this->n_actg); + this->fivebit_data.set(off, amino_acid); + + if(this->n_actg % 8 == 7) { + //fh_fastafs << this->fivebit_data.data_compressed; + fh_fastafs.write((const char*) this->fivebit_data.data_compressed, 5); } -#endif //DEBUG - char buffer[4 + 1]; + this->n_actg++; +} - // (over)write number nucleotides - std::streamoff index_file_position = fh_fastafs.tellp(); - fh_fastafs.seekp(this->file_offset_in_fastafs, std::ios::beg); - uint_to_fourbytes(buffer, this->n_actg); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - fh_fastafs.seekp(index_file_position, std::ios::beg); +void fasta_to_fastafs_seq::fivebit_finish_sequence(std::ofstream &fh_fastafs) +{ + unsigned char n_sticky = (unsigned char)(this->n_actg % 8); // sticky end + unsigned char n_sticky_compressed = fivebit_fivebytes::decompressed_to_compressed_bytes(n_sticky); - // N blocks - uint_to_fourbytes(buffer, (uint32_t) this->n_block_starts.size()); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - for(j = 0; j < this->n_block_starts.size(); j++) { - uint_to_fourbytes(buffer, this->n_block_starts[j]); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - } - for(j = 0; j < this->n_block_ends.size(); j++) { - uint_to_fourbytes(buffer, this->n_block_ends[j]); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + // flush last nucleotide + if(n_sticky_compressed > 0) { + for(unsigned char i = n_sticky; i < 8 ; i++) { + this->fivebit_data.set(i, 0); + } + + fh_fastafs.write((const char*) this->fivebit_data.data_compressed, n_sticky_compressed); } +} - // write checksum - MD5_Final(this->md5_digest, &this->ctx); - fh_fastafs.write(reinterpret_cast(&this->md5_digest), (size_t) 16); - // M blocks - uint_to_fourbytes(buffer, (uint32_t) this->m_block_starts.size()); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - for(j = 0; j < this->m_block_starts.size(); j++) { - uint_to_fourbytes(buffer, this->m_block_starts[j]); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - } - for(j = 0; j < this->m_block_ends.size(); j++) { - uint_to_fourbytes(buffer, this->m_block_ends[j]); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + +void fasta_to_fastafs_seq::add_unknown() +{ + if(!this->previous_was_N) { + this->n_block_starts.push_back(this->n_actg + this->N); } + + this->previous_was_N = true; + this->N++; } -void fasta_to_fastafs_seq::flush() { + + + +void fasta_to_fastafs_seq::flush() +{ this->N = 0; this->n_actg = 0; @@ -255,7 +254,7 @@ void fasta_to_fastafs_seq::flush() { this->m_block_ends.clear(); this->in_m_block = false; this->previous_was_N = false; - + this->has_T = false; this->has_U = false; @@ -309,19 +308,16 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf bool running = getline(fh_fasta, line).good(); while(running) { if(line[0] == '>') { + // more N-bytes than 2-bit bytes - 4bit is more efficient if(auto_recompress_to_fourbit && s->current_dict == DICT_TWOBIT && s->N_bytes_used() > s->twobit_bytes_used()) { fh_fasta.seekg(s->file_offset_in_fasta, std::ios::beg); fh_fastafs.seekp(s->file_offset_in_fastafs + 4, std::ios::beg);// plus four, skipping the size s->flush(); s->current_dict = DICT_FOURBIT; - } - else { - if(s->current_dict == DICT_TWOBIT) { - s->finish_twobit_sequence(fh_fastafs);// finish last sequence - } else { - s->finish_fourbit_sequence(fh_fastafs);// finish last sequence - } + } else { + s->finish_sequence(fh_fastafs); + line.erase(0, 1);// erases first part, quicker would be pointer from first char //s = new fasta_to_fastafs_seq(fh_fastafs.tellp(), line); @@ -331,7 +327,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf index.push_back(s); } } else { - if(s->current_dict == DICT_TWOBIT) { + if(s->current_dict == DICT_TWOBIT) { for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { @@ -347,8 +343,8 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_twobit_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nu, 1);// this needs to be pu in add_Nucleotide + s->twobit_add(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nu, 1);// this needs to be pu in add_unknownucleotide break; case 'u':// lower case = m block if(s->has_T) { @@ -361,8 +357,8 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_twobit_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nu, 1);// this needs to be pu in add_Nucleotide + s->twobit_add(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nu, 1);// this needs to be pu in add_unknownucleotide break; case 'T': if(s->has_U) { @@ -375,8 +371,8 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_twobit_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_Nucleotide + s->twobit_add(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_unknownucleotide break; case 't': if(s->has_U) { @@ -389,8 +385,8 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_twobit_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_Nucleotide + s->twobit_add(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_unknownucleotide break; case 'C': if(s->in_m_block) { @@ -398,7 +394,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_twobit_ACTG(NUCLEOTIDE_C, fh_fastafs); + s->twobit_add(NUCLEOTIDE_C, fh_fastafs); MD5_Update(&s->ctx, nc, 1); break; case 'c': @@ -407,7 +403,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_twobit_ACTG(NUCLEOTIDE_C, fh_fastafs); + s->twobit_add(NUCLEOTIDE_C, fh_fastafs); MD5_Update(&s->ctx, nc, 1); break; case 'A': @@ -417,7 +413,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_twobit_ACTG(NUCLEOTIDE_A, fh_fastafs); + s->twobit_add(NUCLEOTIDE_A, fh_fastafs); MD5_Update(&s->ctx, na, 1); break; case 'a': @@ -426,7 +422,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_twobit_ACTG(NUCLEOTIDE_A, fh_fastafs); + s->twobit_add(NUCLEOTIDE_A, fh_fastafs); MD5_Update(&s->ctx, na, 1); break; case 'G': @@ -435,7 +431,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_twobit_ACTG(NUCLEOTIDE_G, fh_fastafs); + s->twobit_add(NUCLEOTIDE_G, fh_fastafs); MD5_Update(&s->ctx, ng, 1); break; case 'g': @@ -444,7 +440,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_twobit_ACTG(NUCLEOTIDE_G, fh_fastafs); + s->twobit_add(NUCLEOTIDE_G, fh_fastafs); MD5_Update(&s->ctx, ng, 1); break; case 'N': @@ -453,7 +449,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_N(); + s->add_unknown(); MD5_Update(&s->ctx, nn, 1); break; case 'n': @@ -462,19 +458,42 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_N(); + s->add_unknown(); MD5_Update(&s->ctx, nn, 1); break; - default: + case 'r': + case 'R': + case 'y': + case 'Y': + case 'k': + case 'K': + case 'm': + case 'M': + case 's': + case 'S': + case 'w': + case 'W': + case 'b': + case 'B': + case 'd': + case 'D': + case 'h': + case 'H': + case 'v': + case 'V': + case '-': s->current_dict = DICT_FOURBIT; break; + default: + s->current_dict = DICT_FIVEBIT; + break; } } + if(s->current_dict != DICT_TWOBIT) { + char dict = s->current_dict; // DICT_FOURBIT | DICT_FIVEBIT - //@todo Funct set_to_fourbit(*s, *fasta, *fastafs , ...) - if(s->current_dict == DICT_FOURBIT) { // set to fourbit and re-intialize // seek fasta header to beg + s->file_offset_in_fasta // seek fastafs back to beg + s->file_offset_in_fastafs and overwrite @@ -483,10 +502,10 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf fh_fastafs.seekp(s->file_offset_in_fastafs + 4, std::ios::beg);// plus four, skipping the size s->flush(); - s->current_dict = DICT_FOURBIT; + s->current_dict = dict; // set back to dict } - } else { // four bit decoding + } else if(s->current_dict == DICT_FOURBIT) { // four bit decoding for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { @@ -496,7 +515,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(0, fh_fastafs); + s->fourbit_add(0, fh_fastafs); MD5_Update(&s->ctx, na, 1); break; case 'a': @@ -505,7 +524,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(0, fh_fastafs); + s->fourbit_add(0, fh_fastafs); MD5_Update(&s->ctx, na, 1); break; case 'C': @@ -514,7 +533,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(1, fh_fastafs); + s->fourbit_add(1, fh_fastafs); MD5_Update(&s->ctx, nc, 1); break; case 'c': @@ -523,7 +542,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(1, fh_fastafs); + s->fourbit_add(1, fh_fastafs); MD5_Update(&s->ctx, nc, 1); break; case 'G': @@ -532,7 +551,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(2, fh_fastafs); + s->fourbit_add(2, fh_fastafs); MD5_Update(&s->ctx, ng, 1); break; case 'g': @@ -541,7 +560,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(2, fh_fastafs); + s->fourbit_add(2, fh_fastafs); MD5_Update(&s->ctx, ng, 1); break; case 'T': @@ -550,7 +569,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(3, fh_fastafs); + s->fourbit_add(3, fh_fastafs); MD5_Update(&s->ctx, nt, 1); break; case 't': @@ -559,7 +578,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(3, fh_fastafs); + s->fourbit_add(3, fh_fastafs); MD5_Update(&s->ctx, nt, 1); break; case 'U': @@ -568,7 +587,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(4, fh_fastafs); + s->fourbit_add(4, fh_fastafs); MD5_Update(&s->ctx, nu, 1); break; case 'u': @@ -577,7 +596,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(4, fh_fastafs); + s->fourbit_add(4, fh_fastafs); MD5_Update(&s->ctx, nu, 1); break; @@ -587,7 +606,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(5, fh_fastafs); + s->fourbit_add(5, fh_fastafs); MD5_Update(&s->ctx, nr, 1); break; case 'r': @@ -596,7 +615,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(5, fh_fastafs); + s->fourbit_add(5, fh_fastafs); MD5_Update(&s->ctx, nr, 1); break; case 'Y': @@ -605,7 +624,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(6, fh_fastafs); + s->fourbit_add(6, fh_fastafs); MD5_Update(&s->ctx, ny, 1); break; case 'y': @@ -614,7 +633,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(6, fh_fastafs); + s->fourbit_add(6, fh_fastafs); MD5_Update(&s->ctx, ny, 1); break; case 'K': @@ -623,7 +642,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(7, fh_fastafs); + s->fourbit_add(7, fh_fastafs); MD5_Update(&s->ctx, nk, 1); break; case 'k': @@ -632,7 +651,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(7, fh_fastafs); + s->fourbit_add(7, fh_fastafs); MD5_Update(&s->ctx, nk, 1); break; case 'M': @@ -641,7 +660,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(8, fh_fastafs); + s->fourbit_add(8, fh_fastafs); MD5_Update(&s->ctx, nm, 1); break; case 'm': @@ -650,7 +669,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(8, fh_fastafs); + s->fourbit_add(8, fh_fastafs); MD5_Update(&s->ctx, nm, 1); break; case 'S': @@ -659,7 +678,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(9, fh_fastafs); + s->fourbit_add(9, fh_fastafs); MD5_Update(&s->ctx, ns, 1); break; case 's': @@ -668,7 +687,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(9, fh_fastafs); + s->fourbit_add(9, fh_fastafs); MD5_Update(&s->ctx, ns, 1); break; case 'W': @@ -677,7 +696,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(10, fh_fastafs); + s->fourbit_add(10, fh_fastafs); MD5_Update(&s->ctx, nw, 1); break; case 'w': @@ -686,7 +705,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(10, fh_fastafs); + s->fourbit_add(10, fh_fastafs); MD5_Update(&s->ctx, nw, 1); break; case 'B': @@ -695,7 +714,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(11, fh_fastafs); + s->fourbit_add(11, fh_fastafs); MD5_Update(&s->ctx, nb, 1); break; case 'b': @@ -704,7 +723,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(11, fh_fastafs); + s->fourbit_add(11, fh_fastafs); MD5_Update(&s->ctx, nb, 1); break; case 'D': @@ -713,7 +732,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(12, fh_fastafs); + s->fourbit_add(12, fh_fastafs); MD5_Update(&s->ctx, nd, 1); break; case 'd': @@ -722,7 +741,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(12, fh_fastafs); + s->fourbit_add(12, fh_fastafs); MD5_Update(&s->ctx, nd, 1); break; case 'H': @@ -731,7 +750,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(13, fh_fastafs); + s->fourbit_add(13, fh_fastafs); MD5_Update(&s->ctx, nh, 1); break; case 'h': @@ -740,7 +759,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(13, fh_fastafs); + s->fourbit_add(13, fh_fastafs); MD5_Update(&s->ctx, nh, 1); break; case 'V': @@ -749,7 +768,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(14, fh_fastafs); + s->fourbit_add(14, fh_fastafs); MD5_Update(&s->ctx, nv, 1); break; case 'v': @@ -758,7 +777,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(14, fh_fastafs); + s->fourbit_add(14, fh_fastafs); MD5_Update(&s->ctx, nv, 1); break; case 'N': @@ -767,7 +786,7 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = false; } - s->add_fourbit_ACTG(15, fh_fastafs); + s->fourbit_add(15, fh_fastafs); MD5_Update(&s->ctx, nn, 1); break; case 'n': @@ -776,73 +795,579 @@ size_t fasta_to_fastafs(const std::string &fasta_file, const std::string &fastaf s->in_m_block = true; } - s->add_fourbit_ACTG(15, fh_fastafs); + s->fourbit_add(15, fh_fastafs); MD5_Update(&s->ctx, nn, 1); break; case '-': - s->add_N(); + s->add_unknown(); break; + // @todo case for those only in protein seq + default: - throw std::runtime_error("[fasta_to_x_fastafs] invalid chars in FASTA file"); + s->current_dict = DICT_FIVEBIT; break; } } - } - } - - running = getline(fh_fasta, line).good(); - // if not running, recheck - if(!running) { - if(auto_recompress_to_fourbit && s->current_dict == DICT_TWOBIT && s->N_bytes_used() > s->twobit_bytes_used()) { - fh_fasta.clear();// get it out of EOF state + if(s->current_dict != DICT_FOURBIT) { + char dict = s->current_dict; // DICT_FOURBIT | DICT_FIVEBIT - fh_fasta.seekg(s->file_offset_in_fasta, std::ios::beg); - fh_fastafs.seekp(s->file_offset_in_fastafs + 4, std::ios::beg);// plus four, skipping the size + // set to fourbit and re-intialize + // seek fasta header to beg + s->file_offset_in_fasta + // seek fastafs back to beg + s->file_offset_in_fastafs and overwrite + //throw std::runtime_error("[fasta_to_fastafs] invalid chars in FASTA file"); + fh_fasta.seekg(s->file_offset_in_fasta, std::ios::beg); + fh_fastafs.seekp(s->file_offset_in_fastafs + 4, std::ios::beg);// plus four, skipping the size - s->flush(); - s->current_dict = DICT_FOURBIT; - - //after re-opening file and setting the file pointer, read line again - running = getline(fh_fasta, line).good(); - } - else { - if(s->current_dict == DICT_TWOBIT) { - s->finish_twobit_sequence(fh_fastafs);// finish last sequence - } else { - s->finish_fourbit_sequence(fh_fastafs);// finish last sequence + s->flush(); + s->current_dict = dict; // set back to dict } - } - } - } - } - fh_fasta.close(); - } + } else { // s->current_dict == DICT_FIVEBIT + for(std::string::iterator it = line.begin(); it != line.end(); ++it) { + switch(*it) { + //ABCDEFGHIJKLMNOPQRSTUVWYZX*- + case 'A': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } - // write index/footer - unsigned int index_file_position = (uint32_t) fh_fastafs.tellp(); - char buffer[4 + 1]; - uint_to_fourbytes(buffer, (uint32_t) index.size()); - fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + s->fivebit_add(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'a': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } - for(size_t i = 0; i < index.size(); i++) { - s = index[i]; + s->fivebit_add(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'B': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } - // set and write flag - fastafs_sequence_flags fsf; - fsf.set_linear(); + s->fivebit_add(1, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'b': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } - if(s->current_dict == DICT_TWOBIT) { - if(s->has_U) { - fsf.set_rna(); - } else { - fsf.set_dna(); - } - } else { - fsf.set_iupec_nucleotide(); + s->fivebit_add(1, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'C': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(2, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'c': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(2, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'D': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(3, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'd': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(3, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'E': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(4, fh_fastafs); + MD5_Update(&s->ctx, ne, 1); + break; + case 'e': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(4, fh_fastafs); + MD5_Update(&s->ctx, ne, 1); + break; + case 'F': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(5, fh_fastafs); + MD5_Update(&s->ctx, nf, 1); + break; + case 'f': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(5, fh_fastafs); + MD5_Update(&s->ctx, nf, 1); + break; + case 'G'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(6, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'g': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(6, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'H'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(7, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'h': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(7, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'I'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(8, fh_fastafs); + MD5_Update(&s->ctx, ni, 1); + break; + case 'i': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(8, fh_fastafs); + MD5_Update(&s->ctx, ni, 1); + break; + case 'J'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(9, fh_fastafs); + MD5_Update(&s->ctx, nj, 1); + break; + case 'j': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(9, fh_fastafs); + MD5_Update(&s->ctx, nj, 1); + break; + case 'K'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(10, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'k': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(10, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'L'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(11, fh_fastafs); + MD5_Update(&s->ctx, nl, 1); + break; + case 'l': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(11, fh_fastafs); + MD5_Update(&s->ctx, nl, 1); + break; + case 'M'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(12, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'm': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(12, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'N'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(13, fh_fastafs); + MD5_Update(&s->ctx, nn, 1); + break; + case 'n': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(13, fh_fastafs); + MD5_Update(&s->ctx, nn, 1); + break; + case 'O'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(14, fh_fastafs); + MD5_Update(&s->ctx, no, 1); + break; + case 'o': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(14, fh_fastafs); + MD5_Update(&s->ctx, no, 1); + break; + case 'P'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(15, fh_fastafs); + MD5_Update(&s->ctx, np, 1); + break; + case 'p': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(15, fh_fastafs); + MD5_Update(&s->ctx, np, 1); + break; + case 'Q'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(16, fh_fastafs); + MD5_Update(&s->ctx, nq, 1); + break; + case 'q': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(16, fh_fastafs); + MD5_Update(&s->ctx, nq, 1); + break; + case 'R'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(17, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); + break; + case 'r': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(17, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); + break; + case 'S'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(18, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 's': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(18, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 'T'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(19, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); + break; + case 't': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(19, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); + break; + case 'U': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(20, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); + break; + case 'u': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(20, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); + break; + case 'V': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(21, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); + break; + case 'v': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(21, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); + break; + + case 'W'://ABCDEFGHIJKLMNOPQRSTUVWYZX*- + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(22, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + case 'w': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(22, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + + case 'Y': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(23, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); + break; + case 'y': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(23, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); + break; + case 'Z': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(24, fh_fastafs); + MD5_Update(&s->ctx, nz, 1); + break; + case 'z': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(24, fh_fastafs); + MD5_Update(&s->ctx, nz, 1); + break; + case 'X': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->fivebit_add(25, fh_fastafs); + MD5_Update(&s->ctx, nx, 1); + break; + case 'x': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->fivebit_add(25, fh_fastafs); + MD5_Update(&s->ctx, nx, 1); + break; + + case '*': + s->fivebit_add(26, fh_fastafs); + break; + case '-': + s->fivebit_add(27, fh_fastafs); + break; + + + // @todo case for those only in protein seq + + default: + throw std::runtime_error("[fasta_to_x_fastafs] invalid chars in FASTA file"); + break; + } + + + } + } + } + + running = getline(fh_fasta, line).good(); + + // if not running, recheck + if(!running) { + if(auto_recompress_to_fourbit && s->current_dict == DICT_TWOBIT && s->N_bytes_used() > s->twobit_bytes_used()) { + fh_fasta.clear();// get it out of EOF state + + fh_fasta.seekg(s->file_offset_in_fasta, std::ios::beg); + fh_fastafs.seekp(s->file_offset_in_fastafs + 4, std::ios::beg);// plus four, skipping the size + + s->flush(); + s->current_dict = DICT_FOURBIT; + + //after re-opening file and setting the file pointer, read line again + running = getline(fh_fasta, line).good(); + } else { + s->finish_sequence(fh_fastafs); + } + } + } + } + fh_fasta.close(); + } + + + // write index/footer + unsigned int index_file_position = (uint32_t) fh_fastafs.tellp(); + char buffer[4 + 1]; + uint_to_fourbytes(buffer, (uint32_t) index.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + for(size_t i = 0; i < index.size(); i++) { + s = index[i]; + + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + + if(s->current_dict == DICT_TWOBIT) { + if(s->has_U) { + fsf.set_rna(); + } else { + fsf.set_dna(); + } + } else if(s->current_dict == DICT_FOURBIT) { + fsf.set_iupec_nucleotide(); + } else { + fsf.set_protein(); // set protein } fsf.set_complete(); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index b247f29a..8e948e00 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -32,6 +32,7 @@ #include "twobit_byte.hpp" #include "fourbit_byte.hpp" +#include "fivebit_fivebytes.hpp" #include "fastafs.hpp" #include "utils.hpp" @@ -65,7 +66,7 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) -void fastafs_seq::view_fasta(ffs2f_init_seq* cache, std::ifstream *fh) +void fastafs_seq::view_fasta(ffs2f_init_seq* cache, chunked_reader &fh) { char buffer[READ_BUFFER_SIZE];// = new char [READ_BUFFER_SIZE]; uint32_t offset = 0; @@ -141,18 +142,27 @@ uint32_t fastafs_seq::view_fasta_chunk( size_t buffer_size, off_t start_pos_in_fasta, - std::ifstream *fh) + chunked_reader &fh) { + uint32_t written_iter; + uint32_t written = 0; - if(this->flags.is_twobit()) { - if(this->flags.is_dna()) { - return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + do { + if(this->flags.is_twobit()) { + if(this->flags.is_dna()) { + written_iter = this->view_fasta_chunk_generalized(cache, buffer + written, buffer_size - written, start_pos_in_fasta + written, fh); + } else { + written_iter = this->view_fasta_chunk_generalized(cache, buffer + written, buffer_size - written, start_pos_in_fasta + written, fh); + } + } else if(this->flags.is_fourbit()) { + written_iter = this->view_fasta_chunk_generalized(cache, buffer + written, buffer_size - written, start_pos_in_fasta + written, fh); } else { - return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + written_iter = this->view_fasta_chunk_generalized(cache, buffer + written, buffer_size - written, start_pos_in_fasta + written, fh); } - } else { - return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); - } + written += written_iter; + } while((written_iter > 0) and (written < buffer_size)); + + return written; } @@ -171,14 +181,14 @@ uint32_t fastafs_seq::view_fasta_chunk( * * @todo see if this can be a std::ifstream or some kind of stream type of object? */ -template uint32_t fastafs_seq::view_fasta_chunk_generalized( +template inline uint32_t fastafs_seq::view_fasta_chunk_generalized( ffs2f_init_seq* cache, char *buffer, size_t buffer_size, off_t start_pos_in_fasta, - std::ifstream *fh) + chunked_reader &fh) { #if DEBUG if(cache == nullptr) { @@ -234,14 +244,16 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( size_t n_block = cache->n_starts.size(); size_t m_block = cache->m_starts.size(); uint32_t newlines_passed = offset_from_sequence_line / (cache->padding + 1);// number of newlines passed (within the sequence part) - uint32_t nucleotide_pos = offset_from_sequence_line - newlines_passed;// requested nucleotide in file + const uint32_t nucleotide_pos = offset_from_sequence_line - newlines_passed;// requested nucleotide in file // calculate file position for next twobit // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there uint32_t n_passed = 0; this->get_n_offset(nucleotide_pos, &n_passed); - fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / T::nucleotides_per_byte), fh->beg); - + uint32_t compressed_nucleotide_offset = nucleotide_pos - n_passed; // number of nucleotides [NACT / compressed] behind us + fh.seek((uint32_t) this->data_position + 4 + T::nucleotides_to_compressed_fileoffset(compressed_nucleotide_offset)); + unsigned char bit_offset = compressed_nucleotide_offset % T::nucleotides_per_chunk;// twobit -> 4, fourbit: -> 2 + /* 0 0 0 0 1 1 1 1 << desired offset from starting point A C T G A C T G @@ -255,37 +267,10 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( // const char *chunk = t.encode_hash[0];// init // unsigned char bit_offset = (nucleotide_pos - n_passed) % t.nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 - - // big buffer - //@todo avoid dynamic allocation and fix buffer size? - // char *buffer[4096 + 4]; - // watch out for off-grid requests: a 2byte buffer may 3 bytes reserved at least - // X X - // [ | | | | ] [ | | | | ] - //char *from_file_buffer; - //from_file_buffer = (char *) malloc(sizeof(char) * ((buffer_size / T::nucleotides_per_byte) + 5)); // kan zeker 4x kleiner - char from_file_buffer[(READ_BUFFER_SIZE / 2) + 6]; - - fh->read(from_file_buffer, (buffer_size / T::nucleotides_per_byte) + 4); - if(!fh->good()) { - fh->clear();// out of bound oterhwise - } - - uint ff = 0; - - /* - printf("size = (reserved = %i) (read = %i)", ((buffer_size / 4) + 2) , (buffer_size / 4) + 1); - printf(" (actual: %i)\n",fh->gcount() ); - */ - - const char *chunk = t.encode_hash[0];// init - unsigned char bit_offset = (nucleotide_pos - n_passed) % T::nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 + char *chunk = (char *) t.encode_hash[1];// init if(bit_offset != 0) { - //fh->read((char*)(&t.data), 1); - t.data = from_file_buffer[ff]; - ff++; - + t.next(fh); chunk = t.get(); } while(n_block > 0 and pos <= cache->n_ends[n_block - 1]) { // iterate back @@ -305,17 +290,14 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( while(pos < pos_limit) {// while next sequence-containing-line is open if(pos >= cache->n_starts[n_block]) { if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = t.n_fill_masked; + buffer[written++] = T::n_fill_masked; } else { - buffer[written++] = t.n_fill_unmasked; + buffer[written++] = T::n_fill_unmasked; } } else { - if(bit_offset % T::nucleotides_per_byte == 0) { - //fh->read((char*)(&t.data), 1); - t.data = from_file_buffer[ff]; - ff++; - + if(bit_offset % T::nucleotides_per_chunk == 0) { + t.next(fh); chunk = t.get(); } @@ -325,7 +307,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( buffer[written++] = chunk[bit_offset]; } - bit_offset = (unsigned char)(bit_offset + 1) % t.nucleotides_per_byte; + bit_offset = (unsigned char)(bit_offset + 1) % T::nucleotides_per_chunk; } if(pos == cache->n_ends[n_block]) { n_block++; @@ -392,7 +374,9 @@ size_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_re return total_requested_size; } -uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset, std::ifstream *fh) +uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, + sequence_region* sr, char *buffer, size_t size, off_t offset, + chunked_reader &fh) { #if DEBUG if(cache == nullptr) { @@ -457,7 +441,7 @@ fastafs check short 1.53s user 2.73s system 99% cpu 4.269 total chunk size 1024: ?? */ -std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) +std::string fastafs_seq::sha1(ffs2f_init_seq* cache, chunked_reader &fh) { #if DEBUG if(cache == nullptr) { @@ -475,7 +459,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) SHA_CTX ctx; SHA1_Init(&ctx); - fh->clear(); + //fh->clear(); // "(a/b)*b + a%b shall equal a" // full iterations = this->n / chunk_size; do this number of iterations looped @@ -500,7 +484,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) //printf(" (%i * %i) + %i = %i = %i\n", n_iterations , chunksize, remaining_bytes , (n_iterations * chunksize) + remaining_bytes , this->n); unsigned char cur_sha1_digest[SHA_DIGEST_LENGTH]; SHA1_Final(cur_sha1_digest, &ctx); - fh->clear(); // because gseek was done before + //fh->clear(); // because gseek was done before char sha1_hash[41]; sha1_digest_to_hash(cur_sha1_digest, sha1_hash); @@ -510,7 +494,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) -std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) +std::string fastafs_seq::md5(ffs2f_init_seq* cache, chunked_reader &fh) { #if DEBUG if(cache == nullptr) { @@ -528,7 +512,7 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) MD5_CTX ctx; MD5_Init(&ctx); - fh->clear(); + //fh->clear(); // "(a/b)*b + a%b shall equal a" // full iterations = this->n / chunk_size; do this number of iterations looped @@ -564,7 +548,7 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) //printf(" (%i * %i) + %i = %i = %i\n", n_iterations , chunksize, remaining_bytes , (n_iterations * chunksize) + remaining_bytes , this->n); unsigned char cur_md5_digest[MD5_DIGEST_LENGTH]; MD5_Final(cur_md5_digest, &ctx); - fh->clear(); // because gseek was done before + //fh->clear(); // because gseek was done before char md5_hash[32 + 1]; md5_digest_to_hash(cur_md5_digest, md5_hash); @@ -664,25 +648,24 @@ void fastafs::load(std::string afilename) std::streampos size; char *memblock; - std::ifstream file(afilename, std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { + chunked_reader fh_in = chunked_reader(afilename.c_str()); + { + memblock = new char [20 + 1]; //sha1 is 20b // if a user can't compile this line, please replace it with C's // 'realpath' function and delete/free afterwards and send a PR //this->filename = std::filesystem::canonical(afilename);// this path must be absolute because if stuff gets send to FUSE, paths are relative to the FUSE process and probably systemd initialization this->filename = realpath_cpp(afilename); - - size = file.tellg(); + size = (size_t) fh_in.read(memblock, 16); if(size < 16) { - file.close(); + //file.close(); throw std::invalid_argument("Corrupt file: " + filename); } else { - memblock = new char [20 + 1]; //sha1 is 20b - file.seekg(0, std::ios::beg); + fh_in.seek(0); uint32_t i; // HEADER - file.read(memblock, 14); + fh_in.read(memblock, 14); memblock[16] = '\0'; // check magic @@ -702,23 +685,11 @@ void fastafs::load(std::string afilename) throw std::invalid_argument("Incomplete FASTAFS file (probably terminated during conversion): " + filename); } - /* - unsigned char bits; - unsigned char bits_per_byte; - if(this->flags.is_twobit()) { - bits = 2; - bits_per_byte = 4; - } - else { - bits = 4; - bits_per_byte = 2; - }*/ - std::streampos file_cursor = (std::streampos) fourbytes_to_uint(&memblock[10], 0); // INDEX - file.seekg(file_cursor, std::ios::beg); - file.read(memblock, 4); + fh_in.seek(file_cursor); + fh_in.read(memblock, 4); this->data.resize(fourbytes_to_uint(memblock, 0));//n_seq becomes this->data.size() size_t j; @@ -727,97 +698,95 @@ void fastafs::load(std::string afilename) s = new fastafs_seq; // flag - file.read(memblock, 2); + fh_in.read(memblock, 2); s->flags.set(memblock);// should be initialized during construction of this class // name length - file.read(memblock, 1); + fh_in.read(memblock, 1); // name size_t namesize = (unsigned char) memblock[0]; // cast to something that is large enough (> 128) char name[namesize + 1]; - file.read(name, namesize); + fh_in.read(name, namesize); name[(unsigned char) memblock[0]] = '\0'; s->name = std::string(name); // set cursor and save sequence data position - file.read(memblock, 4); - file_cursor = file.tellg(); + fh_in.read(memblock, 4); + file_cursor = fh_in.tell(); s->data_position = fourbytes_to_uint(memblock, 0); - file.seekg((uint32_t) s->data_position, file.beg); + fh_in.seek((uint32_t) s->data_position); { // sequence stuff // n compressed nucleotides - file.read(memblock, 4); + fh_in.read(memblock, 4); s->n = fourbytes_to_uint(memblock, 0); // skip nucleotides if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, - file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); + fh_in.seek((uint32_t) s->data_position + 4 + ((s->n + 3) / 4)); } else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, - file.seekg((uint32_t) s->data_position + 4 + ((s->n + 1) / 2), file.beg); + fh_in.seek((uint32_t) s->data_position + 4 + ((s->n + 1) / 2)); + } else { + fh_in.seek((uint32_t) s->data_position + 4 + fivebit_fivebytes::nucleotides_to_compressed_offset(s->n)); } // N-blocks (and update this->n instantly) - file.read(memblock, 4); + fh_in.read(memblock, 4); uint32_t N_blocks = fourbytes_to_uint(memblock, 0); s->n_starts.resize(N_blocks); s->n_ends.resize(N_blocks); for(j = 0; j < s->n_starts.size(); j++) { - file.read(memblock, 4); + fh_in.read(memblock, 4); s->n_starts[j] = fourbytes_to_uint(memblock, 0); } for(j = 0; j < s->n_ends.size(); j++) { - file.read(memblock, 4); + fh_in.read(memblock, 4); s->n_ends[j] = fourbytes_to_uint(memblock, 0); s->n += s->n_ends[j] - s->n_starts[j] + 1; } // MD5-checksum - only if sequence is complete if(s->flags.is_complete()) { - file.read(memblock, 16); - for(int j = 0; j < 16 ; j ++) { + fh_in.read(memblock, 16); + for(int j = 0; j < 16 ; j++) { s->md5_digest[j] = memblock[j]; } } // M-blocks - file.read(memblock, 4); + fh_in.read(memblock, 4); uint32_t M_blocks = fourbytes_to_uint(memblock, 0); s->m_starts.resize(M_blocks); s->m_ends.resize(M_blocks); for(j = 0; j < s->m_starts.size(); j++) { - file.read(memblock, 4); + fh_in.read(memblock, 4); s->m_starts[j] = fourbytes_to_uint(memblock, 0); } for(j = 0; j < s->m_ends.size(); j++) { - file.read(memblock, 4); + fh_in.read(memblock, 4); s->m_ends[j] = fourbytes_to_uint(memblock, 0); } } - file.seekg(file_cursor, file.beg); + fh_in.seek(file_cursor); this->data[i] = s; } // metadata section - empty for now - file.read(memblock, 1); + fh_in.read(memblock, 1); // crc32 checksum - may be missing because fastafs::load is also used before fastafs::get_crc32 is ran to obtain the checksum - file.read(memblock, 4); - if(file.gcount() == 4) { + if(fh_in.read(memblock, 4) == 4) { this->crc32f = fourbytes_to_uint(memblock, 0); } else { //printf("crc32 checksum missing\n"); } - file.close(); delete[] memblock; } - } else { - throw std::invalid_argument("Unable to open file '" + afilename + "'"); } } @@ -828,14 +797,16 @@ void fastafs::view_fasta(ffs2f_init* cache) throw std::invalid_argument("No filename found"); } - std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - for(uint32_t i = 0; i < this->data.size(); i++) { - this->data[i]->view_fasta(cache->sequences[i], &file); - } + //std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + //if(file.is_open()) { + chunked_reader fh = chunked_reader(this->filename.c_str()); - file.close(); + for(uint32_t i = 0; i < this->data.size(); i++) { + this->data[i]->view_fasta(cache->sequences[i], fh); } + + // file.close(); + //} } @@ -854,7 +825,6 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) - // estimates the whole file size of a file such as "/seq/chr1:56-" size_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg) { @@ -906,18 +876,19 @@ uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region } #endif - std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - // parse "chr..:..-.." string - sequence_region sr = sequence_region(seq_region_arg); + chunked_reader fh = chunked_reader(this->filename.c_str()); + //std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + //if(file.is_open()) { + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); - // 02 : check if 'chr' is equals this->data[i].name - for(size_t i = 0; i < this->data.size(); i++) { - if(sr.seq_name.compare(this->data[i]->name) == 0) { - return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset, &file); - } + // 02 : check if 'chr' is equals this->data[i].name + for(size_t i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset, fh); } } + //} return 0; } @@ -936,46 +907,58 @@ uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region * returns */ uint32_t fastafs::view_fasta_chunk(ffs2f_init* cache, char *buffer, size_t buffer_size, off_t file_offset) +{ + + chunked_reader fh = chunked_reader(this->filename.c_str()); + + return this->view_fasta_chunk(cache, buffer, buffer_size, file_offset, fh); +} + + + +uint32_t fastafs::view_fasta_chunk(ffs2f_init* cache, char *buffer, size_t buffer_size, off_t file_offset, chunked_reader &fh) { uint32_t written = 0; - std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - size_t i = 0;// sequence iterator - uint32_t pos = (uint32_t) file_offset; - fastafs_seq *seq; - - while(i < data.size()) { - seq = this->data[i]; - const uint32_t sequence_file_size = seq->fasta_filesize(cache->padding_arg); - - if(pos < sequence_file_size) { - const uint32_t written_seq = seq->view_fasta_chunk( - cache->sequences[i], - &buffer[written], - std::min((uint32_t) buffer_size - written, sequence_file_size), - pos, - &file); - - written += written_seq; - pos -= (sequence_file_size - written_seq); - - if(written == buffer_size) { - file.close(); - return written; - } - } else { - pos -= sequence_file_size; - } - i++; + size_t i = 0;// sequence iterator + uint32_t pos = (uint32_t) file_offset; + fastafs_seq *seq; + + while(i < data.size()) { + seq = this->data[i]; + const uint32_t sequence_file_size = seq->fasta_filesize(cache->padding_arg); + + if(pos < sequence_file_size) { + const uint32_t written_seq = seq->view_fasta_chunk( + cache->sequences[i], + &buffer[written], + std::min((uint32_t) buffer_size - written, sequence_file_size), + pos, + fh); + + written += written_seq; + pos -= (sequence_file_size - written_seq); + + if(written == buffer_size) { + return written; + } + } else { + pos -= sequence_file_size; } - file.close(); - } else { - throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); + + i++; } + return written; } + + + + + + + //http://genome.ucsc.edu/FAQ/FAQformat.html#format7 //https://www.mathsisfun.com/binary-decimal-hexadecimal-converter.html uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t file_offset) @@ -984,152 +967,179 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi uint32_t pos = (uint32_t) file_offset; // iterator (position, in bytes) in file uint32_t pos_limit = 0; // counter to keep track of when writing needs to stop for given loop - std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - char n_seq[4]; - pos_limit += 4;// skip this loop after writing first four bytes - while(pos < pos_limit) { - buffer[written++] = UCSC2BIT_MAGIC[pos]; + //std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + //if(file.is_open()) { + chunked_reader file = chunked_reader(this->filename.c_str()); + char n_seq[4]; + pos_limit += 4;// skip this loop after writing first four bytes + while(pos < pos_limit) { + buffer[written++] = UCSC2BIT_MAGIC[pos]; + pos++; + if(written >= buffer_size) { + return written; + } + } + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = UCSC2BIT_VERSION[pos - 4]; + pos++; + if(written >= buffer_size) { + return written; + } + } + // number sequences + uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) this->data.size()); + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = n_seq[pos - 8]; + pos++; + if(written >= buffer_size) { + return written; + } + } + // 4 x nullbyte + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = '\0'; + pos++; + if(written >= buffer_size) { + return written; + } + } + uint32_t header_block_len = 4 + 4 + 4 + 4 + ((uint32_t) this->data.size() * (1 + 4)); + uint32_t header_offset_previous = 0; + for(uint32_t i = 0; i < this->data.size(); i++) { + header_block_len += (uint32_t) this->data[i]->name.size(); + } + + fastafs_seq *sequence; + size_t i; + for(i = 0; i < this->data.size(); i++) { + sequence = this->data[i]; + + // single byte can be written, as the while loop has returned true + pos_limit += 1; + if(pos < pos_limit) { + buffer[written++] = (unsigned char) sequence->name.size(); pos++; if(written >= buffer_size) { return written; } } - pos_limit += 4; + + // sequence name + pos_limit += (uint32_t) sequence->name.size(); while(pos < pos_limit) { - buffer[written++] = UCSC2BIT_VERSION[pos - 4]; + buffer[written++] = sequence->name[sequence->name.size() - (pos_limit - pos)]; pos++; if(written >= buffer_size) { return written; } } - // number sequences - uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) this->data.size()); + + // file offset + uint32_t offset = header_block_len + header_offset_previous; + uint_to_fourbytes_ucsc2bit(n_seq, offset); pos_limit += 4; while(pos < pos_limit) { - buffer[written++] = n_seq[pos - 8]; + buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; if(written >= buffer_size) { return written; } } - // 4 x nullbyte + + header_offset_previous += 4 + 4 + 4 + 4; + header_offset_previous += 8 * (uint32_t) sequence->n_starts.size(); + header_offset_previous += 8 * (uint32_t) sequence->m_starts.size(); + header_offset_previous += sequence->n / 4; + + if(sequence->n % 4 != 0) { + header_offset_previous++; + } + } + + ffs2f_init* cache = this->init_ffs2f(0, false); // false, no masking needed, always upper-case is fine in this case + for(i = 0; i < this->data.size(); i++) { + sequence = this->data[i]; + + // number nucleotides + uint_to_fourbytes_ucsc2bit(n_seq, sequence->n); pos_limit += 4; while(pos < pos_limit) { - buffer[written++] = '\0'; + buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; if(written >= buffer_size) { + delete cache; return written; } } - uint32_t header_block_len = 4 + 4 + 4 + 4 + ((uint32_t) this->data.size() * (1 + 4)); - uint32_t header_offset_previous = 0; - for(uint32_t i = 0; i < this->data.size(); i++) { - header_block_len += (uint32_t) this->data[i]->name.size(); - } - - fastafs_seq *sequence; - size_t i; - for(i = 0; i < this->data.size(); i++) { - sequence = this->data[i]; - // single byte can be written, as the while loop has returned true - pos_limit += 1; - if(pos < pos_limit) { - buffer[written++] = (unsigned char) sequence->name.size(); - pos++; - if(written >= buffer_size) { - return written; - } + // number N blocks + uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) sequence->n_starts.size()); + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = n_seq[4 - (pos_limit - pos)]; + pos++; + if(written >= buffer_size) { + delete cache; + return written; } + } - // sequence name - pos_limit += (uint32_t) sequence->name.size(); + // write n-blocks effectively down! + for(uint32_t k = 0; k < sequence->n_starts.size(); k++) { + uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_starts[k]); + pos_limit += 4; while(pos < pos_limit) { - buffer[written++] = sequence->name[sequence->name.size() - (pos_limit - pos)]; + buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; if(written >= buffer_size) { + delete cache; return written; } } - // file offset - uint32_t offset = header_block_len + header_offset_previous; - uint_to_fourbytes_ucsc2bit(n_seq, offset); + uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_ends[k] - sequence->n_starts[k] + 1); pos_limit += 4; while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; if(written >= buffer_size) { + delete cache; return written; } } + } - header_offset_previous += 4 + 4 + 4 + 4; - header_offset_previous += 8 * (uint32_t) sequence->n_starts.size(); - header_offset_previous += 8 * (uint32_t) sequence->m_starts.size(); - header_offset_previous += sequence->n / 4; + // number M blocks (masked regions; lower case regions) + uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) sequence->m_starts.size()); + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = n_seq[4 - (pos_limit - pos)]; + pos++; - if(sequence->n % 4 != 0) { - header_offset_previous++; + if(written >= buffer_size) { + delete cache; + return written; } } - ffs2f_init* cache = this->init_ffs2f(0, false); // false, no masking needed, always upper-case is fine in this case - for(i = 0; i < this->data.size(); i++) { - sequence = this->data[i]; - - // number nucleotides - uint_to_fourbytes_ucsc2bit(n_seq, sequence->n); + // write m-blocks effectively down! + for(uint32_t k = 0; k < sequence->m_starts.size(); k++) { + uint_to_fourbytes_ucsc2bit(n_seq, sequence->m_starts[k]); pos_limit += 4; while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; - if(written >= buffer_size) { - delete cache; - return written; - } - } - // number N blocks - uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) sequence->n_starts.size()); - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = n_seq[4 - (pos_limit - pos)]; - pos++; if(written >= buffer_size) { delete cache; return written; } } - // write n-blocks effectively down! - for(uint32_t k = 0; k < sequence->n_starts.size(); k++) { - uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_starts[k]); - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = n_seq[4 - (pos_limit - pos)]; - pos++; - if(written >= buffer_size) { - delete cache; - return written; - } - } - - uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_ends[k] - sequence->n_starts[k] + 1); - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = n_seq[4 - (pos_limit - pos)]; - pos++; - if(written >= buffer_size) { - delete cache; - return written; - } - } - } - - // number M blocks (masked regions; lower case regions) - uint_to_fourbytes_ucsc2bit(n_seq, (uint32_t) sequence->m_starts.size()); + uint_to_fourbytes_ucsc2bit(n_seq, sequence->m_ends[k] - sequence->m_starts[k] + 1); pos_limit += 4; while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; @@ -1140,55 +1150,49 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi return written; } } + } - // write m-blocks effectively down! - for(uint32_t k = 0; k < sequence->m_starts.size(); k++) { - uint_to_fourbytes_ucsc2bit(n_seq, sequence->m_starts[k]); - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = n_seq[4 - (pos_limit - pos)]; - pos++; - - if(written >= buffer_size) { - delete cache; - return written; - } - } - - uint_to_fourbytes_ucsc2bit(n_seq, sequence->m_ends[k] - sequence->m_starts[k] + 1); - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = n_seq[4 - (pos_limit - pos)]; - pos++; + // reserved block + pos_limit += 4; + while(pos < pos_limit) { + buffer[written++] = '\0'; + pos++; - if(written >= buffer_size) { - delete cache; - return written; - } - } + if(written >= buffer_size) { + delete cache; + return written; } + } - // reserved block - pos_limit += 4; - while(pos < pos_limit) { - buffer[written++] = '\0'; - pos++; + // twobit coded nucleotides (only containing 4 nucleotides each) + uint32_t full_twobits = sequence->n / 4; + twobit_byte_dna t; + pos_limit += full_twobits; - if(written >= buffer_size) { - delete cache; - return written; - } + while(pos < pos_limit) { + //printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j); + //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), file); + t.set(n_seq); + buffer[written++] = t.data; + pos++; + if(written >= buffer_size) { + delete cache; + return written; } + } - // twobit coded nucleotides (only containing 4 nucleotides each) - uint32_t full_twobits = sequence->n / 4; - twobit_byte_dna t; - pos_limit += full_twobits; - - while(pos < pos_limit) { - //printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j); - //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file); - sequence->view_fasta_chunk(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file); + // last byte, may also rely on 1,2 or 3 nucleotides and reqiures setting 0's + if(full_twobits * 4 < sequence->n) { + n_seq[0] = 'N'; + n_seq[1] = 'N'; + n_seq[2] = 'N'; + n_seq[3] = 'N'; + pos_limit += 1; + if(pos < pos_limit) { + //printf("%i - %i = %i || %i :: %i == %i \n",pos_limit,pos, full_twobits * 4, j, sequence->n - (full_twobits * 4), sequence->n - j); + //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, file); t.set(n_seq); buffer[written++] = t.data; pos++; @@ -1197,33 +1201,13 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi return written; } } - - // last byte, may also rely on 1,2 or 3 nucleotides and reqiures setting 0's - if(full_twobits * 4 < sequence->n) { - n_seq[0] = 'N'; - n_seq[1] = 'N'; - n_seq[2] = 'N'; - n_seq[3] = 'N'; - pos_limit += 1; - if(pos < pos_limit) { - //printf("%i - %i = %i || %i :: %i == %i \n",pos_limit,pos, full_twobits * 4, j, sequence->n - (full_twobits * 4), sequence->n - j); - //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file); - sequence->view_fasta_chunk(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file); - t.set(n_seq); - buffer[written++] = t.data; - pos++; - if(written >= buffer_size) { - delete cache; - return written; - } - } - } } - delete cache; - file.close(); - } else { - throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); } + delete cache; + //file.close(); + //} else { + // throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); + // } return written; } @@ -1576,11 +1560,13 @@ int fastafs::info(bool ena_verify_checksum) std::string compression_type; if(this->data[i]->flags.is_twobit()) { - compression_type = "2bit"; + compression_type = "2bit "; } else if(this->data[i]->flags.is_fourbit()) { - compression_type = "4bit"; + compression_type = "4bit "; + } else if(this->data[i]->flags.is_protein()) { + compression_type = "5/8bit"; } else { - compression_type = "????"; + compression_type = "???? "; } @@ -1659,6 +1645,7 @@ int fastafs::info(bool ena_verify_checksum) printf("\n"); } + file.close(); } @@ -1669,7 +1656,14 @@ int fastafs::info(bool ena_verify_checksum) // skips first four bytes and do not include crc32 at the end either uint32_t fastafs::get_crc32(void) { - return file_crc32(this->filename, 4, this->fastafs_filesize() - 4 ); // not sure why -4 rather than -4-4, but seems to work? + if(is_zstd_file((const char*) this->filename.c_str())) { + printf("crc32 verification does not work for zstd compressed archives yet\n"); + exit(1); + return false; + } + + // @ todo rewrite function only providing the chunked_reader obj + return file_crc32(this->filename, 4, this->fastafs_filesize() - 4); // not sure why -4 rather than -4-4, but seems to work? } @@ -1729,29 +1723,30 @@ bool fastafs::check_sequence_integrity(bool verbose) ffs2f_init* cache = this->init_ffs2f(0, false);// do not use masking, this checksum requires capital / upper case nucleotides - std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - for(uint32_t i = 0; i < this->data.size(); i++) { - md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); - old_hash = std::string(md5_hash); - - std::string new_hash = this->data[i]->md5(cache->sequences[i], &file); - if(old_hash.compare(new_hash) == 0) { - if(verbose) { - printf("OK\t%s\n", this->data[i]->name.c_str()); - } - } else { - if(verbose) { - printf("ERROR\t%s\t%s (encoded in fastafs) != %s (on disk)\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); - } + chunked_reader file = chunked_reader(this->filename.c_str()); + //std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + //if(file.is_open()) { + for(uint32_t i = 0; i < this->data.size(); i++) { + md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); + old_hash = std::string(md5_hash); - retcode = false; + std::string new_hash = this->data[i]->md5(cache->sequences[i], file); + if(old_hash.compare(new_hash) == 0) { + if(verbose) { + printf("OK\t%s\n", this->data[i]->name.c_str()); } + } else { + if(verbose) { + printf("ERROR\t%s\t%s (encoded in fastafs) != %s (on disk)\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); + } + + retcode = false; } - file.close(); - } else { - throw std::runtime_error("[fastafs::check_sequence_integrity] could not load fastafs: " + this->filename); } + //file.close(); + //} else { + // throw std::runtime_error("[fastafs::check_sequence_integrity] could not load fastafs: " + this->filename); + //} delete cache; diff --git a/src/fivebit_fivebytes.cpp b/src/fivebit_fivebytes.cpp new file mode 100644 index 00000000..98f945c0 --- /dev/null +++ b/src/fivebit_fivebytes.cpp @@ -0,0 +1,282 @@ +#include +#include + +#include "config.hpp" + +#include "fivebit_fivebytes.hpp" + +/* +alphabet = ABCDEFGHIJKLMNOPQRSTUVWYZX*- + +Gaps can be efficiently included + +In five bytes we can store nine 5bits: + +[00000111] +[11222223] +[33334444] +[46666677] +[77788888] + +idx: https://stackoverflow.com/questions/11509415/character-array-as-a-value-in-c-map + +A = 00000 (0) +B = 00001 (1) +C = 00010 (2) +... +Z = 11000 (24) +X = 11001 (25) +* = 11010 (26) +- = 11011 (27) +*/ + + +const char fivebit_fivebytes::fivebit_alphabet[28 + 1] = "ABCDEFGHIJKLMNOPQRSTUVWYZX*-"; +const char fivebit_fivebytes::encode_hash[28 + 1][2] = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "Z", "X", "*", "-" }; + + +// @todo, offset needs to be second parameter +void fivebit_fivebytes::set(unsigned char bit_offset, unsigned char amino_acid) +{ + // bit_offset: 0, 1, 2, 3, 4, 5, 6, 7 + this->data_decompressed[bit_offset] = fivebit_fivebytes::fivebit_alphabet[amino_acid]; + + switch(bit_offset) { + case 0: + // 00000111 11222223 33334444 45555566 66677777 + // -----111 + this->data_compressed[0] = (unsigned char)((this->data_decompressed[0] & ~(128 + 64 + 32 + 16 + 8)) | (amino_acid << 3)); + break; + case 1: + // 00000111 11222223 33334444 45555566 66677777 + + // xxx00000 + // --xxx000 + this->data_compressed[0] = (unsigned char)((this->data_compressed[0] & ~(4 + 2 + 1)) | amino_acid >> 2); + + // 00000111 11222223 33334444 45555566 66677777 + // --222223 + this->data_compressed[1] = (unsigned char)((this->data_compressed[1] & ~(128 + 64)) | amino_acid << 6); + break; + case 2: + // 00000000 + // 0000000- << 1 + // --00000- ~(2 + 1) << 6 + this->data_compressed[1] = (unsigned char)((this->data_compressed[1] & ~(32 + 16 + 8 + 4 + 2)) | (amino_acid << 1 & ~(128 + 64))); + break; + case 3: + // 00000111 11222223 33334444 45555566 66677777 + + // 000xxxxx + // -------x + this->data_compressed[1] = (unsigned char)((this->data_compressed[1] & ~(1)) | amino_acid >> 4); + + // 000xxxxx + // xxxx---- + this->data_compressed[2] = (unsigned char)((this->data_compressed[2] & ~((8 + 4 + 2 + 1) << 4)) | amino_acid << 4); + break; + case 4: + // 00000111 11222223 33334444 45555566 66677777 + + // 000xxxxx + // -000xxxx + this->data_compressed[2] = (unsigned char)((this->data_compressed[2] & ~(8 + 4 + 2 + 1)) | amino_acid >> 1); + + // + this->data_compressed[3] = (unsigned char)((this->data_compressed[3] & ~(128)) | amino_acid << 7); + break; + case 5: + // 00000111 11222223 33334444 45555566 66677777 + + // 000xxxxx + // 0xxxxx-- + this->data_compressed[3] = (unsigned char)((this->data_compressed[3] & ~(64 + 32 + 16 + 8 + 4)) | amino_acid << 2); + break; + case 6: + // 00000111 11222223 33334444 45555566 66677777 + + // 000xxxxx + // ---000xx + this->data_compressed[3] = (unsigned char)((this->data_compressed[3] & ~(2 + 1)) | amino_acid >> 3); + + + // 000xxxxx + // xxx----- + this->data_compressed[4] = (unsigned char)((this->data_compressed[4] & ~(128 + 64 + 32)) | amino_acid << 5); + + break; + case 7: + // 00000111 11222223 33334444 45555566 66677777 + + // 000xxxxx + // ---xxxxx + this->data_compressed[4] = (unsigned char)((this->data_compressed[4] & ~(16 + 8 + 4 + 2 + 1)) | (amino_acid & ~(128 + 64 + 32))); + break; + } +} + + +char *fivebit_fivebytes::get(void) +{ + return (char *) this->data_decompressed; +} + + + + +// @todo, offset needs to be second parameter +void fivebit_fivebytes::set_compressed(unsigned char (&compressed_data)[5]) +{ + this->data_compressed[0] = compressed_data[0]; + this->data_compressed[1] = compressed_data[1]; + this->data_compressed[2] = compressed_data[2]; + this->data_compressed[3] = compressed_data[3]; + this->data_compressed[4] = compressed_data[4]; + + this->unpack(); +} + + +void fivebit_fivebytes::unpack() +{ + // 00000111 11222223 33334444 45555566 66677777 + // 66677777 + // ---77777 + this->data_decompressed[7] = (unsigned char)(this->data_compressed[4] & ~(128 + 64 + 32)); + + // 00000111 11222223 33334444 45555566 66677777 + // 45555566 66677777 + // -----455 55566666 + // -----455 ---66666 + // ---66666 + this->data_decompressed[6] = (unsigned char)(((this->data_compressed[3] << 8) | (this->data_compressed[4])) >> 5); + this->data_decompressed[6] = (unsigned char)(this->data_decompressed[6] & ~(128 + 64 + 32)); + + // 00000111 11222223 33334444 45555566 66677777 + // 45555566 + // --455555 + // ---55555 + this->data_decompressed[5] = (unsigned char)(this->data_compressed[3] >> 2); + this->data_decompressed[5] = (unsigned char)(this->data_decompressed[5] & ~(128 + 64 + 32));// only bit 6 should be set to 0 + + // 00000111 11222223 33334444 45555566 66677777 + // 33334444 45555566 + // -------3 33344444 + // 33344444 + // ---44444 + this->data_decompressed[4] = (unsigned char)(((this->data_compressed[2] << 8) | (this->data_compressed[3])) >> 7); + this->data_decompressed[4] = (unsigned char)(this->data_decompressed[4] & ~(128 + 64 + 32)); + + // 00000111 11222223 33334444 45555566 66677777 + // 11222223 33334444 bit shift << 8 + normal + // ----1122 22233333 bit shift >> 4 + // 22233333 convert to u-char + // ---33333 set zero's + this->data_decompressed[3] = (unsigned char)(((this->data_compressed[1] << 8) | (this->data_compressed[2])) >> 4); + this->data_decompressed[3] = (unsigned char)(this->data_decompressed[3] & ~(128 + 64 + 32)); + + // 00101000 00100101 00110000 01101100 10110010 + // 00000111 11222223 33334444 45555566 66677777 + // 11222223 + // -1122222 + // ---22222 + this->data_decompressed[2] = (unsigned char)(this->data_compressed[1] >> 1); // shifts of unsigned types always zero-fill :) + this->data_decompressed[2] = (unsigned char)(this->data_decompressed[2] & ~(128 + 64 + 32)); // i think only bit 6 and 7 need to be set because of the shift above + + // 00000111 11222223 33334444 45555566 66677777 + // 00000111 11222223 + // ------00 00011111 + // 00011111 + // ---11111 + this->data_decompressed[1] = (unsigned char)(((this->data_compressed[0] << 8) | (this->data_compressed[1])) >> 6); + this->data_decompressed[1] = (unsigned char)(this->data_decompressed[1] & ~(128 + 64 + 32)); + + // 00000111 11222223 33334444 45555566 66677777 + // 00000111 + // ---00000 + this->data_decompressed[0] = (unsigned char)(this->data_compressed[0] >> 3); // shifts of unsigned types always zero-fill :) + + + // decode + for(unsigned char i = 0; i < 8; i++) { + this->data_decompressed[i] = fivebit_fivebytes::fivebit_alphabet[this->data_decompressed[i]]; + } +} + + + +// static functions - not bound to class instance +unsigned char fivebit_fivebytes::iterator_to_offset(unsigned int iterator) +{ + return (unsigned char)(iterator % 8); +} + + +/* + it is not necessary to write the whole 5 byes if only 3 contain compressin information + 00000111 11222223 33334444 45555566 66677777 + // 00000111 11222223 33334444 45555566 66677777 + // 1 2 3 4 5 6 7 +*/ +unsigned char fivebit_fivebytes::decompressed_to_compressed_bytes(unsigned char decompressed_bytes) +{ + return (unsigned char)(((decompressed_bytes + 3) * 5 / 8) - 1); +} + + + + + + +/* + * To calculate file offset + * + * example: + * + * >Seq + * [ABCDEFGH][ABCDEFGH] has offset of 2 + * + * >Seq + * [ABCDEFGH][ABCDEFGH][A] has offset of 2? + * + * >Seq + * [ABCDEFGH][ABCDEFGH][ACCCAAC] has offset of 2? + * */ +const off_t fivebit_fivebytes::nucleotides_to_compressed_fileoffset(size_t n_amino_acids) +{ + off_t out = n_amino_acids / (off_t) fivebit_fivebytes::nucleotides_per_chunk; + + out = out * fivebit_fivebytes::bytes_per_chunk; + + return out; +} + +/* + * To calculate file offset + * + * example: + * + * >Seq + * [ABCDEFGH][ABCDEFGH] has offset of 10 + * + * >Seq + * [ABCDEFGH][ABCDEFGH][A] has offset of 11? + * + */ +const off_t fivebit_fivebytes::nucleotides_to_compressed_offset(size_t n_amino_acids) +{ + return fivebit_fivebytes::nucleotides_to_compressed_fileoffset(n_amino_acids) + + fivebit_fivebytes::decompressed_to_compressed_bytes(n_amino_acids % fivebit_fivebytes::nucleotides_per_chunk); +} + + + + + + + +void fivebit_fivebytes::next(chunked_reader &r) +{ + r.read(this->data_compressed, fivebit_fivebytes::bytes_per_chunk); + this->unpack(); +} diff --git a/src/flags.cpp b/src/flags.cpp index e1ec3911..8ddeb0cf 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -105,6 +105,14 @@ bool fastafs_sequence_flags::is_iupec_nucleotide() this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); } +bool fastafs_sequence_flags::is_protein() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == true && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); +} + + bool fastafs_sequence_flags::is_complete() { return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE); @@ -138,6 +146,13 @@ void fastafs_sequence_flags::set_iupec_nucleotide() this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); } +void fastafs_sequence_flags::set_protein() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, true); // 1,1 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); +} + + void fastafs_sequence_flags::set_complete() { this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, true); diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index 399d790d..1f0a3f49 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -21,7 +21,7 @@ binary: IUPEC */ const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; -const char fourbit_byte::encode_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; +char fourbit_byte::encode_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; /* @@ -242,7 +242,38 @@ char *fourbit_byte::get(unsigned char length) -const char *fourbit_byte::get() +char *fourbit_byte::get() { return fourbit_byte::encode_hash[this->data]; } + + + +/* + * To calculate file offset to set file pointer to + * + * example: + * + * >Seq + * [AC][PG] offset is 2? + * + * >Seq + * [AC[PG][M] offset is 2 + * + * dit is naar beneden afgerond zodat de file pointer ervoor start + * + * */ +const off_t fourbit_byte::nucleotides_to_compressed_fileoffset(size_t n_nucleotides) +{ + return (off_t) n_nucleotides / fourbit_byte::nucleotides_per_byte; +} + + + + +void fourbit_byte::next(chunked_reader &r) +{ + this->data = r.read(); +} + + diff --git a/src/fuse.cpp b/src/fuse.cpp index 3021be8d..7db2a261 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -15,13 +15,18 @@ #include #include //#include - +#include +#include +#include +#include +#include #include "fuse.hpp" #include "database.hpp" #include "fastafs.hpp" #include "ucsc2bit.hpp" #include "sequence_region.hpp" +#include "chunked_reader.hpp" // http://www.maastaar.net/fuse/linux/filesystem/c/2016/05/21/writing-a-simple-filesystem-using-fuse/ @@ -29,6 +34,20 @@ +struct file_thread_info { + chunked_reader *cr; + sem_t sem; +}; + +int MAX_FILE_THREADS = 4; + +struct file_threads { + std::vector crs; + int thread_i = 0; // thread iterator +}; + + + struct fuse_instance { //fastasfs fastafs *f; @@ -36,7 +55,6 @@ struct fuse_instance { ffs2f_init *cache_p0;// cache with padding of 0; used by API '/seq/chr1:123:456' bool from_fastafs; // if false, from 2bit - // ucsc2bit ucsc2bit *u2b; @@ -45,7 +63,7 @@ struct fuse_instance { uint32_t padding; bool allow_masking; int argc_fuse; - + timespec ts[2]; // access and modify time }; @@ -57,7 +75,7 @@ static int do_getattr(const char *path, struct stat *st) char cur_time[100]; time_t now = time(0); - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); // GNU's definitions of the attributes (http://www.gnu.org/software/libc/manual/html_node/Attribute-Meanings.html): // st_uid: The user ID of the file’s owner. @@ -76,7 +94,6 @@ static int do_getattr(const char *path, struct stat *st) st->st_nlink = 1; - printf("[%s]\n", path); if(strcmp(path, "/") == 0) { //st->st_mode = S_IFREG | 0444; //st->st_nlink = 1; @@ -91,7 +108,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_nlink = 1; } else if(strlen(path) > 4 && strncmp(path, "/seq/", 5) == 0) { // API: "/seq/chr1:123-456" - printf("setting to FILE [%s] because /seq/...\n", path); + //printf("setting to FILE [%s] because /seq/...\n", path); // @ todo - run a check on wether the chr exists and return err otherwise st->st_mode = S_IFREG | 0444; st->st_nlink = 1; @@ -104,7 +121,9 @@ static int do_getattr(const char *path, struct stat *st) if(ffi->from_fastafs) { if(ffi->f != nullptr) { +#if DEBUG printf("\033[0;32m[%s]\033[0;33m do_getattr:\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, path, ffi->f->name.c_str(), ffi->padding); +#endif std::string virtual_fasta_filename = "/" + ffi->f->name + ".fa"; std::string virtual_faidx_filename = "/" + ffi->f->name + ".fa.fai"; @@ -126,7 +145,9 @@ static int do_getattr(const char *path, struct stat *st) } } else { if(ffi->u2b != nullptr) { +#if DEBUG printf("\033[0;32m[%s]\033[0;33m do_getattr:\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, path, ffi->u2b->name.c_str(), ffi->padding); +#endif std::string virtual_fasta_filename = "/" + ffi->u2b->name + ".fa"; std::string virtual_faidx_filename = "/" + ffi->u2b->name + ".fa.fai"; @@ -151,13 +172,13 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of char cur_time[100]; time_t now = time(0); - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); filler(buffer, ".", NULL, 0); // Current Directory filler(buffer, "..", NULL, 0); // Parent Directory if(ffi->from_fastafs) { - printf("\033[0;32m[%s]\033[0;33m fastafs::do_readdir(\033[0moffset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); + printf("\033[0;32m[%s]\033[0;33m do_readdir(\033[0moffset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); std::string virtual_fasta_filename = ffi->f->name + ".fa"; std::string virtual_faidx_filename = ffi->f->name + ".fa.fai"; @@ -172,7 +193,9 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } } else { if(ffi->u2b != nullptr) { +#if DEBUG printf("\033[0;32m[%s]\033[0;33m 2bit::do_readdir(\033[0moffset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) offset, path, ffi->u2b->name.c_str(), ffi->padding); +#endif std::string virtual_fasta_filename = ffi->u2b->name + ".fa"; std::string virtual_faidx_filename = ffi->u2b->name + ".fa.fai"; @@ -192,8 +215,88 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } +static int do_open(const char *path, struct fuse_file_info *fi) +{ + fuse_instance *ffi = static_cast(fuse_get_context()->private_data); + + char cur_time[100]; + time_t now = time(0); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); + + printf("\033[0;32m[%s]\033[0;33m do_open\n", cur_time); + //(\033[0ms=%u, off=%u\033[0;33m):\033[0m %s \033[0;35m(%s, pad: %u)\033[0m\n", + //cur_time, (uint32_t) size, (uint32_t) offset, path, ); + + //chunked_reader *cr = new chunked_reader(ffi->f->filename.c_str()); + + printf("test... \n"); + + // has list with 32 chunked_reader objects + file_threads *ft = new file_threads(); + for(ft->thread_i = 0; ft->thread_i < MAX_FILE_THREADS; ft->thread_i++) { + ft->crs.push_back( + file_thread_info{ + new chunked_reader(ffi->f->filename.c_str()) + } + ); + + //printf("sem init... \n"); + sem_init( &(ft->crs[ft->thread_i].sem), 0, 1 ); + //printf("sem init done... \n"); + } + ft->thread_i = 0; + + fi->fh = reinterpret_cast(ft); + printf("\033[0;35m fi->fh: %u\n", (unsigned int) fi->fh); + + printf("\033[0;35m fi->fh: %u\n", (unsigned int) fi->fh); + printf("\033[0;35m fi->writepage: %u\n", fi->writepage); + printf("\033[0;35m fi->direct_io: %u\n", fi->direct_io); + printf("\033[0;35m fi->keep_cache: %u\n", fi->keep_cache); + printf("\033[0;35m fi->padding: %u\n", fi->padding); + + // here the fi->fh should be set?! + // if possible to chunked reader? + //chunked_reader *cr = new chunked_reader("/tmp/wget"); + //fi->fh = fh_i++; + // should be set to a real fh i presume? + + return 0; +} + +static int do_flush(const char *path, struct fuse_file_info *fi) +{ + return 0; +} + +static int do_release(const char *path, struct fuse_file_info *fi) +{ + file_threads *ft = (file_threads*) fi->fh; + //printf("do_release() - filehandle had [ %i ] locks \n", ft->thread_locks); + + if(ft != nullptr) { + for(size_t i = 0; i < ft->crs.size(); i++) { + sem_destroy(&ft->crs[i].sem); + delete ft->crs[i].cr; + + } + delete ft; + } + + return 0; +} + + +// threaded implementation of libfuse? +// https://libfuse.github.io/doxygen/poll_8c.html + +// test file error reads at: do_read(s=4096, off=20480): static int do_read(const char *path, char *buffer, size_t size, off_t offset, struct fuse_file_info *fi) { + file_threads *ft = (file_threads*) fi->fh; + int cur_file_thread = ft->thread_i++ % MAX_FILE_THREADS; + sem_wait(&ft->crs[cur_file_thread].sem); + fuse_instance *ffi = static_cast(fuse_get_context()->private_data); static int written = -2;// -1 = permission deinied, -2 = missing file or directory @@ -202,9 +305,16 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st #if DEBUG char cur_time[100]; time_t now = time(0); - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); - - printf("\033[0;32m[%s]\033[0;33m fastafs::do_read(\033[0msize=%u, offset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) size, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); + + printf("\033[0;32m[%s]\033[0;33m do_read(\033[0ms=%u, off=%u\033[0;33m):\033[0m %s \033[0;35m(%s, pad: %u)\033[0m\n", cur_time, (uint32_t) size, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); + //printf("\033[0;35m fi: 0x%p\n", (uintptr_t) fi); + printf("\033[0;35m fi: 0x%p\n", (void*) fi); + printf("\033[0;35m fi->fh: %u\n", (unsigned int) fi->fh); + printf("\033[0;35m fi->writepage: %u\n", fi->writepage); + printf("\033[0;35m fi->direct_io: %u\n", fi->direct_io); + printf("\033[0;35m fi->keep_cache: %u\n", fi->keep_cache); + printf("\033[0;35m fi->padding: %u\n", fi->padding); #endif std::string virtual_fasta_filename = "/" + ffi->f->name + ".fa"; @@ -212,9 +322,8 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st std::string virtual_ucsc2bit_filename = "/" + ffi->f->name + ".2bit"; std::string virtual_dict_filename = "/" + ffi->f->name + ".dict"; - //printf("?? [[%s]]\n", path); if(strcmp(path, virtual_fasta_filename.c_str()) == 0) { - written = (signed int) ffi->f->view_fasta_chunk(ffi->cache, buffer, size, offset); + written = (signed int) ffi->f->view_fasta_chunk(ffi->cache, buffer, size, offset, *ft->crs[cur_file_thread].cr); } else if(strcmp(path, virtual_faidx_filename.c_str()) == 0) { written = (signed int) ffi->f->view_faidx_chunk(ffi->padding, buffer, size, offset); } else if(strcmp(path, virtual_ucsc2bit_filename.c_str()) == 0) { @@ -229,7 +338,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st #if DEBUG char cur_time[100]; time_t now = time(0); - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); printf("\033[0;32m[%s]\033[0;33m 2bit::do_read(\033[0msize=%u, offset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) size, (uint32_t) offset, path, ffi->u2b->name.c_str(), ffi->padding); #endif @@ -245,7 +354,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } } - //printf(" return written=%u\n", written); + sem_post(&ft->crs[cur_file_thread].sem); return written; } @@ -290,7 +399,7 @@ static int do_getxattr(const char* path, const char* name, char* value, size_t s // decoy function to not throw an error if snakemake access this -// as it doesn't have access to fi it is practically not possible to do a generic update +// as it doesn't have access to fi it is practically not possible to do a generic update static int do_utimens(const char *path, const struct timespec ts[2]) // seems it doesn't understand 'fuse_file_info ?' , struct fuse_file_info *fi) { //(void) fi; @@ -300,7 +409,7 @@ static int do_utimens(const char *path, const struct timespec ts[2]) // seems it //res = utimensat(0, path, ts, AT_SYMLINK_NOFOLLOW); // set fi data to ts //if (res == -1) - //return -errno; + //return -errno; return 0; } @@ -320,6 +429,12 @@ void do_destroy(void *pd) delete ffi->u2b; } + // for + //if(ffi->cr != nullptr) { + //delete ffi->cr; + //} + + delete ffi; /* @@ -354,12 +469,12 @@ fuse_operations operations = { nullptr, // int (*chown) (const char *, uid_t, gid_t); nullptr, // int (*truncate) (const char *, off_t); nullptr, // int (*utime) (const char *, struct utimbuf *); - nullptr, // int (*open) (const char *, struct fuse_file_info *); + do_open, // int (*open) (const char *, struct fuse_file_info *); do_read, // int (*read) (const char *, char *, size_t, off_t, struct fuse_file_info *); nullptr, // int (*write) (const char *, const char *, size_t, off_t, struct fuse_file_info *); nullptr, // int (*statfs) (const char *, struct statvfs *); - nullptr, // int (*flush) (const char *, struct fuse_file_info *); - nullptr, // int (*release) (const char *, struct fuse_file_info *); + do_flush, // int (*flush) (const char *, struct fuse_file_info *); + do_release, // int (*release) (const char *, struct fuse_file_info *); nullptr, // int (*fsync) (const char *, int, struct fuse_file_info *); nullptr, // int (*setxattr) (const char *, const char *, const char *, size_t, int); do_getxattr,// int (*getxattr) (const char *, const char *, char *, size_t); @@ -478,6 +593,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) nullptr,// pointer to fastafs_init with defined padding nullptr, // pointer to fastafs_init with cache size of 0 (for mounting ./seq/chr1:123-456 true, // from fastafs + nullptr, // pointer to ucsc2bit decoder - if from_fasta is set to false 60, // default_padding true, // allow_masking @@ -487,8 +603,6 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs - printf("checkpoint a\n"); - std::vector fuse_options = {}; // those that need to be appended later char current_argument = '\0';// could be o for '-o', etc. @@ -497,7 +611,9 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) std::vector full_args = {}; for(signed int i = 0; i < argc; ++i) { +#if DEBUG printf("processing argv[%i] = '%s' [current argument=%i]\n", i, argv[i], (int) current_argument); +#endif if(current_argument != '\0') { // parse the arguments' value switch(current_argument) { @@ -552,14 +668,10 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - printf("checkpoint b\n"); - if(full_args.size() > 2) { - printf("checkpoint c\n"); printf("full_args.size() = %u\n", (uint32_t) full_args.size()); int mount_target_arg = full_args[full_args.size() - 2 ]; // last two arguments are and , location to last 2 args not starting with --/- are in this vector - printf("out of bound???\n"); if(fi->from_fastafs) { database d = database(); @@ -598,35 +710,30 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - printf("checkpoint c\n"); - return fi; } - void fuse(int argc, char *argv[]) { - printf("wake up\n"); + // part 1 - rewrite args because "fastafs" "mount" is considered as two args, crashing fuse_init // - @todo at some point define that second mount is not really important? if possible char *argv2[argc]; fuse_instance *ffi = parse_args(argc, argv, argv2); - printf("checkpoint\n"); - // part 2 - print what the planning is char cur_time[100]; time_t now = time(0); - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); printf("\033[0;32m[%s]\033[0;33m init (recv arguments):\033[0m [argc=%i]", cur_time, argc); for(int i = 0; i < argc; i++) { printf(" argv[%u]=\"%s\"", i, argv[i]); } - strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); + strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S", localtime(&now)); printf("\n\033[0;32m[%s]\033[0;33m init (fuse arguments):\033[0m [argc=%i]", cur_time, ffi->argc_fuse); for(int i = 0; i < ffi->argc_fuse; i++) { printf(" argv[%u]=\"%s\"", i, argv2[i]); @@ -641,5 +748,7 @@ void fuse(int argc, char *argv[]) fuse_main(ffi->argc_fuse, argv2, &operations, ffi); } //http://www.maastaar.net/fuse/linux/filesystem/c/2016/05/21/writing-a-simple-filesystem-using-fuse/ + + //return ret; } diff --git a/src/main.cpp b/src/main.cpp index 8f2d92f4..2b5228e6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,6 +12,7 @@ #include "fuse.hpp" #include "lsfastafs.hpp" +#include "zstd_seekable_utils.hpp" // https://github.com/facebook/zstd/issues/521 // https://github.com/samtools/samtools/blob/develop/faidx.c @@ -24,7 +25,7 @@ void usage() std::cout << " list overview of FASTAFS database" << std::endl; std::cout << " ps overview of mounted FASTAFS instances and mountpoints" << std::endl; std::cout << std::endl; - std::cout << " [single fastafs operations]" << std::endl; + std::cout << " [operations per fastafs file]" << std::endl; std::cout << " cache adds FASTA file to cache" << std::endl; std::cout << " view view FASTAFS as FASTA file" << std::endl; std::cout << " info view FASTAFS information" << std::endl; @@ -73,6 +74,7 @@ void usage_cache(void) std::cout << " cache -o \n\n"; std::cout << " -o, --output-file Explicitly define fastafs output file and do not write to database (cache)\n"; std::cout << " -2, --2bit Force 2bit when files become larger than 4bit due to huge N-blocks\n"; + std::cout << " -f, --fastafs-only Convert to FASTAFS only; skip ZSTD-seekable\n"; } int main(int argc, char *argv[]) @@ -98,32 +100,24 @@ int main(int argc, char *argv[]) if(argc > 3) { bool to_cache = true; bool auto_recompress_to_fourbit = true; + bool compress_to_zstd_seekable = true; for(int i = 0 ; i < argc ; i++) { - if( - (strcmp(argv[i], "-2") == 0) - or - (strcmp(argv[i], "--2bit") == 0) - ) { + if((strcmp(argv[i], "-2") == 0) or (strcmp(argv[i], "--2bit") == 0)) { auto_recompress_to_fourbit = false; } - - - if( i < argc - 1 and - ( - (strcmp(argv[argc - 3], "-o") == 0) - or - (strcmp(argv[argc - 3], "--output-file") == 0) - ) - - ) { + + if((strcmp(argv[i], "-f") == 0) or (strcmp(argv[i], "--fastafs-only") == 0)) { + compress_to_zstd_seekable = false; + } + + if(i < argc - 1 and ((strcmp(argv[argc - 3], "-o") == 0) or (strcmp(argv[argc - 3], "--output-file") == 0))) { to_cache = false; } } - - + // reserve place in database std::string fname_out; if(to_cache) { database d = database(); @@ -132,6 +126,8 @@ int main(int argc, char *argv[]) fname_out = std::string(argv[argc - 2]); } + + // convert to plain fastafs if(is_fasta_file(argv[argc - 1])) { // converter is now generic for 2 and 4 bit fasta_to_fastafs(argv[argc - 1], fname_out, auto_recompress_to_fourbit); @@ -141,6 +137,20 @@ int main(int argc, char *argv[]) throw std::runtime_error("[main::cache] Invalid file format"); return 1; } + + + // convert to zstd seekable + if(compress_to_zstd_seekable) { + std::string fname_out_zstd = fname_out + ".zst"; + size_t zst_written = ZSTD_seekable_compressFile_orDie((const char*) fname_out.c_str(), + (const char*) fname_out_zstd.c_str(), + (int) ZSTD_COMPRESSION_QUALIITY, + (unsigned int) ZSTD_SEEKABLE_FRAME_SIZE); + + if(zst_written > 0) { + remove(fname_out.c_str()); + } + } } else { usage_cache(); exit(0); @@ -268,6 +278,7 @@ int main(int argc, char *argv[]) d.list(); } else if(strcmp(argv[1], "ps") == 0) { std::unordered_multimap > fastafs_fuse_mounts = get_fastafs_processes(); + for(auto n : fastafs_fuse_mounts) { std::cout << n.second.first << "\t" << n.first << "\t" << n.second.second << "\n"; } @@ -277,15 +288,14 @@ int main(int argc, char *argv[]) usage_check(); exit(0); } - + bool from_file = false; bool check_md5 = false; for(int i = 2; i < argc - 1; i++) { if(strcmp(argv[i], "-f") == 0 or strcmp(argv[i], "--file") == 0) { from_file = true; - } - else if (strcmp(argv[i], "-5") == 0 or strcmp(argv[i], "--md5") == 0) { + } else if(strcmp(argv[i], "-5") == 0 or strcmp(argv[i], "--md5") == 0) { check_md5 = true; } } @@ -308,11 +318,11 @@ int main(int argc, char *argv[]) bool check1 = f.check_file_integrity(true); bool check2 = true; - + if(check_md5) { check2 = f.check_sequence_integrity(true); } - + if(check1 and check2) { return 0; } else { diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index fd80c602..cfd92d3d 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -128,13 +128,41 @@ char *twobit_byte::get(unsigned char length) -const char *twobit_byte::get() +char *twobit_byte::get() { return twobit_byte::encode_hash[this->data]; } +/* + * To calculate file offset + * + * example: + * + * >Seq + * [ACTG][ACTG] has offset of 2 (or 3)? + * + * >Seq + * [ACTG][ACTG][AC] has offset of 2 (or 3)? + * */ +const off_t twobit_byte::nucleotides_to_compressed_fileoffset(size_t n_nucleotides) +{ + return (off_t) n_nucleotides / twobit_byte::nucleotides_per_byte; +} + +const off_t twobit_byte::nucleotides_to_compressed_offset(size_t n_nucleotides) +{ + return twobit_byte::nucleotides_to_compressed_fileoffset(n_nucleotides + twobit_byte::nucleotides_per_byte - 1 + ); +} + + +// needs to be separate function because not encodings read byte-per-byte +void twobit_byte::next(chunked_reader &r) +{ + this->data = r.read(); +} diff --git a/src/utils.cpp b/src/utils.cpp index 701e7bda..4f4446ba 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -199,7 +199,7 @@ bool is_ucsc2bit_file(char *filename) if((fp = fopen(filename, "rb")) == NULL) { //fclose(fp); segfault if NULL - throw std::runtime_error("Could not read first byte of putative FASTA file."); + throw std::runtime_error("Could not read first byte of file."); return false; } @@ -221,6 +221,38 @@ bool is_ucsc2bit_file(char *filename) } + +bool is_zstd_file(const char *filename) +{ + char buf[4 + 1]; + FILE *fp; + + if((fp = fopen(filename, "rb")) == NULL) { + //fclose(fp); segfault if NULL + throw std::runtime_error("Could not read first byte of file."); + return false; + } + + if(fread(buf, 1, 4, fp) == 4) { + fclose(fp); + + return ( + buf[0] == ZSTD_MAGIC[0] and + buf[1] == ZSTD_MAGIC[1] and + buf[2] == ZSTD_MAGIC[2] and + buf[3] == ZSTD_MAGIC[3] + );// return true if first byte equals > + } else { + fclose(fp); + + throw std::runtime_error("Could not read sufficient data."); + } + + return false; +} + + + // https://www.systutorials.com/241216/how-to-get-the-directory-path-and-file-name-from-a-absolute-path-in-c-on-linux/ // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::path(filename).filename(); std::string basename_cpp(std::string fn) @@ -290,3 +322,8 @@ uint32_t file_crc32(const std::string &fname, off_t start, size_t len) } +bool file_exist(const char *fileName) +{ + std::ifstream infile(fileName); + return infile.good(); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 63552df7..32dbc9c6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,10 @@ cmake_minimum_required(VERSION 2.8) -include_directories(../include) +include_directories("${CMAKE_SOURCE_DIR}/include") +include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted") +include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd/lib/common") +include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd/lib") + find_package(Boost COMPONENTS system filesystem unit_test_framework REQUIRED) if(NOT Boost_FOUND) @@ -18,18 +22,199 @@ set(BUILD_DIR "../bin") set(BUILD_TEST_DIR "${BUILD_DIR}/test") -add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_cache cache/test_cache.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_check check/test_check.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) -add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/sequence_region.cpp) -add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) +add_executable(test_twobit_byte "twobit_byte/test_twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_fourbit_byte "fourbit_byte/test_fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_fivebit_fivebytes "fivebit_fivebytes/test_fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_cache "cache/test_cache.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_view "view/test_view.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_flags "flags/test_flags.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp") + + +add_executable(test_fastafs "fastafs/test_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_check "check/test_check.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_fastafs_as_ucsc2bit "fastafs/test_ucsc2bit.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_ucsc2bit_to_fastafs "ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/ucsc2bit_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_ucsc2bit_as_fasta "ucsc2bit/test_ucsc2bit_as_fasta.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/ucsc2bit.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp") + + +add_executable(test_sequenceregion "sequenceregion/test_sequenceregion.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp") + + +add_executable(test_utils "utils/test_utils.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp") + + +add_executable(test_chunked_reader "chunked_reader/test_chunked_reader.cpp" + "${CMAKE_SOURCE_DIR}/src/fasta_to_fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/flags.cpp" + + "${CMAKE_SOURCE_DIR}/src/fastafs.cpp" + "${CMAKE_SOURCE_DIR}/src/twobit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fourbit_byte.cpp" + "${CMAKE_SOURCE_DIR}/src/fivebit_fivebytes.cpp" + "${CMAKE_SOURCE_DIR}/src/utils.cpp" + "${CMAKE_SOURCE_DIR}/src/sequence_region.cpp" + + "${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common/xxhash.c" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_utils.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_compress.cpp" + "${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp" + + "${CMAKE_SOURCE_DIR}/src/chunked_reader.cpp" ) #add_executable(test_tree tree/test_tree.cpp) @@ -51,6 +236,8 @@ set_target_properties(test_twobit_byte PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fourbit_byte PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_fivebit_fivebytes + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_to_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_as_fasta @@ -59,6 +246,8 @@ set_target_properties(test_sequenceregion PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_utils PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_chunked_reader + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") #set_target_properties(test_tree # PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") diff --git a/test/cache/test_cache.cpp b/test/cache/test_cache.cpp index 8eaf3115..1d5b5086 100644 --- a/test/cache/test_cache.cpp +++ b/test/cache/test_cache.cpp @@ -842,8 +842,6 @@ BOOST_AUTO_TEST_CASE(test_cache_2) BOOST_AUTO_TEST_CASE(test_cache_test_high_N_freq) { - std::cout << " --------------------------------------------- \n\n"; - size_t written = fasta_to_fastafs("test/data/test_009.fa", "tmp/test_009.fastafs", true); static std::string reference = @@ -863,9 +861,9 @@ BOOST_AUTO_TEST_CASE(test_cache_test_high_N_freq) "\x00\x00\x00\x00"s// [, ] m-blocks (2) "\x00\x00\x00\x47"s// [14, 17] seq length (71) - "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F"s// - "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F"s// - "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\xF0"s// + "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F"s// + "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F"s// + "\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\x1F\x11\xF1\xF0"s// "\x00\x00\x00\x00"s// [, ] n-blocks (0) "\xA7\xBA\xB4\x6A\x83\xB0\xE3\x29\x3F\x26\xE9\xD7\x0D\x97\x01\x3C"s// checksum "\x00\x00\x00\x00"s// [, ] m-blocks (0) @@ -887,7 +885,7 @@ BOOST_AUTO_TEST_CASE(test_cache_test_high_N_freq) "\x0E\xB8\xC0\x8A"s ; - //BOOST_CHECK_EQUAL(written, 125); + BOOST_CHECK_EQUAL(written, reference.size()); //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); std::ifstream file("tmp/test_009.fastafs", std::ios::in | std::ios::binary | std::ios::ate); @@ -921,4 +919,144 @@ BOOST_AUTO_TEST_CASE(test_cache_test_high_N_freq) + + + +BOOST_AUTO_TEST_CASE(test_cache_protein) +{ + size_t written = fasta_to_fastafs("test/data/test_010.fa", "tmp/test_010.fastafs", true); + + static std::string reference = + // GENERIC-HEADER - size: 14 + "\x0F\x0A\x46\x53"s// [0, 3] + "\x00\x00\x00\x00"s// [4, 7] version + "\x80\x00"s// [8, 9] FASTAFS flag [ 10000000 | 00000000 ] + "\x00\x00\x00\x47"s // [10, 13] index position in file + + // DATA - size: 43 + "\x00\x00\x00\x2D"s// [14, 17] 45 x ACTG's + "\x01\x03\xAD\x68\xA0"s // [18, 22] + "\x94\xC0\x59\x6B\x5A"s // [23, 27] + "\x16\x04\x84\x64\x8B"s // [28, 32] + "\x0B\x60\xF1\x32\x65"s // [33, 37] + "\xCB\x67\x93\x5A\x02"s // [38, 42] + "\x4A\x77\x73\x00"s // [43, 46] last bytes contains no info thus must be skipped + + "\x00\x00\x00\x00"s// [47, 50] n-blocks (0) + "\xA1\x97\x13\xD9\xB6\xE9\xDD\x9F\x19\xC1\x79\x12\x97\xDF\x41\x3C"s// [51, 66] checksum + "\x00\x00\x00\x00"s// [67, 70] m-blocks (2) + + // INDEX + "\x00\x00\x00\x01"s // [71, 74] n sequences + + "\xD0\x00" // [343, 344] complete, DNA and not circular + "\x07"s "PROTEIN"s // [345, 349] name + "\x00\x00\x00\x0E"s // data position in file (14) + + // METADATA + "\x00"s // [399] no metadata fields [padding will come soon?] + + // CRC32 checksums + "\x77\xAE\x11\x2D"s // only part that is not yet checked + ; + + BOOST_CHECK_EQUAL(written, 94); // 220 bytes compressed data with 44 5/bit/5/bytes + + std::ifstream file("tmp/test_010.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + BOOST_REQUIRE(file.is_open()); + + std::streampos size; + char * buffer; + size = file.tellg(); + buffer = new char [size]; + + file.seekg(0, std::ios::beg); + file.read(buffer, size); + BOOST_CHECK_EQUAL(file.gcount(), size); + file.close(); + + //BOOST_CHECK_UNEQUAL(ret, -1); + + + for(unsigned int i = 0; i < size; i++) { + BOOST_CHECK_EQUAL(buffer[i], reference[i]); + + //if(reference[i] != buffer[i]) { + // printf("comparing char %u ** mismatch [ref] %d %02hhX != [buf] (%u x %02hhX)\n", i, reference[i], reference[i], buffer[i], (unsigned char) buffer[i], buffer[i]); + //} + } + + delete[] buffer; + +} + + + +BOOST_AUTO_TEST_CASE(test_cache_protein2) +{ + size_t written = fasta_to_fastafs("test/data/test_011.fa", "tmp/test_011.fastafs", true); + + static std::string reference = + // GENERIC-HEADER - size: 14 + "\x0F\x0A\x46\x53"s// [0, 3] + "\x00\x00\x00\x00"s// [4, 7] version + "\x80\x00"s// [8, 9] FASTAFS flag [ 10000000 | 00000000 ] + "\x00\x00\x00\x38"s // [10, 13] index position in file + + // DATA - size: 43 + "\x00\x00\x00\x15"s// [14, 17] 21 x ACTG's + "\x60\x0B\x20\x10\x75" // [18, 22] + "\x5A\x89\x71\xC6\x31" // [23, 27] + "\x8B\x08\x05\x80" // [28, 31] + "\x00\x00\x00\x00"s// [32, 35] n-blocks (0) + "\x83\x1a\x10\x3b\xf8\x03\x3e\x69\x54\xba\xe3\x86\x98\x9f\x60\xf3"s// [36, 51] checksum + "\x00\x00\x00\x00"s// [52, 55] m-blocks (2) + + // INDEX + "\x00\x00\x00\x01"s // [56, 59] n sequences + + "\xD0\x00" // [60, 61] complete, DNA and not circular + "\x1C"s "twobit-fourbit-fivebit-error"s // [62, 90] name + "\x00\x00\x00\x0E"s // [91, 94] + + // METADATA + "\x00"s // [95] + + // CRC32 checksums + "\x67\x1B\xC6\xB5"s // [96, 99] + ; + + BOOST_CHECK_EQUAL(written, 100); // 220 bytes compressed data with 44 5/bit/5/bytes + + std::ifstream file("tmp/test_011.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + BOOST_REQUIRE(file.is_open()); + + std::streampos size; + char * buffer; + size = file.tellg(); + buffer = new char [size]; + + file.seekg(0, std::ios::beg); + file.read(buffer, size); + BOOST_CHECK_EQUAL(file.gcount(), size); + file.close(); + + //BOOST_CHECK_UNEQUAL(ret, -1); + + + for(unsigned int i = 0; i < size; i++) { + BOOST_CHECK_EQUAL(buffer[i], reference[i]); + + if(reference[i] != buffer[i]) { + printf("comparing char %u ** mismatch [ref] %d %02hhX != [buf] (%u x %02hhX)\n", i, reference[i], reference[i], buffer[i], (unsigned char) buffer[i], buffer[i]); + } + + } + + delete[] buffer; + +} + + + BOOST_AUTO_TEST_SUITE_END() diff --git a/test/chunked_reader/test_chunked_reader.cpp b/test/chunked_reader/test_chunked_reader.cpp new file mode 100644 index 00000000..6a0a7e97 --- /dev/null +++ b/test/chunked_reader/test_chunked_reader.cpp @@ -0,0 +1,306 @@ + +#define BOOST_TEST_MODULE fastfs_test_chunked_reader + + +#include +#include + +#include + +#include "config.hpp" + +#include "fasta_to_fastafs.hpp" +#include "fastafs.hpp" +#include "chunked_reader.hpp" + +#include "zstd_seekable_utils.hpp" + + + +void flush_buffer(char *buffer, size_t n, char fill) +{ + for(size_t i = 0; i < n; i++) { + buffer[i] = fill; + } +} + + + +BOOST_AUTO_TEST_SUITE(Testing) + + + +BOOST_AUTO_TEST_CASE(test_chunked_reading_small_file) +{ + std::string test_name = "test"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + std::string fastafs_file_zstd = "tmp/" + test_name + ".fastafs.zst"; + + fasta_to_fastafs(fasta_file, fastafs_file, false); + ZSTD_seekable_compressFile_orDie((const char*) fastafs_file.c_str(), + (const char*) fastafs_file_zstd.c_str(), + (int) ZSTD_COMPRESSION_QUALIITY, + (unsigned) ZSTD_SEEKABLE_FRAME_SIZE); + + + char buffer[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + std::string std_buffer; + buffer[1024] = '\0'; + size_t written; + + std::string reference1 = "\x0f\x0a\x46\x53\x00\x00\x00\x00\x80\x00\x00\x00\x01\x37\x00\x00\x00\x10\x00\x55\xaa\xff\x00\x00\x00\x00\x75\x25\x5c\x6d\x90\x77\x89\x99\xad\x36\x43\xa2\xe6\x9d\x43\x44\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x0c\x93\x93\x93\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x0b\x8b\x56\x73\x72\x4a\x99\x65\xc2\x9a\x1d\x76\xfe\x70\x31\xac\x8a\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x0b\x00\x00\x00\x0d\x93\x93\xaa\x40\x00\x00\x00\x00\x61\xde\xba\x32\xec\x4c\x35\x76\xe3\x99\x8f\xa2\xd4\xb8\x72\x88\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x0e\x93\x93\xaa\x50\x00\x00\x00\x00\x99\xb9\x05\x60\xf2\x3c\x1b\xda\x28\x71\xa6\xc9\x3f\xd6\xa2\x40\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x0d\x00\x00\x00\x0f\x93\x93\xaa\x54\x00\x00\x00\x00\x36\x25\xaf\xdf\xbe\xb4\x37\x65\xb8\x5f\x61\x2e\x0a\xcb\x47\x39\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x0e\x00\x00\x00\x04\x93\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x07\xbd\x8c\x08\x0e\xd2\x5b\xa8\xa4\x54\xd9\x43\x4c\xb8\xd1\x4a\x68\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x07\x00\x00\x00\x04\x93\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x98\x0e\xf3\xa1\xcd\x80\xaf\xec\x95\x9d\xcf\x85\x2d\x02\x62\x46\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x07\x10\x00\x04\x63\x68\x72\x31\x00\x00\x00\x0e\x10\x00\x04\x63\x68\x72\x32\x00\x00\x00\x36\x10\x00\x06\x63\x68\x72\x33\x2e\x31\x00\x00\x00\x65\x10\x00\x06\x63\x68\x72\x33\x2e\x32\x00\x00\x00\x8d\x10\x00\x06\x63\x68\x72\x33\x2e\x33\x00\x00\x00\xb5\x10\x00\x04\x63\x68\x72\x34\x00\x00\x00\xdd\x10\x00\x04\x63\x68\x72\x35\x00\x00\x01\x0a\x00\x1e\x77\x77\x22"s;// xxd -p + std::string reference2 = "\x0f\x0a\x46\x53"s; + std::string reference3 = "\x0a\x46\x53\x00"s; + + { + chunked_reader r_flat = chunked_reader(fastafs_file.c_str()); + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 403); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference1), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + // test what happens when file is closed + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + // test seek stuff + r_flat.seek(0); // reset to first pos in file + BOOST_CHECK_EQUAL(r_flat.tell(), 0); + + written = r_flat.read(buffer, 4); + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(r_flat.tell(), 4); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + r_flat.seek(1); // reset to first pos in file + BOOST_CHECK_EQUAL(r_flat.tell(), 1); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 4); + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(r_flat.tell(), 5); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference3), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + } + + { + chunked_reader r_zstd = chunked_reader(fastafs_file_zstd.c_str()); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 403); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference1), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + // test what happens when file is closed + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + // test seek stuff + r_zstd.seek(0); // reset to first pos in file + BOOST_CHECK_EQUAL(r_zstd.tell(), 0); + + written = r_zstd.read(buffer, 4); + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(r_zstd.tell(), 4); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + r_zstd.seek(1); // reset to first pos in file + BOOST_CHECK_EQUAL(r_zstd.tell(), 1); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 4); + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(r_zstd.tell(), 5); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference3), 0, "Difference in content"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + } +} + + +BOOST_AUTO_TEST_CASE(test_chunked_reading_large_file) +{ + // this file needs two buffers as its size is 1593 + + std::string test_name = "test_007"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + std::string fastafs_file_zstd = "tmp/" + test_name + ".fastafs.zst"; + + fasta_to_fastafs(fasta_file, fastafs_file, false); + ZSTD_seekable_compressFile_orDie((const char*) fastafs_file.c_str(), + (const char*) fastafs_file_zstd.c_str(), + (int) ZSTD_COMPRESSION_QUALIITY, + (unsigned) ZSTD_SEEKABLE_FRAME_SIZE); + + + char buffer[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + std::string std_buffer; + size_t written; + + std::string reference1 = "\x0f\x0a\x46\x53\x00\x00\x00\x00\x80\x00\x00\x00\x06\x20\x00\x00\x17\xd7\xf4\xbd\xdd\x5d\x39\xcc\xce\x7e\xe8\x6e\x9d\x92\x70\x2d\x96\x68\x9f\xba\x83\xe1\x99\x2d\x9f\xe4\xed\x65\x3f\x09\x88\x5d\x28\x5c\xc0\x99\x36\x80\x87\xdc\x02\xc0\xe5\x5a\xef\xae\x56\x95\x59\x91\xb6\xde\x35\xf4\x1c\x60\x1e\x30\xd1\x77\x1c\x70\x2d\xda\xed\xc5\xfc\x58\x8a\x28\x94\x2b\x4f\x96\x97\x18\xa0\x65\x22\x48\xa6\x06\x1b\x65\x7f\xf4\x82\x8f\xe3\x05\xde\x00\x70\xb7\xb5\xa4\x1e\xc3\x43\xe9\x49\x92\x8b\x47\xa6\xdd\x97\xd4\x93\x4d\xb4\xd0\x76\xc7\x4d\xeb\x71\x48\x77\x43\x91\xcd\xe5\x8f\x8d\xa2\xcb\x28\x53\xcf\x82\xa4\xd5\x85\x78\xae\x37\xd9\x19\x13\x54\x52\x0c\x7d\xcb\x2a\xfd\x1b\x38\x66\xaa\xd3\x23\xe6\xf7\x20\xd5\x0a\xf1\x4b\x59\xe6\x0b\xbe\x42\xa9\x5e\x7d\xce\xec\x73\xd9\x8b\xc6\x4b\x35\xe4\x69\xbc\x10\x35\x8a\x0e\x09\x2b\xf1\x9f\x38\x15\x57\x21\x08\xe1\xa6\x6e\xf1\x8c\x52\x08\x1b\x85\x50\xe0\x1e\x01\x35\x3a\x0a\x72\x1b\xb3\xda\xfd\x78\x36\x10\xb7\x1a\x2e\x93\xd4\x63\xab\x0b\x98\xfb\x4b\x97\x47\x7f\x61\x0f\x36\x7f\xfe\x02\x36\x2e\x30\xa5\xdb\x8f\xde\xd0\xc0\xc6\x9c\x3a\x7b\x71\x24\x1e\xc3\x04\xac\x31\x7a\xf5\xf3\x33\x26\x99\xa6\x4f\x43\x6c\x46\x5c\x4d\xf5\xb8\x43\x1d\xd7\x73\x3e\xe9\xb6\x3b\xdf\xff\xf6\xf3\x2a\x34\x3f\x39\x60\x4b\xed\xde\xf4\x2f\x5d\xe7\xab\xfe\xa1\x4d\x11\x9c\xcc\x41\xf8\x3c\xdd\x18\xea\xea\x45\x3e\xa5\x0b\xb5\x7b\x38\x5e\x26\x72\xdd\x24\x51\x48\xcf\x79\xa7\xd9\x06\x2e\xe8\xfb\x5d\x3d\x4a\x81\x0d\x15\x48\xd0\x84\x15\x0f\x15\x5b\xc3\x9d\x48\xc4\x9e\x2f\x45\xd4\x1c\x24\xc4\x90\x60\xe6\xa1\x19\x6f\x2d\x3a\xf9\x52\x0d\x06\x93\x21\xb4\xc2\x43\xd7\xce\x5b\xaa\x42\x20\x35\x6c\x45\xa2\xea\xd8\xe0\xc7\x90\xe3\x4a\x3e\xb1\x65\xaf\x5c\xe3\x23\x58\x65\x88\x92\x0b\x98\xc4\x3f\x7b\xb4\x42\x6c\x77\x7f\xf3\x51\x1a\x17\x89\x1f\x03\x66\x95\xbb\x83\x3f\xfb\xd1\x8c\x46\x40\x7f\xd1\xff\x7a\xbb\xb2\xcc\xaa\xc9\xfe\xbe\x7a\xff\x5b\xf7\x17\xe0\x4d\xca\x6b\xf2\xef\x0f\x0c\x48\x90\x5c\x6d\xa4\x53\xf4\xfb\xe3\xfe\x38\x61\xdb\x32\xe6\x6e\x35\x86\xad\xad\x33\x3b\x7f\x92\x7f\xf9\x4c\xbd\x92\xbe\x41\x4f\x23\x37\xa2\x6e\xd9\x7d\x82\x47\xa4\x8f\x77\x51\xdb\x2f\xd6\xda\xcb\x1d\x7b\x2b\xe3\x29\x6f\x03\xad\xce\x05\xa7\xab\x34\x52\xb8\x94\xd3\x08\x5b\x9f\x0d\xec\x27\x09\xce\xb5\x82\x89\x43\xe0\xc3\xc3\x7f\xad\xeb\x30\x0a\x5c\xa8\x88\xc8\x38\x02\x18\x4d\xda\x80\x02\xf5\xb0\x0b\xbf\x3b\xbc\x11\x6b\xe7\xfd\x4b\x4a\xe9\x48\x31\x9f\x3a\x83\x80\x7b\x21\x73\xf8\x99\x43\x1b\xd6\x1a\xb6\xce\xe4\xff\x0e\x58\x33\x86\xd0\x09\x70\x14\x63\xc6\x45\x8f\x2a\x5f\xc8\xb2\x82\xdc\x4f\x99\x81\xa8\x87\xe4\xbf\xc5\xfe\x35\x81\x73\x63\x21\xf1\x82\xdb\x73\xfe\xe2\x1b\x5f\xff\x07\x8b\xb4\xef\xb6\x6a\x92\x9c\xcf\x6d\x09\xb1\xc1\x78\xa4\x56\x37\xe4\x6a\xf9\x01\x1e\x8c\x51\x14\x10\x34\xbd\xb0\x4f\xc6\xcb\xd6\xf4\xee\xed\x7c\x23\xa2\x80\xde\x5d\x76\x9d\x09\xd8\x1d\x45\x21\xc1\xad\xe9\x74\xf2\x61\xd4\x0b\xc7\x0d\x6a\xab\x25\x7c\x19\xa3\xf0\x88\x87\x7b\xba\xf0\x37\x3f\x59\x8f\x7f\x8e\x25\xbb\x80\x70\xf2\xe3\xf5\x0a\xa5\xb5\x2c\x43\x6f\xf1\x7b\xd3\x48\x86\x9a\xa2\xb1\x42\x89\xf3\x00\x0e\x9d\x99\xca\x5e\xb0\x2a\xf7\x46\xe6\xfb\xb9\x22\xc9\x14\xb9\x75\x95\x82\x87\x0d\x9a\x54\x80\xf6\xbc\x1f\xd9\xcb\x09\x0c\x4b\x5e\x38\xa1\x10\xaa\x32\xb1\xfa\xcc\xba\x37\x37\x01\x6d\x7f\xf1\x9d\x49\x35\x6a\x5b\xec\xec\xfb\x6a\x46\xca\x41\x03\x35\xfb\x56\xef\x5b\xe2\x44\xa0\x9e\xf8\x99\xde\x92\x17\x12\x98\x5e\x11\xe0\x73\x94\x23\xc9\x81\x61\xcc\x8a\xb4\x72\x5d\x6e\x1b\xfb\xa4\x3c\x79\x06\x12\xd3\x00\x47\xa7\x8e\x8c\x42\x9d\xa4\xfd\x34\xcd\xf0\x94\xdc\x3c\x84\xe3\xf7\xfc\x16\xd8\x0d\x4a\x9d\x05\xe1\xff\x1b\x47\xf1\xdc\xdf\xa4\x86\x09\xc1\xfe\xde\x45\xe4\x43\xfd\x0d\x05\xf4\x3f\xb5\x2e\xe7\x48\xde\xc8\x2b\x8a\x5f\xee\x28\x66\x09\xb4\x65\x12\x77\x23\x6a\xe2\x80\xa4\xc2\xa5\x1e\xbe\xd9\x8e\xae\x56\x4d\x56\xfe\xed\xe8\x0e\x39\xab\xba\x68\xfd\x39\x2c\x22\x30\x80\x31\xfe\x34\x46\x7d\xea\x3c\x8e\x5b\x87\xef\xac\x2d\xe3\x80\x19\x5a\xd8\xba\x63\xd5\xb4\x59\xc0\x38\xff\xc5\xd8\x00\x75\x8e\x31\x7c\x1f\x90\x98\xdc\x4a\x9c\x67\x84\x12\x87\xb2\x06\xcc\x5c\x41\xc4\xa2\x22\x88\x2d\xf5\x43\xdc\x5f\xe8\x71\xa0\x0f\xbd\xa8\x33\x6f\x83\xbf\xc0\x3a\xfd\xa7\xf9\x8a\x93\x12\x94\x0a\x9e\x39\x68\x60\xc2\xfe\x0a\x2c\x13\xb6\x25\x5a\x85\x62\x1c\x5b"s;// xxd -p + std::string reference2 = "\x44\x2c\x05\x5b\xe6\x92\x56\x6b\x2f\xf6\x4f\xfb\xdc\x46\x9c\xe2\xbd\xac\xc0\x0d\x53\x44\x4d\x29\xd3\xe3\x61\x06\x77\xfb\x0c\x1b\xfa\x05\x17\x3b\x32\xc8\x6c\xd3\x0e\xa8\x18\xde\x64\xfb\x8a\xb8\x84\xf6\x3f\x17\xc4\x1f\xea\x8c\xea\xd5\x42\xc1\xb3\xdb\x68\x90\x8a\x24\x2f\x0c\xc5\x9b\xb6\xd6\x16\x5d\x3d\x38\xf1\xf6\x80\xf2\x56\x47\xf3\x95\x64\x7e\x50\x14\x02\x73\xa9\x0a\x04\x01\xcc\xf3\x1b\x3c\x9a\xfd\x98\x86\xdf\x54\xe6\x36\x50\xe9\xc0\x46\xd7\xae\x54\xd1\xe4\xaf\x98\xc3\xa6\xee\x44\xce\x8c\x16\xdf\x33\x87\x0b\xca\x12\x91\xac\xa4\xbe\x4e\xdb\xb2\x32\x21\x21\x16\xdb\x0c\x5f\xe3\x33\xbd\xa9\x8a\x88\xed\x3e\x65\x46\x4d\x8b\x16\xf0\x73\xe7\x76\x3d\x42\xb5\xe1\xba\x14\xe8\xd9\x99\x4f\x67\xc2\x20\x0d\x41\x07\x27\x61\x3a\x28\x49\x6f\x73\xdb\x44\xdb\xe2\x5e\x54\x4e\x1c\xe0\xd4\x66\x1e\xfe\x0c\x96\x52\xb3\x79\x00\x9d\x87\xed\xee\xc6\x82\x5e\xdc\x8f\xcd\xc8\xaa\x1c\x44\x76\x22\x14\x99\xef\x56\x73\x0e\x93\x14\x77\xa3\xa4\x52\xa7\xad\x55\x6c\xe2\x1a\x6a\x57\xd1\xb8\x4a\x8f\x3a\xa9\xcf\xab\x20\x25\xc8\xa8\x13\x30\x3c\x78\xbd\x3e\x9d\x73\x8f\xd9\x10\x9c\x15\xa8\x8a\x58\x70\x34\x38\xbb\xff\x26\x6d\x42\xcd\x2f\x8f\x7c\x20\x39\xa5\x37\x70\xf1\x1f\x65\x8a\xc5\xa3\x4f\x02\x57\x35\x17\x1b\x91\xa2\xa6\xd4\x67\x1c\x54\xde\xb4\xaf\x53\x99\x92\x23\xc1\x3d\xcc\x62\x9c\x21\xd9\xb5\xde\x5f\xd6\x1e\xa5\x4a\x45\x7e\x10\x74\xc4\x9e\x7f\x3b\xdd\xf6\x6c\xb6\xf2\xc9\xb6\xbe\x01\x45\x2e\x4a\x3b\xaf\x41\x05\x91\x38\x68\x35\x36\x0e\x1a\xc7\xc9\x52\x6d\xc1\x9c\x9e\x50\x29\x7b\x3e\xe0\x39\x67\x32\xe8\xae\xaa\xac\x0c\xbb\x18\x4c\x11\x3b\x58\xc8\x80\x88\xf1\x6d\x7a\x3d\x36\xd0\x8e\xc1\xb1\xf8\xbb\xa9\xd1\xd6\x8f\x07\x6b\x12\x1a\x5b\xf1\xea\xed\x94\x1b\xe1\x1b\xe7\x0e\x75\x3d\x4e\xcf\x5b\x91\x2e\x78\x55\xd8\x8d\x1f\x1b\x09\x60\x38\xd2\xb8\xaa\x1f\xb5\x9d\x2c\xd4\x5c\x44\x78\x1f\x88\x4c\xaf\xa6\x2c\xeb\xca\x00\x51\xbe\xc9\x2e\x60\xaf\x0d\xb4\x02\xb3\x47\x0a\x3f\x4b\xbc\xc4\xa4\xff\xbb\xb3\x0e\x4f\xb3\xf0\x71\x3a\x84\x9a\x3d\x36\x33\x25\xeb\x2f\x76\x66\x5e\xc3\xd0\x66\xfc\xd4\x10\x3b\x78\x15\x61\x2d\xfc\xe6\x05\x7e\xda\x86\x43\x15\xb9\x78\xc2\x8b\x98\x42\x3e\x56\x42\x69\xba\xa2\xf3\x1e\xec\x00\x00\x00\x00\x21\x83\x67\xa8\x14\xed\xdc\x51\xeb\x96\x93\x98\x74\x4d\x13\x7c\x00\x00\x00\x00\x00\x00\x00\x01\x10\x00\x09\x6c\x65\x6e\x2d\x6c\x69\x6d\x69\x74\x00\x00\x00\x0e\x00\x98\x32\x91\x09"s;// xxd -p + std::string reference3 = "\x00\x00\x00\x00\x80\x00\x00\x00\x06\x20\x00\x00\x17\xd7\xf4\xbd\xdd\x5d\x39\xcc\xce\x7e\xe8\x6e\x9d\x92\x70\x2d\x96\x68\x9f\xba\x83\xe1\x99\x2d\x9f\xe4\xed\x65\x3f\x09\x88\x5d\x28\x5c\xc0\x99\x36\x80\x87\xdc\x02\xc0\xe5\x5a\xef\xae\x56\x95\x59\x91\xb6\xde\x35\xf4\x1c\x60\x1e\x30\xd1\x77\x1c\x70\x2d\xda\xed\xc5\xfc\x58\x8a\x28\x94\x2b\x4f\x96\x97\x18\xa0\x65\x22\x48\xa6\x06\x1b\x65\x7f\xf4\x82\x8f\xe3\x05\xde\x00\x70\xb7\xb5\xa4\x1e\xc3\x43\xe9\x49\x92\x8b\x47\xa6\xdd\x97\xd4\x93\x4d\xb4\xd0\x76\xc7\x4d\xeb\x71\x48\x77\x43\x91\xcd\xe5\x8f\x8d\xa2\xcb\x28\x53\xcf\x82\xa4\xd5\x85\x78\xae\x37\xd9\x19\x13\x54\x52\x0c\x7d\xcb\x2a\xfd\x1b\x38\x66\xaa\xd3\x23\xe6\xf7\x20\xd5\x0a\xf1\x4b\x59\xe6\x0b\xbe\x42\xa9\x5e\x7d\xce\xec\x73\xd9\x8b\xc6\x4b\x35\xe4\x69\xbc\x10\x35\x8a\x0e\x09\x2b\xf1\x9f\x38\x15\x57\x21\x08\xe1\xa6\x6e\xf1\x8c\x52\x08\x1b\x85\x50\xe0\x1e\x01\x35\x3a\x0a\x72\x1b\xb3\xda\xfd\x78\x36\x10\xb7\x1a\x2e\x93\xd4\x63\xab\x0b\x98\xfb\x4b\x97\x47\x7f\x61\x0f\x36\x7f\xfe\x02\x36\x2e\x30\xa5\xdb\x8f\xde\xd0\xc0\xc6\x9c\x3a\x7b\x71\x24\x1e\xc3\x04\xac\x31\x7a\xf5\xf3\x33\x26\x99\xa6\x4f\x43\x6c\x46\x5c\x4d\xf5\xb8\x43\x1d\xd7\x73\x3e\xe9\xb6\x3b\xdf\xff\xf6\xf3\x2a\x34\x3f\x39\x60\x4b\xed\xde\xf4\x2f\x5d\xe7\xab\xfe\xa1\x4d\x11\x9c\xcc\x41\xf8\x3c\xdd\x18\xea\xea\x45\x3e\xa5\x0b\xb5\x7b\x38\x5e\x26\x72\xdd\x24\x51\x48\xcf\x79\xa7\xd9\x06\x2e\xe8\xfb\x5d\x3d\x4a\x81\x0d\x15\x48\xd0\x84\x15\x0f\x15\x5b\xc3\x9d\x48\xc4\x9e\x2f\x45\xd4\x1c\x24\xc4\x90\x60\xe6\xa1\x19\x6f\x2d\x3a\xf9\x52\x0d\x06\x93\x21\xb4\xc2\x43\xd7\xce\x5b\xaa\x42\x20\x35\x6c\x45\xa2\xea\xd8\xe0\xc7\x90\xe3\x4a\x3e\xb1\x65\xaf\x5c\xe3\x23\x58\x65\x88\x92\x0b\x98\xc4\x3f\x7b\xb4\x42\x6c\x77\x7f\xf3\x51\x1a\x17\x89\x1f\x03\x66\x95\xbb\x83\x3f\xfb\xd1\x8c\x46\x40\x7f\xd1\xff\x7a\xbb\xb2\xcc\xaa\xc9\xfe\xbe\x7a\xff\x5b\xf7\x17\xe0\x4d\xca\x6b\xf2\xef\x0f\x0c\x48\x90\x5c\x6d\xa4\x53\xf4\xfb\xe3\xfe\x38\x61\xdb\x32\xe6\x6e\x35\x86\xad\xad\x33\x3b\x7f\x92\x7f\xf9\x4c\xbd\x92\xbe\x41\x4f\x23\x37\xa2\x6e\xd9\x7d\x82\x47\xa4\x8f\x77\x51\xdb\x2f\xd6\xda\xcb\x1d\x7b\x2b\xe3\x29\x6f\x03\xad\xce\x05\xa7\xab\x34\x52\xb8\x94\xd3\x08\x5b\x9f\x0d\xec\x27\x09\xce\xb5\x82\x89\x43\xe0\xc3\xc3\x7f\xad\xeb\x30\x0a\x5c\xa8\x88\xc8\x38\x02\x18\x4d\xda\x80\x02\xf5\xb0\x0b\xbf\x3b\xbc\x11\x6b\xe7\xfd\x4b\x4a\xe9\x48\x31\x9f\x3a\x83\x80\x7b\x21\x73\xf8\x99\x43\x1b\xd6\x1a\xb6\xce\xe4\xff\x0e\x58\x33\x86\xd0\x09\x70\x14\x63\xc6\x45\x8f\x2a\x5f\xc8\xb2\x82\xdc\x4f\x99\x81\xa8\x87\xe4\xbf\xc5\xfe\x35\x81\x73\x63\x21\xf1\x82\xdb\x73\xfe\xe2\x1b\x5f\xff\x07\x8b\xb4\xef\xb6\x6a\x92\x9c\xcf\x6d\x09\xb1\xc1\x78\xa4\x56\x37\xe4\x6a\xf9\x01\x1e\x8c\x51\x14\x10\x34\xbd\xb0\x4f\xc6\xcb\xd6\xf4\xee\xed\x7c\x23\xa2\x80\xde\x5d\x76\x9d\x09\xd8\x1d\x45\x21\xc1\xad\xe9\x74\xf2\x61\xd4\x0b\xc7\x0d\x6a\xab\x25\x7c\x19\xa3\xf0\x88\x87\x7b\xba\xf0\x37\x3f\x59\x8f\x7f\x8e\x25\xbb\x80\x70\xf2\xe3\xf5\x0a\xa5\xb5\x2c\x43\x6f\xf1\x7b\xd3\x48\x86\x9a\xa2\xb1\x42\x89\xf3\x00\x0e\x9d\x99\xca\x5e\xb0\x2a\xf7\x46\xe6\xfb\xb9\x22\xc9\x14\xb9\x75\x95\x82\x87\x0d\x9a\x54\x80\xf6\xbc\x1f\xd9\xcb\x09\x0c\x4b\x5e\x38\xa1\x10\xaa\x32\xb1\xfa\xcc\xba\x37\x37\x01\x6d\x7f\xf1\x9d\x49\x35\x6a\x5b\xec\xec\xfb\x6a\x46\xca\x41\x03\x35\xfb\x56\xef\x5b\xe2\x44\xa0\x9e\xf8\x99\xde\x92\x17\x12\x98\x5e\x11\xe0\x73\x94\x23\xc9\x81\x61\xcc\x8a\xb4\x72\x5d\x6e\x1b\xfb\xa4\x3c\x79\x06\x12\xd3\x00\x47\xa7\x8e\x8c\x42\x9d\xa4\xfd\x34\xcd\xf0\x94\xdc\x3c\x84\xe3\xf7\xfc\x16\xd8\x0d\x4a\x9d\x05\xe1\xff\x1b\x47\xf1\xdc\xdf\xa4\x86\x09\xc1\xfe\xde\x45\xe4\x43\xfd\x0d\x05\xf4\x3f\xb5\x2e\xe7\x48\xde\xc8\x2b\x8a\x5f\xee\x28\x66\x09\xb4\x65\x12\x77\x23\x6a\xe2\x80\xa4\xc2\xa5\x1e\xbe\xd9\x8e\xae\x56\x4d\x56\xfe\xed\xe8\x0e\x39\xab\xba\x68\xfd\x39\x2c\x22\x30\x80\x31\xfe\x34\x46\x7d\xea\x3c\x8e\x5b\x87\xef\xac\x2d\xe3\x80\x19\x5a\xd8\xba\x63\xd5\xb4\x59\xc0\x38\xff\xc5\xd8\x00\x75\x8e\x31\x7c\x1f\x90\x98\xdc\x4a\x9c\x67\x84\x12\x87\xb2\x06\xcc\x5c\x41\xc4\xa2\x22\x88\x2d\xf5\x43\xdc\x5f\xe8\x71\xa0\x0f\xbd\xa8\x33\x6f\x83\xbf\xc0\x3a\xfd\xa7\xf9\x8a\x93\x12\x94\x0a\x9e\x39\x68\x60\xc2\xfe\x0a\x2c\x13\xb6\x25\x5a\x85\x62\x1c\x5b\x44\x2c\x05\x5b"s;// xxd -p + std::string reference4 = "\xe6\x92\x56\x6b\x2f\xf6\x4f\xfb\xdc\x46\x9c\xe2\xbd\xac\xc0\x0d\x53\x44\x4d\x29\xd3\xe3\x61\x06\x77\xfb\x0c\x1b\xfa\x05\x17\x3b\x32\xc8\x6c\xd3\x0e\xa8\x18\xde\x64\xfb\x8a\xb8\x84\xf6\x3f\x17\xc4\x1f\xea\x8c\xea\xd5\x42\xc1\xb3\xdb\x68\x90\x8a\x24\x2f\x0c\xc5\x9b\xb6\xd6\x16\x5d\x3d\x38\xf1\xf6\x80\xf2\x56\x47\xf3\x95\x64\x7e\x50\x14\x02\x73\xa9\x0a\x04\x01\xcc\xf3\x1b\x3c\x9a\xfd\x98\x86\xdf\x54\xe6\x36\x50\xe9\xc0\x46\xd7\xae\x54\xd1\xe4\xaf\x98\xc3\xa6\xee\x44\xce\x8c\x16\xdf\x33\x87\x0b\xca\x12\x91\xac\xa4\xbe\x4e\xdb\xb2\x32\x21\x21\x16\xdb\x0c\x5f\xe3\x33\xbd\xa9\x8a\x88\xed\x3e\x65\x46\x4d\x8b\x16\xf0\x73\xe7\x76\x3d\x42\xb5\xe1\xba\x14\xe8\xd9\x99\x4f\x67\xc2\x20\x0d\x41\x07\x27\x61\x3a\x28\x49\x6f\x73\xdb\x44\xdb\xe2\x5e\x54\x4e\x1c\xe0\xd4\x66\x1e\xfe\x0c\x96\x52\xb3\x79\x00\x9d\x87\xed\xee\xc6\x82\x5e\xdc\x8f\xcd\xc8\xaa\x1c\x44\x76\x22\x14\x99\xef\x56\x73\x0e\x93\x14\x77\xa3\xa4\x52\xa7\xad\x55\x6c\xe2\x1a\x6a\x57\xd1\xb8\x4a\x8f\x3a\xa9\xcf\xab\x20\x25\xc8\xa8\x13\x30\x3c\x78\xbd\x3e\x9d\x73\x8f\xd9\x10\x9c\x15\xa8\x8a\x58\x70\x34\x38\xbb\xff\x26\x6d\x42\xcd\x2f\x8f\x7c\x20\x39\xa5\x37\x70\xf1\x1f\x65\x8a\xc5\xa3\x4f\x02\x57\x35\x17\x1b\x91\xa2\xa6\xd4\x67\x1c\x54\xde\xb4\xaf\x53\x99\x92\x23\xc1\x3d\xcc\x62\x9c\x21\xd9\xb5\xde\x5f\xd6\x1e\xa5\x4a\x45\x7e\x10\x74\xc4\x9e\x7f\x3b\xdd\xf6\x6c\xb6\xf2\xc9\xb6\xbe\x01\x45\x2e\x4a\x3b\xaf\x41\x05\x91\x38\x68\x35\x36\x0e\x1a\xc7\xc9\x52\x6d\xc1\x9c\x9e\x50\x29\x7b\x3e\xe0\x39\x67\x32\xe8\xae\xaa\xac\x0c\xbb\x18\x4c\x11\x3b\x58\xc8\x80\x88\xf1\x6d\x7a\x3d\x36\xd0\x8e\xc1\xb1\xf8\xbb\xa9\xd1\xd6\x8f\x07\x6b\x12\x1a\x5b\xf1\xea\xed\x94\x1b\xe1\x1b\xe7\x0e\x75\x3d\x4e\xcf\x5b\x91\x2e\x78\x55\xd8\x8d\x1f\x1b\x09\x60\x38\xd2\xb8\xaa\x1f\xb5\x9d\x2c\xd4\x5c\x44\x78\x1f\x88\x4c\xaf\xa6\x2c\xeb\xca\x00\x51\xbe\xc9\x2e\x60\xaf\x0d\xb4\x02\xb3\x47\x0a\x3f\x4b\xbc\xc4\xa4\xff\xbb\xb3\x0e\x4f\xb3\xf0\x71\x3a\x84\x9a\x3d\x36\x33\x25\xeb\x2f\x76\x66\x5e\xc3\xd0\x66\xfc\xd4\x10\x3b\x78\x15\x61\x2d\xfc\xe6\x05\x7e\xda\x86\x43\x15\xb9\x78\xc2\x8b\x98\x42\x3e\x56\x42\x69\xba\xa2\xf3\x1e\xec\x00\x00\x00\x00\x21\x83\x67\xa8\x14\xed\xdc\x51\xeb\x96\x93\x98\x74\x4d\x13\x7c\x00\x00\x00\x00\x00\x00\x00\x01\x10\x00\x09\x6c\x65\x6e\x2d\x6c\x69\x6d\x69\x74\x00\x00\x00\x0e\x00\x98\x32\x91\x09"s;// xxd -p + std::string reference5 = "\x00\x00\x00\x00"s;// xxd -p + std::string reference6 = "\x80\x00\x00\x00\x06\x20\x00\x00\x17\xd7\xf4\xbd\xdd\x5d\x39\xcc\xce\x7e\xe8\x6e\x9d\x92\x70\x2d\x96\x68\x9f\xba\x83\xe1\x99\x2d\x9f\xe4\xed\x65\x3f\x09\x88\x5d\x28\x5c\xc0\x99\x36\x80\x87\xdc\x02\xc0\xe5\x5a\xef\xae\x56\x95\x59\x91\xb6\xde\x35\xf4\x1c\x60\x1e\x30\xd1\x77\x1c\x70\x2d\xda\xed\xc5\xfc\x58\x8a\x28\x94\x2b\x4f\x96\x97\x18\xa0\x65\x22\x48\xa6\x06\x1b\x65\x7f\xf4\x82\x8f\xe3\x05\xde\x00\x70\xb7\xb5\xa4\x1e\xc3\x43\xe9\x49\x92\x8b\x47\xa6\xdd\x97\xd4\x93\x4d\xb4\xd0\x76\xc7\x4d\xeb\x71\x48\x77\x43\x91\xcd\xe5\x8f\x8d\xa2\xcb\x28\x53\xcf\x82\xa4\xd5\x85\x78\xae\x37\xd9\x19\x13\x54\x52\x0c\x7d\xcb\x2a\xfd\x1b\x38\x66\xaa\xd3\x23\xe6\xf7\x20\xd5\x0a\xf1\x4b\x59\xe6\x0b\xbe\x42\xa9\x5e\x7d\xce\xec\x73\xd9\x8b\xc6\x4b\x35\xe4\x69\xbc\x10\x35\x8a\x0e\x09\x2b\xf1\x9f\x38\x15\x57\x21\x08\xe1\xa6\x6e\xf1\x8c\x52\x08\x1b\x85\x50\xe0\x1e\x01\x35\x3a\x0a\x72\x1b\xb3\xda\xfd\x78\x36\x10\xb7\x1a\x2e\x93\xd4\x63\xab\x0b\x98\xfb\x4b\x97\x47\x7f\x61\x0f\x36\x7f\xfe\x02\x36\x2e\x30\xa5\xdb\x8f\xde\xd0\xc0\xc6\x9c\x3a\x7b\x71\x24\x1e\xc3\x04\xac\x31\x7a\xf5\xf3\x33\x26\x99\xa6\x4f\x43\x6c\x46\x5c\x4d\xf5\xb8\x43\x1d\xd7\x73\x3e\xe9\xb6\x3b\xdf\xff\xf6\xf3\x2a\x34\x3f\x39\x60\x4b\xed\xde\xf4\x2f\x5d\xe7\xab\xfe\xa1\x4d\x11\x9c\xcc\x41\xf8\x3c\xdd\x18\xea\xea\x45\x3e\xa5\x0b\xb5\x7b\x38\x5e\x26\x72\xdd\x24\x51\x48\xcf\x79\xa7\xd9\x06\x2e\xe8\xfb\x5d\x3d\x4a\x81\x0d\x15\x48\xd0\x84\x15\x0f\x15\x5b\xc3\x9d\x48\xc4\x9e\x2f\x45\xd4\x1c\x24\xc4\x90\x60\xe6\xa1\x19\x6f\x2d\x3a\xf9\x52\x0d\x06\x93\x21\xb4\xc2\x43\xd7\xce\x5b\xaa\x42\x20\x35\x6c\x45\xa2\xea\xd8\xe0\xc7\x90\xe3\x4a\x3e\xb1\x65\xaf\x5c\xe3\x23\x58\x65\x88\x92\x0b\x98\xc4\x3f\x7b\xb4\x42\x6c\x77\x7f\xf3\x51\x1a\x17\x89\x1f\x03\x66\x95\xbb\x83\x3f\xfb\xd1\x8c\x46\x40\x7f\xd1\xff\x7a\xbb\xb2\xcc\xaa\xc9\xfe\xbe\x7a\xff\x5b\xf7\x17\xe0\x4d\xca\x6b\xf2\xef\x0f\x0c\x48\x90\x5c\x6d\xa4\x53\xf4\xfb\xe3\xfe\x38\x61\xdb\x32\xe6\x6e\x35\x86\xad\xad\x33\x3b\x7f\x92\x7f\xf9\x4c\xbd\x92\xbe\x41\x4f\x23\x37\xa2\x6e\xd9\x7d\x82\x47\xa4\x8f\x77\x51\xdb\x2f\xd6\xda\xcb\x1d\x7b\x2b\xe3\x29\x6f\x03\xad\xce\x05\xa7\xab\x34\x52\xb8\x94\xd3\x08\x5b\x9f\x0d\xec\x27\x09\xce\xb5\x82\x89\x43\xe0\xc3\xc3\x7f\xad\xeb\x30\x0a\x5c\xa8\x88\xc8\x38\x02\x18\x4d\xda\x80\x02\xf5\xb0\x0b\xbf\x3b\xbc\x11\x6b\xe7\xfd\x4b\x4a\xe9\x48\x31\x9f\x3a\x83\x80\x7b\x21\x73\xf8\x99\x43\x1b\xd6\x1a\xb6\xce\xe4\xff\x0e\x58\x33\x86\xd0\x09\x70\x14\x63\xc6\x45\x8f\x2a\x5f\xc8\xb2\x82\xdc\x4f\x99\x81\xa8\x87\xe4\xbf\xc5\xfe\x35\x81\x73\x63\x21\xf1\x82\xdb\x73\xfe\xe2\x1b\x5f\xff\x07\x8b\xb4\xef\xb6\x6a\x92\x9c\xcf\x6d\x09\xb1\xc1\x78\xa4\x56\x37\xe4\x6a\xf9\x01\x1e\x8c\x51\x14\x10\x34\xbd\xb0\x4f\xc6\xcb\xd6\xf4\xee\xed\x7c\x23\xa2\x80\xde\x5d\x76\x9d\x09\xd8\x1d\x45\x21\xc1\xad\xe9\x74\xf2\x61\xd4\x0b\xc7\x0d\x6a\xab\x25\x7c\x19\xa3\xf0\x88\x87\x7b\xba\xf0\x37\x3f\x59\x8f\x7f\x8e\x25\xbb\x80\x70\xf2\xe3\xf5\x0a\xa5\xb5\x2c\x43\x6f\xf1\x7b\xd3\x48\x86\x9a\xa2\xb1\x42\x89\xf3\x00\x0e\x9d\x99\xca\x5e\xb0\x2a\xf7\x46\xe6\xfb\xb9\x22\xc9\x14\xb9\x75\x95\x82\x87\x0d\x9a\x54\x80\xf6\xbc\x1f\xd9\xcb\x09\x0c\x4b\x5e\x38\xa1\x10\xaa\x32\xb1\xfa\xcc\xba\x37\x37\x01\x6d\x7f\xf1\x9d\x49\x35\x6a\x5b\xec\xec\xfb\x6a\x46\xca\x41\x03\x35\xfb\x56\xef\x5b\xe2\x44\xa0\x9e\xf8\x99\xde\x92\x17\x12\x98\x5e\x11\xe0\x73\x94\x23\xc9\x81\x61\xcc\x8a\xb4\x72\x5d\x6e\x1b\xfb\xa4\x3c\x79\x06\x12\xd3\x00\x47\xa7\x8e\x8c\x42\x9d\xa4\xfd\x34\xcd\xf0\x94\xdc\x3c\x84\xe3\xf7\xfc\x16\xd8\x0d\x4a\x9d\x05\xe1\xff\x1b\x47\xf1\xdc\xdf\xa4\x86\x09\xc1\xfe\xde\x45\xe4\x43\xfd\x0d\x05\xf4\x3f\xb5\x2e\xe7\x48\xde\xc8\x2b\x8a\x5f\xee\x28\x66\x09\xb4\x65\x12\x77\x23\x6a\xe2\x80\xa4\xc2\xa5\x1e\xbe\xd9\x8e\xae\x56\x4d\x56\xfe\xed\xe8\x0e\x39\xab\xba\x68\xfd\x39\x2c\x22\x30\x80\x31\xfe\x34\x46\x7d\xea\x3c\x8e\x5b\x87\xef\xac\x2d\xe3\x80\x19\x5a\xd8\xba\x63\xd5\xb4\x59\xc0\x38\xff\xc5\xd8\x00\x75\x8e\x31\x7c\x1f\x90\x98\xdc\x4a\x9c\x67\x84\x12\x87\xb2\x06\xcc\x5c\x41\xc4\xa2\x22\x88\x2d\xf5\x43\xdc\x5f\xe8\x71\xa0\x0f\xbd\xa8\x33\x6f\x83\xbf\xc0\x3a\xfd\xa7\xf9\x8a\x93\x12\x94\x0a\x9e\x39\x68\x60\xc2\xfe\x0a\x2c\x13\xb6\x25\x5a\x85\x62\x1c\x5b\x44\x2c\x05\x5b\xe6\x92\x56\x6b"s;// xxd -p + + + { + chunked_reader r_flat = chunked_reader(fastafs_file.c_str()); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference1), 0, "Difference in content 1st read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 569); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + // set back + r_flat.seek(1024); + + written = r_flat.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 569); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + // set back + r_flat.seek(4); + + written = r_flat.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference3), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 565); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference4), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + r_flat.seek(4); + + written = r_flat.read(buffer, 4);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 4); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference5), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_flat.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference6), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + } + + { + chunked_reader r_zstd = chunked_reader(fastafs_file_zstd.c_str()); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference1), 0, "Difference in content 1st read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 569); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 0); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + // set back + r_zstd.seek(1024); + + written = r_zstd.read(buffer, 1024); + BOOST_CHECK_EQUAL(written, 569); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference2), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + // set back + r_zstd.seek(4); + + written = r_zstd.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference3), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 565); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference4), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + + r_zstd.seek(4); + + written = r_zstd.read(buffer, 4);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 4); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference5), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + + written = r_zstd.read(buffer, 1024);// reads across two buffers? + BOOST_CHECK_EQUAL(written, 1024); + std_buffer = std::string(buffer, written); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(reference6), 0, "Difference in content 2nd read"); + flush_buffer(buffer, READ_BUFFER_SIZE + 1, '\0'); + } +} + + + +BOOST_AUTO_TEST_SUITE_END() + diff --git a/test/data/test_010.fa b/test/data/test_010.fa new file mode 100644 index 00000000..f7074dc4 --- /dev/null +++ b/test/data/test_010.fa @@ -0,0 +1,7 @@ +>PROTEIN +AEB***FA +STAFS*** +CZCIIXEL +BNQPCMTF +XNTXGWQC +JJ-YG diff --git a/test/data/test_011.fa b/test/data/test_011.fa new file mode 100644 index 00000000..e11da65d --- /dev/null +++ b/test/data/test_011.fa @@ -0,0 +1,2 @@ +>twobit-fourbit-fivebit-error +MAFSAEDVLKEYDRRRRMEAL diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 6ba16cd0..d8bd2de3 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -44,28 +44,27 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // > c h r 1 \n t t t t c c c c a a a a g g g g \n - BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(100), 23); + BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(40), 23); BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(16), 23); // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 // > c h r 1 \n t t t t c c c c a a a a g g g \n g \n BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(15), 24); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + chunked_reader file = chunked_reader(fs.filename.c_str()); - ffs2f_init* cache_p100 = fs.init_ffs2f(100, true); + ffs2f_init* cache_p40 = fs.init_ffs2f(40, true); ffs2f_init* cache_p23 = fs.init_ffs2f(23, true); // then: check returncodes: uint32_t ret; char chunk[4]; for(uint32_t i = 0; i < 23; i++) { - ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p40->sequences[0], chunk, 1, i, file); BOOST_CHECK_EQUAL(ret, 1); } for(uint32_t i = 23; i < 23 + 5; i++) { - ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p40->sequences[0], chunk, 1, i, file); BOOST_CHECK_EQUAL(ret, 0); } @@ -75,7 +74,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk(cache_p23->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p23->sequences[0], chunk, 1, i, file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } @@ -84,9 +83,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) BOOST_CHECK_EQUAL(chunk[2], '\1'); BOOST_CHECK_EQUAL(chunk[3], '\2'); - file.close(); - - delete cache_p100; + delete cache_p40; delete cache_p23; } @@ -104,8 +101,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // > c h r 1 \n T T T T C C C C A A A A G G G G \n BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(fs.data[0]->n), 23); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + chunked_reader file = chunked_reader(fs.filename.c_str()); ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // then: check returncodes: @@ -115,17 +111,15 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), file); BOOST_CHECK_EQUAL(ret, 0); - file.close(); - delete cache_p0; } @@ -145,8 +139,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // > c h r 1 \n T T T T C C C C A A A A G G G G \n BOOST_CHECK_EQUAL(fs.data[0]->fasta_filesize(fs.data[0]->n), 23); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + chunked_reader file = chunked_reader(fs.filename.c_str()); ffs2f_init* cache_p0 = fs.init_ffs2f(0, false); // no masking; everything must be uppercase @@ -156,17 +149,15 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) std::string ref = ">chr1\nTTTTCCCCAAAAGGGG\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), file); BOOST_CHECK_EQUAL(ret, 0); - file.close(); - delete cache_p0; } @@ -182,11 +173,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1) ffs2f_init* cache_p0 = fs.init_ffs2f(0, false); // allow masking = false, alles moet in capital / upper case BOOST_REQUIRE(fs.data.size() > 0); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + chunked_reader file = chunked_reader(fs.filename.c_str()); //fs.data[0]->sha1(cache_p0->sequences[0], &file); - BOOST_CHECK_EQUAL(fs.data[0]->sha1(cache_p0->sequences[0], &file), "2c0cae1d4e272b3ba63e7dd7e3c0efe62f2aaa2f"); + BOOST_CHECK_EQUAL(fs.data[0]->sha1(cache_p0->sequences[0], file), "2c0cae1d4e272b3ba63e7dd7e3c0efe62f2aaa2f"); delete cache_p0; } @@ -203,16 +193,15 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_md5) ffs2f_init* cache = fs.init_ffs2f(0, false); // allow masking = false, alles moet in capital / upper case BOOST_REQUIRE(fs.data.size() > 0); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + chunked_reader file = chunked_reader(fs.filename.c_str()); - BOOST_CHECK_EQUAL(fs.data[0]->md5(cache->sequences[0], &file), "75255c6d90778999ad3643a2e69d4344"); - BOOST_CHECK_EQUAL(fs.data[1]->md5(cache->sequences[1], &file), "8b5673724a9965c29a1d76fe7031ac8a"); - BOOST_CHECK_EQUAL(fs.data[2]->md5(cache->sequences[2], &file), "61deba32ec4c3576e3998fa2d4b87288"); - BOOST_CHECK_EQUAL(fs.data[3]->md5(cache->sequences[3], &file), "99b90560f23c1bda2871a6c93fd6a240"); - BOOST_CHECK_EQUAL(fs.data[4]->md5(cache->sequences[4], &file), "3625afdfbeb43765b85f612e0acb4739"); - BOOST_CHECK_EQUAL(fs.data[5]->md5(cache->sequences[5], &file), "bd8c080ed25ba8a454d9434cb8d14a68"); - BOOST_CHECK_EQUAL(fs.data[6]->md5(cache->sequences[6], &file), "980ef3a1cd80afec959dcf852d026246"); + BOOST_CHECK_EQUAL(fs.data[0]->md5(cache->sequences[0], file), "75255c6d90778999ad3643a2e69d4344"); + BOOST_CHECK_EQUAL(fs.data[1]->md5(cache->sequences[1], file), "8b5673724a9965c29a1d76fe7031ac8a"); + BOOST_CHECK_EQUAL(fs.data[2]->md5(cache->sequences[2], file), "61deba32ec4c3576e3998fa2d4b87288"); + BOOST_CHECK_EQUAL(fs.data[3]->md5(cache->sequences[3], file), "99b90560f23c1bda2871a6c93fd6a240"); + BOOST_CHECK_EQUAL(fs.data[4]->md5(cache->sequences[4], file), "3625afdfbeb43765b85f612e0acb4739"); + BOOST_CHECK_EQUAL(fs.data[5]->md5(cache->sequences[5], file), "bd8c080ed25ba8a454d9434cb8d14a68"); + BOOST_CHECK_EQUAL(fs.data[6]->md5(cache->sequences[6], file), "980ef3a1cd80afec959dcf852d026246"); delete cache; } @@ -232,8 +221,8 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) BOOST_REQUIRE(fs.data.size() > 0); - std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(file.is_open()); + //std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + //BOOST_REQUIRE(file.is_open()); BOOST_CHECK_EQUAL(fs.check_sequence_integrity(false), true); } @@ -887,4 +876,71 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) + + + +/** + * @description tests size and content of creating dict files + */ +BOOST_AUTO_TEST_CASE(test_fastafs__failing_example) +{ + // s=4096, off=20480 + + // is auto-generated by python script + fastafs fs = fastafs("test"); + fs.load("tmp/benchmark/test.zst"); + + BOOST_REQUIRE(fs.data.size() > 0); + + ffs2f_init* cache_p40 = fs.init_ffs2f(40, true); // equals original fasta + + + const int READ_BUFFER_SIZE_F = 4096 ; // make sure it is large enough, error occurrsed with buf len=4096 + char* buffer = new char[READ_BUFFER_SIZE_F + 2]; + uint32_t ret; + + // test the first read + chunked_reader fh1 = chunked_reader(fs.filename.c_str()); + flush_buffer(buffer, READ_BUFFER_SIZE_F + 1, '\0'); + ret = fs.view_fasta_chunk(cache_p40, buffer, 4096, 0, fh1); + printf("[%i]\n", ret); + buffer[4096] = '\0'; + //printf("[%s]\n", buffer); + printf("----------------------------------------------------------------\n", buffer); + + // test the first read + flush_buffer(buffer, READ_BUFFER_SIZE_F + 1, '\0'); + ret = fs.view_fasta_chunk(cache_p40, buffer, 4096, 0); + printf("[%i]\n", ret); + buffer[4096] = '\0'; + //printf("[%s]\n", buffer); + printf("----------------------------------------------------------------\n", buffer); + + + + + // test the first read + //chunked_reader fh2 = chunked_reader(fs.filename.c_str()); + flush_buffer(buffer, READ_BUFFER_SIZE_F + 1, '\0'); + ret = fs.view_fasta_chunk(cache_p40, buffer, 4096, 20480, fh1); + printf("[%i]\n", ret); + buffer[4096] = '\0'; + printf("[%s]\n", buffer); + printf("----------------------------------------------------------------\n", buffer); + + // test the first read + flush_buffer(buffer, READ_BUFFER_SIZE_F + 1, '\0'); + ret = fs.view_fasta_chunk(cache_p40, buffer, 4096, 20480); + printf("[%i]\n", ret); + buffer[4096] = '\0'; + //printf("[%s]\n", buffer); + printf("----------------------------------------------------------------\n", buffer); + + + + delete cache_p40; + delete[] buffer; +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/test/fivebit_fivebytes/test_fivebit_fivebytes.cpp b/test/fivebit_fivebytes/test_fivebit_fivebytes.cpp new file mode 100644 index 00000000..bc56bd2d --- /dev/null +++ b/test/fivebit_fivebytes/test_fivebit_fivebytes.cpp @@ -0,0 +1,202 @@ +#define BOOST_TEST_MODULE fivebit_fivebytes + +#include + +#include "config.hpp" + +#include "fivebit_fivebytes.hpp" + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_fivebit_fivebytes_conversions) +{ + unsigned char seq_comp[5]; + + // F A S T A - F S + // [05 ][00 ][18 ][19 ][00 ][27 ][05 ][18 ] + // 00101 00000 10010 10011 00000 11011 00101 10010 + // 00101000 00100101 00110000 01101100 10110010 + // FFFFFAAA AASSSSST TTTTAAAA A-----FF FFFSSSSS + // 40 37 48 108 178 + + seq_comp[0] = 40; + seq_comp[1] = 37; + seq_comp[2] = 48; + seq_comp[3] = 108; + seq_comp[4] = 178; + + fivebit_fivebytes f = fivebit_fivebytes();// set_compressed(char *);// string with 5 character - requires unpacking + f.set_compressed(seq_comp); + + char *seq_decomp = f.get(); + + + BOOST_CHECK_EQUAL(seq_decomp[0], 'F'); + BOOST_CHECK_EQUAL(seq_decomp[1], 'A'); + BOOST_CHECK_EQUAL(seq_decomp[2], 'S'); + BOOST_CHECK_EQUAL(seq_decomp[3], 'T'); + BOOST_CHECK_EQUAL(seq_decomp[4], 'A'); + BOOST_CHECK_EQUAL(seq_decomp[5], '-'); + BOOST_CHECK_EQUAL(seq_decomp[6], 'F'); + BOOST_CHECK_EQUAL(seq_decomp[7], 'S'); + + + fivebit_fivebytes f2 = fivebit_fivebytes();// set_compressed(char *);// string with 5 character - requires unpacking + + f2.data_compressed[0] = 1; + f2.data_compressed[1] = 3; + f2.data_compressed[2] = 3; + f2.data_compressed[3] = 7; + f2.data_compressed[4] = 255; + + f2.set(0, 05);// F + f2.set(1, 00);// A + f2.set(2, 18);// S + f2.set(3, 19);// T + f2.set(4, 00);// A + f2.set(5, 27);// - + f2.set(6, 05);// F + f2.set(7, 18);// S + + // F A S T A - F S + // [05 ][00 ][18 ][19 ][00 ][27 ][05 ][18 ] + // 00101 00000 10010 10011 00000 11011 00101 10010 + // 00101000 00100101 00110000 01101100 10110010 + // FFFFFAAA AASSSSST TTTTAAAA A-----FF FFFSSSSS + // 40 37 48 108 178 + + + BOOST_CHECK_EQUAL(f2.data_compressed[0], 40); + BOOST_CHECK_EQUAL(f2.data_compressed[1], 37); + BOOST_CHECK_EQUAL(f2.data_compressed[2], 48); + BOOST_CHECK_EQUAL(f2.data_compressed[3], 108); + BOOST_CHECK_EQUAL(f2.data_compressed[4], 178); + + // reset to make sure it needs to be unpacked properly + f2.data_decompressed[0] = '?'; + f2.data_decompressed[1] = '?'; + f2.data_decompressed[2] = '?'; + f2.data_decompressed[3] = '?'; + f2.data_decompressed[4] = '?'; + f2.data_decompressed[5] = '?'; + f2.data_decompressed[6] = '?'; + f2.data_decompressed[7] = '?'; + + + f2.unpack(); + + BOOST_CHECK_EQUAL(f2.data_decompressed[0], 'F'); + BOOST_CHECK_EQUAL(f2.data_decompressed[1], 'A'); + BOOST_CHECK_EQUAL(f2.data_decompressed[2], 'S'); + BOOST_CHECK_EQUAL(f2.data_decompressed[3], 'T'); + BOOST_CHECK_EQUAL(f2.data_decompressed[4], 'A'); + BOOST_CHECK_EQUAL(f2.data_decompressed[5], '-'); + BOOST_CHECK_EQUAL(f2.data_decompressed[6], 'F'); + BOOST_CHECK_EQUAL(f2.data_decompressed[7], 'S'); +} + + + +BOOST_AUTO_TEST_CASE(test_dict_conv) +{ + char hash[255]; + hash['A'] = 0; + hash['B'] = 1; + hash['C'] = 2; + hash['D'] = 3; + hash['E'] = 4; + hash['F'] = 5; + hash['G'] = 6; + hash['H'] = 7; + hash['I'] = 8; + hash['J'] = 9; + hash['K'] = 10; + hash['L'] = 11; + hash['M'] = 12; + hash['N'] = 13; + hash['O'] = 14; + hash['P'] = 15; + hash['Q'] = 16; + hash['R'] = 17; + hash['S'] = 18; + hash['T'] = 19; + hash['U'] = 20; + hash['V'] = 21; + hash['W'] = 22; + hash['Y'] = 23; + hash['Z'] = 24; + hash['X'] = 25; + hash['*'] = 26; + hash['-'] = 27; + + + fivebit_fivebytes f = fivebit_fivebytes();// set_compressed(char *);// string with 5 character - requires unpacking + + std::vector dict = {"FASTA-FS", "FRATSAST", "UCTFXJNH", "CGLWQNSI", "*OLS*DEN", "LAGD*PYE", "HFFXWSVZ", "ABBECXVW", "YMGTTOX*", "I-XVFAQX", "KEYVHNER", "IUISR-ZH", "JHCVXJMK", "NNFKGPOW", "WVGTUHYB", "SYGCM-UQ", "-KLOGKUC", "W*SWGLIJ", "*ZKJTHTV", "UWRTTHUM", "XLJTHJEQ", "LRFPJHAR", "ZJAVVMHP", "MDOADYFU", "NCK*CYNZ", "YMQZXEUR", "FBHAEAZS", "ERBIE-NQ", "GIJRSMBZ", "SEAZ*PO-", "PKB*XTLD", "GDFWBYMA", "RDGF-TN-", "P-LUKTWO", "-PXBTMBL", "OOJVYSJ*", "NDMCNXNQ", "OVNK-DVU", "FPIHDXEK", "L*AKZMMG", "AYQIQQXF", "CEWTCTDR", "YQPUSSPI", "ZIRY*WWR", "MJWQAOWO", "OCYFLK-V", "GQOEOZFZ", "*SFXOJX*", "PK-BHBLJ", "QPS*KSVV", "IHDZCVM-", "-GWREDJB", "PEKCDABZ", "ULBUMPNG", "ICYANTRP", "CYQGQHR*", "ZDISLFU-", "DIUBILAR", "VSWWTJAP", "PXYJYBIH", "NA-LNTOS", "FQIUFQRX", "KACQOLYW", "TWXUYV*F", "YMANTHBD", "KLZGRNCO", "BSKQ*GDM", "PFELJUIZ", "Q-WOCMLE", "AXJNEKCQ", "WOLPA*FZ", "DMHVYSFS", "RIFXSLYC", "-CQFIYZE", "R*EZXK-V", "WVPJ*HGJ", "HUDGTUV*", "PUC-GHYT", "M-*ZOJVB", "VD*LXUAU", "OLVEK-ZX", "WIVVTQRE", "-MAPUSWY", "IJHTQSGL", "OHIKEZEE", "G-ICTNTZ", "GMRS-DP-", "ARZEWZFS", "RGTFGQFR", "JKD-ELBY", "A-XDUGGM", "MFLWDXAN", "DGOFAMAI", "O**O-STQ", "-VNKGBJC", "RPMZHJJY", "XDBBUKQK", "CUFICTTW", "AUGSBPPE", "*PYTZKEF", "AHYSKALD", "KQMB*BNN", "*HWNMGYV", "IRWY*WBU", "LJNNZNLO", "PHVFRNWR", "CCCIHRHD", "JBORIXIX", "JT*SZHKP", "ZXURDOQD", "CPSBC-VD", "IHMEOQLB", "KXNMOVEX", "SJPQUBM*", "ZCLXJSIA", "YBHNUNHH", "AYTGKWSO", "IHXJ*TD-", "MVAJKHUH", "QUXLEYWW", "TSROZJIQ", "LIAPGNHF", "RCMXSNLM", "GF*-VRJK", "IKDCTKFY", "VAPADEGP", "GKHLIWHU", "-OH-F-EZ", "*GPJZCNY", "RVUGFGVA", "JLLYKRWZ", "WSOWVDKV", "QPJZLBPH", "BYTSODIJ", "-MCJFBFU", "D*NIFL*L", "IG*CW*ZM", "GZTGQWHC", "CTCTBRDU", "IBNLOPPM", "DTP-HLXN", "YQRZ*GWS", "YFVBQEMM", "MYARXKOF", "SQDKDEQZ", "RKKVOEOL", "XA*YVU-W", "CQZQXTRV", "VP*YCJQM", "JWEF*O-J", "LFUTAQSF", "-EEILVRG", "DKNSYQZ-", "EQLM*BT-", "TGNJDBDS", "AJ-IGQLP", "PPDRKUKA", "WDDKVXXX", "HCHASUOB", "-RCUHJOJ", "BVANOTRB", "TYGLLX*H", "SYBOLOOM", "IIBETFMT", "PDFGC-CE", "*PTPEDLR", "ZZQQESJB", "HMNGEPEK", "UPFIPVQN", "XBWOFKFY", "XDOPNZUI", "JEMVIXKL", "EXQWSNKQ", "FAAIORTR", "BWXEMW-D", "ZVT-NUKK", "YRCACBAE", "ASDZZFXM", "XHHSBSRN", "UYSVOSIT", "DVNEHROM", "*PUCVJIC", "NYUWPZ-I", "Y*-GH-AV", "CBB-FGNQ", "CUCEAWMU", "FZO*WW-S", "G*WJ*SNY", "X*BRUXZM", "AGUMJR*Q", "MYTN*XIZ", + "CZCIIXEL", "BNQPCMTF"// failed in a test? + }; + + for(size_t i = 0; i < dict.size(); i++) { + // set and compress amino acid string + for(size_t j = 0 ; j < 8; j ++) { + f.set((unsigned char) j, hash[ dict[i][j] ]) ; + } + + // decompress + f.unpack(); + + // verify + for(size_t j = 0 ; j < 8; j ++) { + BOOST_CHECK_EQUAL_MESSAGE(dict[i][j], f.data_decompressed[j], dict[i] + " => " + std::string(1, f.data_decompressed[0]) + std::string(1, f.data_decompressed[1]) + std::string(1, f.data_decompressed[2]) + std::string(1, f.data_decompressed[3]) + std::string(1, f.data_decompressed[4]) + std::string(1, f.data_decompressed[5]) + std::string(1, f.data_decompressed[6]) + std::string(1, f.data_decompressed[7]) + " (" + std::to_string(j) + ": " + std::string(1, dict[i][j]) + " => " + std::string(1, f.data_decompressed[j]) + ")"); + } + + /* printing for debugging purpose + printf(" => [%u %u %u %u %u] => [%02hhX %02hhX %02hhX %02hhX %02hhX] => %c%c%c%c%c%c%c%c\n", + + f.data_compressed[0],f.data_compressed[1],f.data_compressed[2],f.data_compressed[3],f.data_compressed[4], + f.data_compressed[0],f.data_compressed[1],f.data_compressed[2],f.data_compressed[3],f.data_compressed[4], + + f.data_decompressed[0],f.data_decompressed[1],f.data_decompressed[2],f.data_decompressed[3],f.data_decompressed[4],f.data_decompressed[5],f.data_decompressed[6],f.data_decompressed[7] + ); + */ + } +} + + + + +BOOST_AUTO_TEST_CASE(test_bytes_rounding) +{ + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(0), 0); // 8 % 8 = 0 + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(1), 1); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(2), 2); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(3), 2); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(4), 3); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(5), 4); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(6), 4); + BOOST_CHECK_EQUAL(fivebit_fivebytes::decompressed_to_compressed_bytes(7), 5); + + // 12345678 12345678 12345 + // 11111 22222 3333 + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(8), 5); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(16), 10); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24), 15); + + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(21), 14); + + + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 1), 15 + 1); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 2), 15 + 2); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 3), 15 + 2); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 4), 15 + 3); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 5), 15 + 4); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 6), 15 + 4); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 7), 15 + 5); + BOOST_CHECK_EQUAL(fivebit_fivebytes::nucleotides_to_compressed_offset(24 + 8), 15 + 5); // full reset, back to /8 +} + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/test_functional.py b/test/test_functional.py index 63285b9e..7ea4984e 100755 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -27,69 +27,71 @@ class FunctionalTest(unittest.TestCase): def test_01(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test.fa', "test_func_01", 100, './bin/fastafs', 'tmp/mnt') - + difference = diff_fasta_with_mounted(TEST_DIR + 'test.fa', T_TEST_DIR + "test_functional__test.fastafs" , "test_func_01", 100, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) - self.assertEqual(difference['stdout']['cache'] , '') - - for prog in difference['stderr']: - self.assertEqual(difference['stderr']['cache'] , '') - difference = diff_fasta_with_view(TEST_DIR + 'test.fa', "test_func_01", 100, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) + for prog in ['cache', 'check']: + self.assertEqual(difference['stderr'][prog] , '') def test_02(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_002.fa', "test_func_02", 60, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_002.fa', T_TEST_DIR + "test_functional__test_002.fastafs", "test_func_02", 60, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_002.fa', "test_func_02", 60, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) + for prog in ['cache', 'check']: + self.assertEqual(difference['stderr'][prog] , '') def test_03(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_003.fa', "test_func_03", 60, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_003.fa', T_TEST_DIR + "test_functional__test_003.fastafs", "test_func_03", 60, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_003.fa', "test_func_03", 60, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) + for prog in ['cache', 'check']: + self.assertEqual(difference['stderr'][prog] , '') def test_04(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_004.fa', "test_func_04", 32, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_004.fa', T_TEST_DIR + "test_functional__test_004.fastafs", "test_func_04", 32, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_004.fa', "test_func_04", 32, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) + for prog in ['cache', 'check']: + self.assertEqual(difference['stderr'][prog] , '') def test_05(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_005.fa', "test_func_05", 80, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_005.fa', T_TEST_DIR + "test_functional__test_005.fastafs", "test_func_05", 80, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_005.fa', "test_func_05", 80, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) def test_06(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_006.fa', "test_func_06", 10, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_006.fa', T_TEST_DIR + "test_functional__test_006.fastafs", "test_func_06", 10, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_006.fa', "test_func_06", 10, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) def test_07(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_007.fa', "test_func_07", 72, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_007.fa', T_TEST_DIR + "test_functional__test_007.fastafs", "test_func_07", 72, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_007.fa', "test_func_07", 72, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) def test_08(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_008.fa', "test_func_08", 72, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_008.fa', T_TEST_DIR + "test_functional__test_008.fastafs", "test_func_08", 72, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_008.fa', "test_func_08", 72, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) def test_09(self): - difference = diff_fasta_with_mounted(TEST_DIR + 'test_009.fa', "test_func_09", 24, './bin/fastafs', 'tmp/mnt') + difference = diff_fasta_with_mounted(TEST_DIR + 'test_009.fa', T_TEST_DIR + "test_functional__test_009.fastafs", "test_func_09", 24, './bin/fastafs', 'tmp/mnt') self.assertEqual(difference['diff'] , False) difference = diff_fasta_with_view(TEST_DIR + 'test_009.fa', "test_func_09", 24, './bin/fastafs', 'tmp/mnt') diff --git a/test/test_utils.py b/test/test_utils.py index 772482c0..2a25d0a8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,10 +1,37 @@ #!/usr/bn/env python -import subprocess -import time -import multiprocessing from multiprocessing import Process +from tqdm import tqdm +import datetime +import multiprocessing import os +import re +import subprocess +import time +import wget + + + +def get_git_revision(): + git_branch = subprocess.check_output(['git', 'log', '-n', '1']).decode("utf-8").strip().split("\n")[0].replace("commit ","git-commit:") + git_status = "|".join([_.strip().replace(" ","_") for _ in subprocess.check_output(['git', 'status', '-s', '-b']).decode("utf-8").strip().split("\n")]) + + return (git_branch, git_status) + +def get_machine_id(): + with open('/etc/machine-id') as fh: + return "system:" + fh.read().strip() + +def get_curtime(): + return str(datetime.datetime.now().strftime('%Y-%m-%d %X')) + + +#git = get_git_revision() +#print(get_curtime() + "\t" + get_machine_id() + "\t" + git[0] + "\t" + git[1] ) + + + + """ @@ -39,24 +66,37 @@ def get_ins_per_reading_mounted_suffix(size): def run_mount_bg(fastafs_binary, args, return_dict): - cmd = [fastafs_binary, 'mount', '-d', '-f'] + args - #return_dict[cmd] = cmd - return_dict['stdout'] = '' - return_dict['stderr'] = '' + #cmd = ['perf', 'stat' , fastafs_binary, 'mount', '-d', '-f'] + args #+ ['>' + '/dev/null'] + cmd = ['perf', 'stat', '-e', 'cycles', fastafs_binary, 'mount', '-d', '-f'] + args #+ ['>' + '/dev/null'] + + return_dict['cmd'] = cmd + + o = '' + e = '' + with subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout = subprocess.PIPE) as p: - stdout, stderr = p.communicate() + while p.poll() is None: + stdout, stderr = p.communicate() + + stdout = stdout.decode("utf-8") + stderr = stderr.decode("utf-8") - stdout = stdout.decode("utf-8") - stderr = stderr.decode("utf-8") + # i believe return_dict values can be set only once, so first cache and store later + o += "\n---\n" + str(stdout)#[0:50] + "..." + str(stdout)[-50:] + e += "\n---\n" + str(stderr)#[0:50] + "..." + str(stderr)[-50:] + p.terminate() - return_dict['stdout'] = stdout - return_dict['stderr'] = stderr + return_dict['stdout'] = o[0:100] + return_dict['stderr'] = e[0:100] - #pass + return_dict['perf'] = {'cycles': int(re.search('([0-9,]+)[ \t]+cycles' , e, re.IGNORECASE).group(1).replace(",","")), + 'total_time': float(re.search('([0-9\.]+)[ \t]+seconds time elapsed' , e, re.IGNORECASE).group(1)), + 'user_time': float(re.search('([0-9\.]+)[ \t]+seconds user', e, re.IGNORECASE).group(1)), + 'sys_time': float(re.search('([0-9\.]+)[ \t]+seconds sys', e, re.IGNORECASE).group(1)) } -def diff_fasta_with_mounted(fasta_file, fastafs_tmp_name, padding, fastafs_binary, mountpoint): +def diff_fasta_with_mounted(fasta_file, fastafs_tmp_filename, fastafs_mount_alias, padding, fastafs_binary, mountpoint = "tmp/mnt"): """ Do a diff with an original fasta and one fastafs converted and mounted """ @@ -65,51 +105,113 @@ def diff_fasta_with_mounted(fasta_file, fastafs_tmp_name, padding, fastafs_binar 'stdout': {}, 'stderr': {}, 'retcode': {}, - 'diff': False + 'diff': False, + 'perf': None } - + + mountpoint = mountpoint.rstrip('/') + '/' + # 1. fasta to FASTAFS: prog = 'cache' - #cmd = [fastafs_binary, prog, '-o', fastafs_tmp_file , fasta_file] - cmd = [fastafs_binary, prog, fastafs_tmp_name , fasta_file] + cmd = [fastafs_binary, prog, '-o', fastafs_tmp_filename, fasta_file] + + p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout = subprocess.PIPE)#:# as p: stdout, stderr = p.communicate() output['cmd'][prog] = cmd output['stdout'][prog] = stdout.decode("utf-8") output['stderr'][prog] = stderr.decode("utf-8") output['retcode'][prog] = p.returncode + + + fastafs_tmp_filename = fastafs_tmp_filename + ".zst" # 2. check integrity: + """ # uncomment when supported with chunked zstd(!!!) prog = 'check' - cmd = [fastafs_binary, 'check', fastafs_tmp_name, fastafs_tmp_name] + cmd = [fastafs_binary, 'check', '-f', fastafs_tmp_filename] + print(cmd) p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout = subprocess.PIPE)#:# as p: stdout, stderr = p.communicate() output['cmd'][prog] = cmd output['stdout'][prog] = stdout.decode("utf-8") output['stderr'][prog] = stderr.decode("utf-8") output['retcode'][prog] = p.returncode + """ # 3. run active mount process and push it to the background manager = multiprocessing.Manager() return_dict = manager.dict() - parallel_thread = Process(target=run_mount_bg, args=(fastafs_binary, ['-p', str(padding), fastafs_tmp_name, 'tmp/mnt'], return_dict)) + cmd = [fastafs_binary, "mount", '-p', str(padding), '-f', fastafs_tmp_filename, mountpoint] + #print(' '.join(cmd)) + parallel_thread = Process(target=run_mount_bg, args=(cmd[0], cmd[2:], return_dict)) parallel_thread.start() - time.sleep(0.1) # there always is some time before invoking the mount and having the mount point up and running - with open("tmp/mnt/" + fastafs_tmp_name + ".fa") as fh_mnt, open(fasta_file) as fh_orig: - for line1, line2 in zip(fh_mnt, fh_orig): - if line1.strip() != line2.strip(): - print("[" + line1.strip() + "] == [" + line2.strip() + "]") - output['diff'] = True - #return output - break + #print(" --- checkpoint 03 --- ") + basename = fastafs_tmp_filename.replace('.fastafs.zstd','').replace('.fastafs.zst','').replace('.fastafs.zstd','').split('/')[-1] + + fn = mountpoint + basename + ".fastafs.fa" + i = 0 + while not os.path.exists(fn): + print("waiting for file [" + fn + "] to come online") + time.sleep(0.1) # there always is some time before invoking the mount and having the mount point up and running + if i > 15: + sys.exit(1) + else: + i += 1 + + matches = 0 + try: + #os.system('ls -als tmp/mnt/*.fa') + #time.sleep(0.1) + #os.system('ls -als tmp/mnt/*.fa') + + with open(fn) as fh_mnt, open(fasta_file) as fh_orig: + for line1, line2 in tqdm(zip(fh_mnt, fh_orig)): + if line1.strip() != line2.strip(): + #print("[" + line1.strip() + "] == [" + line2.strip() + "]") + output['diff'] = True + break + else: + matches += 1 + + except Exception as e: + # print('-> ',fn) + # print(':: ',os.path.exists(fn)) + # print('-> ',fasta_file) + # print(':: ',os.path.exists(fasta_file)) + # print("***********************************") + # print("xxxxx ",str(e)) + output['diff'] = True + # 2. check integrity: - cmd = ['fusermount', '-u', "tmp/mnt"] + cmd = ['fusermount', '-u', mountpoint] p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout = subprocess.PIPE) + #print("pt 1: ", parallel_thread.is_alive() ) stdout, stderr = p.communicate() + #print("pt 2: ", parallel_thread.is_alive() ) + while p.poll() is None: + print("waiting for p (umount)") + #print("p: ", p.is_alive() ) + + #print(" --- checkpoint 06 --- ") + + #print("pt 3: ", parallel_thread.is_alive() ) + #while parallel_thread.is_alive() is None: + # print("waiting for pt (thread)") + # time.sleep(0.1) + parallel_thread.join() + #print("pt 4: ", parallel_thread.is_alive() ) + + return_dict['diff'] = output['diff'] + + #print(" --- checkpoint 07 --- ") - return output + #output['perf'] = return_dict['perf'] + + + return return_dict @@ -123,13 +225,17 @@ def diff_fasta_with_view(fasta_file, fastafs_tmp_name, padding, fastafs_binary, 'stdout': {}, 'stderr': {}, 'retcode': {}, - 'diff': False + 'diff': False, + 'perf-cycles': None } - + + mountpoint = mountpoint.rstrip('/') + '/' + # 1. fasta to FASTAFS: prog = 'cache' #cmd = [fastafs_binary, prog, '-o', fastafs_tmp_file , fasta_file] cmd = [fastafs_binary, prog, fastafs_tmp_name , fasta_file] + print(cmd) p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout = subprocess.PIPE) stdout, stderr = p.communicate() output['cmd'][prog] = cmd diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index ad556668..693f1652 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -11,6 +11,7 @@ #include "fasta_to_fastafs.hpp" #include "fastafs.hpp" +#include "zstd_seekable_utils.hpp" void flush_buffer(char *buffer, size_t n, char fill) @@ -328,10 +329,11 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) std::string full_file = ">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3\nACTG\nACTG\nAAAA\nCCC\n>chr4\nACTG\nNNNN\n>chr5\nNNAC\nTG\n"; //std::string full_file = ">chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG "; + chunked_reader fhc = chunked_reader(fs.filename.c_str()); for(uint32_t offset = 0; offset < 62; ++offset) { std::string substr_file = full_file.substr(offset, 100); - written = fs.view_fasta_chunk(cache_p4, buffer, 100, offset); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, offset, fhc); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); @@ -368,21 +370,22 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) //std::string std_buffer; // test fastafs_seq functions - std::ifstream fh(fastafs_file.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - BOOST_REQUIRE(fh.is_open()); + //std::ifstream fh(fastafs_file.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + chunked_reader fh = chunked_reader(fastafs_file.c_str()); + //BOOST_REQUIRE(fh.is_open()); // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 //[>] [c] [h] [r] [3] [.] [1] [\n] [A] [C] [T] [G] [A] [C] [T] [G] [A] [A] [A] [A] [C] [\n] BOOST_CHECK_EQUAL(fs.data[2]->fasta_filesize(100), 22); - written = fs.data[2]->view_fasta_chunk(cache_p100->sequences[2], buffer, 100, 0, &fh); + written = fs.data[2]->view_fasta_chunk(cache_p100->sequences[2], buffer, 100, 0, fh); BOOST_CHECK_EQUAL(written, 22); std::string std_buffer = std::string(buffer, written); BOOST_CHECK_EQUAL(std_buffer.compare(">chr3.1\nACTGACTGAAAAC\n"), 0); flush_buffer(buffer, 100, '?'); - fh.close(); + //fh.close(); delete[] buffer; @@ -428,6 +431,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) flush_buffer(buffer, 2110, '?'); ffs2f_init* cache = fs.init_ffs2f(60, true); + chunked_reader fhc = chunked_reader(fs.filename.c_str()); /* maak alle substrings: [....] @@ -445,7 +449,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { std::string substr_file = std::string(full_file, start_pos, buffer_len); - written = fs.view_fasta_chunk(cache, buffer, buffer_len, start_pos); + written = fs.view_fasta_chunk(cache, buffer, buffer_len, start_pos, fhc); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); @@ -586,10 +590,11 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) std::string full_file = ">IUPAC\nNBKA\nHMDC\nUWGS\nYVTR\nHGWV\nUMTB\nSDN-\n----\n----\n-BGY\nADNH\nSMUT\nRCKW\nVsbh\nvdnr\ntgyc\nmkwu\naAVT\nSDKN\nB---\nUGWM\nHYRC\n";// length = 117 + chunked_reader fhc = chunked_reader(fs.filename.c_str()); for(uint32_t offset = 0; offset < 62; ++offset) { std::string substr_file = full_file.substr(offset, 200); - written = fs.view_fasta_chunk(cache_p4, buffer, 200, offset); + written = fs.view_fasta_chunk(cache_p4, buffer, 200, offset, fhc); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); @@ -608,7 +613,6 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) - // This test case tests is specific for the decoding implementation // it can return less bytes than the buffer_size BOOST_AUTO_TEST_CASE(test_chunked_viewing_buffermaxlen) @@ -633,12 +637,47 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_buffermaxlen) uint32_t written = fs.view_fasta_chunk(cache_p0, buffer, 8192, 0); + BOOST_CHECK_EQUAL(written, 6115); + + delete[] buffer; + delete cache_p0; +} + + + +// This test case tests is specific for the decoding implementation +// it can return less bytes than the buffer_size +BOOST_AUTO_TEST_CASE(test_chunked_viewing_buffermaxlen_lim) +{ + BOOST_REQUIRE_EQUAL(READ_BUFFER_SIZE, 4096);// required for this test + + std::string test_name = "test_007"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + + fasta_to_fastafs(fasta_file, fastafs_file, false); + fastafs fs = fastafs(test_name); + fs.load(fastafs_file); + + BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); + + + char *buffer = new char[8192 + 1];// buffer needs to be c buffer because of the fuse layer + flush_buffer(buffer, 8192, '?'); + + ffs2f_init* cache_p0 = fs.init_ffs2f(8192 + 2, true); + + uint32_t written = fs.view_fasta_chunk(cache_p0, buffer, 4096, 0); + BOOST_CHECK_EQUAL(written, 4096); delete[] buffer; delete cache_p0; } + + + // This test case tests is specific for the decoding implementation // it can return less bytes than the buffer_size BOOST_AUTO_TEST_CASE(test_chunked_viewing_buffermaxlen2) @@ -678,5 +717,192 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_buffermaxlen2) + +BOOST_AUTO_TEST_CASE(test_chunked_viewing_zstd) +{ + std::string test_name = "test"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + std::string fastafs_file_zstd = "tmp/" + test_name + ".fastafs.zst"; + + fasta_to_fastafs(fasta_file, fastafs_file, false); + ZSTD_seekable_compressFile_orDie((const char*) fastafs_file.c_str(), + (const char*) fastafs_file_zstd.c_str(), + (int) ZSTD_COMPRESSION_QUALIITY, + (unsigned) ZSTD_SEEKABLE_FRAME_SIZE); + remove(fastafs_file.c_str()); + + fastafs fs = fastafs(test_name); + fs.load(fastafs_file_zstd); + BOOST_CHECK_EQUAL(fs.data.size(), 7); + BOOST_CHECK_EQUAL(fs.data[0]->name.compare("chr1"), 0); + BOOST_CHECK_EQUAL(fs.data[1]->name.compare("chr2"), 0); + BOOST_CHECK_EQUAL(fs.data[2]->name.compare("chr3.1"), 0); + BOOST_CHECK_EQUAL(fs.data[3]->name.compare("chr3.2"), 0); + BOOST_CHECK_EQUAL(fs.data[4]->name.compare("chr3.3"), 0); + BOOST_CHECK_EQUAL(fs.data[5]->name.compare("chr4"), 0); + BOOST_CHECK_EQUAL(fs.data[6]->name.compare("chr5"), 0); + + + char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer + std::string std_buffer; + + // init caches + ffs2f_init* cache_p1 = fs.init_ffs2f(1, false); + ffs2f_init* cache_p4 = fs.init_ffs2f(4, false); + ffs2f_init* cache_p5 = fs.init_ffs2f(5, false); + ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + + // padding: 4 + + size_t written = fs.view_fasta_chunk(cache_p4, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 999 - longer than longest seq + written = fs.view_fasta_chunk(cache_p999, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTTCCCCAAAAGGGG >chr2 ACTGACTGNNNNACTG >chr3.1 ACTGACTGAAAAC >chr3.2 ACTGACTGAAAACC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTCCCCAAAAGGGG\n>chr2\nACTGACTGNNNNACTG\n>chr3.1\nACTGACTGAAAAC\n>chr3.2\nACTGACTGAAAACC\n>chr3.3\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 5 - see if 2bit works + written = fs.view_fasta_chunk(cache_p5, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTTC CCCAA AAGGG G >chr2 ACTGA CTGNN NNACT G >chr3.1 ACTGA CTGAA AAC >chr3.2 ACTGA CTGAA AACC >chr3.3 ACTGA CTGAA AACCC >chr4 ACTGN NNN >chr5 NNACT G + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTC\nCCCAA\nAAGGG\nG\n>chr2\nACTGA\nCTGNN\nNNACT\nG\n>chr3.1\nACTGA\nCTGAA\nAAC\n>chr3.2\nACTGA\nCTGAA\nAACC"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 1 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 1); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //X----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 2 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 2); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("hr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 3 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 3); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("r1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 4 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 4); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 5 + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 5); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 6 + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 6); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>ch"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 7 + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 7); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 8 + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 8); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 9 + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 9); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("T\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3."), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 10 + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 10); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(written, 100); + BOOST_CHECK_EQUAL(std_buffer.compare("\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3"), 0); + flush_buffer(buffer, 100, '?'); + + std::string full_file = ">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3\nACTG\nACTG\nAAAA\nCCC\n>chr4\nACTG\nNNNN\n>chr5\nNNAC\nTG\n"; + //std::string full_file = ">chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG "; + for(uint32_t offset = 0; offset < 62; ++offset) { + std::string substr_file = full_file.substr(offset, 100); + + written = fs.view_fasta_chunk(cache_p4, buffer, 100, offset); + std_buffer = std::string(buffer, substr_file.size()); + + BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << offset); + + flush_buffer(buffer, 100, '?'); + } + + delete[] buffer; + + delete cache_p1; + delete cache_p4; + delete cache_p5; + delete cache_p999; +} + + + BOOST_AUTO_TEST_SUITE_END() diff --git a/tmp/.gitignore b/tmp/.gitignore new file mode 100644 index 00000000..3862dc0b --- /dev/null +++ b/tmp/.gitignore @@ -0,0 +1,9 @@ +*.2bit +benchmark +*.fa +*.fasta +*.fastafs +/mnt +*.txt +*.x +*.zst diff --git a/tmp/benchmark/.gitignore b/tmp/benchmark/.gitignore new file mode 100644 index 00000000..2f96538f --- /dev/null +++ b/tmp/benchmark/.gitignore @@ -0,0 +1,2 @@ +*.fastafs +*.zst