Skip to content

Commit

Permalink
squased commit 1.8.0 / Optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
yhoogstrate committed Dec 28, 2020
1 parent 320ae7c commit 27aacf3
Show file tree
Hide file tree
Showing 56 changed files with 6,689 additions and 758 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Testing/
cmake_install.cmake
.directory
/*.2bit
tmp/
repeats.txt
build/
xcheck.sh
Expand Down
Empty file added .gitmodules
Empty file.
45 changes: 41 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ cmake_minimum_required(VERSION 2.8)

project(fastafs)

# Do this once in a while - find different bugs
# Do this once in a while - find different compiler warnings
#set(CMAKE_CXX_COMPILER "clang++")

set(PROJECT_VERSION "1.7.5")
set(PROJECT_VERSION "1.8.0")
set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs")
set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues")

Expand Down Expand Up @@ -77,10 +77,20 @@ add_custom_target(tidy DEPENDS make_tidy )
# ----------------------------------------------------------------------
# ---------------------------- Compilation -----------------------------


add_subdirectory(src)
include_directories(include)
#include_directories(${BUILD_DIR})
include_directories("${BUILD_DIR}/include")
include_directories("${CMAKE_SOURCE_DIR}/include")


# ZSTD
include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd-lib-common")

# ZSTD-SEEKABLE
add_subdirectory("${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted")
include_directories("${CMAKE_SOURCE_DIR}/dependencies/zstd-seekable-adapted")

add_definitions(-std=c++14)

# Boost
Expand All @@ -98,8 +108,10 @@ link_libraries(ssl)
link_libraries(crypto)
link_libraries(fuse)
link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity
link_libraries(zstd)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")


# -DXXH_NAMESPACE=ZST_
if(DEBUG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -Wall -g -ggdb -Wconversion -D_FILE_OFFSET_BITS=64")# -Werror makes compilation crash when warnings are given (also part of Travis)
else()
Expand All @@ -115,11 +127,19 @@ add_executable(fastafs
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/fivebit_fivebytes.cpp
src/database.cpp
src/utils.cpp
src/sequence_region.cpp
src/fuse.cpp
src/lsfastafs.cpp
src/chunked_reader.cpp


dependencies/zstd-lib-common/xxhash.c
dependencies/zstd-seekable-adapted/zstdseek_utils.cpp
dependencies/zstd-seekable-adapted/zstdseek_compress.cpp
dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp
)
set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}")

Expand All @@ -133,11 +153,19 @@ add_executable(mount.fastafs
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/fivebit_fivebytes.cpp
src/database.cpp
src/utils.cpp
src/sequence_region.cpp
src/fuse.cpp
src/lsfastafs.cpp
src/chunked_reader.cpp


dependencies/zstd-lib-common/xxhash.c
dependencies/zstd-seekable-adapted/zstdseek_utils.cpp
dependencies/zstd-seekable-adapted/zstdseek_compress.cpp
dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp
)
set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}")

Expand All @@ -149,11 +177,19 @@ add_library(libfastafs SHARED
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/fivebit_fivebytes.cpp
src/database.cpp
src/utils.cpp
src/sequence_region.cpp
src/fuse.cpp
src/lsfastafs.cpp
src/chunked_reader.cpp


dependencies/zstd-lib-common/xxhash.c
dependencies/zstd-seekable-adapted/zstdseek_utils.cpp
dependencies/zstd-seekable-adapted/zstdseek_compress.cpp
dependencies/zstd-seekable-adapted/zstdseek_decompress.cpp
)
target_include_directories(libfastafs PUBLIC include)
target_sources(libfastafs PUBLIC include/fastafs.hpp)
Expand All @@ -180,6 +216,7 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as al

add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N)
add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-)
add_test(test_fivebit_fivebytes "${BUILD_TEST_DIR}/test_fivebit_fivebytes")
add_test(test_cache "${BUILD_TEST_DIR}/test_cache")
add_test(test_view "${BUILD_TEST_DIR}/test_view")
add_test(test_flags "${BUILD_TEST_DIR}/test_flags")
Expand Down
8 changes: 8 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
2020-04-29 Youri Hoogstrate

* v1.8.0
* Support for zstd-seekable
* Support for protein sequences using 8-to-5 byte compression
* Object type for chunked/buffered file reading
* Requires libzstd (1.4.5 and above) as dependency

2020-03-03 Youri Hoogstrate

* v1.7.5
Expand Down
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,21 @@ Direct link to the file format specification:

![](https://bioinf-galaxian.erasmusmc.nl/public/images/fastafs/fastafs-example.gif)

## in a compressed and random access manner
## Elegant integration of sequence data archives, backwards compatible with FASTA and no API's needed

RNA, DNA and protein sequences are commonly stored in the FASTA format. Although very commonly used and easy to read, FASTA files consume vast amounts of diskspace and need to be provided with additional files to achieve random access and interoperability. Classical compressors only offer back and forwards compression of the files, often requiring to decompress to a new copy of the FASTA file.
RNA, DNA and protein sequences are commonly stored in the FASTA format. Although very commonly used and easy to read, FASTA files come with additional metadata files and consume unnecessary disk space. These additional metadata files need to be are necessary to achieve random access and have certain interoperability features, and require additional maintaince. Classical FASTA (de-)compressors only offer back and forwards compression of the files, often requiring to decompress to a new copy of the FASTA file making it inpractical solutions in particular for random access use cases. Although they typically produce very compact archives with quick algorithms, they are not widely adopted in our bioinformatics software.

Here we propose a solution; a virtual layer to (random access) TwoBit/FourBit compression that provides read-only access to a FASTA file and the guarenteed in-sync FAI, DICT and 2BIT files, through a FUSE file system layer. By simply mounting the compressed archive as a FASTA and necessary metadata files, we only virtualize chunks of the FASTA corresponding to an file request. Additional advantages of FASTAFS are the toolkit and interface are sequence verification, checking file integrity and a feature rich toolskit that allows management of the mounted files.
Here we propose a solution; a virtual layer between (random access) FASTA archives and read-only access to FASTA files and their guarenteed in-sync FAI, DICT and 2BIT files, through the File System in Userspace (FUSE) file system layer. When the archive is mounted, fastafs virtualizes a folder containing the FASTA and necessary metadata files, only accessing the chunks of the archive needed to deliver to the file request. This elegant software solution offers several advantages:
- virtual files and their system calls are identical to flat files and preserve backwards compatibility with tools only compatible with FASTA, also for random access use-cases,
- there is no need to use additional disk space for temporary decompression or to put entire FASTA files into memory,
- for random access requests, computational resources are only spent on decompressing the region of interest,
- it does not need multiple implementations of software libraries for each distinct tool and for each programming language,
- it does not require to maintain multiple files that all together make up one data entity as it is guaranteed to provide dict- and fai-files that are in sync with their FASTA of origin.

FASTAFS is deliberately made backwards compatible with both TwoBit and Fasta. The package even allows to mount TwoBit files instead of FASTAFS files, to FASTA files. An important question is whether FASTAFS is this famous 15th standard (<https://xkcd.com/927/>)?
Partially, but it is not designed to replace FASTA nor 2bit as the mountpoints provide an exact identical way of file access as regular flat file acces, and is thus backwards compatible.
In addition, the corresponding toolkit offers an interface that allows ENA sequence identification, file integrity verification and management of the mounted files and process ids.

FASTAFS is deliberately made backwards compatible with both TwoBit and Fasta. The package even allows to mount TwoBit files instead of FASTAFS files, to FASTA files. For those who believe FASTAFS is this famous 15th standard (<https://xkcd.com/927/>)?
Partially, it is not designed to replace FASTA nor TwoBit as the mountpoints provide an exact identical way of file access as regular flat file acces, and is thus backwards compatible. Instead, it offers the same old standard with an elegant toolkit that allows easier integration with workflow management systems.

## Installation and compilation

Expand Down
Empty file added benchmarks/.gitignore
Empty file.
4 changes: 4 additions & 0 deletions benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
timestamp git-commit perf:cycles perf:total_time perf:user_time perf:sys_time cmd git-mod-status
2020-12-28 11:49:37.552478 a8f0d23cf01e6c622932427222862e481b20141c 211512123619 47.812391901 77.174097 0.834557 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps
2020-12-28 11:52:00.792104 a8f0d23cf01e6c622932427222862e481b20141c 212350209391 45.673330693 75.748635 0.850329 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps
2020-12-28 11:53:58.344964 a8f0d23cf01e6c622932427222862e481b20141c 215102039446 47.109749872 76.433183 0.887304 perf stat -e cycles ./bin/fastafs mount -d -f -p 40 -f tmp/benchmark/test.zst tmp/benchmark/mnt/ ##_optimizations...origin/optimizations_[ahead_1]|??_benchmarks/youri-ccbc_aeea40dfc0a64ab0b6234b379bf1f84a.txt|??_deps
1 change: 1 addition & 0 deletions dependencies/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
IntervalTree.hpp
/zstd
30 changes: 30 additions & 0 deletions dependencies/zstd-lib-common/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
BSD License

For Zstandard software

Copyright (c) 2016-present, Facebook, Inc. All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1 change: 1 addition & 0 deletions dependencies/zstd-lib-common/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
These files are copied from ZSTD 1.4.5 (lib/common)
Loading

0 comments on commit 27aacf3

Please sign in to comment.